1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6 * Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "primitives.h"
28 #include "x265.h"
29 #include "cpu.h"
30
31 #define FUNCDEF_TU(ret, name, cpu, ...) \
32 ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
33 ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
34 ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
35 ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
36 ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
37
38 #define FUNCDEF_TU_S(ret, name, cpu, ...) \
39 ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
40 ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
41 ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
42 ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
43 ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
44
45 #define FUNCDEF_TU_S2(ret, name, cpu, ...) \
46 ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
47 ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
48 ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
49 ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
50 ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
51
52 #define FUNCDEF_PU(ret, name, cpu, ...) \
53 ret PFX(name ## _4x4_ ## cpu)(__VA_ARGS__); \
54 ret PFX(name ## _8x8_ ## cpu)(__VA_ARGS__); \
55 ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
56 ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
57 ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
58 ret PFX(name ## _8x4_ ## cpu)(__VA_ARGS__); \
59 ret PFX(name ## _4x8_ ## cpu)(__VA_ARGS__); \
60 ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \
61 ret PFX(name ## _8x16_ ## cpu)(__VA_ARGS__); \
62 ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
63 ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
64 ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
65 ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
66 ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
67 ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
68 ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \
69 ret PFX(name ## _4x16_ ## cpu)(__VA_ARGS__); \
70 ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
71 ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
72 ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \
73 ret PFX(name ## _8x32_ ## cpu)(__VA_ARGS__); \
74 ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
75 ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
76 ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
77 ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
78
79 #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
80 FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
81 ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
82 ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
83 ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
84 ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
85 ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
86 ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
87 ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
88 ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
89 ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
90 ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
91 ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
92 ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
93 ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
94 ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
95 ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
96 ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
97 ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
98 ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
99 ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
100 ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
101 ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
102 ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
103 ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
104 ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
105 ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
106 ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
107
108 extern "C" {
109 #include "pixel.h"
110 #include "pixel-util.h"
111 #include "mc.h"
112 #include "ipfilter8.h"
113 #include "loopfilter.h"
114 #include "blockcopy8.h"
115 #include "intrapred.h"
116 #include "dct8.h"
117 }
118
119 #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
120 p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
121 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
122 p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
123 p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu)
124 #define ALL_LUMA_CU_TYPED_S(prim, fncdef, fname, cpu) \
125 p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## 8_ ## cpu); \
126 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
127 p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu); \
128 p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## 64_ ## cpu)
129 #define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
130 p.cu[BLOCK_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
131 p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
132 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
133 p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu)
134 #define ALL_LUMA_TU_TYPED_S(prim, fncdef, fname, cpu) \
135 p.cu[BLOCK_4x4].prim = fncdef PFX(fname ## 4_ ## cpu); \
136 p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## 8_ ## cpu); \
137 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
138 p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu)
139 #define ALL_LUMA_BLOCKS_TYPED(prim, fncdef, fname, cpu) \
140 p.cu[BLOCK_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
141 p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
142 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
143 p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
144 p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu);
145 #define ALL_LUMA_CU(prim, fname, cpu) ALL_LUMA_CU_TYPED(prim, , fname, cpu)
146 #define ALL_LUMA_CU_S(prim, fname, cpu) ALL_LUMA_CU_TYPED_S(prim, , fname, cpu)
147 #define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
148 #define ALL_LUMA_BLOCKS(prim, fname, cpu) ALL_LUMA_BLOCKS_TYPED(prim, , fname, cpu)
149 #define ALL_LUMA_TU_S(prim, fname, cpu) ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
150
151 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
152 p.pu[LUMA_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
153 p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
154 p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
155 p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
156 p.pu[LUMA_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \
157 p.pu[LUMA_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
158 p.pu[LUMA_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
159 p.pu[LUMA_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
160 p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
161 p.pu[LUMA_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
162 p.pu[LUMA_64x32].prim = fncdef PFX(fname ## _64x32_ ## cpu); \
163 p.pu[LUMA_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
164 p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
165 p.pu[LUMA_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
166 p.pu[LUMA_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \
167 p.pu[LUMA_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \
168 p.pu[LUMA_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
169 p.pu[LUMA_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
170 p.pu[LUMA_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
171 p.pu[LUMA_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu); \
172 p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
173 p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
174 p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
175 p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
176 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
177
178 #define ALL_LUMA_PU_T(prim, fname) \
179 p.pu[LUMA_8x8].prim = fname<LUMA_8x8>; \
180 p.pu[LUMA_16x16].prim = fname<LUMA_16x16>; \
181 p.pu[LUMA_32x32].prim = fname<LUMA_32x32>; \
182 p.pu[LUMA_64x64].prim = fname<LUMA_64x64>; \
183 p.pu[LUMA_8x4].prim = fname<LUMA_8x4>; \
184 p.pu[LUMA_4x8].prim = fname<LUMA_4x8>; \
185 p.pu[LUMA_16x8].prim = fname<LUMA_16x8>; \
186 p.pu[LUMA_8x16].prim = fname<LUMA_8x16>; \
187 p.pu[LUMA_16x32].prim = fname<LUMA_16x32>; \
188 p.pu[LUMA_32x16].prim = fname<LUMA_32x16>; \
189 p.pu[LUMA_64x32].prim = fname<LUMA_64x32>; \
190 p.pu[LUMA_32x64].prim = fname<LUMA_32x64>; \
191 p.pu[LUMA_16x12].prim = fname<LUMA_16x12>; \
192 p.pu[LUMA_12x16].prim = fname<LUMA_12x16>; \
193 p.pu[LUMA_16x4].prim = fname<LUMA_16x4>; \
194 p.pu[LUMA_4x16].prim = fname<LUMA_4x16>; \
195 p.pu[LUMA_32x24].prim = fname<LUMA_32x24>; \
196 p.pu[LUMA_24x32].prim = fname<LUMA_24x32>; \
197 p.pu[LUMA_32x8].prim = fname<LUMA_32x8>; \
198 p.pu[LUMA_8x32].prim = fname<LUMA_8x32>; \
199 p.pu[LUMA_64x48].prim = fname<LUMA_64x48>; \
200 p.pu[LUMA_48x64].prim = fname<LUMA_48x64>; \
201 p.pu[LUMA_64x16].prim = fname<LUMA_64x16>; \
202 p.pu[LUMA_16x64].prim = fname<LUMA_16x64>
203
204 #define ALL_CHROMA_420_CU_TYPED(prim, fncdef, fname, cpu) \
205 p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
206 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
207 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
208 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu)
209 #define ALL_CHROMA_420_CU_TYPED_S(prim, fncdef, fname, cpu) \
210 p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim = fncdef PFX(fname ## _4_ ## cpu); \
211 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim = fncdef PFX(fname ## _8_ ## cpu); \
212 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef PFX(fname ## _16_ ## cpu); \
213 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef PFX(fname ## _32_ ## cpu)
214 #define ALL_CHROMA_420_CU(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED(prim, , fname, cpu)
215 #define ALL_CHROMA_420_CU_S(prim, fname, cpu) ALL_CHROMA_420_CU_TYPED_S(prim, , fname, cpu)
216
217 #define ALL_CHROMA_420_PU_TYPED(prim, fncdef, fname, cpu) \
218 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
219 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
220 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
221 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
222 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].prim = fncdef PFX(fname ## _4x2_ ## cpu); \
223 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].prim = fncdef PFX(fname ## _2x4_ ## cpu); \
224 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \
225 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
226 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
227 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
228 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
229 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
230 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].prim = fncdef PFX(fname ## _8x6_ ## cpu); \
231 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].prim = fncdef PFX(fname ## _6x8_ ## cpu); \
232 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].prim = fncdef PFX(fname ## _8x2_ ## cpu); \
233 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].prim = fncdef PFX(fname ## _2x8_ ## cpu); \
234 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
235 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
236 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \
237 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \
238 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
239 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
240 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
241 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu)
242 #define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
243
244 #define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \
245 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
246 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
247 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
248 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
249 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \
250 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
251 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
252 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
253 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
254 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
255 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
256 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
257 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \
258 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \
259 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
260 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
261 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
262 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu)
263 #define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
264
265 #define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \
266 p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
267 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
268 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
269 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu)
270 #define ALL_CHROMA_422_CU(prim, fname, cpu) ALL_CHROMA_422_CU_TYPED(prim, , fname, cpu)
271
272 #define ALL_CHROMA_422_PU_TYPED(prim, fncdef, fname, cpu) \
273 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
274 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
275 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
276 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
277 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
278 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].prim = fncdef PFX(fname ## _2x8_ ## cpu); \
279 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
280 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \
281 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
282 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu); \
283 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
284 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu); \
285 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].prim = fncdef PFX(fname ## _8x12_ ## cpu); \
286 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].prim = fncdef PFX(fname ## _6x16_ ## cpu); \
287 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \
288 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].prim = fncdef PFX(fname ## _2x16_ ## cpu); \
289 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].prim = fncdef PFX(fname ## _16x24_ ## cpu); \
290 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].prim = fncdef PFX(fname ## _12x32_ ## cpu); \
291 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
292 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef PFX(fname ## _4x32_ ## cpu); \
293 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].prim = fncdef PFX(fname ## _32x48_ ## cpu); \
294 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(fname ## _24x64_ ## cpu); \
295 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
296 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(fname ## _8x64_ ## cpu)
297 #define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
298
299 #define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
300 p.chroma[X265_CSP_I444].pu[LUMA_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \
301 p.chroma[X265_CSP_I444].pu[LUMA_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
302 p.chroma[X265_CSP_I444].pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
303 p.chroma[X265_CSP_I444].pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
304 p.chroma[X265_CSP_I444].pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
305 p.chroma[X265_CSP_I444].pu[LUMA_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \
306 p.chroma[X265_CSP_I444].pu[LUMA_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \
307 p.chroma[X265_CSP_I444].pu[LUMA_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
308 p.chroma[X265_CSP_I444].pu[LUMA_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \
309 p.chroma[X265_CSP_I444].pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
310 p.chroma[X265_CSP_I444].pu[LUMA_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
311 p.chroma[X265_CSP_I444].pu[LUMA_64x32].prim = fncdef PFX(fname ## _64x32_ ## cpu); \
312 p.chroma[X265_CSP_I444].pu[LUMA_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
313 p.chroma[X265_CSP_I444].pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
314 p.chroma[X265_CSP_I444].pu[LUMA_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
315 p.chroma[X265_CSP_I444].pu[LUMA_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \
316 p.chroma[X265_CSP_I444].pu[LUMA_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \
317 p.chroma[X265_CSP_I444].pu[LUMA_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
318 p.chroma[X265_CSP_I444].pu[LUMA_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
319 p.chroma[X265_CSP_I444].pu[LUMA_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
320 p.chroma[X265_CSP_I444].pu[LUMA_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu); \
321 p.chroma[X265_CSP_I444].pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
322 p.chroma[X265_CSP_I444].pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
323 p.chroma[X265_CSP_I444].pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
324 p.chroma[X265_CSP_I444].pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
325 #define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
326
327 #define AVC_LUMA_PU(name, cpu) \
328 p.pu[LUMA_16x16].name = PFX(pixel_ ## name ## _16x16_ ## cpu); \
329 p.pu[LUMA_16x8].name = PFX(pixel_ ## name ## _16x8_ ## cpu); \
330 p.pu[LUMA_8x16].name = PFX(pixel_ ## name ## _8x16_ ## cpu); \
331 p.pu[LUMA_8x8].name = PFX(pixel_ ## name ## _8x8_ ## cpu); \
332 p.pu[LUMA_8x4].name = PFX(pixel_ ## name ## _8x4_ ## cpu); \
333 p.pu[LUMA_4x8].name = PFX(pixel_ ## name ## _4x8_ ## cpu); \
334 p.pu[LUMA_4x4].name = PFX(pixel_ ## name ## _4x4_ ## cpu); \
335 p.pu[LUMA_4x16].name = PFX(pixel_ ## name ## _4x16_ ## cpu)
336
337 #define HEVC_SAD(cpu) \
338 p.pu[LUMA_8x32].sad = PFX(pixel_sad_8x32_ ## cpu); \
339 p.pu[LUMA_16x4].sad = PFX(pixel_sad_16x4_ ## cpu); \
340 p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_ ## cpu); \
341 p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_ ## cpu); \
342 p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_ ## cpu); \
343 p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_ ## cpu); \
344 p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_ ## cpu); \
345 p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_ ## cpu); \
346 p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_ ## cpu); \
347 p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_ ## cpu); \
348 p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_ ## cpu); \
349 p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_ ## cpu); \
350 p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_ ## cpu); \
351 p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_ ## cpu); \
352 p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_ ## cpu); \
353 p.pu[LUMA_24x32].sad = PFX(pixel_sad_24x32_ ## cpu); \
354 p.pu[LUMA_12x16].sad = PFX(pixel_sad_12x16_ ## cpu)
355
356 #define HEVC_SAD_X3(cpu) \
357 p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_ ## cpu); \
358 p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_ ## cpu); \
359 p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_ ## cpu); \
360 p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_ ## cpu); \
361 p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_ ## cpu); \
362 p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_ ## cpu); \
363 p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_ ## cpu); \
364 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_ ## cpu); \
365 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_ ## cpu); \
366 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_ ## cpu); \
367 p.pu[LUMA_24x32].sad_x3 = PFX(pixel_sad_x3_24x32_ ## cpu); \
368 p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_ ## cpu); \
369 p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_ ## cpu); \
370 p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_ ## cpu); \
371 p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_ ## cpu); \
372 p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_ ## cpu)
373
374 #define HEVC_SAD_X4(cpu) \
375 p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_ ## cpu); \
376 p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_ ## cpu); \
377 p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_ ## cpu); \
378 p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_ ## cpu); \
379 p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_ ## cpu); \
380 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_ ## cpu); \
381 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_ ## cpu); \
382 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_ ## cpu); \
383 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_ ## cpu); \
384 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_ ## cpu); \
385 p.pu[LUMA_24x32].sad_x4 = PFX(pixel_sad_x4_24x32_ ## cpu); \
386 p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_ ## cpu); \
387 p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_ ## cpu); \
388 p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_ ## cpu); \
389 p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_ ## cpu); \
390 p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_ ## cpu)
391
392 #define ASSIGN_SSE_PP(cpu) \
393 p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_ ## cpu); \
394 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_ ## cpu); \
395 p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_ ## cpu); \
396 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_ssd_8x16_ ## cpu); \
397 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_ssd_16x32_ ## cpu); \
398 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_ssd_32x64_ ## cpu);
399
400 #define ASSIGN_SSE_SS(cpu) ALL_LUMA_BLOCKS(sse_ss, pixel_ssd_ss, cpu)
401
402 #define ASSIGN_SA8D(cpu) \
403 ALL_LUMA_CU(sa8d, pixel_sa8d, cpu); \
404 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_ ## cpu); \
405 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_ ## cpu); \
406 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_ ## cpu)
407
408 #define PIXEL_AVG(cpu) \
409 p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_ ## cpu); \
410 p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_ ## cpu); \
411 p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_ ## cpu); \
412 p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_ ## cpu); \
413 p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_ ## cpu); \
414 p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_ ## cpu); \
415 p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_ ## cpu); \
416 p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_ ## cpu); \
417 p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_ ## cpu); \
418 p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_ ## cpu); \
419 p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_ ## cpu); \
420 p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_ ## cpu); \
421 p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_ ## cpu); \
422 p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_ ## cpu); \
423 p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_ ## cpu); \
424 p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_ ## cpu); \
425 p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_ ## cpu); \
426 p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_ ## cpu); \
427 p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_ ## cpu); \
428 p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_ ## cpu); \
429 p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_ ## cpu); \
430 p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_ ## cpu);
431
432 #define PIXEL_AVG_W4(cpu) \
433 p.pu[LUMA_4x4].pixelavg_pp = PFX(pixel_avg_4x4_ ## cpu); \
434 p.pu[LUMA_4x8].pixelavg_pp = PFX(pixel_avg_4x8_ ## cpu); \
435 p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_4x16_ ## cpu);
436
437 #define CHROMA_420_FILTERS(cpu) \
438 ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
439 ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
440 ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
441 ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, cpu);
442
443 #define CHROMA_422_FILTERS(cpu) \
444 ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
445 ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
446 ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
447 ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, cpu);
448
449 #define CHROMA_444_FILTERS(cpu) \
450 ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
451 ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
452 ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
453 ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu);
454
455 #define SETUP_CHROMA_420_VSP_FUNC_DEF(W, H, cpu) \
456 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## cpu);
457
458 #define CHROMA_420_VSP_FILTERS_SSE4(cpu) \
459 SETUP_CHROMA_420_VSP_FUNC_DEF(4, 4, cpu); \
460 SETUP_CHROMA_420_VSP_FUNC_DEF(4, 2, cpu); \
461 SETUP_CHROMA_420_VSP_FUNC_DEF(2, 4, cpu); \
462 SETUP_CHROMA_420_VSP_FUNC_DEF(4, 8, cpu); \
463 SETUP_CHROMA_420_VSP_FUNC_DEF(6, 8, cpu); \
464 SETUP_CHROMA_420_VSP_FUNC_DEF(2, 8, cpu); \
465 SETUP_CHROMA_420_VSP_FUNC_DEF(16, 16, cpu); \
466 SETUP_CHROMA_420_VSP_FUNC_DEF(16, 8, cpu); \
467 SETUP_CHROMA_420_VSP_FUNC_DEF(16, 12, cpu); \
468 SETUP_CHROMA_420_VSP_FUNC_DEF(12, 16, cpu); \
469 SETUP_CHROMA_420_VSP_FUNC_DEF(16, 4, cpu); \
470 SETUP_CHROMA_420_VSP_FUNC_DEF(4, 16, cpu); \
471 SETUP_CHROMA_420_VSP_FUNC_DEF(32, 32, cpu); \
472 SETUP_CHROMA_420_VSP_FUNC_DEF(32, 16, cpu); \
473 SETUP_CHROMA_420_VSP_FUNC_DEF(16, 32, cpu); \
474 SETUP_CHROMA_420_VSP_FUNC_DEF(32, 24, cpu); \
475 SETUP_CHROMA_420_VSP_FUNC_DEF(24, 32, cpu); \
476 SETUP_CHROMA_420_VSP_FUNC_DEF(32, 8, cpu);
477
478 #define CHROMA_420_VSP_FILTERS(cpu) \
479 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 2, cpu); \
480 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 4, cpu); \
481 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 6, cpu); \
482 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 8, cpu); \
483 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 16, cpu); \
484 SETUP_CHROMA_420_VSP_FUNC_DEF(8, 32, cpu);
485
486 #define SETUP_CHROMA_422_VSP_FUNC_DEF(W, H, cpu) \
487 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## cpu);
488
489 #define CHROMA_422_VSP_FILTERS_SSE4(cpu) \
490 SETUP_CHROMA_422_VSP_FUNC_DEF(4, 8, cpu); \
491 SETUP_CHROMA_422_VSP_FUNC_DEF(4, 4, cpu); \
492 SETUP_CHROMA_422_VSP_FUNC_DEF(2, 8, cpu); \
493 SETUP_CHROMA_422_VSP_FUNC_DEF(4, 16, cpu); \
494 SETUP_CHROMA_422_VSP_FUNC_DEF(6, 16, cpu); \
495 SETUP_CHROMA_422_VSP_FUNC_DEF(2, 16, cpu); \
496 SETUP_CHROMA_422_VSP_FUNC_DEF(16, 32, cpu); \
497 SETUP_CHROMA_422_VSP_FUNC_DEF(16, 16, cpu); \
498 SETUP_CHROMA_422_VSP_FUNC_DEF(16, 24, cpu); \
499 SETUP_CHROMA_422_VSP_FUNC_DEF(12, 32, cpu); \
500 SETUP_CHROMA_422_VSP_FUNC_DEF(16, 8, cpu); \
501 SETUP_CHROMA_422_VSP_FUNC_DEF(4, 32, cpu); \
502 SETUP_CHROMA_422_VSP_FUNC_DEF(32, 64, cpu); \
503 SETUP_CHROMA_422_VSP_FUNC_DEF(32, 32, cpu); \
504 SETUP_CHROMA_422_VSP_FUNC_DEF(16, 64, cpu); \
505 SETUP_CHROMA_422_VSP_FUNC_DEF(32, 48, cpu); \
506 SETUP_CHROMA_422_VSP_FUNC_DEF(24, 64, cpu); \
507 SETUP_CHROMA_422_VSP_FUNC_DEF(32, 16, cpu);
508
509 #define CHROMA_422_VSP_FILTERS(cpu) \
510 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 4, cpu); \
511 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 8, cpu); \
512 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 12, cpu); \
513 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 16, cpu); \
514 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 32, cpu); \
515 SETUP_CHROMA_422_VSP_FUNC_DEF(8, 64, cpu);
516
517 #define SETUP_CHROMA_444_VSP_FUNC_DEF(W, H, cpu) \
518 p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## cpu);
519
520 #define CHROMA_444_VSP_FILTERS_SSE4(cpu) \
521 SETUP_CHROMA_444_VSP_FUNC_DEF(4, 4, cpu); \
522 SETUP_CHROMA_444_VSP_FUNC_DEF(4, 8, cpu); \
523 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 16, cpu); \
524 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 8, cpu); \
525 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 12, cpu); \
526 SETUP_CHROMA_444_VSP_FUNC_DEF(12, 16, cpu); \
527 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 4, cpu); \
528 SETUP_CHROMA_444_VSP_FUNC_DEF(4, 16, cpu); \
529 SETUP_CHROMA_444_VSP_FUNC_DEF(32, 32, cpu); \
530 SETUP_CHROMA_444_VSP_FUNC_DEF(32, 16, cpu); \
531 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 32, cpu); \
532 SETUP_CHROMA_444_VSP_FUNC_DEF(32, 24, cpu); \
533 SETUP_CHROMA_444_VSP_FUNC_DEF(24, 32, cpu); \
534 SETUP_CHROMA_444_VSP_FUNC_DEF(32, 8, cpu); \
535 SETUP_CHROMA_444_VSP_FUNC_DEF(64, 64, cpu); \
536 SETUP_CHROMA_444_VSP_FUNC_DEF(64, 32, cpu); \
537 SETUP_CHROMA_444_VSP_FUNC_DEF(32, 64, cpu); \
538 SETUP_CHROMA_444_VSP_FUNC_DEF(64, 48, cpu); \
539 SETUP_CHROMA_444_VSP_FUNC_DEF(48, 64, cpu); \
540 SETUP_CHROMA_444_VSP_FUNC_DEF(64, 16, cpu); \
541 SETUP_CHROMA_444_VSP_FUNC_DEF(16, 64, cpu);
542
543 #define CHROMA_444_VSP_FILTERS(cpu) \
544 SETUP_CHROMA_444_VSP_FUNC_DEF(8, 8, cpu); \
545 SETUP_CHROMA_444_VSP_FUNC_DEF(8, 4, cpu); \
546 SETUP_CHROMA_444_VSP_FUNC_DEF(8, 16, cpu); \
547 SETUP_CHROMA_444_VSP_FUNC_DEF(8, 32, cpu);
548
549 #define SETUP_CHROMA_420_VSS_FUNC_DEF(W, H, cpu) \
550 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## cpu);
551
552 #define CHROMA_420_VSS_FILTERS(cpu) \
553 SETUP_CHROMA_420_VSS_FUNC_DEF(4, 4, cpu); \
554 SETUP_CHROMA_420_VSS_FUNC_DEF(4, 2, cpu); \
555 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 8, cpu); \
556 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 4, cpu); \
557 SETUP_CHROMA_420_VSS_FUNC_DEF(4, 8, cpu); \
558 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 6, cpu); \
559 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 2, cpu); \
560 SETUP_CHROMA_420_VSS_FUNC_DEF(16, 16, cpu); \
561 SETUP_CHROMA_420_VSS_FUNC_DEF(16, 8, cpu); \
562 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 16, cpu); \
563 SETUP_CHROMA_420_VSS_FUNC_DEF(16, 12, cpu); \
564 SETUP_CHROMA_420_VSS_FUNC_DEF(12, 16, cpu); \
565 SETUP_CHROMA_420_VSS_FUNC_DEF(16, 4, cpu); \
566 SETUP_CHROMA_420_VSS_FUNC_DEF(4, 16, cpu); \
567 SETUP_CHROMA_420_VSS_FUNC_DEF(32, 32, cpu); \
568 SETUP_CHROMA_420_VSS_FUNC_DEF(32, 16, cpu); \
569 SETUP_CHROMA_420_VSS_FUNC_DEF(16, 32, cpu); \
570 SETUP_CHROMA_420_VSS_FUNC_DEF(32, 24, cpu); \
571 SETUP_CHROMA_420_VSS_FUNC_DEF(24, 32, cpu); \
572 SETUP_CHROMA_420_VSS_FUNC_DEF(32, 8, cpu); \
573 SETUP_CHROMA_420_VSS_FUNC_DEF(8, 32, cpu);
574
575 #define CHROMA_420_VSS_FILTERS_SSE4(cpu) \
576 SETUP_CHROMA_420_VSS_FUNC_DEF(2, 4, cpu); \
577 SETUP_CHROMA_420_VSS_FUNC_DEF(2, 8, cpu); \
578 SETUP_CHROMA_420_VSS_FUNC_DEF(6, 8, cpu);
579
580 #define SETUP_CHROMA_422_VSS_FUNC_DEF(W, H, cpu) \
581 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## cpu);
582
583 #define CHROMA_422_VSS_FILTERS(cpu) \
584 SETUP_CHROMA_422_VSS_FUNC_DEF(4, 8, cpu); \
585 SETUP_CHROMA_422_VSS_FUNC_DEF(4, 4, cpu); \
586 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 16, cpu); \
587 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 8, cpu); \
588 SETUP_CHROMA_422_VSS_FUNC_DEF(4, 16, cpu); \
589 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 12, cpu); \
590 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 4, cpu); \
591 SETUP_CHROMA_422_VSS_FUNC_DEF(16, 32, cpu); \
592 SETUP_CHROMA_422_VSS_FUNC_DEF(16, 16, cpu); \
593 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 32, cpu); \
594 SETUP_CHROMA_422_VSS_FUNC_DEF(16, 24, cpu); \
595 SETUP_CHROMA_422_VSS_FUNC_DEF(12, 32, cpu); \
596 SETUP_CHROMA_422_VSS_FUNC_DEF(16, 8, cpu); \
597 SETUP_CHROMA_422_VSS_FUNC_DEF(4, 32, cpu); \
598 SETUP_CHROMA_422_VSS_FUNC_DEF(32, 64, cpu); \
599 SETUP_CHROMA_422_VSS_FUNC_DEF(32, 32, cpu); \
600 SETUP_CHROMA_422_VSS_FUNC_DEF(16, 64, cpu); \
601 SETUP_CHROMA_422_VSS_FUNC_DEF(32, 48, cpu); \
602 SETUP_CHROMA_422_VSS_FUNC_DEF(24, 64, cpu); \
603 SETUP_CHROMA_422_VSS_FUNC_DEF(32, 16, cpu); \
604 SETUP_CHROMA_422_VSS_FUNC_DEF(8, 64, cpu);
605
606 #define CHROMA_422_VSS_FILTERS_SSE4(cpu) \
607 SETUP_CHROMA_422_VSS_FUNC_DEF(2, 8, cpu); \
608 SETUP_CHROMA_422_VSS_FUNC_DEF(2, 16, cpu); \
609 SETUP_CHROMA_422_VSS_FUNC_DEF(6, 16, cpu);
610
611 #define CHROMA_444_VSS_FILTERS(cpu) ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu)
612
613 #define LUMA_FILTERS(cpu) \
614 ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, cpu); p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_ ## cpu); \
615 ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, cpu); p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_ ## cpu); \
616 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, cpu); p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_ ## cpu); \
617 ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, cpu); p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_ ## cpu); \
618 ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, cpu); p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_ ## cpu); \
619 ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
620
621 #define LUMA_VSS_FILTERS(cpu) ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, cpu); p.pu[LUMA_4x4].luma_vss = PFX(interp_8tap_vert_ss_4x4_ ## cpu)
622
623 #define LUMA_CU_BLOCKCOPY(type, cpu) \
624 p.cu[BLOCK_4x4].copy_ ## type = PFX(blockcopy_ ## type ## _4x4_ ## cpu); \
625 ALL_LUMA_CU(copy_ ## type, blockcopy_ ## type, cpu);
626
627 #define CHROMA_420_CU_BLOCKCOPY(type, cpu) ALL_CHROMA_420_CU(copy_ ## type, blockcopy_ ## type, cpu)
628 #define CHROMA_422_CU_BLOCKCOPY(type, cpu) ALL_CHROMA_422_CU(copy_ ## type, blockcopy_ ## type, cpu)
629
630 #define LUMA_PU_BLOCKCOPY(type, cpu) ALL_LUMA_PU(copy_ ## type, blockcopy_ ## type, cpu); p.pu[LUMA_4x4].copy_ ## type = PFX(blockcopy_ ## type ## _4x4_ ## cpu)
631 #define CHROMA_420_PU_BLOCKCOPY(type, cpu) ALL_CHROMA_420_PU(copy_ ## type, blockcopy_ ## type, cpu)
632 #define CHROMA_422_PU_BLOCKCOPY(type, cpu) ALL_CHROMA_422_PU(copy_ ## type, blockcopy_ ## type, cpu)
633
634 #define LUMA_PIXELSUB(cpu) \
635 p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_ ## cpu); \
636 p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_ ## cpu); \
637 ALL_LUMA_CU(sub_ps, pixel_sub_ps, cpu); \
638 ALL_LUMA_CU(add_ps, pixel_add_ps, cpu);
639
640 #define CHROMA_420_PIXELSUB_PS(cpu) \
641 ALL_CHROMA_420_CU(sub_ps, pixel_sub_ps, cpu); \
642 ALL_CHROMA_420_CU(add_ps, pixel_add_ps, cpu);
643
644 #define CHROMA_422_PIXELSUB_PS(cpu) \
645 ALL_CHROMA_422_CU(sub_ps, pixel_sub_ps, cpu); \
646 ALL_CHROMA_422_CU(add_ps, pixel_add_ps, cpu);
647
648 #define LUMA_VAR(cpu) ALL_LUMA_CU(var, pixel_var, cpu)
649
650 #define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg, addAvg, cpu); p.pu[LUMA_4x4].addAvg = PFX(addAvg_4x4_ ## cpu)
651 #define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg, addAvg, cpu);
652 #define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg, addAvg, cpu);
653
654 #define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
655 p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu); \
656 p.cu[BLOCK_8x8].intra_pred[mode] = PFX(intra_pred_ang8_ ## fno ## _ ## cpu); \
657 p.cu[BLOCK_16x16].intra_pred[mode] = PFX(intra_pred_ang16_ ## fno ## _ ## cpu); \
658 p.cu[BLOCK_32x32].intra_pred[mode] = PFX(intra_pred_ang32_ ## fno ## _ ## cpu);
659
660 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
661 p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu);
662
663 #define SETUP_INTRA_ANG16_32(mode, fno, cpu) \
664 p.cu[BLOCK_16x16].intra_pred[mode] = PFX(intra_pred_ang16_ ## fno ## _ ## cpu); \
665 p.cu[BLOCK_32x32].intra_pred[mode] = PFX(intra_pred_ang32_ ## fno ## _ ## cpu);
666
667 #define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
668 p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu); \
669 p.cu[BLOCK_8x8].intra_pred[mode] = PFX(intra_pred_ang8_ ## fno ## _ ## cpu);
670
671 #define INTRA_ANG_SSSE3(cpu) \
672 SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
673 SETUP_INTRA_ANG_COMMON(34, 2, cpu);
674
675 #define INTRA_ANG_SSE4_COMMON(cpu) \
676 SETUP_INTRA_ANG_COMMON(3, 3, cpu); \
677 SETUP_INTRA_ANG_COMMON(4, 4, cpu); \
678 SETUP_INTRA_ANG_COMMON(5, 5, cpu); \
679 SETUP_INTRA_ANG_COMMON(6, 6, cpu); \
680 SETUP_INTRA_ANG_COMMON(7, 7, cpu); \
681 SETUP_INTRA_ANG_COMMON(8, 8, cpu); \
682 SETUP_INTRA_ANG_COMMON(9, 9, cpu); \
683 SETUP_INTRA_ANG_COMMON(10, 10, cpu); \
684 SETUP_INTRA_ANG_COMMON(11, 11, cpu); \
685 SETUP_INTRA_ANG_COMMON(12, 12, cpu); \
686 SETUP_INTRA_ANG_COMMON(13, 13, cpu); \
687 SETUP_INTRA_ANG_COMMON(14, 14, cpu); \
688 SETUP_INTRA_ANG_COMMON(15, 15, cpu); \
689 SETUP_INTRA_ANG_COMMON(16, 16, cpu); \
690 SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
691 SETUP_INTRA_ANG_COMMON(18, 18, cpu);
692
693 #define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
694 p.cu[BLOCK_8x8].intra_pred[mode] = PFX(intra_pred_ang8_ ## fno ## _ ## cpu); \
695 p.cu[BLOCK_16x16].intra_pred[mode] = PFX(intra_pred_ang16_ ## fno ## _ ## cpu); \
696 p.cu[BLOCK_32x32].intra_pred[mode] = PFX(intra_pred_ang32_ ## fno ## _ ## cpu);
697
698 #define INTRA_ANG_SSE4_HIGH(cpu) \
699 SETUP_INTRA_ANG_HIGH(19, 19, cpu); \
700 SETUP_INTRA_ANG_HIGH(20, 20, cpu); \
701 SETUP_INTRA_ANG_HIGH(21, 21, cpu); \
702 SETUP_INTRA_ANG_HIGH(22, 22, cpu); \
703 SETUP_INTRA_ANG_HIGH(23, 23, cpu); \
704 SETUP_INTRA_ANG_HIGH(24, 24, cpu); \
705 SETUP_INTRA_ANG_HIGH(25, 25, cpu); \
706 SETUP_INTRA_ANG_HIGH(26, 26, cpu); \
707 SETUP_INTRA_ANG_HIGH(27, 27, cpu); \
708 SETUP_INTRA_ANG_HIGH(28, 28, cpu); \
709 SETUP_INTRA_ANG_HIGH(29, 29, cpu); \
710 SETUP_INTRA_ANG_HIGH(30, 30, cpu); \
711 SETUP_INTRA_ANG_HIGH(31, 31, cpu); \
712 SETUP_INTRA_ANG_HIGH(32, 32, cpu); \
713 SETUP_INTRA_ANG_HIGH(33, 33, cpu); \
714 SETUP_INTRA_ANG4(19, 17, cpu); \
715 SETUP_INTRA_ANG4(20, 16, cpu); \
716 SETUP_INTRA_ANG4(21, 15, cpu); \
717 SETUP_INTRA_ANG4(22, 14, cpu); \
718 SETUP_INTRA_ANG4(23, 13, cpu); \
719 SETUP_INTRA_ANG4(24, 12, cpu); \
720 SETUP_INTRA_ANG4(25, 11, cpu); \
721 SETUP_INTRA_ANG4(26, 26, cpu); \
722 SETUP_INTRA_ANG4(27, 9, cpu); \
723 SETUP_INTRA_ANG4(28, 8, cpu); \
724 SETUP_INTRA_ANG4(29, 7, cpu); \
725 SETUP_INTRA_ANG4(30, 6, cpu); \
726 SETUP_INTRA_ANG4(31, 5, cpu); \
727 SETUP_INTRA_ANG4(32, 4, cpu); \
728 SETUP_INTRA_ANG4(33, 3, cpu);
729
730 #define INTRA_ANG_SSE4(cpu) \
731 SETUP_INTRA_ANG4_8(19, 17, cpu); \
732 SETUP_INTRA_ANG4_8(20, 16, cpu); \
733 SETUP_INTRA_ANG4_8(21, 15, cpu); \
734 SETUP_INTRA_ANG4_8(22, 14, cpu); \
735 SETUP_INTRA_ANG4_8(23, 13, cpu); \
736 SETUP_INTRA_ANG4_8(24, 12, cpu); \
737 SETUP_INTRA_ANG4_8(25, 11, cpu); \
738 SETUP_INTRA_ANG4_8(26, 26, cpu); \
739 SETUP_INTRA_ANG4_8(27, 9, cpu); \
740 SETUP_INTRA_ANG4_8(28, 8, cpu); \
741 SETUP_INTRA_ANG4_8(29, 7, cpu); \
742 SETUP_INTRA_ANG4_8(30, 6, cpu); \
743 SETUP_INTRA_ANG4_8(31, 5, cpu); \
744 SETUP_INTRA_ANG4_8(32, 4, cpu); \
745 SETUP_INTRA_ANG4_8(33, 3, cpu); \
746 SETUP_INTRA_ANG16_32(19, 19, cpu); \
747 SETUP_INTRA_ANG16_32(20, 20, cpu); \
748 SETUP_INTRA_ANG16_32(21, 21, cpu); \
749 SETUP_INTRA_ANG16_32(22, 22, cpu); \
750 SETUP_INTRA_ANG16_32(23, 23, cpu); \
751 SETUP_INTRA_ANG16_32(24, 24, cpu); \
752 SETUP_INTRA_ANG16_32(25, 25, cpu); \
753 SETUP_INTRA_ANG16_32(26, 26, cpu); \
754 SETUP_INTRA_ANG16_32(27, 27, cpu); \
755 SETUP_INTRA_ANG16_32(28, 28, cpu); \
756 SETUP_INTRA_ANG16_32(29, 29, cpu); \
757 SETUP_INTRA_ANG16_32(30, 30, cpu); \
758 SETUP_INTRA_ANG16_32(31, 31, cpu); \
759 SETUP_INTRA_ANG16_32(32, 32, cpu); \
760 SETUP_INTRA_ANG16_32(33, 33, cpu);
761
762 #define CHROMA_420_VERT_FILTERS(cpu) \
763 ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu); \
764 ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
765 ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, cpu); \
766 ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu)
767
768 #define SETUP_CHROMA_420_VERT_FUNC_DEF(W, H, cpu) \
769 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## cpu); \
770 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = PFX(interp_4tap_vert_pp_ ## W ## x ## H ## cpu); \
771 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = PFX(interp_4tap_vert_ps_ ## W ## x ## H ## cpu); \
772 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## cpu);
773
774 #define CHROMA_420_VERT_FILTERS_SSE4(cpu) \
775 SETUP_CHROMA_420_VERT_FUNC_DEF(2, 4, cpu); \
776 SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
777 SETUP_CHROMA_420_VERT_FUNC_DEF(4, 2, cpu); \
778 SETUP_CHROMA_420_VERT_FUNC_DEF(6, 8, cpu);
779
780 #define SETUP_CHROMA_422_VERT_FUNC_DEF(W, H, cpu) \
781 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = PFX(interp_4tap_vert_ss_ ## W ## x ## H ## cpu); \
782 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = PFX(interp_4tap_vert_pp_ ## W ## x ## H ## cpu); \
783 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = PFX(interp_4tap_vert_ps_ ## W ## x ## H ## cpu); \
784 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = PFX(interp_4tap_vert_sp_ ## W ## x ## H ## cpu);
785
786 #define CHROMA_422_VERT_FILTERS(cpu) \
787 SETUP_CHROMA_422_VERT_FUNC_DEF(4, 8, cpu); \
788 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 16, cpu); \
789 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 8, cpu); \
790 SETUP_CHROMA_422_VERT_FUNC_DEF(4, 16, cpu); \
791 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 12, cpu); \
792 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 4, cpu); \
793 SETUP_CHROMA_422_VERT_FUNC_DEF(16, 32, cpu); \
794 SETUP_CHROMA_422_VERT_FUNC_DEF(16, 16, cpu); \
795 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 32, cpu); \
796 SETUP_CHROMA_422_VERT_FUNC_DEF(16, 24, cpu); \
797 SETUP_CHROMA_422_VERT_FUNC_DEF(12, 32, cpu); \
798 SETUP_CHROMA_422_VERT_FUNC_DEF(16, 8, cpu); \
799 SETUP_CHROMA_422_VERT_FUNC_DEF(4, 32, cpu); \
800 SETUP_CHROMA_422_VERT_FUNC_DEF(32, 64, cpu); \
801 SETUP_CHROMA_422_VERT_FUNC_DEF(32, 32, cpu); \
802 SETUP_CHROMA_422_VERT_FUNC_DEF(16, 64, cpu); \
803 SETUP_CHROMA_422_VERT_FUNC_DEF(32, 48, cpu); \
804 SETUP_CHROMA_422_VERT_FUNC_DEF(24, 64, cpu); \
805 SETUP_CHROMA_422_VERT_FUNC_DEF(32, 16, cpu); \
806 SETUP_CHROMA_422_VERT_FUNC_DEF(8, 64, cpu);
807
808 #define CHROMA_422_VERT_FILTERS_SSE4(cpu) \
809 SETUP_CHROMA_422_VERT_FUNC_DEF(2, 8, cpu); \
810 SETUP_CHROMA_422_VERT_FUNC_DEF(2, 16, cpu); \
811 SETUP_CHROMA_422_VERT_FUNC_DEF(4, 4, cpu); \
812 SETUP_CHROMA_422_VERT_FUNC_DEF(6, 16, cpu);
813
814 #define CHROMA_444_VERT_FILTERS(cpu) \
815 ALL_CHROMA_444_PU(filter_vss, interp_4tap_vert_ss, cpu); \
816 ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, cpu); \
817 ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, cpu); \
818 ALL_CHROMA_444_PU(filter_vsp, interp_4tap_vert_sp, cpu)
819
820 #define CHROMA_420_HORIZ_FILTERS(cpu) \
821 ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
822 ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu);
823
824 #define SETUP_CHROMA_422_HORIZ_FUNC_DEF(W, H, cpu) \
825 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = PFX(interp_4tap_horiz_pp_ ## W ## x ## H ## cpu); \
826 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = PFX(interp_4tap_horiz_ps_ ## W ## x ## H ## cpu);
827
828 #define CHROMA_422_HORIZ_FILTERS(cpu) \
829 SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 8, cpu); \
830 SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 4, cpu); \
831 SETUP_CHROMA_422_HORIZ_FUNC_DEF(2, 8, cpu); \
832 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 16, cpu); \
833 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 8, cpu); \
834 SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 16, cpu); \
835 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 12, cpu); \
836 SETUP_CHROMA_422_HORIZ_FUNC_DEF(6, 16, cpu); \
837 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 4, cpu); \
838 SETUP_CHROMA_422_HORIZ_FUNC_DEF(2, 16, cpu); \
839 SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 32, cpu); \
840 SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 16, cpu); \
841 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 32, cpu); \
842 SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 24, cpu); \
843 SETUP_CHROMA_422_HORIZ_FUNC_DEF(12, 32, cpu); \
844 SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 8, cpu); \
845 SETUP_CHROMA_422_HORIZ_FUNC_DEF(4, 32, cpu); \
846 SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 64, cpu); \
847 SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 32, cpu); \
848 SETUP_CHROMA_422_HORIZ_FUNC_DEF(16, 64, cpu); \
849 SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 48, cpu); \
850 SETUP_CHROMA_422_HORIZ_FUNC_DEF(24, 64, cpu); \
851 SETUP_CHROMA_422_HORIZ_FUNC_DEF(32, 16, cpu); \
852 SETUP_CHROMA_422_HORIZ_FUNC_DEF(8, 64, cpu);
853
854 #define CHROMA_444_HORIZ_FILTERS(cpu) \
855 ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
856 ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu);
857
858 namespace X265_NS {
859 // private x265 namespace
860
861 template<int size>
interp_8tap_hv_pp_cpu(const pixel * src,intptr_t srcStride,pixel * dst,intptr_t dstStride,int idxX,int idxY)862 void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
863 {
864 ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA)]);
865 const int filterSize = NTAPS_LUMA;
866 const int halfFilterSize = filterSize >> 1;
867
868 primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
869 primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
870 }
871
872 #if HIGH_BIT_DEPTH
873
setupAssemblyPrimitives(EncoderPrimitives & p,int cpuMask)874 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
875 {
876 #if !defined(X86_64)
877 #error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
878 #endif
879
880 #if X86_64
881 p.scanPosLast = PFX(scanPosLast_x64);
882 #endif
883
884 if (cpuMask & X265_CPU_SSE2)
885 {
886 /* We do not differentiate CPUs which support MMX and not SSE2. We only check
887 * for SSE2 and then use both MMX and SSE2 functions */
888 AVC_LUMA_PU(sad, mmx2);
889
890 p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_sse2);
891 p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_sse2);
892 p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_sse2);
893 HEVC_SAD(sse2);
894
895 p.pu[LUMA_4x4].sad_x3 = PFX(pixel_sad_x3_4x4_mmx2);
896 p.pu[LUMA_4x8].sad_x3 = PFX(pixel_sad_x3_4x8_mmx2);
897 p.pu[LUMA_4x16].sad_x3 = PFX(pixel_sad_x3_4x16_mmx2);
898 p.pu[LUMA_8x4].sad_x3 = PFX(pixel_sad_x3_8x4_sse2);
899 p.pu[LUMA_8x8].sad_x3 = PFX(pixel_sad_x3_8x8_sse2);
900 p.pu[LUMA_8x16].sad_x3 = PFX(pixel_sad_x3_8x16_sse2);
901 p.pu[LUMA_8x32].sad_x3 = PFX(pixel_sad_x3_8x32_sse2);
902 p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_sse2);
903 p.pu[LUMA_12x16].sad_x3 = PFX(pixel_sad_x3_12x16_mmx2);
904 HEVC_SAD_X3(sse2);
905
906 p.pu[LUMA_4x4].sad_x4 = PFX(pixel_sad_x4_4x4_mmx2);
907 p.pu[LUMA_4x8].sad_x4 = PFX(pixel_sad_x4_4x8_mmx2);
908 p.pu[LUMA_4x16].sad_x4 = PFX(pixel_sad_x4_4x16_mmx2);
909 p.pu[LUMA_8x4].sad_x4 = PFX(pixel_sad_x4_8x4_sse2);
910 p.pu[LUMA_8x8].sad_x4 = PFX(pixel_sad_x4_8x8_sse2);
911 p.pu[LUMA_8x16].sad_x4 = PFX(pixel_sad_x4_8x16_sse2);
912 p.pu[LUMA_8x32].sad_x4 = PFX(pixel_sad_x4_8x32_sse2);
913 p.pu[LUMA_16x4].sad_x4 = PFX(pixel_sad_x4_16x4_sse2);
914 p.pu[LUMA_12x16].sad_x4 = PFX(pixel_sad_x4_12x16_mmx2);
915 HEVC_SAD_X4(sse2);
916
917 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_mmx2);
918 ALL_LUMA_PU(satd, pixel_satd, sse2);
919
920 #if X265_DEPTH <= 10
921 ASSIGN_SA8D(sse2);
922 #endif /* X265_DEPTH <= 10 */
923 LUMA_PIXELSUB(sse2);
924 CHROMA_420_PIXELSUB_PS(sse2);
925 CHROMA_422_PIXELSUB_PS(sse2);
926
927 LUMA_CU_BLOCKCOPY(ss, sse2);
928 CHROMA_420_CU_BLOCKCOPY(ss, sse2);
929 CHROMA_422_CU_BLOCKCOPY(ss, sse2);
930
931 p.pu[LUMA_4x4].copy_pp = (copy_pp_t)PFX(blockcopy_ss_4x4_sse2);
932 ALL_LUMA_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
933 ALL_CHROMA_420_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
934 ALL_CHROMA_422_PU_TYPED(copy_pp, (copy_pp_t), blockcopy_ss, sse2);
935
936 CHROMA_420_VERT_FILTERS(sse2);
937 CHROMA_422_VERT_FILTERS(_sse2);
938 CHROMA_444_VERT_FILTERS(sse2);
939
940 ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
941 p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
942 ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
943 p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
944 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
945 ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
946
947 p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
948 p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
949 PIXEL_AVG(sse2);
950 PIXEL_AVG_W4(mmx2);
951 LUMA_VAR(sse2);
952
953
954 ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
955 ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
956 ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
957 ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
958 ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
959 ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
960 ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
961 ALL_LUMA_TU_S(transpose, transpose, sse2);
962
963 #if X265_DEPTH <= 10
964 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
965 #endif /* X265_DEPTH <= 10 */
966 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
967
968 p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
969 p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
970 p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
971 p.cu[BLOCK_4x4].intra_pred[5] = PFX(intra_pred_ang4_5_sse2);
972 p.cu[BLOCK_4x4].intra_pred[6] = PFX(intra_pred_ang4_6_sse2);
973 p.cu[BLOCK_4x4].intra_pred[7] = PFX(intra_pred_ang4_7_sse2);
974 p.cu[BLOCK_4x4].intra_pred[8] = PFX(intra_pred_ang4_8_sse2);
975 p.cu[BLOCK_4x4].intra_pred[9] = PFX(intra_pred_ang4_9_sse2);
976 p.cu[BLOCK_4x4].intra_pred[10] = PFX(intra_pred_ang4_10_sse2);
977 p.cu[BLOCK_4x4].intra_pred[11] = PFX(intra_pred_ang4_11_sse2);
978 p.cu[BLOCK_4x4].intra_pred[12] = PFX(intra_pred_ang4_12_sse2);
979 p.cu[BLOCK_4x4].intra_pred[13] = PFX(intra_pred_ang4_13_sse2);
980 p.cu[BLOCK_4x4].intra_pred[14] = PFX(intra_pred_ang4_14_sse2);
981 p.cu[BLOCK_4x4].intra_pred[15] = PFX(intra_pred_ang4_15_sse2);
982 p.cu[BLOCK_4x4].intra_pred[16] = PFX(intra_pred_ang4_16_sse2);
983 p.cu[BLOCK_4x4].intra_pred[17] = PFX(intra_pred_ang4_17_sse2);
984 p.cu[BLOCK_4x4].intra_pred[18] = PFX(intra_pred_ang4_18_sse2);
985 p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_19_sse2);
986 p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_20_sse2);
987 p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_21_sse2);
988 p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_22_sse2);
989 p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
990 p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
991 p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
992 p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
993 p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
994 p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
995 p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
996 p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_30_sse2);
997 p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_31_sse2);
998 p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
999 p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
1000
1001 p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
1002 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
1003 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
1004
1005 #if X265_DEPTH <= 10
1006 p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
1007 ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
1008 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
1009 #endif
1010
1011 p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
1012 p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
1013 p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
1014 p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
1015
1016 p.idst4x4 = PFX(idst4_sse2);
1017 p.dst4x4 = PFX(dst4_sse2);
1018
1019 LUMA_VSS_FILTERS(sse2);
1020
1021 p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
1022 // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
1023 //p.planecopy_sp = PFX(downShift_16_sse2);
1024 p.planecopy_sp_shl = PFX(upShift_16_sse2);
1025
1026 ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
1027 ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
1028 ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
1029 ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
1030 ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
1031 }
1032 if (cpuMask & X265_CPU_SSE3)
1033 {
1034 ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
1035 ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
1036 ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
1037 ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
1038 ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
1039 ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
1040 }
1041 if (cpuMask & X265_CPU_SSSE3)
1042 {
1043 p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
1044 p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
1045
1046 // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
1047 ALL_LUMA_PU(satd, pixel_satd, ssse3);
1048 #if X265_DEPTH <= 10
1049 ASSIGN_SA8D(ssse3);
1050 #endif
1051 INTRA_ANG_SSSE3(ssse3);
1052
1053 p.dst4x4 = PFX(dst4_ssse3);
1054 p.cu[BLOCK_8x8].idct = PFX(idct8_ssse3);
1055
1056 p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
1057
1058 ALL_LUMA_PU(convert_p2s, filterPixelToShort, ssse3);
1059
1060 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
1061 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
1062 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
1063 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1064 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1065 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1066 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1067 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
1068 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1069 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
1070 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1071 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1072 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
1073 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1074 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
1075 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1076 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
1077 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
1078 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
1079 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_ssse3);
1080 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1081 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1082 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
1083 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1084 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1085 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
1086 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
1087 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1088 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1089 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
1090 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1091 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
1092 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
1093 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1094 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1095 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
1096 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
1097 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_ssse3);
1098 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
1099 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
1100 p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
1101 }
1102 if (cpuMask & X265_CPU_SSE4)
1103 {
1104 p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
1105 p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
1106 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
1107 p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
1108 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_sse4);
1109 p.saoCuOrgE3[0] = PFX(saoCuOrgE3_sse4);
1110 p.saoCuOrgE3[1] = PFX(saoCuOrgE3_sse4);
1111 p.saoCuOrgB0 = PFX(saoCuOrgB0_sse4);
1112 p.sign = PFX(calSign_sse4);
1113
1114 LUMA_ADDAVG(sse4);
1115 CHROMA_420_ADDAVG(sse4);
1116 CHROMA_422_ADDAVG(sse4);
1117
1118 LUMA_FILTERS(sse4);
1119 CHROMA_420_HORIZ_FILTERS(sse4);
1120 CHROMA_420_VERT_FILTERS_SSE4(_sse4);
1121 CHROMA_422_HORIZ_FILTERS(_sse4);
1122 CHROMA_422_VERT_FILTERS_SSE4(_sse4);
1123 CHROMA_444_HORIZ_FILTERS(sse4);
1124
1125 p.cu[BLOCK_8x8].dct = PFX(dct8_sse4);
1126 p.quant = PFX(quant_sse4);
1127 p.nquant = PFX(nquant_sse4);
1128 p.dequant_normal = PFX(dequant_normal_sse4);
1129 p.dequant_scaling = PFX(dequant_scaling_sse4);
1130
1131 // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
1132 ALL_LUMA_PU(satd, pixel_satd, sse4);
1133 #if X265_DEPTH <= 10
1134 ASSIGN_SA8D(sse4);
1135 #endif
1136
1137 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
1138 p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
1139 p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
1140 p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
1141
1142 #if X265_DEPTH <= 10
1143 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
1144 #endif
1145 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
1146 INTRA_ANG_SSE4_COMMON(sse4);
1147 INTRA_ANG_SSE4_HIGH(sse4);
1148
1149 p.planecopy_cp = PFX(upShift_8_sse4);
1150 p.weight_pp = PFX(weight_pp_sse4);
1151 p.weight_sp = PFX(weight_sp_sse4);
1152
1153 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
1154 p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
1155
1156 // TODO: check POPCNT flag!
1157 ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
1158 #if X265_DEPTH <= 10
1159 ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
1160 #endif
1161 ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
1162
1163 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
1164 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1165 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
1166 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1167 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
1168 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
1169 }
1170 if (cpuMask & X265_CPU_AVX)
1171 {
1172 // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests
1173 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx);
1174 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx);
1175 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx);
1176 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_avx);
1177 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_avx);
1178 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_avx);
1179 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
1180 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
1181 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
1182 // p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
1183 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
1184 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
1185 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
1186 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);
1187
1188 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx);
1189 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);
1190 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx);
1191 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_avx);
1192 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);
1193 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
1194 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
1195 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
1196
1197 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
1198 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
1199 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
1200
1201 // copy_pp primitives
1202 // 16 x N
1203 p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
1204 p.pu[LUMA_16x4].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x4_avx);
1205 p.pu[LUMA_16x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x8_avx);
1206 p.pu[LUMA_16x12].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x12_avx);
1207 p.pu[LUMA_16x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x16_avx);
1208 p.pu[LUMA_16x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x32_avx);
1209 p.pu[LUMA_16x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x64_avx);
1210 p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx);
1211 p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
1212 p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
1213 p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
1214 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x4_avx);
1215 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x8_avx);
1216 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x12_avx);
1217 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x16_avx);
1218 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x32_avx);
1219 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x16_avx);
1220 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x24_avx);
1221 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x32_avx);
1222 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x64_avx);
1223 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_16x8_avx);
1224
1225 // 24 X N
1226 p.pu[LUMA_24x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_24x32_avx);
1227 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_24x32_avx);
1228 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_24x64_avx);
1229
1230 // 32 x N
1231 p.pu[LUMA_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx);
1232 p.pu[LUMA_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx);
1233 p.pu[LUMA_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx);
1234 p.pu[LUMA_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx);
1235 p.pu[LUMA_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx);
1236 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx);
1237 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx);
1238 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx);
1239 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx);
1240 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx);
1241 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx);
1242 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x48_avx);
1243 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx);
1244
1245 // 48 X 64
1246 p.pu[LUMA_48x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_48x64_avx);
1247
1248 // copy_ss primitives
1249 // 16 X N
1250 p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx);
1251 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx);
1252 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_avx);
1253
1254 // 32 X N
1255 p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx);
1256 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx);
1257 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx);
1258
1259 // 64 X N
1260 p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx);
1261
1262 // copy_ps primitives
1263 // 16 X N
1264 p.cu[BLOCK_16x16].copy_ps = (copy_ps_t)PFX(blockcopy_ss_16x16_avx);
1265 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = (copy_ps_t)PFX(blockcopy_ss_16x16_avx);
1266 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_16x32_avx);
1267
1268 // 32 X N
1269 p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx);
1270 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx);
1271 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x64_avx);
1272
1273 // 64 X N
1274 p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx);
1275
1276 // copy_sp primitives
1277 // 16 X N
1278 p.cu[BLOCK_16x16].copy_sp = (copy_sp_t)PFX(blockcopy_ss_16x16_avx);
1279 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = (copy_sp_t)PFX(blockcopy_ss_16x16_avx);
1280 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_16x32_avx);
1281
1282 // 32 X N
1283 p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx);
1284 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx);
1285 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx);
1286
1287 // 64 X N
1288 p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx);
1289
1290 p.frameInitLowres = PFX(frame_init_lowres_core_avx);
1291
1292 p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx);
1293 p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
1294 p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
1295 p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
1296
1297 /* The following primitives have been disabled since performance compared to SSE is negligible/negative */
1298 #if 0
1299 ALL_LUMA_PU(satd, pixel_satd, avx);
1300
1301 p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
1302 p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
1303
1304 LUMA_VAR(avx);
1305
1306 #if X265_DEPTH <= 10
1307 ASSIGN_SA8D(avx);
1308 #endif
1309 #endif
1310 }
1311 if (cpuMask & X265_CPU_XOP)
1312 {
1313 //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
1314 ALL_LUMA_PU(satd, pixel_satd, xop);
1315 #if X265_DEPTH <= 10
1316 ASSIGN_SA8D(xop);
1317 #endif
1318 LUMA_VAR(xop);
1319 p.frameInitLowres = PFX(frame_init_lowres_core_xop);
1320 }
1321 if (cpuMask & X265_CPU_AVX2)
1322 {
1323 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
1324
1325 // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
1326 //p.planecopy_sp = PFX(downShift_16_avx2);
1327 p.planecopy_sp_shl = PFX(upShift_16_avx2);
1328
1329 p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
1330 p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
1331 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
1332 p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2);
1333 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
1334 p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2);
1335 p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2);
1336 p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2);
1337
1338 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
1339 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
1340 p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2);
1341 p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx2);
1342 p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx2);
1343 p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx2);
1344 p.cu[BLOCK_16x16].intra_pred[8] = PFX(intra_pred_ang16_8_avx2);
1345 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx2);
1346 p.cu[BLOCK_16x16].intra_pred[10] = PFX(intra_pred_ang16_10_avx2);
1347 p.cu[BLOCK_16x16].intra_pred[11] = PFX(intra_pred_ang16_11_avx2);
1348 p.cu[BLOCK_16x16].intra_pred[12] = PFX(intra_pred_ang16_12_avx2);
1349 p.cu[BLOCK_16x16].intra_pred[13] = PFX(intra_pred_ang16_13_avx2);
1350 p.cu[BLOCK_16x16].intra_pred[14] = PFX(intra_pred_ang16_14_avx2);
1351 p.cu[BLOCK_16x16].intra_pred[15] = PFX(intra_pred_ang16_15_avx2);
1352 p.cu[BLOCK_16x16].intra_pred[16] = PFX(intra_pred_ang16_16_avx2);
1353 p.cu[BLOCK_16x16].intra_pred[17] = PFX(intra_pred_ang16_17_avx2);
1354 p.cu[BLOCK_16x16].intra_pred[18] = PFX(intra_pred_ang16_18_avx2);
1355 p.cu[BLOCK_16x16].intra_pred[19] = PFX(intra_pred_ang16_19_avx2);
1356 p.cu[BLOCK_16x16].intra_pred[20] = PFX(intra_pred_ang16_20_avx2);
1357 p.cu[BLOCK_16x16].intra_pred[21] = PFX(intra_pred_ang16_21_avx2);
1358 p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
1359 p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2);
1360 p.cu[BLOCK_16x16].intra_pred[24] = PFX(intra_pred_ang16_24_avx2);
1361 p.cu[BLOCK_16x16].intra_pred[25] = PFX(intra_pred_ang16_25_avx2);
1362 p.cu[BLOCK_16x16].intra_pred[26] = PFX(intra_pred_ang16_26_avx2);
1363 p.cu[BLOCK_16x16].intra_pred[27] = PFX(intra_pred_ang16_27_avx2);
1364 p.cu[BLOCK_16x16].intra_pred[28] = PFX(intra_pred_ang16_28_avx2);
1365 p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx2);
1366 p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx2);
1367 p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx2);
1368 p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx2);
1369 p.cu[BLOCK_16x16].intra_pred[33] = PFX(intra_pred_ang16_33_avx2);
1370 p.cu[BLOCK_16x16].intra_pred[34] = PFX(intra_pred_ang16_2_avx2);
1371
1372 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
1373 p.cu[BLOCK_32x32].intra_pred[3] = PFX(intra_pred_ang32_3_avx2);
1374 p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2);
1375 p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx2);
1376 p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx2);
1377 p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx2);
1378 p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2);
1379 p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2);
1380 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
1381 p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
1382 p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
1383 p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
1384 p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
1385 p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2);
1386 p.cu[BLOCK_32x32].intra_pred[16] = PFX(intra_pred_ang32_16_avx2);
1387 p.cu[BLOCK_32x32].intra_pred[17] = PFX(intra_pred_ang32_17_avx2);
1388 p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
1389 p.cu[BLOCK_32x32].intra_pred[19] = PFX(intra_pred_ang32_19_avx2);
1390 p.cu[BLOCK_32x32].intra_pred[20] = PFX(intra_pred_ang32_20_avx2);
1391 p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
1392 p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
1393 p.cu[BLOCK_32x32].intra_pred[23] = PFX(intra_pred_ang32_23_avx2);
1394 p.cu[BLOCK_32x32].intra_pred[24] = PFX(intra_pred_ang32_24_avx2);
1395 p.cu[BLOCK_32x32].intra_pred[25] = PFX(intra_pred_ang32_25_avx2);
1396 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
1397 p.cu[BLOCK_32x32].intra_pred[27] = PFX(intra_pred_ang32_27_avx2);
1398 p.cu[BLOCK_32x32].intra_pred[28] = PFX(intra_pred_ang32_28_avx2);
1399 p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx2);
1400 p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx2);
1401 p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx2);
1402 p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx2);
1403 p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
1404 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx2);
1405
1406 p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
1407 p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
1408 p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
1409 p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
1410 p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
1411 p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
1412 p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
1413 p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
1414 p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
1415 p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
1416 p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
1417 p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
1418 p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
1419 p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
1420 p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
1421 p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
1422 p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
1423 p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
1424
1425 p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
1426 p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2);
1427 p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2);
1428 p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2);
1429 p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
1430 p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
1431 p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
1432 p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
1433 p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
1434 p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx2);
1435 p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
1436 p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
1437 p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
1438 p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
1439 p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
1440 p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
1441 p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
1442 p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
1443 p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
1444
1445 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2);
1446 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2);
1447 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2);
1448 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2);
1449 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2);
1450 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2);
1451 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2);
1452 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
1453 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
1454 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
1455 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx2);
1456 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
1457 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
1458 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
1459
1460 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
1461 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
1462 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
1463 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
1464 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
1465 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
1466 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
1467 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
1468 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
1469 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
1470 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
1471 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
1472 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
1473 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
1474
1475 p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
1476 p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
1477 p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
1478 p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
1479 p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
1480 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
1481 #if X265_DEPTH <= 10
1482 p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
1483 p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
1484 p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
1485 p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
1486 p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
1487 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
1488 #endif
1489
1490 p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
1491 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
1492
1493 p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx2);
1494
1495 p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx2);
1496 p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx2);
1497 p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx2);
1498 p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx2);
1499
1500 p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx2);
1501 p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx2);
1502 p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx2);
1503 p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx2);
1504 p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx2);
1505
1506 p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx2);
1507 p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx2);
1508 p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx2);
1509 p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx2);
1510 p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx2);
1511 p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx2);
1512 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx2);
1513 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx2);
1514 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx2);
1515 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_avx2);
1516 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_avx2);
1517 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx2);
1518 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx2);
1519 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx2);
1520 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx2);
1521 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx2);
1522 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx2);
1523 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx2);
1524 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx2);
1525 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx2);
1526 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
1527 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
1528
1529 p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
1530 p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
1531
1532 #if X265_DEPTH <= 10
1533 p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
1534 p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
1535 p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
1536
1537 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
1538 p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
1539 p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
1540 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
1541 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
1542 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
1543 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
1544 #endif
1545
1546 p.quant = PFX(quant_avx2);
1547 p.nquant = PFX(nquant_avx2);
1548 p.dequant_normal = PFX(dequant_normal_avx2);
1549 p.dequant_scaling = PFX(dequant_scaling_avx2);
1550 p.dst4x4 = PFX(dst4_avx2);
1551 p.idst4x4 = PFX(idst4_avx2);
1552 p.denoiseDct = PFX(denoise_dct_avx2);
1553
1554 p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
1555 p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
1556
1557 p.weight_pp = PFX(weight_pp_avx2);
1558 p.weight_sp = PFX(weight_sp_avx2);
1559 p.sign = PFX(calSign_avx2);
1560 p.planecopy_cp = PFX(upShift_8_avx2);
1561
1562 p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
1563 p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
1564
1565 p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
1566 p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
1567
1568 p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8x8_avx2);
1569 p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx2);
1570 p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx2);
1571
1572 p.cu[BLOCK_16x16].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_16_avx2);
1573 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx2);
1574
1575 p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8_avx2);
1576 p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx2);
1577 p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx2);
1578
1579 p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
1580 p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
1581
1582 p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
1583 p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
1584 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
1585
1586 p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8_avx2);
1587 p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
1588 p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
1589
1590 #if X265_DEPTH <= 10
1591 ALL_LUMA_TU_S(dct, dct, avx2);
1592 ALL_LUMA_TU_S(idct, idct, avx2);
1593 #endif
1594 ALL_LUMA_CU_S(transpose, transpose, avx2);
1595
1596 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
1597 ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
1598 #if X265_DEPTH <= 10
1599 ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
1600 #endif
1601 ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
1602 #if X265_DEPTH <= 10
1603 p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2); // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use
1604 #endif
1605
1606 p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
1607 p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
1608 p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
1609 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
1610 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
1611 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
1612 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
1613
1614 p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
1615 p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
1616 p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx2);
1617 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
1618 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
1619 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_avx2);
1620 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx2);
1621
1622 p.pu[LUMA_16x4].sad = PFX(pixel_sad_16x4_avx2);
1623 p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx2);
1624 p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
1625 p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
1626 p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
1627 #if X265_DEPTH <= 10
1628 p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
1629 p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
1630 p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
1631 p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx2);
1632 p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx2);
1633 p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx2);
1634 p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx2);
1635 p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx2);
1636 p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
1637 p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
1638 p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
1639 #endif
1640
1641 p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
1642 p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
1643 p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx2);
1644 p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx2);
1645 p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx2);
1646 p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx2);
1647 p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
1648 p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
1649 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2);
1650 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2);
1651 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2);
1652 p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
1653 p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx2);
1654 p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx2);
1655 p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
1656 p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
1657
1658 p.pu[LUMA_16x4].sad_x4 = PFX(pixel_sad_x4_16x4_avx2);
1659 p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx2);
1660 p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx2);
1661 p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx2);
1662 p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx2);
1663 p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx2);
1664 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
1665 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx2);
1666 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
1667 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx2);
1668 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
1669 p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx2);
1670 p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx2);
1671 p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx2);
1672 p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
1673 p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
1674
1675 p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
1676 p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
1677 p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
1678 p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
1679 p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
1680 p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
1681 p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
1682 p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
1683 p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
1684 p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
1685 p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
1686 p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
1687 p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
1688 p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
1689 p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
1690 p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
1691 p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
1692
1693 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
1694 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
1695 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
1696 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
1697 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
1698 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
1699 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
1700 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
1701 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
1702 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
1703 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
1704 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
1705 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
1706 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
1707 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
1708 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
1709 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
1710 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
1711 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
1712 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
1713
1714 #if X265_DEPTH <= 10
1715 p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
1716 p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
1717 p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
1718 p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_avx2);
1719 p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_avx2);
1720 p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_avx2);
1721 p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_avx2);
1722 p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx2);
1723 p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx2);
1724 p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx2);
1725 p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx2);
1726 p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx2);
1727 p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx2);
1728 p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx2);
1729 p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx2);
1730 p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx2);
1731 p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx2);
1732 p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx2);
1733 p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx2);
1734 p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx2);
1735 p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx2);
1736 p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx2);
1737 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
1738 p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
1739 p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
1740 #endif
1741
1742 p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
1743 p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
1744 p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_avx2);
1745 p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_avx2);
1746 p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_avx2);
1747 p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_avx2);
1748 p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_avx2);
1749 p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx2);
1750 p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx2);
1751 p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx2);
1752 p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx2);
1753 p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx2);
1754 p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx2);
1755 p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx2);
1756 p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx2);
1757 p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx2);
1758 p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx2);
1759 p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx2);
1760 p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx2);
1761 p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx2);
1762 p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx2);
1763 p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx2);
1764 p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_avx2);
1765 p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
1766 p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
1767
1768 #if X265_DEPTH <= 10
1769 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
1770 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
1771 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
1772 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hps = PFX(interp_4tap_horiz_ps_8x6_avx2);
1773 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hps = PFX(interp_4tap_horiz_ps_8x2_avx2);
1774 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
1775 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
1776 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx2);
1777 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx2);
1778 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx2);
1779 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx2);
1780 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
1781 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
1782 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
1783 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps = PFX(interp_4tap_horiz_ps_8x12_avx2);
1784 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
1785 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx2);
1786 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx2);
1787 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
1788 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx2);
1789 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_avx2);
1790 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
1791 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
1792 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
1793 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
1794 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
1795 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx2);
1796 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx2);
1797 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx2);
1798 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx2);
1799 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx2);
1800 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx2);
1801 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx2);
1802 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx2);
1803 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx2);
1804 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx2);
1805 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx2);
1806
1807 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp = PFX(interp_4tap_horiz_pp_6x8_avx2);
1808 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hpp = PFX(interp_4tap_horiz_pp_8x2_avx2);
1809 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
1810 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hpp = PFX(interp_4tap_horiz_pp_8x6_avx2);
1811 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
1812 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
1813 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
1814 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx2);
1815 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
1816 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx2);
1817 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
1818 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
1819 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx2);
1820 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
1821 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx2);
1822 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
1823 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx2);
1824 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp = PFX(interp_4tap_horiz_pp_6x16_avx2);
1825 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
1826 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
1827 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx2);
1828 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
1829 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
1830 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx2);
1831 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
1832 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
1833 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx2);
1834 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
1835 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx2);
1836 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
1837 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
1838 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx2);
1839 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx2);
1840 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_avx2);
1841 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
1842 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
1843 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
1844 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
1845 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx2);
1846 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
1847 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx2);
1848 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
1849 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
1850 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx2);
1851 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx2);
1852 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
1853 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx2);
1854 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
1855 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx2);
1856 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx2);
1857 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx2);
1858 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx2);
1859 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx2);
1860 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx2);
1861 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx2);
1862 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = PFX(interp_4tap_vert_pp_4x2_avx2);
1863 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = PFX(interp_4tap_vert_ps_4x2_avx2);
1864 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vsp = PFX(interp_4tap_vert_sp_4x2_avx2);
1865 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vss = PFX(interp_4tap_vert_ss_4x2_avx2);
1866 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
1867 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
1868 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
1869 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
1870 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
1871 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
1872 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
1873 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
1874 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
1875 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
1876 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
1877 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
1878 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vpp = PFX(interp_4tap_vert_pp_8x2_avx2);
1879 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = PFX(interp_4tap_vert_ps_8x2_avx2);
1880 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vsp = PFX(interp_4tap_vert_sp_8x2_avx2);
1881 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vss = PFX(interp_4tap_vert_ss_8x2_avx2);
1882 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
1883 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
1884 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
1885 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
1886 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = PFX(interp_4tap_vert_pp_8x6_avx2);
1887 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = PFX(interp_4tap_vert_ps_8x6_avx2);
1888 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vsp = PFX(interp_4tap_vert_sp_8x6_avx2);
1889 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vss = PFX(interp_4tap_vert_ss_8x6_avx2);
1890 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
1891 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
1892 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
1893 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
1894 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = PFX(interp_4tap_vert_pp_8x12_avx2);
1895 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = PFX(interp_4tap_vert_ps_8x12_avx2);
1896 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vsp = PFX(interp_4tap_vert_sp_8x12_avx2);
1897 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx2);
1898 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
1899 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
1900 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
1901 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
1902 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
1903 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
1904 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
1905 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
1906
1907 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
1908 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
1909 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
1910 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
1911 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
1912 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
1913 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
1914 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
1915 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
1916 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
1917 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
1918 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
1919 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vpp = PFX(interp_4tap_vert_pp_4x32_avx2);
1920 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = PFX(interp_4tap_vert_ps_4x32_avx2);
1921 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vsp = PFX(interp_4tap_vert_sp_4x32_avx2);
1922 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vss = PFX(interp_4tap_vert_ss_4x32_avx2);
1923 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
1924 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
1925 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
1926 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
1927 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
1928 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
1929 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
1930 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
1931 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
1932 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
1933 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
1934 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
1935 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
1936 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
1937 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
1938 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
1939 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_avx2);
1940 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_avx2);
1941 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vsp = PFX(interp_4tap_vert_sp_8x64_avx2);
1942 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
1943 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
1944 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
1945 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
1946 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
1947 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
1948 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
1949 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
1950 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
1951 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
1952 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
1953 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
1954 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
1955 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
1956 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
1957 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
1958 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
1959 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
1960 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
1961 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
1962 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
1963 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
1964 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
1965 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
1966 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
1967 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
1968 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
1969 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
1970
1971 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vss = PFX(interp_4tap_vert_ss_6x8_avx2);
1972 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vsp = PFX(interp_4tap_vert_sp_6x8_avx2);
1973 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vps = PFX(interp_4tap_vert_ps_6x8_avx2);
1974 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vpp = PFX(interp_4tap_vert_pp_6x8_avx2);
1975 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vpp = PFX(interp_4tap_vert_pp_12x16_avx2);
1976 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vps = PFX(interp_4tap_vert_ps_12x16_avx2);
1977 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = PFX(interp_4tap_vert_ss_12x16_avx2);
1978 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vsp = PFX(interp_4tap_vert_sp_12x16_avx2);
1979 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx2);
1980 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
1981 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx2);
1982 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
1983 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
1984 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx2);
1985 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
1986 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx2);
1987 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
1988 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
1989 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx2);
1990 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx2);
1991 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx2);
1992 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
1993 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
1994 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx2);
1995 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
1996 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx2);
1997 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
1998 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
1999 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx2);
2000 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx2);
2001 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx2);
2002 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx2);
2003 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx2);
2004 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
2005 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx2);
2006 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
2007 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx2);
2008 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
2009 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx2);
2010 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
2011 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx2);
2012 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
2013 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx2);
2014 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
2015 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx2);
2016 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
2017 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx2);
2018 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
2019 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vpp = PFX(interp_4tap_vert_pp_12x32_avx2);
2020 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vps = PFX(interp_4tap_vert_ps_12x32_avx2);
2021 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = PFX(interp_4tap_vert_ss_12x32_avx2);
2022 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vsp = PFX(interp_4tap_vert_sp_12x32_avx2);
2023 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
2024 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
2025 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx2);
2026 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
2027 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx2);
2028 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
2029 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
2030 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx2);
2031 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
2032 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx2);
2033 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx2);
2034 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
2035 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx2);
2036 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
2037 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx2);
2038 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
2039 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
2040 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx2);
2041 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
2042 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx2);
2043 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_avx2);
2044 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vps = PFX(interp_4tap_vert_ps_24x64_avx2);
2045 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx2);
2046 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vsp = PFX(interp_4tap_vert_sp_24x64_avx2);
2047 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
2048 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
2049 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx2);
2050 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx2);
2051 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
2052 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
2053 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx2);
2054 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx2);
2055 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
2056 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
2057 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx2);
2058 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx2);
2059 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
2060 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
2061 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx2);
2062 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx2);
2063 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vpp = PFX(interp_4tap_vert_pp_12x16_avx2);
2064 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vps = PFX(interp_4tap_vert_ps_12x16_avx2);
2065 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = PFX(interp_4tap_vert_ss_12x16_avx2);
2066 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vsp = PFX(interp_4tap_vert_sp_12x16_avx2);
2067 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx2);
2068 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
2069 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx2);
2070 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
2071 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
2072 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx2);
2073 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx2);
2074 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
2075 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx2);
2076 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
2077 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
2078 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx2);
2079 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx2);
2080 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx2);
2081 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx2);
2082 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
2083 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
2084 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx2);
2085 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx2);
2086 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
2087 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx2);
2088 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
2089 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
2090 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx2);
2091 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx2);
2092 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx2);
2093 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx2);
2094 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx2);
2095 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx2);
2096 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
2097 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx2);
2098 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
2099 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx2);
2100 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx2);
2101 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
2102 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx2);
2103 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
2104 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx2);
2105 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx2);
2106 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
2107 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx2);
2108 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
2109 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx2);
2110 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx2);
2111 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
2112 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx2);
2113 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
2114 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx2);
2115 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx2);
2116 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx2);
2117 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx2);
2118 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx2);
2119 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
2120 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx2);
2121 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx2);
2122 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx2);
2123 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx2);
2124 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx2);
2125 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx2);
2126 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx2);
2127 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx2);
2128 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx2);
2129 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx2);
2130 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx2);
2131 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx2);
2132 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
2133 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
2134 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
2135
2136 /* The following primitives have been disabled since performance compared to SSE is negligible/negative */
2137 #if 0
2138 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vss = PFX(interp_4tap_vert_ss_6x16_avx2);
2139 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vps = PFX(interp_4tap_vert_ps_6x16_avx2);
2140
2141 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_avx2);
2142 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_avx2);
2143 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = PFX(interp_4tap_horiz_pp_12x32_avx2);
2144
2145 p.cu[BLOCK_4x4].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_4_avx2);
2146 p.cu[BLOCK_8x8].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_8_avx2);
2147 p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4_avx2);
2148 p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4x4_avx2);
2149 p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
2150
2151 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps = PFX(interp_4tap_horiz_ps_12x16_avx2);
2152 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = PFX(interp_4tap_horiz_ps_12x16_avx2);
2153 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = PFX(interp_4tap_horiz_ps_12x32_avx2);
2154 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps = PFX(interp_4tap_horiz_ps_6x16_avx2);
2155 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = PFX(interp_4tap_horiz_ps_6x8_avx2);
2156 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
2157 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
2158 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
2159 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
2160 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
2161 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
2162 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx2);
2163 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx2);
2164 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx2);
2165 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps = PFX(interp_4tap_horiz_ps_8x64_avx2);
2166 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
2167 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
2168 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
2169 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
2170 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
2171 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
2172 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx2);
2173 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vsp = PFX(interp_4tap_vert_sp_6x16_avx2);
2174 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vpp = PFX(interp_4tap_vert_pp_6x16_avx2);
2175
2176 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
2177 p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
2178 p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
2179 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2);
2180 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
2181 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
2182 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
2183 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
2184 #endif
2185 #endif
2186
2187 p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
2188
2189 #if X265_DEPTH <= 10
2190 // TODO: depends on hps and vsp
2191 ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
2192 p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4]
2193 #endif
2194
2195 if (cpuMask & X265_CPU_BMI2)
2196 p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
2197 }
2198 }
2199 #else // if HIGH_BIT_DEPTH
2200
setupAssemblyPrimitives(EncoderPrimitives & p,int cpuMask)2201 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main
2202 {
2203 #if X86_64
2204 p.scanPosLast = PFX(scanPosLast_x64);
2205 #endif
2206
2207 if (cpuMask & X265_CPU_SSE2)
2208 {
2209 /* We do not differentiate CPUs which support MMX and not SSE2. We only check
2210 * for SSE2 and then use both MMX and SSE2 functions */
2211 AVC_LUMA_PU(sad, mmx2);
2212 AVC_LUMA_PU(sad_x3, mmx2);
2213 AVC_LUMA_PU(sad_x4, mmx2);
2214
2215 p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_sse2);
2216 p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_sse2);
2217 p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_sse2);
2218 p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_sse2);
2219 p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_sse2);
2220 p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_sse2);
2221 HEVC_SAD(sse2);
2222
2223 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_mmx2);
2224 ALL_LUMA_PU(satd, pixel_satd, sse2);
2225
2226 p.cu[BLOCK_4x4].sse_pp = PFX(pixel_ssd_4x4_mmx);
2227 p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_mmx);
2228 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_mmx);
2229
2230 PIXEL_AVG_W4(mmx2);
2231 PIXEL_AVG(sse2);
2232 LUMA_VAR(sse2);
2233
2234 ASSIGN_SA8D(sse2);
2235 p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = PFX(pixel_ssd_4x8_mmx);
2236 ASSIGN_SSE_PP(sse2);
2237 ASSIGN_SSE_SS(sse2);
2238
2239 LUMA_PU_BLOCKCOPY(pp, sse2);
2240 CHROMA_420_PU_BLOCKCOPY(pp, sse2);
2241 CHROMA_422_PU_BLOCKCOPY(pp, sse2);
2242
2243 LUMA_CU_BLOCKCOPY(ss, sse2);
2244 LUMA_CU_BLOCKCOPY(sp, sse2);
2245 CHROMA_420_CU_BLOCKCOPY(ss, sse2);
2246 CHROMA_422_CU_BLOCKCOPY(ss, sse2);
2247 CHROMA_420_CU_BLOCKCOPY(sp, sse2);
2248 CHROMA_422_CU_BLOCKCOPY(sp, sse2);
2249
2250 LUMA_VSS_FILTERS(sse2);
2251 CHROMA_420_VSS_FILTERS(_sse2);
2252 CHROMA_422_VSS_FILTERS(_sse2);
2253 CHROMA_444_VSS_FILTERS(sse2);
2254 CHROMA_420_VSP_FILTERS(_sse2);
2255 CHROMA_422_VSP_FILTERS(_sse2);
2256 CHROMA_444_VSP_FILTERS(_sse2);
2257 #if X86_64
2258 ALL_CHROMA_420_PU(filter_vpp, interp_4tap_vert_pp, sse2);
2259 ALL_CHROMA_422_PU(filter_vpp, interp_4tap_vert_pp, sse2);
2260 ALL_CHROMA_444_PU(filter_vpp, interp_4tap_vert_pp, sse2);
2261 ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, sse2);
2262 ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, sse2);
2263 ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sse2);
2264 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
2265 ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
2266 #else
2267 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = PFX(interp_4tap_vert_pp_2x4_sse2);
2268 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = PFX(interp_4tap_vert_pp_2x8_sse2);
2269 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = PFX(interp_4tap_vert_pp_4x2_sse2);
2270 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_sse2);
2271 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_sse2);
2272 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_sse2);
2273 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = PFX(interp_4tap_vert_pp_2x16_sse2);
2274 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_sse2);
2275 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_sse2);
2276 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_sse2);
2277 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vpp = PFX(interp_4tap_vert_pp_4x32_sse2);
2278 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_sse2);
2279 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_sse2);
2280 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_sse2);
2281 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = PFX(interp_4tap_vert_ps_2x4_sse2);
2282 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vps = PFX(interp_4tap_vert_ps_2x8_sse2);
2283 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = PFX(interp_4tap_vert_ps_4x2_sse2);
2284 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_sse2);
2285 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_sse2);
2286 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = PFX(interp_4tap_vert_ps_2x16_sse2);
2287 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_sse2);
2288 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_sse2);
2289 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_sse2);
2290 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = PFX(interp_4tap_vert_ps_4x32_sse2);
2291 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_sse2);
2292 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_sse2);
2293 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_sse2);
2294 #endif
2295
2296 ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
2297 p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
2298 ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
2299 p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
2300 p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_sse3);
2301
2302 //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
2303 p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
2304
2305 ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
2306 ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
2307 ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
2308 ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
2309 ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
2310 ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
2311
2312 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
2313 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
2314
2315 p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
2316 p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
2317 p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
2318 p.cu[BLOCK_4x4].intra_pred[5] = PFX(intra_pred_ang4_5_sse2);
2319 p.cu[BLOCK_4x4].intra_pred[6] = PFX(intra_pred_ang4_6_sse2);
2320 p.cu[BLOCK_4x4].intra_pred[7] = PFX(intra_pred_ang4_7_sse2);
2321 p.cu[BLOCK_4x4].intra_pred[8] = PFX(intra_pred_ang4_8_sse2);
2322 p.cu[BLOCK_4x4].intra_pred[9] = PFX(intra_pred_ang4_9_sse2);
2323 p.cu[BLOCK_4x4].intra_pred[10] = PFX(intra_pred_ang4_10_sse2);
2324 p.cu[BLOCK_4x4].intra_pred[11] = PFX(intra_pred_ang4_11_sse2);
2325 p.cu[BLOCK_4x4].intra_pred[12] = PFX(intra_pred_ang4_12_sse2);
2326 p.cu[BLOCK_4x4].intra_pred[13] = PFX(intra_pred_ang4_13_sse2);
2327 p.cu[BLOCK_4x4].intra_pred[14] = PFX(intra_pred_ang4_14_sse2);
2328 p.cu[BLOCK_4x4].intra_pred[15] = PFX(intra_pred_ang4_15_sse2);
2329 p.cu[BLOCK_4x4].intra_pred[16] = PFX(intra_pred_ang4_16_sse2);
2330 p.cu[BLOCK_4x4].intra_pred[17] = PFX(intra_pred_ang4_17_sse2);
2331 p.cu[BLOCK_4x4].intra_pred[18] = PFX(intra_pred_ang4_18_sse2);
2332 p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_19_sse2);
2333 p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_20_sse2);
2334 p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_21_sse2);
2335 p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_22_sse2);
2336 p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
2337 p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
2338 p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
2339 p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
2340 p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
2341 p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
2342 p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
2343 p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_30_sse2);
2344 p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_31_sse2);
2345 p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
2346 p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
2347
2348 p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_sse2);
2349
2350 p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_sse2);
2351 p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_sse2);
2352
2353 ALL_LUMA_TU_S(transpose, transpose, sse2);
2354 p.cu[BLOCK_64x64].transpose = PFX(transpose64_sse2);
2355
2356 p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
2357 p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
2358
2359 p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
2360 p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
2361 p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
2362 #if X86_64
2363 p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
2364
2365 // TODO: it is passed smoke test, but we need testbench, so temporary disable
2366 p.costC1C2Flag = x265_costC1C2Flag_sse2;
2367 #endif
2368 p.idst4x4 = PFX(idst4_sse2);
2369 p.dst4x4 = PFX(dst4_sse2);
2370
2371 p.planecopy_sp = PFX(downShift_16_sse2);
2372 ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
2373 ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
2374 ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
2375 ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
2376 ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
2377 }
2378 if (cpuMask & X265_CPU_SSE3)
2379 {
2380 ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
2381 ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
2382 ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
2383 ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
2384 ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
2385 ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
2386 }
2387 if (cpuMask & X265_CPU_SSSE3)
2388 {
2389 p.pu[LUMA_8x16].sad_x3 = PFX(pixel_sad_x3_8x16_ssse3);
2390 p.pu[LUMA_8x32].sad_x3 = PFX(pixel_sad_x3_8x32_ssse3);
2391 p.pu[LUMA_12x16].sad_x3 = PFX(pixel_sad_x3_12x16_ssse3);
2392 HEVC_SAD_X3(ssse3);
2393
2394 p.pu[LUMA_8x4].sad_x4 = PFX(pixel_sad_x4_8x4_ssse3);
2395 p.pu[LUMA_8x8].sad_x4 = PFX(pixel_sad_x4_8x8_ssse3);
2396 p.pu[LUMA_8x16].sad_x4 = PFX(pixel_sad_x4_8x16_ssse3);
2397 p.pu[LUMA_8x32].sad_x4 = PFX(pixel_sad_x4_8x32_ssse3);
2398 p.pu[LUMA_12x16].sad_x4 = PFX(pixel_sad_x4_12x16_ssse3);
2399 HEVC_SAD_X4(ssse3);
2400
2401 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3);
2402 ALL_LUMA_PU(satd, pixel_satd, ssse3);
2403
2404 ASSIGN_SA8D(ssse3);
2405 PIXEL_AVG(ssse3);
2406 PIXEL_AVG_W4(ssse3);
2407 INTRA_ANG_SSSE3(ssse3);
2408
2409 ASSIGN_SSE_PP(ssse3);
2410 p.cu[BLOCK_4x4].sse_pp = PFX(pixel_ssd_4x4_ssse3);
2411 p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = PFX(pixel_ssd_4x8_ssse3);
2412
2413 p.dst4x4 = PFX(dst4_ssse3);
2414 p.cu[BLOCK_8x8].idct = PFX(idct8_ssse3);
2415
2416 // MUST be done after LUMA_FILTERS() to overwrite default version
2417 p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_ssse3);
2418
2419 p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
2420 p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
2421 p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
2422
2423 p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_ssse3);
2424 p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_ssse3);
2425 p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_ssse3);
2426 p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_ssse3);
2427 p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_ssse3);
2428 p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_ssse3);
2429 p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_ssse3);
2430 p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_ssse3);
2431 p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_ssse3);
2432 p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_ssse3);
2433 p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_ssse3);
2434 p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_ssse3);
2435 p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_ssse3);
2436 p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_ssse3);
2437 p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_ssse3);
2438 p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_ssse3);
2439 p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_ssse3);
2440 p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_ssse3);
2441 p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_ssse3);
2442 p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_ssse3);
2443 p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_ssse3);
2444 p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_ssse3);
2445
2446 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
2447 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
2448 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
2449 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
2450 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
2451 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
2452 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
2453 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
2454 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
2455 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
2456 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
2457 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
2458 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
2459 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
2460 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
2461 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
2462 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
2463 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
2464 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
2465 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
2466 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
2467 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
2468 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
2469 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
2470 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
2471 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
2472 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
2473 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
2474 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
2475 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
2476 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
2477 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
2478 p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
2479 }
2480 if (cpuMask & X265_CPU_SSE4)
2481 {
2482 p.sign = PFX(calSign_sse4);
2483 p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
2484 p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
2485 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
2486 p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
2487 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_sse4);
2488 p.saoCuOrgE3[0] = PFX(saoCuOrgE3_sse4);
2489 p.saoCuOrgE3[1] = PFX(saoCuOrgE3_sse4);
2490 p.saoCuOrgB0 = PFX(saoCuOrgB0_sse4);
2491
2492 LUMA_ADDAVG(sse4);
2493 CHROMA_420_ADDAVG(sse4);
2494 CHROMA_422_ADDAVG(sse4);
2495
2496 // TODO: check POPCNT flag!
2497 ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
2498
2499 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4);
2500 ALL_LUMA_PU(satd, pixel_satd, sse4);
2501 ASSIGN_SA8D(sse4);
2502 ASSIGN_SSE_SS(sse4);
2503 p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_sse4);
2504
2505 LUMA_PIXELSUB(sse4);
2506 CHROMA_420_PIXELSUB_PS(sse4);
2507 CHROMA_422_PIXELSUB_PS(sse4);
2508
2509 LUMA_FILTERS(sse4);
2510 CHROMA_420_FILTERS(sse4);
2511 CHROMA_422_FILTERS(sse4);
2512 CHROMA_444_FILTERS(sse4);
2513 CHROMA_420_VSS_FILTERS_SSE4(_sse4);
2514 CHROMA_422_VSS_FILTERS_SSE4(_sse4);
2515 CHROMA_420_VSP_FILTERS_SSE4(_sse4);
2516 CHROMA_422_VSP_FILTERS_SSE4(_sse4);
2517 CHROMA_444_VSP_FILTERS_SSE4(_sse4);
2518
2519 // MUST be done after LUMA_FILTERS() to overwrite default version
2520 p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_ssse3);
2521
2522 LUMA_CU_BLOCKCOPY(ps, sse4);
2523 CHROMA_420_CU_BLOCKCOPY(ps, sse4);
2524 CHROMA_422_CU_BLOCKCOPY(ps, sse4);
2525
2526 p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_sse4);
2527 p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_sse4);
2528 p.cu[BLOCK_8x8].dct = PFX(dct8_sse4);
2529 p.denoiseDct = PFX(denoise_dct_sse4);
2530 p.quant = PFX(quant_sse4);
2531 p.nquant = PFX(nquant_sse4);
2532 p.dequant_normal = PFX(dequant_normal_sse4);
2533 p.dequant_scaling = PFX(dequant_scaling_sse4);
2534
2535 p.weight_pp = PFX(weight_pp_sse4);
2536 p.weight_sp = PFX(weight_sp_sse4);
2537
2538 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
2539 p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
2540 p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
2541 p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
2542
2543 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
2544 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
2545 ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
2546
2547 INTRA_ANG_SSE4_COMMON(sse4);
2548 INTRA_ANG_SSE4(sse4);
2549
2550 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
2551 p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
2552
2553 p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_sse4);
2554 p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_sse4);
2555 p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_sse4);
2556
2557 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
2558 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
2559 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_sse4);
2560 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
2561 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
2562 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
2563 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
2564 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
2565 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
2566 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
2567 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
2568 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
2569 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_sse4);
2570 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
2571
2572 #if X86_64
2573 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
2574 p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
2575 p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
2576 p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
2577 p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
2578
2579 ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
2580 ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
2581
2582 p.costCoeffNxN = PFX(costCoeffNxN_sse4);
2583 #endif
2584 p.costCoeffRemain = PFX(costCoeffRemain_sse4);
2585 }
2586 if (cpuMask & X265_CPU_AVX)
2587 {
2588 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx);
2589 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx);
2590 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx);
2591 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_avx);
2592 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_avx);
2593 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_avx);
2594 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);
2595 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx);
2596 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx);
2597 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx);
2598 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx);
2599 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx);
2600 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx);
2601 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
2602 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
2603 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
2604 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx);
2605 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx);
2606 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx);
2607 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx);
2608 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx);
2609 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx);
2610 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_avx);
2611 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx);
2612 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
2613 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_avx);
2614 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
2615 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
2616 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
2617 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = PFX(pixel_ssd_8x8_avx);
2618 p.pu[LUMA_16x4].sad_x4 = PFX(pixel_sad_x4_16x4_avx);
2619 p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx);
2620 p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx);
2621 p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx);
2622 p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx);
2623 p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx);
2624 p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_avx);
2625 p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx);
2626
2627 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = PFX(blockcopy_pp_32x8_avx);
2628 p.pu[LUMA_32x8].copy_pp = PFX(blockcopy_pp_32x8_avx);
2629
2630 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx);
2631 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx);
2632 p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx);
2633
2634 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx);
2635 p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx);
2636
2637 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx);
2638 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx);
2639 p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx);
2640
2641 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx);
2642
2643 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx);
2644 p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx);
2645
2646 p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx);
2647 p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx);
2648 p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx);
2649 p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx);
2650
2651 p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
2652
2653 /* The following primitives have been disabled since performance compared to SSE4.2 is negligible/negative */
2654 #if 0
2655 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_avx);
2656 p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx);
2657 p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx);
2658 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx);
2659 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx);
2660 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx);
2661 p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx);
2662 p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
2663 p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
2664 p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
2665 p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_avx);
2666 p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = PFX(pixel_satd_4x4_avx);
2667 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_avx);
2668 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_avx);
2669 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_avx);
2670 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
2671
2672 ALL_LUMA_PU(satd, pixel_satd, avx);
2673 p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
2674 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);
2675 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
2676 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
2677 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
2678 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx);
2679 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);
2680 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
2681 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);
2682 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
2683 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx);
2684 p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx);
2685 p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_avx);
2686 p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_ssd_8x16_avx);
2687 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_ssd_16x32_avx);
2688 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_ssd_32x64_avx);
2689 ASSIGN_SSE_SS(avx);
2690
2691 LUMA_VAR(avx);
2692 p.pu[LUMA_12x16].sad_x3 = PFX(pixel_sad_x3_12x16_avx);
2693 p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx);
2694 HEVC_SAD_X3(avx);
2695
2696 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx);
2697 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx);
2698 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx);
2699 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx);
2700 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx);
2701 p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx);
2702 p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx);
2703 p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx);
2704 p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx);
2705 p.pu[LUMA_12x16].sad_x4 = PFX(pixel_sad_x4_12x16_avx);
2706 p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx);
2707 p.pu[LUMA_24x32].sad_x4 = PFX(pixel_sad_x4_24x32_avx);
2708 p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx);
2709 p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx);
2710 p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx);
2711 p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx);
2712 p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx)
2713
2714 p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
2715 p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
2716 p.frameInitLowres = PFX(frame_init_lowres_core_avx);
2717 #endif
2718 }
2719 if (cpuMask & X265_CPU_XOP)
2720 {
2721 //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
2722 ALL_LUMA_PU(satd, pixel_satd, xop);
2723 ASSIGN_SA8D(xop);
2724 LUMA_VAR(xop);
2725 p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_xop);
2726 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_xop);
2727 p.frameInitLowres = PFX(frame_init_lowres_core_xop);
2728 }
2729 #if X86_64
2730 if (cpuMask & X265_CPU_AVX2)
2731 {
2732 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
2733
2734 p.planecopy_sp = PFX(downShift_16_avx2);
2735
2736 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
2737
2738 p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
2739 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
2740
2741 p.idst4x4 = PFX(idst4_avx2);
2742 p.dst4x4 = PFX(dst4_avx2);
2743 p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
2744 p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
2745 p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
2746 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
2747 p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2);
2748 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
2749 p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2);
2750 p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2);
2751 p.sign = PFX(calSign_avx2);
2752
2753 p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
2754 p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
2755 p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
2756 p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
2757 p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
2758
2759 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
2760 p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
2761 p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
2762 p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
2763 p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
2764 p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2);
2765 p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2);
2766 p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
2767 p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
2768 p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
2769 p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
2770 p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
2771 p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx2);
2772 p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
2773 p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
2774 p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
2775 p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
2776 p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
2777 p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
2778 p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
2779 p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
2780 p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
2781 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2);
2782 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2);
2783 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
2784 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
2785 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
2786 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx2);
2787 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
2788 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
2789 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
2790 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
2791 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
2792 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
2793 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
2794 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
2795 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
2796 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
2797 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
2798 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
2799 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
2800
2801 p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
2802 p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2803 p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
2804 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
2805 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2806 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
2807
2808 p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2809 p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2810 p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
2811 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2812 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2813 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
2814 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
2815
2816 p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
2817 p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2818 p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx2);
2819 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
2820 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2821 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_avx2);
2822 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx2);
2823
2824 p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
2825 p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
2826 p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
2827 p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
2828 p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
2829 p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
2830 p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
2831 p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
2832 p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
2833
2834 p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx2);
2835 p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx2);
2836 p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx2);
2837 p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
2838 p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx2);
2839 p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx2);
2840 p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx2);
2841 p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx2);
2842 p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx2);
2843 p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx2);
2844 p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx2);
2845 p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx2);
2846 p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx2);
2847 p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx2);
2848 p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx2);
2849 p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx2);
2850 p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx2);
2851 p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx2);
2852
2853 p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
2854 p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
2855 p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx2);
2856 p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx2);
2857 p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx2);
2858 p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx2);
2859 p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx2);
2860 p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
2861 p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
2862 p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
2863 p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx2);
2864 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx2);
2865 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx2);
2866 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
2867 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
2868 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
2869
2870 p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
2871 p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
2872 p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
2873 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
2874 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
2875
2876 p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
2877 p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
2878
2879 p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
2880 p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
2881 p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
2882
2883 p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
2884 p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
2885 p.cu[BLOCK_16x16].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_16_avx2);
2886 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx2);
2887 p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8_avx2);
2888 p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx2);
2889 p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx2);
2890
2891 p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
2892 p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
2893 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
2894
2895 p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8_avx2);
2896 p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
2897 p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
2898
2899 p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8x8_avx2);
2900 p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx2);
2901 p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx2);
2902
2903 p.denoiseDct = PFX(denoise_dct_avx2);
2904 p.quant = PFX(quant_avx2);
2905 p.nquant = PFX(nquant_avx2);
2906 p.dequant_normal = PFX(dequant_normal_avx2);
2907 p.dequant_scaling = PFX(dequant_scaling_avx2);
2908
2909 p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
2910 p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
2911
2912 p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
2913 p.weight_pp = PFX(weight_pp_avx2);
2914 p.weight_sp = PFX(weight_sp_avx2);
2915
2916 // intra_pred functions
2917 p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_avx2);
2918 p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_avx2);
2919 p.cu[BLOCK_4x4].intra_pred[5] = PFX(intra_pred_ang4_5_avx2);
2920 p.cu[BLOCK_4x4].intra_pred[6] = PFX(intra_pred_ang4_6_avx2);
2921 p.cu[BLOCK_4x4].intra_pred[7] = PFX(intra_pred_ang4_7_avx2);
2922 p.cu[BLOCK_4x4].intra_pred[8] = PFX(intra_pred_ang4_8_avx2);
2923 p.cu[BLOCK_4x4].intra_pred[9] = PFX(intra_pred_ang4_9_avx2);
2924 p.cu[BLOCK_4x4].intra_pred[11] = PFX(intra_pred_ang4_11_avx2);
2925 p.cu[BLOCK_4x4].intra_pred[12] = PFX(intra_pred_ang4_12_avx2);
2926 p.cu[BLOCK_4x4].intra_pred[13] = PFX(intra_pred_ang4_13_avx2);
2927 p.cu[BLOCK_4x4].intra_pred[14] = PFX(intra_pred_ang4_14_avx2);
2928 p.cu[BLOCK_4x4].intra_pred[15] = PFX(intra_pred_ang4_15_avx2);
2929 p.cu[BLOCK_4x4].intra_pred[16] = PFX(intra_pred_ang4_16_avx2);
2930 p.cu[BLOCK_4x4].intra_pred[17] = PFX(intra_pred_ang4_17_avx2);
2931 p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_19_avx2);
2932 p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_20_avx2);
2933 p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_21_avx2);
2934 p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_22_avx2);
2935 p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_avx2);
2936 p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_avx2);
2937 p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_avx2);
2938 p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_avx2);
2939 p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_avx2);
2940 p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_avx2);
2941 p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_30_avx2);
2942 p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_31_avx2);
2943 p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_avx2);
2944 p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_avx2);
2945 p.cu[BLOCK_8x8].intra_pred[3] = PFX(intra_pred_ang8_3_avx2);
2946 p.cu[BLOCK_8x8].intra_pred[4] = PFX(intra_pred_ang8_4_avx2);
2947 p.cu[BLOCK_8x8].intra_pred[5] = PFX(intra_pred_ang8_5_avx2);
2948 p.cu[BLOCK_8x8].intra_pred[6] = PFX(intra_pred_ang8_6_avx2);
2949 p.cu[BLOCK_8x8].intra_pred[7] = PFX(intra_pred_ang8_7_avx2);
2950 p.cu[BLOCK_8x8].intra_pred[8] = PFX(intra_pred_ang8_8_avx2);
2951 p.cu[BLOCK_8x8].intra_pred[9] = PFX(intra_pred_ang8_9_avx2);
2952 p.cu[BLOCK_8x8].intra_pred[11] = PFX(intra_pred_ang8_11_avx2);
2953 p.cu[BLOCK_8x8].intra_pred[12] = PFX(intra_pred_ang8_12_avx2);
2954 p.cu[BLOCK_8x8].intra_pred[13] = PFX(intra_pred_ang8_13_avx2);
2955 p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
2956 p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
2957 p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
2958 p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
2959 p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
2960 p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
2961 p.cu[BLOCK_8x8].intra_pred[23] = PFX(intra_pred_ang8_23_avx2);
2962 p.cu[BLOCK_8x8].intra_pred[24] = PFX(intra_pred_ang8_24_avx2);
2963 p.cu[BLOCK_8x8].intra_pred[25] = PFX(intra_pred_ang8_25_avx2);
2964 p.cu[BLOCK_8x8].intra_pred[27] = PFX(intra_pred_ang8_27_avx2);
2965 p.cu[BLOCK_8x8].intra_pred[28] = PFX(intra_pred_ang8_28_avx2);
2966 p.cu[BLOCK_8x8].intra_pred[29] = PFX(intra_pred_ang8_29_avx2);
2967 p.cu[BLOCK_8x8].intra_pred[30] = PFX(intra_pred_ang8_30_avx2);
2968 p.cu[BLOCK_8x8].intra_pred[31] = PFX(intra_pred_ang8_31_avx2);
2969 p.cu[BLOCK_8x8].intra_pred[32] = PFX(intra_pred_ang8_32_avx2);
2970 p.cu[BLOCK_8x8].intra_pred[33] = PFX(intra_pred_ang8_33_avx2);
2971 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
2972 p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2);
2973 p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx2);
2974 p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx2);
2975 p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx2);
2976 p.cu[BLOCK_16x16].intra_pred[8] = PFX(intra_pred_ang16_8_avx2);
2977 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx2);
2978 p.cu[BLOCK_16x16].intra_pred[12] = PFX(intra_pred_ang16_12_avx2);
2979 p.cu[BLOCK_16x16].intra_pred[11] = PFX(intra_pred_ang16_11_avx2);
2980 p.cu[BLOCK_16x16].intra_pred[13] = PFX(intra_pred_ang16_13_avx2);
2981 p.cu[BLOCK_16x16].intra_pred[14] = PFX(intra_pred_ang16_14_avx2);
2982 p.cu[BLOCK_16x16].intra_pred[15] = PFX(intra_pred_ang16_15_avx2);
2983 p.cu[BLOCK_16x16].intra_pred[16] = PFX(intra_pred_ang16_16_avx2);
2984 p.cu[BLOCK_16x16].intra_pred[17] = PFX(intra_pred_ang16_17_avx2);
2985 p.cu[BLOCK_16x16].intra_pred[25] = PFX(intra_pred_ang16_25_avx2);
2986 p.cu[BLOCK_16x16].intra_pred[28] = PFX(intra_pred_ang16_28_avx2);
2987 p.cu[BLOCK_16x16].intra_pred[27] = PFX(intra_pred_ang16_27_avx2);
2988 p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx2);
2989 p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx2);
2990 p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx2);
2991 p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx2);
2992 p.cu[BLOCK_16x16].intra_pred[33] = PFX(intra_pred_ang16_33_avx2);
2993 p.cu[BLOCK_16x16].intra_pred[24] = PFX(intra_pred_ang16_24_avx2);
2994 p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2);
2995 p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2);
2996 p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx2);
2997 p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx2);
2998 p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx2);
2999 p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2);
3000 p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2);
3001 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
3002 p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
3003 p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
3004 p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
3005 p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
3006 p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2);
3007 p.cu[BLOCK_32x32].intra_pred[16] = PFX(intra_pred_ang32_16_avx2);
3008 p.cu[BLOCK_32x32].intra_pred[17] = PFX(intra_pred_ang32_17_avx2);
3009 p.cu[BLOCK_32x32].intra_pred[19] = PFX(intra_pred_ang32_19_avx2);
3010 p.cu[BLOCK_32x32].intra_pred[20] = PFX(intra_pred_ang32_20_avx2);
3011 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
3012 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
3013 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
3014 p.cu[BLOCK_32x32].intra_pred[27] = PFX(intra_pred_ang32_27_avx2);
3015 p.cu[BLOCK_32x32].intra_pred[28] = PFX(intra_pred_ang32_28_avx2);
3016 p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx2);
3017 p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx2);
3018 p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx2);
3019 p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx2);
3020 p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
3021 p.cu[BLOCK_32x32].intra_pred[25] = PFX(intra_pred_ang32_25_avx2);
3022 p.cu[BLOCK_32x32].intra_pred[24] = PFX(intra_pred_ang32_24_avx2);
3023 p.cu[BLOCK_32x32].intra_pred[23] = PFX(intra_pred_ang32_23_avx2);
3024 p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
3025 p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
3026 p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
3027 p.cu[BLOCK_32x32].intra_pred[3] = PFX(intra_pred_ang32_3_avx2);
3028
3029 // all_angs primitives
3030 p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_avx2);
3031
3032 p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx2);
3033 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx2);
3034 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx2);
3035
3036 p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx2);
3037
3038 // copy_ps primitives
3039 p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_avx2);
3040 p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_avx2);
3041 p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_avx2);
3042
3043 ALL_LUMA_TU_S(dct, dct, avx2);
3044 ALL_LUMA_TU_S(idct, idct, avx2);
3045 ALL_LUMA_CU_S(transpose, transpose, avx2);
3046
3047 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
3048 ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
3049 ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
3050 ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
3051 p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);
3052
3053 // missing 4x8, 4x16, 24x32, 12x16 for the fill set of luma PU
3054 p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
3055 p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
3056 p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_avx2);
3057 p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_avx2);
3058 p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_avx2);
3059 p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_avx2);
3060 p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_avx2);
3061 p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx2);
3062 p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx2);
3063 p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx2);
3064 p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx2);
3065 p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx2);
3066 p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx2);
3067 p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx2);
3068 p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx2);
3069 p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx2);
3070 p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx2);
3071 p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx2);
3072 p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx2);
3073 p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx2);
3074 p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx2);
3075 p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx2);
3076 p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
3077 p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
3078 p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_avx2);
3079
3080 p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
3081 p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
3082 p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
3083 p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_avx2);
3084 p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_avx2);
3085 p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_avx2);
3086 p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_avx2);
3087 p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx2);
3088 p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx2);
3089 p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx2);
3090 p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx2);
3091 p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx2);
3092 p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx2);
3093 p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx2);
3094 p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx2);
3095 p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx2);
3096 p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx2);
3097 p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx2);
3098 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
3099 p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx2);
3100 p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx2);
3101 p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx2);
3102 p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx2);
3103 p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
3104 p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
3105 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
3106 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
3107 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
3108 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = PFX(interp_4tap_horiz_pp_2x4_avx2);
3109 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = PFX(interp_4tap_horiz_pp_2x8_avx2);
3110 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = PFX(interp_4tap_horiz_pp_4x2_avx2);
3111 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx2);
3112 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
3113 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx2);
3114 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
3115 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp = PFX(interp_4tap_horiz_pp_6x8_avx2);
3116 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp = PFX(interp_4tap_horiz_pp_6x16_avx2);
3117 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
3118 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx2);
3119 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx2);
3120 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hpp = PFX(interp_4tap_horiz_pp_8x2_avx2);
3121 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
3122 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hpp = PFX(interp_4tap_horiz_pp_8x6_avx2);
3123 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
3124 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
3125 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_avx2);
3126 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
3127 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
3128 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps = PFX(interp_4tap_horiz_ps_4x4_avx2);
3129 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
3130 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps = PFX(interp_4tap_horiz_ps_4x2_avx2);
3131 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps = PFX(interp_4tap_horiz_ps_4x8_avx2);
3132 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps = PFX(interp_4tap_horiz_ps_4x16_avx2);
3133 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hps = PFX(interp_4tap_horiz_ps_8x2_avx2);
3134 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
3135 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hps = PFX(interp_4tap_horiz_ps_8x6_avx2);
3136 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
3137 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
3138 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
3139 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx2);
3140 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
3141 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx2);
3142 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx2);
3143 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
3144 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx2);
3145 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx2);
3146 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps = PFX(interp_4tap_horiz_ps_2x4_avx2);
3147 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps = PFX(interp_4tap_horiz_ps_2x8_avx2);
3148 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = PFX(interp_4tap_horiz_ps_6x8_avx2);
3149 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx2);
3150 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
3151 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
3152 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
3153 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
3154 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = PFX(interp_4tap_vert_pp_2x4_avx2);
3155 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = PFX(interp_4tap_vert_pp_2x8_avx2);
3156 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
3157 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vpp = PFX(interp_4tap_vert_pp_8x2_avx2);
3158 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
3159 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = PFX(interp_4tap_vert_pp_8x6_avx2);
3160 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
3161 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
3162 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vpp = PFX(interp_4tap_vert_pp_12x16_avx2);
3163 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx2);
3164 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
3165 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx2);
3166 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
3167
3168 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = PFX(interp_4tap_vert_ps_2x4_avx2);
3169 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vps = PFX(interp_4tap_vert_ps_2x8_avx2);
3170 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
3171 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
3172 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = PFX(interp_4tap_vert_ps_8x2_avx2);
3173 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
3174 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = PFX(interp_4tap_vert_ps_8x6_avx2);
3175 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
3176 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vps = PFX(interp_4tap_vert_ps_12x16_avx2);
3177 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx2);
3178 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
3179 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx2);
3180 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
3181 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
3182 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
3183 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
3184 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx2);
3185 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
3186 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx2);
3187
3188 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
3189 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
3190 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
3191 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
3192 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vsp = PFX(interp_4tap_vert_sp_2x4_avx2);
3193 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vsp = PFX(interp_4tap_vert_sp_2x8_avx2);
3194 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vsp = PFX(interp_4tap_vert_sp_4x2_avx2);
3195 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
3196 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
3197 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vsp = PFX(interp_4tap_vert_sp_6x8_avx2);
3198 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vsp = PFX(interp_4tap_vert_sp_8x2_avx2);
3199 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
3200 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vsp = PFX(interp_4tap_vert_sp_8x6_avx2);
3201 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
3202 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
3203 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vsp = PFX(interp_4tap_vert_sp_12x16_avx2);
3204 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx2);
3205 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
3206 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx2);
3207 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
3208 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx2);
3209 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx2);
3210 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
3211 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx2);
3212
3213 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
3214 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
3215 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
3216 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
3217 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vss = PFX(interp_4tap_vert_ss_2x4_avx2);
3218 p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vss = PFX(interp_4tap_vert_ss_2x8_avx2);
3219 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vss = PFX(interp_4tap_vert_ss_4x2_avx2);
3220 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
3221 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
3222 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vss = PFX(interp_4tap_vert_ss_6x8_avx2);
3223 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vss = PFX(interp_4tap_vert_ss_8x2_avx2);
3224 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
3225 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vss = PFX(interp_4tap_vert_ss_8x6_avx2);
3226 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
3227 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
3228 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = PFX(interp_4tap_vert_ss_12x16_avx2);
3229 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx2);
3230 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx2);
3231 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx2);
3232 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
3233 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx2);
3234 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx2);
3235 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
3236
3237 //i422 for chroma_vss
3238 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
3239 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
3240 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
3241 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
3242 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vss = PFX(interp_4tap_vert_ss_2x8_avx2);
3243 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
3244 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
3245 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
3246 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
3247 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
3248 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
3249 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
3250 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx2);
3251 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx2);
3252 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vss = PFX(interp_4tap_vert_ss_6x16_avx2);
3253 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vss = PFX(interp_4tap_vert_ss_2x16_avx2);
3254 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx2);
3255 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = PFX(interp_4tap_vert_ss_12x32_avx2);
3256 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vss = PFX(interp_4tap_vert_ss_4x32_avx2);
3257 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vss = PFX(interp_4tap_vert_ss_2x4_avx2);
3258
3259 //i444 for chroma_vss
3260 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vss = PFX(interp_4tap_vert_ss_4x4_avx2);
3261 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx2);
3262 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx2);
3263 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx2);
3264 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx2);
3265 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vss = PFX(interp_4tap_vert_ss_4x8_avx2);
3266 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx2);
3267 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx2);
3268 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx2);
3269 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx2);
3270 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx2);
3271 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = PFX(interp_4tap_vert_ss_12x16_avx2);
3272 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx2);
3273 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vss = PFX(interp_4tap_vert_ss_4x16_avx2);
3274 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx2);
3275 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx2);
3276 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx2);
3277 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx2);
3278 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx2);
3279 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx2);
3280 p.pu[LUMA_16x16].luma_hvpp = PFX(interp_8tap_hv_pp_16x16_avx2);
3281
3282 ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
3283 p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
3284
3285 p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
3286 p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
3287 p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
3288 p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
3289 p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
3290 p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
3291 p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
3292 p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
3293 p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
3294 p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
3295 p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
3296
3297 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
3298 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
3299 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
3300 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
3301 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
3302 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
3303 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
3304 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
3305 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
3306 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
3307
3308 //i422 for chroma_hpp
3309 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = PFX(interp_4tap_horiz_pp_12x32_avx2);
3310 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_avx2);
3311 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = PFX(interp_4tap_horiz_pp_2x16_avx2);
3312 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = PFX(interp_4tap_horiz_pp_2x16_avx2);
3313 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
3314 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
3315 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
3316 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
3317 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx2);
3318 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx2);
3319 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
3320 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
3321 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
3322 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx2);
3323 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx2);
3324 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
3325 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
3326 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx2);
3327 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx2);
3328 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = PFX(interp_4tap_horiz_pp_2x8_avx2);
3329
3330 //i444 filters hpp
3331 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx2);
3332 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx2);
3333 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx2);
3334 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx2);
3335 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx2);
3336 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx2);
3337
3338 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx2);
3339 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx2);
3340 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx2);
3341 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx2);
3342 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx2);
3343
3344 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_avx2);
3345 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx2);
3346
3347 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx2);
3348 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx2);
3349 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx2);
3350 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx2);
3351
3352 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx2);
3353 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx2);
3354 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx2);
3355 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx2);
3356 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx2);
3357
3358 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps = PFX(interp_4tap_horiz_ps_4x4_avx2);
3359 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps = PFX(interp_4tap_horiz_ps_4x8_avx2);
3360 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps = PFX(interp_4tap_horiz_ps_4x16_avx2);
3361
3362 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
3363 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
3364 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
3365 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
3366 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps = PFX(interp_4tap_horiz_ps_8x64_avx2);
3367 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps = PFX(interp_4tap_horiz_ps_8x12_avx2);
3368
3369 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
3370 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
3371 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
3372 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx2);
3373 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx2);
3374
3375 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
3376 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
3377 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx2);
3378 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx2);
3379
3380 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps = PFX(interp_4tap_horiz_ps_2x8_avx2);
3381 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_avx2);
3382 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps = PFX(interp_4tap_horiz_ps_2x16_avx2);
3383
3384 //i444 chroma_hps
3385 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx2);
3386 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx2);
3387 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx2);
3388 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx2);
3389
3390 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps = PFX(interp_4tap_horiz_ps_4x4_avx2);
3391 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
3392 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx2);
3393 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx2);
3394
3395 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps = PFX(interp_4tap_horiz_ps_4x8_avx2);
3396 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps = PFX(interp_4tap_horiz_ps_4x16_avx2);
3397
3398 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
3399 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
3400 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx2);
3401
3402 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx2);
3403 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx2);
3404 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx2);
3405 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx2);
3406 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx2);
3407
3408 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx2);
3409 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx2);
3410
3411 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx2);
3412 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx2);
3413 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx2);
3414 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx2);
3415
3416 //i422 for chroma_vsp
3417 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
3418 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
3419 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
3420 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
3421 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vsp = PFX(interp_4tap_vert_sp_2x8_avx2);
3422 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
3423 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
3424 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
3425 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
3426 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
3427 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
3428 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
3429 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
3430 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx2);
3431 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx2);
3432 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vsp = PFX(interp_4tap_vert_sp_24x64_avx2);
3433 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vsp = PFX(interp_4tap_vert_sp_8x64_avx2);
3434 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx2);
3435 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vsp = PFX(interp_4tap_vert_sp_8x12_avx2);
3436 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vsp = PFX(interp_4tap_vert_sp_6x16_avx2);
3437 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vsp = PFX(interp_4tap_vert_sp_2x16_avx2);
3438 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx2);
3439 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vsp = PFX(interp_4tap_vert_sp_12x32_avx2);
3440 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vsp = PFX(interp_4tap_vert_sp_4x32_avx2);
3441 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vsp = PFX(interp_4tap_vert_sp_2x4_avx2);
3442
3443 //i444 for chroma_vsp
3444 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = PFX(interp_4tap_vert_sp_4x4_avx2);
3445 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx2);
3446 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx2);
3447 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx2);
3448 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
3449 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_avx2);
3450 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = PFX(interp_4tap_vert_sp_4x8_avx2);
3451 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx2);
3452 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx2);
3453 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx2);
3454 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx2);
3455 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx2);
3456 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vsp = PFX(interp_4tap_vert_sp_12x16_avx2);
3457 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx2);
3458 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vsp = PFX(interp_4tap_vert_sp_4x16_avx2);
3459 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx2);
3460 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx2);
3461 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx2);
3462 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx2);
3463 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
3464 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx2);
3465 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
3466 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx2);
3467 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx2);
3468 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx2);
3469
3470 //i422 for chroma_vps
3471 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
3472 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
3473 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
3474 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vps = PFX(interp_4tap_vert_ps_2x8_avx2);
3475 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
3476 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
3477 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
3478 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
3479 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
3480 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
3481 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
3482 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx2);
3483 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_avx2);
3484 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx2);
3485 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx2);
3486 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vps = PFX(interp_4tap_vert_ps_12x32_avx2);
3487 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = PFX(interp_4tap_vert_ps_8x12_avx2);
3488 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vps = PFX(interp_4tap_vert_ps_2x4_avx2);
3489 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx2);
3490 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = PFX(interp_4tap_vert_ps_2x16_avx2);
3491 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = PFX(interp_4tap_vert_ps_4x32_avx2);
3492 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vps = PFX(interp_4tap_vert_ps_24x64_avx2);
3493
3494 //i444 for chroma_vps
3495 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = PFX(interp_4tap_vert_ps_4x4_avx2);
3496 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx2);
3497 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx2);
3498 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx2);
3499 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_avx2);
3500 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = PFX(interp_4tap_vert_ps_4x8_avx2);
3501 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx2);
3502 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx2);
3503 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx2);
3504 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx2);
3505 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vps = PFX(interp_4tap_vert_ps_12x16_avx2);
3506 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx2);
3507 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = PFX(interp_4tap_vert_ps_4x16_avx2);
3508 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx2);
3509 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx2);
3510 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx2);
3511 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx2);
3512 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx2);
3513 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx2);
3514 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx2);
3515 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx2);
3516 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx2);
3517
3518 //i422 for chroma_vpp
3519 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
3520 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
3521 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
3522 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vpp = PFX(interp_4tap_vert_pp_2x8_avx2);
3523 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
3524 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
3525 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
3526 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
3527 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
3528 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
3529 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_avx2);
3530 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx2);
3531 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx2);
3532 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vpp = PFX(interp_4tap_vert_pp_12x32_avx2);
3533 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = PFX(interp_4tap_vert_pp_8x12_avx2);
3534 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vpp = PFX(interp_4tap_vert_pp_2x4_avx2);
3535 p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = PFX(interp_4tap_vert_pp_2x16_avx2);
3536 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vpp = PFX(interp_4tap_vert_pp_4x32_avx2);
3537 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_avx2);
3538
3539 //i444 for chroma_vpp
3540 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = PFX(interp_4tap_vert_pp_4x4_avx2);
3541 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx2);
3542 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx2);
3543 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_avx2);
3544 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vpp = PFX(interp_4tap_vert_pp_4x8_avx2);
3545 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx2);
3546 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx2);
3547 p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vpp = PFX(interp_4tap_vert_pp_12x16_avx2);
3548 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vpp = PFX(interp_4tap_vert_pp_4x16_avx2);
3549 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx2);
3550 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx2);
3551 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx2);
3552 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx2);
3553 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx2);
3554 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx2);
3555 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx2);
3556 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx2);
3557 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
3558
3559 p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
3560
3561 if (cpuMask & X265_CPU_BMI2)
3562 p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
3563
3564 p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
3565 p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
3566 p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
3567 p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
3568 p.planeClipAndMax = PFX(planeClipAndMax_avx2);
3569
3570 /* The following primitives have been disabled since performance compared to SSE is negligible/negative */
3571 #if 0
3572 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
3573 p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
3574 p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
3575 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2);
3576 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2);
3577 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2);
3578 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
3579 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
3580 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
3581 p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
3582 p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
3583 p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
3584 p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
3585 p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
3586 p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
3587 p.pu[LUMA_8x4].sad_x3 = PFX(pixel_sad_x3_8x4_avx2);
3588 p.pu[LUMA_8x16].sad_x3 = PFX(pixel_sad_x3_8x16_avx2);
3589 p.pu[LUMA_8x8].sad_x4 = PFX(pixel_sad_x4_8x8_avx2);
3590 p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx2);
3591 p.cu[BLOCK_4x4].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_4_avx2);
3592 p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4_avx2);
3593 p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4x4_avx2);
3594 p.cu[BLOCK_16x16].intra_pred[13] = PFX(intra_pred_ang16_13_avx2);
3595 p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_avx2);
3596 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_avx2);
3597 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_avx2);
3598 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = PFX(interp_4tap_horiz_pp_4x8_avx2);
3599 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = PFX(interp_4tap_horiz_pp_4x16_avx2);
3600 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx2);
3601 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
3602 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
3603 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = PFX(interp_4tap_vert_ps_4x2_avx2);
3604 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vps = PFX(interp_4tap_vert_ps_6x8_avx2);
3605 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = PFX(interp_4tap_horiz_pp_4x8_avx2);
3606 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = PFX(interp_4tap_horiz_pp_4x16_avx2);
3607 p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp = PFX(interp_4tap_horiz_pp_4x8_avx2);
3608 p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp = PFX(interp_4tap_horiz_pp_4x16_avx2);
3609 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
3610 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
3611 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx2);
3612 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx2);
3613 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx2);
3614 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx2);
3615
3616 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = PFX(interp_4tap_horiz_pp_4x4_avx2);
3617 p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp = PFX(interp_4tap_horiz_pp_4x4_avx2);
3618 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = PFX(interp_4tap_horiz_pp_4x4_avx2);
3619
3620 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx2);
3621 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx2);
3622 p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vpp = PFX(interp_4tap_vert_pp_6x8_avx2);
3623 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
3624 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
3625 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx2);
3626 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx2);
3627 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx2);
3628 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx2);
3629 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx2);
3630 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = PFX(interp_4tap_vert_pp_4x2_avx2);
3631
3632 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx2);
3633 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx2);
3634 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx2);
3635 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx2);
3636 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx2);
3637 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx2);
3638 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx2);
3639 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx2);
3640 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx2);
3641 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx2);
3642
3643 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
3644 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
3645 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx2);
3646 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx2);
3647 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx2);
3648 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
3649 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
3650 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx2);
3651
3652 p.pu[LUMA_8x8].sad_x3 = PFX(pixel_sad_x3_8x8_avx2);
3653 p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx2);
3654 p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx2);
3655
3656 p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
3657 p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2);
3658 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2);
3659 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2);
3660 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2);
3661 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
3662 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
3663 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
3664 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
3665 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
3666
3667 p.cu[BLOCK_8x8].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_8_avx2);
3668 p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2);
3669
3670 p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
3671 p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
3672 p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
3673 #endif
3674 }
3675 #endif
3676 }
3677 #endif // if HIGH_BIT_DEPTH
3678
3679 } // namespace X265_NS
3680
3681 extern "C" {
3682 #ifdef __INTEL_COMPILER
3683
3684 /* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
3685 * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
3686 * adapted to x265's cpu schema. */
3687
3688 // Global variable indicating cpu
3689 int __intel_cpu_indicator = 0;
3690 // CPU dispatcher function
PFX(intel_cpu_indicator_init)3691 void PFX(intel_cpu_indicator_init)(void)
3692 {
3693 uint32_t cpu = x265::cpu_detect();
3694
3695 if (cpu & X265_CPU_AVX)
3696 __intel_cpu_indicator = 0x20000;
3697 else if (cpu & X265_CPU_SSE42)
3698 __intel_cpu_indicator = 0x8000;
3699 else if (cpu & X265_CPU_SSE4)
3700 __intel_cpu_indicator = 0x2000;
3701 else if (cpu & X265_CPU_SSSE3)
3702 __intel_cpu_indicator = 0x1000;
3703 else if (cpu & X265_CPU_SSE3)
3704 __intel_cpu_indicator = 0x800;
3705 else if (cpu & X265_CPU_SSE2 && !(cpu & X265_CPU_SSE2_IS_SLOW))
3706 __intel_cpu_indicator = 0x200;
3707 else if (cpu & X265_CPU_SSE)
3708 __intel_cpu_indicator = 0x80;
3709 else if (cpu & X265_CPU_MMX2)
3710 __intel_cpu_indicator = 8;
3711 else
3712 __intel_cpu_indicator = 1;
3713 }
3714
3715 /* __intel_cpu_indicator_init appears to have a non-standard calling convention that
3716 * assumes certain registers aren't preserved, so we'll route it through a function
3717 * that backs up all the registers. */
__intel_cpu_indicator_init(void)3718 void __intel_cpu_indicator_init(void)
3719 {
3720 x265_safe_intel_cpu_indicator_init();
3721 }
3722
3723 #else // ifdef __INTEL_COMPILER
3724 void PFX(intel_cpu_indicator_init)(void) {}
3725
3726 #endif // ifdef __INTEL_COMPILER
3727 }
3728