1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h> // AVX2
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_ports/mem.h"
17
18 /* clang-format off */
19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
20 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
21 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
22 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
23 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
27 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
28 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
31 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
32 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
33 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
34 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
35 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
36 };
37 /* clang-format on */
38
39 #define FILTER_SRC(filter) \
40 /* filter the source */ \
41 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
42 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
43 \
44 /* add 8 to source */ \
45 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
46 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
47 \
48 /* divide source by 16 */ \
49 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
50 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
51
52 #define MERGE_WITH_SRC(src_reg, reg) \
53 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
54 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
55
56 #define LOAD_SRC_DST \
57 /* load source and destination */ \
58 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
59 dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
60
61 #define AVG_NEXT_SRC(src_reg, size_stride) \
62 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
63 /* average between current and next stride source */ \
64 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
65
66 #define MERGE_NEXT_SRC(src_reg, size_stride) \
67 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
68 MERGE_WITH_SRC(src_reg, src_next_reg)
69
70 #define CALC_SUM_SSE_INSIDE_LOOP \
71 /* expand each byte to 2 bytes */ \
72 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
73 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
74 /* source - dest */ \
75 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
76 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
77 /* caculate sum */ \
78 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
79 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
80 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
81 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
82 /* calculate sse */ \
83 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
84 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
85
86 // final calculation to sum and sse
87 #define CALC_SUM_AND_SSE \
88 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
89 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
90 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
91 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
92 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
93 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
94 \
95 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
96 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
97 \
98 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
99 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
100 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
101 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
102 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
103 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
104 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
105 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
106
107 // Functions related to sub pixel variance width 16
108 #define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
109 /* load source and destination of 2 rows and insert*/ \
110 src_reg = _mm256_inserti128_si256( \
111 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
112 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
113 dst_reg = _mm256_inserti128_si256( \
114 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
115 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
116
117 #define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \
118 src_next_reg = _mm256_inserti128_si256( \
119 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
120 _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \
121 /* average between current and next stride source */ \
122 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
123
124 #define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \
125 src_next_reg = _mm256_inserti128_si256( \
126 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
127 _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \
128 MERGE_WITH_SRC(src_reg, src_next_reg)
129
130 #define LOAD_SRC_NEXT_BYTE_INSERT \
131 /* load source and another source from next row */ \
132 src_reg = _mm256_inserti128_si256( \
133 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
134 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
135 /* load source and next row source from 1 byte onwards */ \
136 src_next_reg = _mm256_inserti128_si256( \
137 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
138 _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
139
140 #define LOAD_DST_INSERT \
141 dst_reg = _mm256_inserti128_si256( \
142 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
143 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
144
145 #define LOAD_SRC_MERGE_128BIT(filter) \
146 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
147 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
148 __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \
149 __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \
150 __m128i filter_128bit = _mm256_castsi256_si128(filter); \
151 __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
152
153 #define FILTER_SRC_128BIT(filter) \
154 /* filter the source */ \
155 src_lo = _mm_maddubs_epi16(src_lo, filter); \
156 src_hi = _mm_maddubs_epi16(src_hi, filter); \
157 \
158 /* add 8 to source */ \
159 src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
160 src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
161 \
162 /* divide source by 16 */ \
163 src_lo = _mm_srai_epi16(src_lo, 4); \
164 src_hi = _mm_srai_epi16(src_hi, 4);
165
aom_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)166 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
167 int x_offset, int y_offset,
168 const uint8_t *dst, int dst_stride,
169 int height, unsigned int *sse) {
170 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
171 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
172 __m256i zero_reg;
173 int i, sum;
174 sum_reg = _mm256_set1_epi16(0);
175 sse_reg = _mm256_set1_epi16(0);
176 zero_reg = _mm256_set1_epi16(0);
177
178 // x_offset = 0 and y_offset = 0
179 if (x_offset == 0) {
180 if (y_offset == 0) {
181 for (i = 0; i < height; i++) {
182 LOAD_SRC_DST
183 // expend each byte to 2 bytes
184 MERGE_WITH_SRC(src_reg, zero_reg)
185 CALC_SUM_SSE_INSIDE_LOOP
186 src += src_stride;
187 dst += dst_stride;
188 }
189 // x_offset = 0 and y_offset = 4
190 } else if (y_offset == 4) {
191 __m256i src_next_reg;
192 for (i = 0; i < height; i++) {
193 LOAD_SRC_DST
194 AVG_NEXT_SRC(src_reg, src_stride)
195 // expend each byte to 2 bytes
196 MERGE_WITH_SRC(src_reg, zero_reg)
197 CALC_SUM_SSE_INSIDE_LOOP
198 src += src_stride;
199 dst += dst_stride;
200 }
201 // x_offset = 0 and y_offset = bilin interpolation
202 } else {
203 __m256i filter, pw8, src_next_reg;
204
205 y_offset <<= 5;
206 filter = _mm256_load_si256(
207 (__m256i const *)(bilinear_filters_avx2 + y_offset));
208 pw8 = _mm256_set1_epi16(8);
209 for (i = 0; i < height; i++) {
210 LOAD_SRC_DST
211 MERGE_NEXT_SRC(src_reg, src_stride)
212 FILTER_SRC(filter)
213 CALC_SUM_SSE_INSIDE_LOOP
214 src += src_stride;
215 dst += dst_stride;
216 }
217 }
218 // x_offset = 4 and y_offset = 0
219 } else if (x_offset == 4) {
220 if (y_offset == 0) {
221 __m256i src_next_reg;
222 for (i = 0; i < height; i++) {
223 LOAD_SRC_DST
224 AVG_NEXT_SRC(src_reg, 1)
225 // expand each byte to 2 bytes
226 MERGE_WITH_SRC(src_reg, zero_reg)
227 CALC_SUM_SSE_INSIDE_LOOP
228 src += src_stride;
229 dst += dst_stride;
230 }
231 // x_offset = 4 and y_offset = 4
232 } else if (y_offset == 4) {
233 __m256i src_next_reg, src_avg;
234 // load source and another source starting from the next
235 // following byte
236 src_reg = _mm256_loadu_si256((__m256i const *)(src));
237 AVG_NEXT_SRC(src_reg, 1)
238 for (i = 0; i < height; i++) {
239 src_avg = src_reg;
240 src += src_stride;
241 LOAD_SRC_DST
242 AVG_NEXT_SRC(src_reg, 1)
243 // average between previous average to current average
244 src_avg = _mm256_avg_epu8(src_avg, src_reg);
245 // expand each byte to 2 bytes
246 MERGE_WITH_SRC(src_avg, zero_reg)
247 // save current source average
248 CALC_SUM_SSE_INSIDE_LOOP
249 dst += dst_stride;
250 }
251 // x_offset = 4 and y_offset = bilin interpolation
252 } else {
253 __m256i filter, pw8, src_next_reg, src_avg;
254 y_offset <<= 5;
255 filter = _mm256_load_si256(
256 (__m256i const *)(bilinear_filters_avx2 + y_offset));
257 pw8 = _mm256_set1_epi16(8);
258 // load source and another source starting from the next
259 // following byte
260 src_reg = _mm256_loadu_si256((__m256i const *)(src));
261 AVG_NEXT_SRC(src_reg, 1)
262 for (i = 0; i < height; i++) {
263 // save current source average
264 src_avg = src_reg;
265 src += src_stride;
266 LOAD_SRC_DST
267 AVG_NEXT_SRC(src_reg, 1)
268 MERGE_WITH_SRC(src_avg, src_reg)
269 FILTER_SRC(filter)
270 CALC_SUM_SSE_INSIDE_LOOP
271 dst += dst_stride;
272 }
273 }
274 // x_offset = bilin interpolation and y_offset = 0
275 } else {
276 if (y_offset == 0) {
277 __m256i filter, pw8, src_next_reg;
278 x_offset <<= 5;
279 filter = _mm256_load_si256(
280 (__m256i const *)(bilinear_filters_avx2 + x_offset));
281 pw8 = _mm256_set1_epi16(8);
282 for (i = 0; i < height; i++) {
283 LOAD_SRC_DST
284 MERGE_NEXT_SRC(src_reg, 1)
285 FILTER_SRC(filter)
286 CALC_SUM_SSE_INSIDE_LOOP
287 src += src_stride;
288 dst += dst_stride;
289 }
290 // x_offset = bilin interpolation and y_offset = 4
291 } else if (y_offset == 4) {
292 __m256i filter, pw8, src_next_reg, src_pack;
293 x_offset <<= 5;
294 filter = _mm256_load_si256(
295 (__m256i const *)(bilinear_filters_avx2 + x_offset));
296 pw8 = _mm256_set1_epi16(8);
297 src_reg = _mm256_loadu_si256((__m256i const *)(src));
298 MERGE_NEXT_SRC(src_reg, 1)
299 FILTER_SRC(filter)
300 // convert each 16 bit to 8 bit to each low and high lane source
301 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
302 for (i = 0; i < height; i++) {
303 src += src_stride;
304 LOAD_SRC_DST
305 MERGE_NEXT_SRC(src_reg, 1)
306 FILTER_SRC(filter)
307 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
308 // average between previous pack to the current
309 src_pack = _mm256_avg_epu8(src_pack, src_reg);
310 MERGE_WITH_SRC(src_pack, zero_reg)
311 CALC_SUM_SSE_INSIDE_LOOP
312 src_pack = src_reg;
313 dst += dst_stride;
314 }
315 // x_offset = bilin interpolation and y_offset = bilin interpolation
316 } else {
317 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
318 x_offset <<= 5;
319 xfilter = _mm256_load_si256(
320 (__m256i const *)(bilinear_filters_avx2 + x_offset));
321 y_offset <<= 5;
322 yfilter = _mm256_load_si256(
323 (__m256i const *)(bilinear_filters_avx2 + y_offset));
324 pw8 = _mm256_set1_epi16(8);
325 // load source and another source starting from the next
326 // following byte
327 src_reg = _mm256_loadu_si256((__m256i const *)(src));
328 MERGE_NEXT_SRC(src_reg, 1)
329
330 FILTER_SRC(xfilter)
331 // convert each 16 bit to 8 bit to each low and high lane source
332 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
333 for (i = 0; i < height; i++) {
334 src += src_stride;
335 LOAD_SRC_DST
336 MERGE_NEXT_SRC(src_reg, 1)
337 FILTER_SRC(xfilter)
338 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
339 // merge previous pack to current pack source
340 MERGE_WITH_SRC(src_pack, src_reg)
341 // filter the source
342 FILTER_SRC(yfilter)
343 src_pack = src_reg;
344 CALC_SUM_SSE_INSIDE_LOOP
345 dst += dst_stride;
346 }
347 }
348 }
349 CALC_SUM_AND_SSE
350 _mm256_zeroupper();
351 return sum;
352 }
353
aom_sub_pixel_variance16xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)354 unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
355 int x_offset, int y_offset,
356 const uint8_t *dst, int dst_stride,
357 int height, unsigned int *sse) {
358 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
359 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
360 __m256i zero_reg;
361 int i, sum;
362 sum_reg = _mm256_set1_epi16(0);
363 sse_reg = _mm256_set1_epi16(0);
364 zero_reg = _mm256_set1_epi16(0);
365
366 // x_offset = 0 and y_offset = 0
367 if (x_offset == 0) {
368 if (y_offset == 0) {
369 for (i = 0; i < height; i += 2) {
370 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
371 // expend each byte to 2 bytes
372 MERGE_WITH_SRC(src_reg, zero_reg)
373 CALC_SUM_SSE_INSIDE_LOOP
374 src += (src_stride << 1);
375 dst += (dst_stride << 1);
376 }
377 // x_offset = 0 and y_offset = 4
378 } else if (y_offset == 4) {
379 __m256i src_next_reg;
380 for (i = 0; i < height; i += 2) {
381 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
382 AVG_NEXT_SRC_INSERT(src_reg, src_stride)
383 // expend each byte to 2 bytes
384 MERGE_WITH_SRC(src_reg, zero_reg)
385 CALC_SUM_SSE_INSIDE_LOOP
386 src += (src_stride << 1);
387 dst += (dst_stride << 1);
388 }
389 // x_offset = 0 and y_offset = bilin interpolation
390 } else {
391 __m256i filter, pw8, src_next_reg;
392 y_offset <<= 5;
393 filter = _mm256_load_si256(
394 (__m256i const *)(bilinear_filters_avx2 + y_offset));
395 pw8 = _mm256_set1_epi16(8);
396 for (i = 0; i < height; i += 2) {
397 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
398 MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
399 FILTER_SRC(filter)
400 CALC_SUM_SSE_INSIDE_LOOP
401 src += (src_stride << 1);
402 dst += (dst_stride << 1);
403 }
404 }
405 // x_offset = 4 and y_offset = 0
406 } else if (x_offset == 4) {
407 if (y_offset == 0) {
408 __m256i src_next_reg;
409 for (i = 0; i < height; i += 2) {
410 LOAD_SRC_NEXT_BYTE_INSERT
411 LOAD_DST_INSERT
412 /* average between current and next stride source */
413 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
414 // expand each byte to 2 bytes
415 MERGE_WITH_SRC(src_reg, zero_reg)
416 CALC_SUM_SSE_INSIDE_LOOP
417 src += (src_stride << 1);
418 dst += (dst_stride << 1);
419 }
420 // x_offset = 4 and y_offset = 4
421 } else if (y_offset == 4) {
422 __m256i src_next_reg, src_avg, src_temp;
423 // load and insert source and next row source
424 LOAD_SRC_NEXT_BYTE_INSERT
425 src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
426 src += src_stride << 1;
427 for (i = 0; i < height - 2; i += 2) {
428 LOAD_SRC_NEXT_BYTE_INSERT
429 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
430 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
431 src_temp = _mm256_avg_epu8(src_avg, src_temp);
432 LOAD_DST_INSERT
433 // expand each byte to 2 bytes
434 MERGE_WITH_SRC(src_temp, zero_reg)
435 // save current source average
436 src_avg = src_next_reg;
437 CALC_SUM_SSE_INSIDE_LOOP
438 dst += dst_stride << 1;
439 src += src_stride << 1;
440 }
441 // last 2 rows processing happens here
442 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
443 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
444 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
445 src_next_reg = _mm256_permute2x128_si256(
446 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
447 LOAD_DST_INSERT
448 src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
449 MERGE_WITH_SRC(src_avg, zero_reg)
450 CALC_SUM_SSE_INSIDE_LOOP
451 } else {
452 // x_offset = 4 and y_offset = bilin interpolation
453 __m256i filter, pw8, src_next_reg, src_avg, src_temp;
454 y_offset <<= 5;
455 filter = _mm256_load_si256(
456 (__m256i const *)(bilinear_filters_avx2 + y_offset));
457 pw8 = _mm256_set1_epi16(8);
458 // load and insert source and next row source
459 LOAD_SRC_NEXT_BYTE_INSERT
460 src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
461 src += src_stride << 1;
462 for (i = 0; i < height - 2; i += 2) {
463 LOAD_SRC_NEXT_BYTE_INSERT
464 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
465 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
466 LOAD_DST_INSERT
467 MERGE_WITH_SRC(src_avg, src_temp)
468 // save current source average
469 src_avg = src_next_reg;
470 FILTER_SRC(filter)
471 CALC_SUM_SSE_INSIDE_LOOP
472 dst += dst_stride << 1;
473 src += src_stride << 1;
474 }
475 // last 2 rows processing happens here
476 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
477 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
478 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
479 src_next_reg = _mm256_permute2x128_si256(
480 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
481 LOAD_DST_INSERT
482 MERGE_WITH_SRC(src_avg, src_next_reg)
483 FILTER_SRC(filter)
484 CALC_SUM_SSE_INSIDE_LOOP
485 }
486 // x_offset = bilin interpolation and y_offset = 0
487 } else {
488 if (y_offset == 0) {
489 __m256i filter, pw8, src_next_reg;
490 x_offset <<= 5;
491 filter = _mm256_load_si256(
492 (__m256i const *)(bilinear_filters_avx2 + x_offset));
493 pw8 = _mm256_set1_epi16(8);
494 for (i = 0; i < height; i += 2) {
495 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
496 MERGE_NEXT_SRC_INSERT(src_reg, 1)
497 FILTER_SRC(filter)
498 CALC_SUM_SSE_INSIDE_LOOP
499 src += (src_stride << 1);
500 dst += (dst_stride << 1);
501 }
502 // x_offset = bilin interpolation and y_offset = 4
503 } else if (y_offset == 4) {
504 __m256i filter, pw8, src_next_reg, src_pack;
505 x_offset <<= 5;
506 filter = _mm256_load_si256(
507 (__m256i const *)(bilinear_filters_avx2 + x_offset));
508 pw8 = _mm256_set1_epi16(8);
509 // load and insert source and next row source
510 LOAD_SRC_NEXT_BYTE_INSERT
511 MERGE_WITH_SRC(src_reg, src_next_reg)
512 FILTER_SRC(filter)
513 // convert each 16 bit to 8 bit to each low and high lane source
514 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
515 src += src_stride << 1;
516 for (i = 0; i < height - 2; i += 2) {
517 LOAD_SRC_NEXT_BYTE_INSERT
518 LOAD_DST_INSERT
519 MERGE_WITH_SRC(src_reg, src_next_reg)
520 FILTER_SRC(filter)
521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
523 // average between previous pack to the current
524 src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
525 MERGE_WITH_SRC(src_pack, zero_reg)
526 CALC_SUM_SSE_INSIDE_LOOP
527 src_pack = src_reg;
528 src += src_stride << 1;
529 dst += dst_stride << 1;
530 }
531 // last 2 rows processing happens here
532 LOAD_SRC_MERGE_128BIT(filter)
533 LOAD_DST_INSERT
534 FILTER_SRC_128BIT(filter_128bit)
535 src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
536 src_next_reg = _mm256_permute2x128_si256(
537 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
538 // average between previous pack to the current
539 src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
540 MERGE_WITH_SRC(src_pack, zero_reg)
541 CALC_SUM_SSE_INSIDE_LOOP
542 } else {
543 // x_offset = bilin interpolation and y_offset = bilin interpolation
544 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
545 x_offset <<= 5;
546 xfilter = _mm256_load_si256(
547 (__m256i const *)(bilinear_filters_avx2 + x_offset));
548 y_offset <<= 5;
549 yfilter = _mm256_load_si256(
550 (__m256i const *)(bilinear_filters_avx2 + y_offset));
551 pw8 = _mm256_set1_epi16(8);
552 // load and insert source and next row source
553 LOAD_SRC_NEXT_BYTE_INSERT
554 MERGE_WITH_SRC(src_reg, src_next_reg)
555 FILTER_SRC(xfilter)
556 // convert each 16 bit to 8 bit to each low and high lane source
557 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
558 src += src_stride << 1;
559 for (i = 0; i < height - 2; i += 2) {
560 LOAD_SRC_NEXT_BYTE_INSERT
561 LOAD_DST_INSERT
562 MERGE_WITH_SRC(src_reg, src_next_reg)
563 FILTER_SRC(xfilter)
564 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
565 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
566 // average between previous pack to the current
567 MERGE_WITH_SRC(src_pack, src_next_reg)
568 // filter the source
569 FILTER_SRC(yfilter)
570 src_pack = src_reg;
571 CALC_SUM_SSE_INSIDE_LOOP
572 src += src_stride << 1;
573 dst += dst_stride << 1;
574 }
575 // last 2 rows processing happens here
576 LOAD_SRC_MERGE_128BIT(xfilter)
577 LOAD_DST_INSERT
578 FILTER_SRC_128BIT(filter_128bit)
579 src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
580 src_next_reg = _mm256_permute2x128_si256(
581 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
582 MERGE_WITH_SRC(src_pack, src_next_reg)
583 FILTER_SRC(yfilter)
584 CALC_SUM_SSE_INSIDE_LOOP
585 }
586 }
587 CALC_SUM_AND_SSE
588 _mm256_zeroupper();
589 return sum;
590 }
591
aom_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)592 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
593 const uint8_t *src, int src_stride, int x_offset, int y_offset,
594 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
595 int height, unsigned int *sse) {
596 __m256i sec_reg;
597 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
598 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
599 __m256i zero_reg;
600 int i, sum;
601 sum_reg = _mm256_set1_epi16(0);
602 sse_reg = _mm256_set1_epi16(0);
603 zero_reg = _mm256_set1_epi16(0);
604
605 // x_offset = 0 and y_offset = 0
606 if (x_offset == 0) {
607 if (y_offset == 0) {
608 for (i = 0; i < height; i++) {
609 LOAD_SRC_DST
610 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
611 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
612 sec += sec_stride;
613 // expend each byte to 2 bytes
614 MERGE_WITH_SRC(src_reg, zero_reg)
615 CALC_SUM_SSE_INSIDE_LOOP
616 src += src_stride;
617 dst += dst_stride;
618 }
619 } else if (y_offset == 4) {
620 __m256i src_next_reg;
621 for (i = 0; i < height; i++) {
622 LOAD_SRC_DST
623 AVG_NEXT_SRC(src_reg, src_stride)
624 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
625 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
626 sec += sec_stride;
627 // expend each byte to 2 bytes
628 MERGE_WITH_SRC(src_reg, zero_reg)
629 CALC_SUM_SSE_INSIDE_LOOP
630 src += src_stride;
631 dst += dst_stride;
632 }
633 // x_offset = 0 and y_offset = bilin interpolation
634 } else {
635 __m256i filter, pw8, src_next_reg;
636
637 y_offset <<= 5;
638 filter = _mm256_load_si256(
639 (__m256i const *)(bilinear_filters_avx2 + y_offset));
640 pw8 = _mm256_set1_epi16(8);
641 for (i = 0; i < height; i++) {
642 LOAD_SRC_DST
643 MERGE_NEXT_SRC(src_reg, src_stride)
644 FILTER_SRC(filter)
645 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
646 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
647 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
648 sec += sec_stride;
649 MERGE_WITH_SRC(src_reg, zero_reg)
650 CALC_SUM_SSE_INSIDE_LOOP
651 src += src_stride;
652 dst += dst_stride;
653 }
654 }
655 // x_offset = 4 and y_offset = 0
656 } else if (x_offset == 4) {
657 if (y_offset == 0) {
658 __m256i src_next_reg;
659 for (i = 0; i < height; i++) {
660 LOAD_SRC_DST
661 AVG_NEXT_SRC(src_reg, 1)
662 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
663 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
664 sec += sec_stride;
665 // expand each byte to 2 bytes
666 MERGE_WITH_SRC(src_reg, zero_reg)
667 CALC_SUM_SSE_INSIDE_LOOP
668 src += src_stride;
669 dst += dst_stride;
670 }
671 // x_offset = 4 and y_offset = 4
672 } else if (y_offset == 4) {
673 __m256i src_next_reg, src_avg;
674 // load source and another source starting from the next
675 // following byte
676 src_reg = _mm256_loadu_si256((__m256i const *)(src));
677 AVG_NEXT_SRC(src_reg, 1)
678 for (i = 0; i < height; i++) {
679 // save current source average
680 src_avg = src_reg;
681 src += src_stride;
682 LOAD_SRC_DST
683 AVG_NEXT_SRC(src_reg, 1)
684 // average between previous average to current average
685 src_avg = _mm256_avg_epu8(src_avg, src_reg);
686 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
687 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
688 sec += sec_stride;
689 // expand each byte to 2 bytes
690 MERGE_WITH_SRC(src_avg, zero_reg)
691 CALC_SUM_SSE_INSIDE_LOOP
692 dst += dst_stride;
693 }
694 // x_offset = 4 and y_offset = bilin interpolation
695 } else {
696 __m256i filter, pw8, src_next_reg, src_avg;
697 y_offset <<= 5;
698 filter = _mm256_load_si256(
699 (__m256i const *)(bilinear_filters_avx2 + y_offset));
700 pw8 = _mm256_set1_epi16(8);
701 // load source and another source starting from the next
702 // following byte
703 src_reg = _mm256_loadu_si256((__m256i const *)(src));
704 AVG_NEXT_SRC(src_reg, 1)
705 for (i = 0; i < height; i++) {
706 // save current source average
707 src_avg = src_reg;
708 src += src_stride;
709 LOAD_SRC_DST
710 AVG_NEXT_SRC(src_reg, 1)
711 MERGE_WITH_SRC(src_avg, src_reg)
712 FILTER_SRC(filter)
713 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
714 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
715 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
716 // expand each byte to 2 bytes
717 MERGE_WITH_SRC(src_avg, zero_reg)
718 sec += sec_stride;
719 CALC_SUM_SSE_INSIDE_LOOP
720 dst += dst_stride;
721 }
722 }
723 // x_offset = bilin interpolation and y_offset = 0
724 } else {
725 if (y_offset == 0) {
726 __m256i filter, pw8, src_next_reg;
727 x_offset <<= 5;
728 filter = _mm256_load_si256(
729 (__m256i const *)(bilinear_filters_avx2 + x_offset));
730 pw8 = _mm256_set1_epi16(8);
731 for (i = 0; i < height; i++) {
732 LOAD_SRC_DST
733 MERGE_NEXT_SRC(src_reg, 1)
734 FILTER_SRC(filter)
735 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
736 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
737 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
738 MERGE_WITH_SRC(src_reg, zero_reg)
739 sec += sec_stride;
740 CALC_SUM_SSE_INSIDE_LOOP
741 src += src_stride;
742 dst += dst_stride;
743 }
744 // x_offset = bilin interpolation and y_offset = 4
745 } else if (y_offset == 4) {
746 __m256i filter, pw8, src_next_reg, src_pack;
747 x_offset <<= 5;
748 filter = _mm256_load_si256(
749 (__m256i const *)(bilinear_filters_avx2 + x_offset));
750 pw8 = _mm256_set1_epi16(8);
751 src_reg = _mm256_loadu_si256((__m256i const *)(src));
752 MERGE_NEXT_SRC(src_reg, 1)
753 FILTER_SRC(filter)
754 // convert each 16 bit to 8 bit to each low and high lane source
755 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
756 for (i = 0; i < height; i++) {
757 src += src_stride;
758 LOAD_SRC_DST
759 MERGE_NEXT_SRC(src_reg, 1)
760 FILTER_SRC(filter)
761 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
762 // average between previous pack to the current
763 src_pack = _mm256_avg_epu8(src_pack, src_reg);
764 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
765 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
766 sec += sec_stride;
767 MERGE_WITH_SRC(src_pack, zero_reg)
768 src_pack = src_reg;
769 CALC_SUM_SSE_INSIDE_LOOP
770 dst += dst_stride;
771 }
772 // x_offset = bilin interpolation and y_offset = bilin interpolation
773 } else {
774 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
775 x_offset <<= 5;
776 xfilter = _mm256_load_si256(
777 (__m256i const *)(bilinear_filters_avx2 + x_offset));
778 y_offset <<= 5;
779 yfilter = _mm256_load_si256(
780 (__m256i const *)(bilinear_filters_avx2 + y_offset));
781 pw8 = _mm256_set1_epi16(8);
782 // load source and another source starting from the next
783 // following byte
784 src_reg = _mm256_loadu_si256((__m256i const *)(src));
785 MERGE_NEXT_SRC(src_reg, 1)
786
787 FILTER_SRC(xfilter)
788 // convert each 16 bit to 8 bit to each low and high lane source
789 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
790 for (i = 0; i < height; i++) {
791 src += src_stride;
792 LOAD_SRC_DST
793 MERGE_NEXT_SRC(src_reg, 1)
794 FILTER_SRC(xfilter)
795 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
796 // merge previous pack to current pack source
797 MERGE_WITH_SRC(src_pack, src_reg)
798 // filter the source
799 FILTER_SRC(yfilter)
800 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
801 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
802 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
803 MERGE_WITH_SRC(src_pack, zero_reg)
804 src_pack = src_reg;
805 sec += sec_stride;
806 CALC_SUM_SSE_INSIDE_LOOP
807 dst += dst_stride;
808 }
809 }
810 }
811 CALC_SUM_AND_SSE
812 _mm256_zeroupper();
813 return sum;
814 }
815