1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h> // AVX2
12
13 #include "./vp9_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vp9/encoder/vp9_variance.h"
16
17 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
18 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
19 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
23 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
24 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
25 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
26 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
29 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
30 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
31 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
32 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
33 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
34 };
35
36 #define FILTER_SRC(filter) \
37 /* filter the source */ \
38 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
39 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
40 \
41 /* add 8 to source */ \
42 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
43 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
44 \
45 /* divide source by 16 */ \
46 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
47 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
48
49 #define MERGE_WITH_SRC(src_reg, reg) \
50 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
51 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
52
53 #define LOAD_SRC_DST \
54 /* load source and destination */ \
55 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
56 dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
57
58 #define AVG_NEXT_SRC(src_reg, size_stride) \
59 src_next_reg = _mm256_loadu_si256((__m256i const *) \
60 (src + size_stride)); \
61 /* average between current and next stride source */ \
62 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
63
64 #define MERGE_NEXT_SRC(src_reg, size_stride) \
65 src_next_reg = _mm256_loadu_si256((__m256i const *) \
66 (src + size_stride)); \
67 MERGE_WITH_SRC(src_reg, src_next_reg)
68
69 #define CALC_SUM_SSE_INSIDE_LOOP \
70 /* expand each byte to 2 bytes */ \
71 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
72 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
73 /* source - dest */ \
74 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
75 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
76 /* caculate sum */ \
77 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
78 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
79 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
80 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
81 /* calculate sse */ \
82 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
83 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
84
85 // final calculation to sum and sse
86 #define CALC_SUM_AND_SSE \
87 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
88 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
89 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
90 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
91 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
92 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
93 \
94 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
95 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
96 \
97 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
98 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
99 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
100 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
101 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
102 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
103 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
104 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
105
106
vp9_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)107 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
108 int src_stride,
109 int x_offset,
110 int y_offset,
111 const uint8_t *dst,
112 int dst_stride,
113 int height,
114 unsigned int *sse) {
115 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
116 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
117 __m256i zero_reg;
118 int i, sum;
119 sum_reg = _mm256_set1_epi16(0);
120 sse_reg = _mm256_set1_epi16(0);
121 zero_reg = _mm256_set1_epi16(0);
122
123 // x_offset = 0 and y_offset = 0
124 if (x_offset == 0) {
125 if (y_offset == 0) {
126 for (i = 0; i < height ; i++) {
127 LOAD_SRC_DST
128 // expend each byte to 2 bytes
129 MERGE_WITH_SRC(src_reg, zero_reg)
130 CALC_SUM_SSE_INSIDE_LOOP
131 src+= src_stride;
132 dst+= dst_stride;
133 }
134 // x_offset = 0 and y_offset = 8
135 } else if (y_offset == 8) {
136 __m256i src_next_reg;
137 for (i = 0; i < height ; i++) {
138 LOAD_SRC_DST
139 AVG_NEXT_SRC(src_reg, src_stride)
140 // expend each byte to 2 bytes
141 MERGE_WITH_SRC(src_reg, zero_reg)
142 CALC_SUM_SSE_INSIDE_LOOP
143 src+= src_stride;
144 dst+= dst_stride;
145 }
146 // x_offset = 0 and y_offset = bilin interpolation
147 } else {
148 __m256i filter, pw8, src_next_reg;
149
150 y_offset <<= 5;
151 filter = _mm256_load_si256((__m256i const *)
152 (bilinear_filters_avx2 + y_offset));
153 pw8 = _mm256_set1_epi16(8);
154 for (i = 0; i < height ; i++) {
155 LOAD_SRC_DST
156 MERGE_NEXT_SRC(src_reg, src_stride)
157 FILTER_SRC(filter)
158 CALC_SUM_SSE_INSIDE_LOOP
159 src+= src_stride;
160 dst+= dst_stride;
161 }
162 }
163 // x_offset = 8 and y_offset = 0
164 } else if (x_offset == 8) {
165 if (y_offset == 0) {
166 __m256i src_next_reg;
167 for (i = 0; i < height ; i++) {
168 LOAD_SRC_DST
169 AVG_NEXT_SRC(src_reg, 1)
170 // expand each byte to 2 bytes
171 MERGE_WITH_SRC(src_reg, zero_reg)
172 CALC_SUM_SSE_INSIDE_LOOP
173 src+= src_stride;
174 dst+= dst_stride;
175 }
176 // x_offset = 8 and y_offset = 8
177 } else if (y_offset == 8) {
178 __m256i src_next_reg, src_avg;
179 // load source and another source starting from the next
180 // following byte
181 src_reg = _mm256_loadu_si256((__m256i const *) (src));
182 AVG_NEXT_SRC(src_reg, 1)
183 for (i = 0; i < height ; i++) {
184 src_avg = src_reg;
185 src+= src_stride;
186 LOAD_SRC_DST
187 AVG_NEXT_SRC(src_reg, 1)
188 // average between previous average to current average
189 src_avg = _mm256_avg_epu8(src_avg, src_reg);
190 // expand each byte to 2 bytes
191 MERGE_WITH_SRC(src_avg, zero_reg)
192 // save current source average
193 CALC_SUM_SSE_INSIDE_LOOP
194 dst+= dst_stride;
195 }
196 // x_offset = 8 and y_offset = bilin interpolation
197 } else {
198 __m256i filter, pw8, src_next_reg, src_avg;
199 y_offset <<= 5;
200 filter = _mm256_load_si256((__m256i const *)
201 (bilinear_filters_avx2 + y_offset));
202 pw8 = _mm256_set1_epi16(8);
203 // load source and another source starting from the next
204 // following byte
205 src_reg = _mm256_loadu_si256((__m256i const *) (src));
206 AVG_NEXT_SRC(src_reg, 1)
207 for (i = 0; i < height ; i++) {
208 // save current source average
209 src_avg = src_reg;
210 src+= src_stride;
211 LOAD_SRC_DST
212 AVG_NEXT_SRC(src_reg, 1)
213 MERGE_WITH_SRC(src_avg, src_reg)
214 FILTER_SRC(filter)
215 CALC_SUM_SSE_INSIDE_LOOP
216 dst+= dst_stride;
217 }
218 }
219 // x_offset = bilin interpolation and y_offset = 0
220 } else {
221 if (y_offset == 0) {
222 __m256i filter, pw8, src_next_reg;
223 x_offset <<= 5;
224 filter = _mm256_load_si256((__m256i const *)
225 (bilinear_filters_avx2 + x_offset));
226 pw8 = _mm256_set1_epi16(8);
227 for (i = 0; i < height ; i++) {
228 LOAD_SRC_DST
229 MERGE_NEXT_SRC(src_reg, 1)
230 FILTER_SRC(filter)
231 CALC_SUM_SSE_INSIDE_LOOP
232 src+= src_stride;
233 dst+= dst_stride;
234 }
235 // x_offset = bilin interpolation and y_offset = 8
236 } else if (y_offset == 8) {
237 __m256i filter, pw8, src_next_reg, src_pack;
238 x_offset <<= 5;
239 filter = _mm256_load_si256((__m256i const *)
240 (bilinear_filters_avx2 + x_offset));
241 pw8 = _mm256_set1_epi16(8);
242 src_reg = _mm256_loadu_si256((__m256i const *) (src));
243 MERGE_NEXT_SRC(src_reg, 1)
244 FILTER_SRC(filter)
245 // convert each 16 bit to 8 bit to each low and high lane source
246 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
247 for (i = 0; i < height ; i++) {
248 src+= src_stride;
249 LOAD_SRC_DST
250 MERGE_NEXT_SRC(src_reg, 1)
251 FILTER_SRC(filter)
252 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
253 // average between previous pack to the current
254 src_pack = _mm256_avg_epu8(src_pack, src_reg);
255 MERGE_WITH_SRC(src_pack, zero_reg)
256 CALC_SUM_SSE_INSIDE_LOOP
257 src_pack = src_reg;
258 dst+= dst_stride;
259 }
260 // x_offset = bilin interpolation and y_offset = bilin interpolation
261 } else {
262 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
263 x_offset <<= 5;
264 xfilter = _mm256_load_si256((__m256i const *)
265 (bilinear_filters_avx2 + x_offset));
266 y_offset <<= 5;
267 yfilter = _mm256_load_si256((__m256i const *)
268 (bilinear_filters_avx2 + y_offset));
269 pw8 = _mm256_set1_epi16(8);
270 // load source and another source starting from the next
271 // following byte
272 src_reg = _mm256_loadu_si256((__m256i const *) (src));
273 MERGE_NEXT_SRC(src_reg, 1)
274
275 FILTER_SRC(xfilter)
276 // convert each 16 bit to 8 bit to each low and high lane source
277 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
278 for (i = 0; i < height ; i++) {
279 src+= src_stride;
280 LOAD_SRC_DST
281 MERGE_NEXT_SRC(src_reg, 1)
282 FILTER_SRC(xfilter)
283 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
284 // merge previous pack to current pack source
285 MERGE_WITH_SRC(src_pack, src_reg)
286 // filter the source
287 FILTER_SRC(yfilter)
288 src_pack = src_reg;
289 CALC_SUM_SSE_INSIDE_LOOP
290 dst+= dst_stride;
291 }
292 }
293 }
294 CALC_SUM_AND_SSE
295 return sum;
296 }
297
vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)298 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
299 int src_stride,
300 int x_offset,
301 int y_offset,
302 const uint8_t *dst,
303 int dst_stride,
304 const uint8_t *sec,
305 int sec_stride,
306 int height,
307 unsigned int *sse) {
308 __m256i sec_reg;
309 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
310 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
311 __m256i zero_reg;
312 int i, sum;
313 sum_reg = _mm256_set1_epi16(0);
314 sse_reg = _mm256_set1_epi16(0);
315 zero_reg = _mm256_set1_epi16(0);
316
317 // x_offset = 0 and y_offset = 0
318 if (x_offset == 0) {
319 if (y_offset == 0) {
320 for (i = 0; i < height ; i++) {
321 LOAD_SRC_DST
322 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
323 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
324 sec+= sec_stride;
325 // expend each byte to 2 bytes
326 MERGE_WITH_SRC(src_reg, zero_reg)
327 CALC_SUM_SSE_INSIDE_LOOP
328 src+= src_stride;
329 dst+= dst_stride;
330 }
331 } else if (y_offset == 8) {
332 __m256i src_next_reg;
333 for (i = 0; i < height ; i++) {
334 LOAD_SRC_DST
335 AVG_NEXT_SRC(src_reg, src_stride)
336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338 sec+= sec_stride;
339 // expend each byte to 2 bytes
340 MERGE_WITH_SRC(src_reg, zero_reg)
341 CALC_SUM_SSE_INSIDE_LOOP
342 src+= src_stride;
343 dst+= dst_stride;
344 }
345 // x_offset = 0 and y_offset = bilin interpolation
346 } else {
347 __m256i filter, pw8, src_next_reg;
348
349 y_offset <<= 5;
350 filter = _mm256_load_si256((__m256i const *)
351 (bilinear_filters_avx2 + y_offset));
352 pw8 = _mm256_set1_epi16(8);
353 for (i = 0; i < height ; i++) {
354 LOAD_SRC_DST
355 MERGE_NEXT_SRC(src_reg, src_stride)
356 FILTER_SRC(filter)
357 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
358 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
359 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
360 sec+= sec_stride;
361 MERGE_WITH_SRC(src_reg, zero_reg)
362 CALC_SUM_SSE_INSIDE_LOOP
363 src+= src_stride;
364 dst+= dst_stride;
365 }
366 }
367 // x_offset = 8 and y_offset = 0
368 } else if (x_offset == 8) {
369 if (y_offset == 0) {
370 __m256i src_next_reg;
371 for (i = 0; i < height ; i++) {
372 LOAD_SRC_DST
373 AVG_NEXT_SRC(src_reg, 1)
374 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
375 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
376 sec+= sec_stride;
377 // expand each byte to 2 bytes
378 MERGE_WITH_SRC(src_reg, zero_reg)
379 CALC_SUM_SSE_INSIDE_LOOP
380 src+= src_stride;
381 dst+= dst_stride;
382 }
383 // x_offset = 8 and y_offset = 8
384 } else if (y_offset == 8) {
385 __m256i src_next_reg, src_avg;
386 // load source and another source starting from the next
387 // following byte
388 src_reg = _mm256_loadu_si256((__m256i const *) (src));
389 AVG_NEXT_SRC(src_reg, 1)
390 for (i = 0; i < height ; i++) {
391 // save current source average
392 src_avg = src_reg;
393 src+= src_stride;
394 LOAD_SRC_DST
395 AVG_NEXT_SRC(src_reg, 1)
396 // average between previous average to current average
397 src_avg = _mm256_avg_epu8(src_avg, src_reg);
398 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
399 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
400 sec+= sec_stride;
401 // expand each byte to 2 bytes
402 MERGE_WITH_SRC(src_avg, zero_reg)
403 CALC_SUM_SSE_INSIDE_LOOP
404 dst+= dst_stride;
405 }
406 // x_offset = 8 and y_offset = bilin interpolation
407 } else {
408 __m256i filter, pw8, src_next_reg, src_avg;
409 y_offset <<= 5;
410 filter = _mm256_load_si256((__m256i const *)
411 (bilinear_filters_avx2 + y_offset));
412 pw8 = _mm256_set1_epi16(8);
413 // load source and another source starting from the next
414 // following byte
415 src_reg = _mm256_loadu_si256((__m256i const *) (src));
416 AVG_NEXT_SRC(src_reg, 1)
417 for (i = 0; i < height ; i++) {
418 // save current source average
419 src_avg = src_reg;
420 src+= src_stride;
421 LOAD_SRC_DST
422 AVG_NEXT_SRC(src_reg, 1)
423 MERGE_WITH_SRC(src_avg, src_reg)
424 FILTER_SRC(filter)
425 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
426 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
427 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
428 // expand each byte to 2 bytes
429 MERGE_WITH_SRC(src_avg, zero_reg)
430 sec+= sec_stride;
431 CALC_SUM_SSE_INSIDE_LOOP
432 dst+= dst_stride;
433 }
434 }
435 // x_offset = bilin interpolation and y_offset = 0
436 } else {
437 if (y_offset == 0) {
438 __m256i filter, pw8, src_next_reg;
439 x_offset <<= 5;
440 filter = _mm256_load_si256((__m256i const *)
441 (bilinear_filters_avx2 + x_offset));
442 pw8 = _mm256_set1_epi16(8);
443 for (i = 0; i < height ; i++) {
444 LOAD_SRC_DST
445 MERGE_NEXT_SRC(src_reg, 1)
446 FILTER_SRC(filter)
447 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
448 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
449 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
450 MERGE_WITH_SRC(src_reg, zero_reg)
451 sec+= sec_stride;
452 CALC_SUM_SSE_INSIDE_LOOP
453 src+= src_stride;
454 dst+= dst_stride;
455 }
456 // x_offset = bilin interpolation and y_offset = 8
457 } else if (y_offset == 8) {
458 __m256i filter, pw8, src_next_reg, src_pack;
459 x_offset <<= 5;
460 filter = _mm256_load_si256((__m256i const *)
461 (bilinear_filters_avx2 + x_offset));
462 pw8 = _mm256_set1_epi16(8);
463 src_reg = _mm256_loadu_si256((__m256i const *) (src));
464 MERGE_NEXT_SRC(src_reg, 1)
465 FILTER_SRC(filter)
466 // convert each 16 bit to 8 bit to each low and high lane source
467 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
468 for (i = 0; i < height ; i++) {
469 src+= src_stride;
470 LOAD_SRC_DST
471 MERGE_NEXT_SRC(src_reg, 1)
472 FILTER_SRC(filter)
473 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
474 // average between previous pack to the current
475 src_pack = _mm256_avg_epu8(src_pack, src_reg);
476 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
477 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
478 sec+= sec_stride;
479 MERGE_WITH_SRC(src_pack, zero_reg)
480 src_pack = src_reg;
481 CALC_SUM_SSE_INSIDE_LOOP
482 dst+= dst_stride;
483 }
484 // x_offset = bilin interpolation and y_offset = bilin interpolation
485 } else {
486 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
487 x_offset <<= 5;
488 xfilter = _mm256_load_si256((__m256i const *)
489 (bilinear_filters_avx2 + x_offset));
490 y_offset <<= 5;
491 yfilter = _mm256_load_si256((__m256i const *)
492 (bilinear_filters_avx2 + y_offset));
493 pw8 = _mm256_set1_epi16(8);
494 // load source and another source starting from the next
495 // following byte
496 src_reg = _mm256_loadu_si256((__m256i const *) (src));
497 MERGE_NEXT_SRC(src_reg, 1)
498
499 FILTER_SRC(xfilter)
500 // convert each 16 bit to 8 bit to each low and high lane source
501 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
502 for (i = 0; i < height ; i++) {
503 src+= src_stride;
504 LOAD_SRC_DST
505 MERGE_NEXT_SRC(src_reg, 1)
506 FILTER_SRC(xfilter)
507 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
508 // merge previous pack to current pack source
509 MERGE_WITH_SRC(src_pack, src_reg)
510 // filter the source
511 FILTER_SRC(yfilter)
512 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
513 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
514 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
515 MERGE_WITH_SRC(src_pack, zero_reg)
516 src_pack = src_reg;
517 sec+= sec_stride;
518 CALC_SUM_SSE_INSIDE_LOOP
519 dst+= dst_stride;
520 }
521 }
522 }
523 CALC_SUM_AND_SSE
524 return sum;
525 }
526