1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>  // AVX2
12 
13 #include "./vp9_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vp9/encoder/vp9_variance.h"
16 
17 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
18   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
19   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
20   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
23   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
24   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
25   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
26   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
29   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
30   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
31   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
32   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
33   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
34 };
35 
36 #define FILTER_SRC(filter) \
37   /* filter the source */ \
38   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
39   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
40   \
41   /* add 8 to source */ \
42   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
43   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
44   \
45   /* divide source by 16 */ \
46   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
47   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
48 
49 #define MERGE_WITH_SRC(src_reg, reg) \
50   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
51   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
52 
53 #define LOAD_SRC_DST \
54   /* load source and destination */ \
55   src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
56   dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
57 
58 #define AVG_NEXT_SRC(src_reg, size_stride) \
59   src_next_reg = _mm256_loadu_si256((__m256i const *) \
60                                    (src + size_stride)); \
61   /* average between current and next stride source */ \
62   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
63 
64 #define MERGE_NEXT_SRC(src_reg, size_stride) \
65   src_next_reg = _mm256_loadu_si256((__m256i const *) \
66                                    (src + size_stride)); \
67   MERGE_WITH_SRC(src_reg, src_next_reg)
68 
69 #define CALC_SUM_SSE_INSIDE_LOOP \
70   /* expand each byte to 2 bytes */ \
71   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
72   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
73   /* source - dest */ \
74   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
75   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
76   /* caculate sum */ \
77   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
78   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
79   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
80   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
81   /* calculate sse */ \
82   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
83   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
84 
85 // final calculation to sum and sse
86 #define CALC_SUM_AND_SSE \
87   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
88   sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
89   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
90   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
91   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
92   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
93   \
94   sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
95   sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
96   \
97   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
98   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
99   *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
100                 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
101   sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
102   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
103   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
104         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
105 
106 
vp9_sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)107 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
108                                              int src_stride,
109                                              int x_offset,
110                                              int y_offset,
111                                              const uint8_t *dst,
112                                              int dst_stride,
113                                              int height,
114                                              unsigned int *sse) {
115   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
116   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
117   __m256i zero_reg;
118   int i, sum;
119   sum_reg = _mm256_set1_epi16(0);
120   sse_reg = _mm256_set1_epi16(0);
121   zero_reg = _mm256_set1_epi16(0);
122 
123   // x_offset = 0 and y_offset = 0
124   if (x_offset == 0) {
125     if (y_offset == 0) {
126       for (i = 0; i < height ; i++) {
127         LOAD_SRC_DST
128         // expend each byte to 2 bytes
129         MERGE_WITH_SRC(src_reg, zero_reg)
130         CALC_SUM_SSE_INSIDE_LOOP
131         src+= src_stride;
132         dst+= dst_stride;
133       }
134     // x_offset = 0 and y_offset = 8
135     } else if (y_offset == 8) {
136       __m256i src_next_reg;
137       for (i = 0; i < height ; i++) {
138         LOAD_SRC_DST
139         AVG_NEXT_SRC(src_reg, src_stride)
140         // expend each byte to 2 bytes
141         MERGE_WITH_SRC(src_reg, zero_reg)
142         CALC_SUM_SSE_INSIDE_LOOP
143         src+= src_stride;
144         dst+= dst_stride;
145       }
146     // x_offset = 0 and y_offset = bilin interpolation
147     } else {
148       __m256i filter, pw8, src_next_reg;
149 
150       y_offset <<= 5;
151       filter = _mm256_load_si256((__m256i const *)
152                (bilinear_filters_avx2 + y_offset));
153       pw8 = _mm256_set1_epi16(8);
154       for (i = 0; i < height ; i++) {
155         LOAD_SRC_DST
156         MERGE_NEXT_SRC(src_reg, src_stride)
157         FILTER_SRC(filter)
158         CALC_SUM_SSE_INSIDE_LOOP
159         src+= src_stride;
160         dst+= dst_stride;
161       }
162     }
163   // x_offset = 8  and y_offset = 0
164   } else if (x_offset == 8) {
165     if (y_offset == 0) {
166       __m256i src_next_reg;
167       for (i = 0; i < height ; i++) {
168         LOAD_SRC_DST
169         AVG_NEXT_SRC(src_reg, 1)
170         // expand each byte to 2 bytes
171         MERGE_WITH_SRC(src_reg, zero_reg)
172         CALC_SUM_SSE_INSIDE_LOOP
173         src+= src_stride;
174         dst+= dst_stride;
175       }
176     // x_offset = 8  and y_offset = 8
177     } else if (y_offset == 8) {
178       __m256i src_next_reg, src_avg;
179       // load source and another source starting from the next
180       // following byte
181       src_reg = _mm256_loadu_si256((__m256i const *) (src));
182       AVG_NEXT_SRC(src_reg, 1)
183       for (i = 0; i < height ; i++) {
184         src_avg = src_reg;
185         src+= src_stride;
186         LOAD_SRC_DST
187         AVG_NEXT_SRC(src_reg, 1)
188         // average between previous average to current average
189         src_avg = _mm256_avg_epu8(src_avg, src_reg);
190         // expand each byte to 2 bytes
191         MERGE_WITH_SRC(src_avg, zero_reg)
192         // save current source average
193         CALC_SUM_SSE_INSIDE_LOOP
194         dst+= dst_stride;
195       }
196     // x_offset = 8  and y_offset = bilin interpolation
197     } else {
198       __m256i filter, pw8, src_next_reg, src_avg;
199       y_offset <<= 5;
200       filter = _mm256_load_si256((__m256i const *)
201                (bilinear_filters_avx2 + y_offset));
202       pw8 = _mm256_set1_epi16(8);
203       // load source and another source starting from the next
204       // following byte
205       src_reg = _mm256_loadu_si256((__m256i const *) (src));
206       AVG_NEXT_SRC(src_reg, 1)
207       for (i = 0; i < height ; i++) {
208         // save current source average
209         src_avg = src_reg;
210         src+= src_stride;
211         LOAD_SRC_DST
212         AVG_NEXT_SRC(src_reg, 1)
213         MERGE_WITH_SRC(src_avg, src_reg)
214         FILTER_SRC(filter)
215         CALC_SUM_SSE_INSIDE_LOOP
216         dst+= dst_stride;
217       }
218     }
219   // x_offset = bilin interpolation and y_offset = 0
220   } else {
221     if (y_offset == 0) {
222       __m256i filter, pw8, src_next_reg;
223       x_offset <<= 5;
224       filter = _mm256_load_si256((__m256i const *)
225                (bilinear_filters_avx2 + x_offset));
226       pw8 = _mm256_set1_epi16(8);
227       for (i = 0; i < height ; i++) {
228         LOAD_SRC_DST
229         MERGE_NEXT_SRC(src_reg, 1)
230         FILTER_SRC(filter)
231         CALC_SUM_SSE_INSIDE_LOOP
232         src+= src_stride;
233         dst+= dst_stride;
234       }
235     // x_offset = bilin interpolation and y_offset = 8
236     } else if (y_offset == 8) {
237       __m256i filter, pw8, src_next_reg, src_pack;
238       x_offset <<= 5;
239       filter = _mm256_load_si256((__m256i const *)
240                (bilinear_filters_avx2 + x_offset));
241       pw8 = _mm256_set1_epi16(8);
242       src_reg = _mm256_loadu_si256((__m256i const *) (src));
243       MERGE_NEXT_SRC(src_reg, 1)
244       FILTER_SRC(filter)
245       // convert each 16 bit to 8 bit to each low and high lane source
246       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
247       for (i = 0; i < height ; i++) {
248         src+= src_stride;
249         LOAD_SRC_DST
250         MERGE_NEXT_SRC(src_reg, 1)
251         FILTER_SRC(filter)
252         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
253         // average between previous pack to the current
254         src_pack = _mm256_avg_epu8(src_pack, src_reg);
255         MERGE_WITH_SRC(src_pack, zero_reg)
256         CALC_SUM_SSE_INSIDE_LOOP
257         src_pack = src_reg;
258         dst+= dst_stride;
259       }
260     // x_offset = bilin interpolation and y_offset = bilin interpolation
261     } else {
262       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
263       x_offset <<= 5;
264       xfilter = _mm256_load_si256((__m256i const *)
265                 (bilinear_filters_avx2 + x_offset));
266       y_offset <<= 5;
267       yfilter = _mm256_load_si256((__m256i const *)
268                 (bilinear_filters_avx2 + y_offset));
269       pw8 = _mm256_set1_epi16(8);
270       // load source and another source starting from the next
271       // following byte
272       src_reg = _mm256_loadu_si256((__m256i const *) (src));
273       MERGE_NEXT_SRC(src_reg, 1)
274 
275       FILTER_SRC(xfilter)
276       // convert each 16 bit to 8 bit to each low and high lane source
277       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
278       for (i = 0; i < height ; i++) {
279         src+= src_stride;
280         LOAD_SRC_DST
281         MERGE_NEXT_SRC(src_reg, 1)
282         FILTER_SRC(xfilter)
283         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
284         // merge previous pack to current pack source
285         MERGE_WITH_SRC(src_pack, src_reg)
286         // filter the source
287         FILTER_SRC(yfilter)
288         src_pack = src_reg;
289         CALC_SUM_SSE_INSIDE_LOOP
290         dst+= dst_stride;
291       }
292     }
293   }
294   CALC_SUM_AND_SSE
295   return sum;
296 }
297 
vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * sec,int sec_stride,int height,unsigned int * sse)298 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
299                                              int src_stride,
300                                              int x_offset,
301                                              int y_offset,
302                                              const uint8_t *dst,
303                                              int dst_stride,
304                                              const uint8_t *sec,
305                                              int sec_stride,
306                                              int height,
307                                              unsigned int *sse) {
308   __m256i sec_reg;
309   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
310   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
311   __m256i zero_reg;
312   int i, sum;
313   sum_reg = _mm256_set1_epi16(0);
314   sse_reg = _mm256_set1_epi16(0);
315   zero_reg = _mm256_set1_epi16(0);
316 
317   // x_offset = 0 and y_offset = 0
318   if (x_offset == 0) {
319     if (y_offset == 0) {
320       for (i = 0; i < height ; i++) {
321         LOAD_SRC_DST
322         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
323         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
324         sec+= sec_stride;
325         // expend each byte to 2 bytes
326         MERGE_WITH_SRC(src_reg, zero_reg)
327         CALC_SUM_SSE_INSIDE_LOOP
328         src+= src_stride;
329         dst+= dst_stride;
330       }
331     } else if (y_offset == 8) {
332       __m256i src_next_reg;
333       for (i = 0; i < height ; i++) {
334         LOAD_SRC_DST
335         AVG_NEXT_SRC(src_reg, src_stride)
336         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
337         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338         sec+= sec_stride;
339         // expend each byte to 2 bytes
340         MERGE_WITH_SRC(src_reg, zero_reg)
341         CALC_SUM_SSE_INSIDE_LOOP
342         src+= src_stride;
343         dst+= dst_stride;
344       }
345     // x_offset = 0 and y_offset = bilin interpolation
346     } else {
347       __m256i filter, pw8, src_next_reg;
348 
349       y_offset <<= 5;
350       filter = _mm256_load_si256((__m256i const *)
351                  (bilinear_filters_avx2 + y_offset));
352       pw8 = _mm256_set1_epi16(8);
353       for (i = 0; i < height ; i++) {
354         LOAD_SRC_DST
355         MERGE_NEXT_SRC(src_reg, src_stride)
356         FILTER_SRC(filter)
357         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
358         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
359         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
360         sec+= sec_stride;
361         MERGE_WITH_SRC(src_reg, zero_reg)
362         CALC_SUM_SSE_INSIDE_LOOP
363         src+= src_stride;
364         dst+= dst_stride;
365       }
366     }
367   // x_offset = 8  and y_offset = 0
368   } else if (x_offset == 8) {
369     if (y_offset == 0) {
370       __m256i src_next_reg;
371       for (i = 0; i < height ; i++) {
372         LOAD_SRC_DST
373         AVG_NEXT_SRC(src_reg, 1)
374         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
375         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
376         sec+= sec_stride;
377         // expand each byte to 2 bytes
378         MERGE_WITH_SRC(src_reg, zero_reg)
379         CALC_SUM_SSE_INSIDE_LOOP
380         src+= src_stride;
381         dst+= dst_stride;
382       }
383     // x_offset = 8  and y_offset = 8
384     } else if (y_offset == 8) {
385       __m256i src_next_reg, src_avg;
386       // load source and another source starting from the next
387       // following byte
388       src_reg = _mm256_loadu_si256((__m256i const *) (src));
389       AVG_NEXT_SRC(src_reg, 1)
390       for (i = 0; i < height ; i++) {
391         // save current source average
392         src_avg = src_reg;
393         src+= src_stride;
394         LOAD_SRC_DST
395         AVG_NEXT_SRC(src_reg, 1)
396         // average between previous average to current average
397         src_avg = _mm256_avg_epu8(src_avg, src_reg);
398         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
399         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
400         sec+= sec_stride;
401         // expand each byte to 2 bytes
402         MERGE_WITH_SRC(src_avg, zero_reg)
403         CALC_SUM_SSE_INSIDE_LOOP
404         dst+= dst_stride;
405       }
406     // x_offset = 8  and y_offset = bilin interpolation
407     } else {
408       __m256i filter, pw8, src_next_reg, src_avg;
409       y_offset <<= 5;
410       filter = _mm256_load_si256((__m256i const *)
411                (bilinear_filters_avx2 + y_offset));
412       pw8 = _mm256_set1_epi16(8);
413       // load source and another source starting from the next
414       // following byte
415       src_reg = _mm256_loadu_si256((__m256i const *) (src));
416       AVG_NEXT_SRC(src_reg, 1)
417       for (i = 0; i < height ; i++) {
418         // save current source average
419         src_avg = src_reg;
420         src+= src_stride;
421         LOAD_SRC_DST
422         AVG_NEXT_SRC(src_reg, 1)
423         MERGE_WITH_SRC(src_avg, src_reg)
424         FILTER_SRC(filter)
425         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
426         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
427         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
428         // expand each byte to 2 bytes
429         MERGE_WITH_SRC(src_avg, zero_reg)
430         sec+= sec_stride;
431         CALC_SUM_SSE_INSIDE_LOOP
432         dst+= dst_stride;
433       }
434     }
435   // x_offset = bilin interpolation and y_offset = 0
436   } else {
437     if (y_offset == 0) {
438       __m256i filter, pw8, src_next_reg;
439       x_offset <<= 5;
440       filter = _mm256_load_si256((__m256i const *)
441                (bilinear_filters_avx2 + x_offset));
442       pw8 = _mm256_set1_epi16(8);
443       for (i = 0; i < height ; i++) {
444         LOAD_SRC_DST
445         MERGE_NEXT_SRC(src_reg, 1)
446         FILTER_SRC(filter)
447         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
448         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
449         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
450         MERGE_WITH_SRC(src_reg, zero_reg)
451         sec+= sec_stride;
452         CALC_SUM_SSE_INSIDE_LOOP
453         src+= src_stride;
454         dst+= dst_stride;
455       }
456     // x_offset = bilin interpolation and y_offset = 8
457     } else if (y_offset == 8) {
458       __m256i filter, pw8, src_next_reg, src_pack;
459       x_offset <<= 5;
460       filter = _mm256_load_si256((__m256i const *)
461                (bilinear_filters_avx2 + x_offset));
462       pw8 = _mm256_set1_epi16(8);
463       src_reg = _mm256_loadu_si256((__m256i const *) (src));
464       MERGE_NEXT_SRC(src_reg, 1)
465       FILTER_SRC(filter)
466       // convert each 16 bit to 8 bit to each low and high lane source
467       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
468       for (i = 0; i < height ; i++) {
469         src+= src_stride;
470         LOAD_SRC_DST
471         MERGE_NEXT_SRC(src_reg, 1)
472         FILTER_SRC(filter)
473         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
474         // average between previous pack to the current
475         src_pack = _mm256_avg_epu8(src_pack, src_reg);
476         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
477         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
478         sec+= sec_stride;
479         MERGE_WITH_SRC(src_pack, zero_reg)
480         src_pack = src_reg;
481         CALC_SUM_SSE_INSIDE_LOOP
482         dst+= dst_stride;
483       }
484     // x_offset = bilin interpolation and y_offset = bilin interpolation
485     } else {
486       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
487       x_offset <<= 5;
488       xfilter = _mm256_load_si256((__m256i const *)
489                 (bilinear_filters_avx2 + x_offset));
490       y_offset <<= 5;
491       yfilter = _mm256_load_si256((__m256i const *)
492                 (bilinear_filters_avx2 + y_offset));
493       pw8 = _mm256_set1_epi16(8);
494       // load source and another source starting from the next
495       // following byte
496       src_reg = _mm256_loadu_si256((__m256i const *) (src));
497       MERGE_NEXT_SRC(src_reg, 1)
498 
499       FILTER_SRC(xfilter)
500       // convert each 16 bit to 8 bit to each low and high lane source
501       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
502       for (i = 0; i < height ; i++) {
503         src+= src_stride;
504         LOAD_SRC_DST
505         MERGE_NEXT_SRC(src_reg, 1)
506         FILTER_SRC(xfilter)
507         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
508         // merge previous pack to current pack source
509         MERGE_WITH_SRC(src_pack, src_reg)
510         // filter the source
511         FILTER_SRC(yfilter)
512         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
513         sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
514         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
515         MERGE_WITH_SRC(src_pack, zero_reg)
516         src_pack = src_reg;
517         sec+= sec_stride;
518         CALC_SUM_SSE_INSIDE_LOOP
519         dst+= dst_stride;
520       }
521     }
522   }
523   CALC_SUM_AND_SSE
524   return sum;
525 }
526