1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/blend.h"
18 #include "aom_ports/mem.h"
19 #include "av1/common/arm/mem_neon.h"
20 #include "config/aom_dsp_rtcd.h"
21 
blend8x1(int16x8_t mask,int16x8_t src_0,int16x8_t src_1,const int16x8_t v_maxval,int16x8_t * res)22 static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
23                             const int16x8_t v_maxval, int16x8_t *res) {
24   int32x4_t im_res_low, im_res_high;
25   const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
26 
27   im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
28   im_res_low =
29       vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
30 
31   im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
32   im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
33                           vget_high_s16(src_1));
34 
35   *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
36                       vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
37 }
38 
blend_8x4(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,int16x8_t mask0,int16x8_t mask1,int16x8_t mask2,int16x8_t mask3,const int16x8_t v_maxval,const uint16x8_t vec_round_offset,const int16x8_t vec_round_bits)39 static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
40                              const CONV_BUF_TYPE *src0, uint32_t src0_stride,
41                              const CONV_BUF_TYPE *src1, uint32_t src1_stride,
42                              int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
43                              int16x8_t mask3, const int16x8_t v_maxval,
44                              const uint16x8_t vec_round_offset,
45                              const int16x8_t vec_round_bits) {
46   int16x8_t src0_0, src0_1, src0_2, src0_3;
47   int16x8_t src1_0, src1_1, src1_2, src1_3;
48   int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
49 
50   load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
51                &src0_3);
52   load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
53                &src1_3);
54 
55   blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
56   blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
57   blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
58   blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
59 
60   uint16x8_t im_res1_0 =
61       vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
62   uint16x8_t im_res1_1 =
63       vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
64   uint16x8_t im_res1_2 =
65       vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
66   uint16x8_t im_res1_3 =
67       vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
68 
69   im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
70   im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
71   im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
72   im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
73 
74   vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
75   vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
76   vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
77   vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
78 }
79 
blend_4x4(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,int16x4_t mask0,int16x4_t mask1,int16x4_t mask2,int16x4_t mask3,const int16x8_t v_maxval,const uint16x8_t vec_round_offset,const int16x8_t vec_round_bits)80 static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
81                              const CONV_BUF_TYPE *src0, uint32_t src0_stride,
82                              const CONV_BUF_TYPE *src1, uint32_t src1_stride,
83                              int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
84                              int16x4_t mask3, const int16x8_t v_maxval,
85                              const uint16x8_t vec_round_offset,
86                              const int16x8_t vec_round_bits) {
87   int16x8_t src0_0, src0_1;
88   int16x8_t src1_0, src1_1;
89   uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
90              tu3 = vdupq_n_u64(0);
91   int16x8_t mask0_1, mask2_3;
92   int16x8_t res0, res1;
93 
94   load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
95   load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
96 
97   src0_0 = vreinterpretq_s16_u64(tu0);
98   src0_1 = vreinterpretq_s16_u64(tu1);
99 
100   src1_0 = vreinterpretq_s16_u64(tu2);
101   src1_1 = vreinterpretq_s16_u64(tu3);
102 
103   mask0_1 = vcombine_s16(mask0, mask1);
104   mask2_3 = vcombine_s16(mask2, mask3);
105 
106   blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
107   blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
108 
109   uint16x8_t im_res_0 =
110       vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
111   uint16x8_t im_res_1 =
112       vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
113 
114   src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
115   src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
116 
117   uint8x8_t res_0 = vqmovun_s16(src0_0);
118   uint8x8_t res_1 = vqmovun_s16(src0_1);
119 
120   vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
121                 0);
122   vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
123                 1);
124   vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
125                 0);
126   vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
127                 1);
128 }
129 
aom_lowbd_blend_a64_d16_mask_neon(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)130 void aom_lowbd_blend_a64_d16_mask_neon(
131     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
132     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
133     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
134     ConvolveParams *conv_params) {
135   int i = 0;
136   const int bd = 8;
137   int w_tmp = w;
138   const uint8_t *mask_tmp = mask;
139   const CONV_BUF_TYPE *src0_tmp = src0;
140   const CONV_BUF_TYPE *src1_tmp = src1;
141   uint8_t *dst_tmp = dst;
142 
143   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
144   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
145                            (1 << (offset_bits - conv_params->round_1 - 1));
146   const int round_bits =
147       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
148 
149   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
150   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
151 
152   assert(h >= 4);
153   assert(w >= 4);
154   assert(IS_POWER_OF_TWO(h));
155   assert(IS_POWER_OF_TWO(w));
156 
157   uint8x8_t s0, s1, s2, s3;
158   uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
159              tu3 = vdup_n_u32(0);
160   uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
161   int16x8_t mask0, mask1, mask2, mask3;
162   int16x8_t mask4, mask5, mask6, mask7;
163   int32x4_t m0_32, m1_32, m2_32, m3_32;
164   int32x4_t m4_32, m5_32, m6_32, m7_32;
165   uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
166   uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
167   int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
168   const uint16x4_t vec_zero = vdup_n_u16(0);
169   const uint16_t offset = round_offset - (1 << (round_bits - 1));
170   const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
171   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
172   const uint16x8_t vec_offset = vdupq_n_u16(offset);
173 
174   if (subw == 0 && subh == 0) {
175     if (w_tmp > 7) {
176       do {
177         w_tmp = w;
178         do {
179           load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
180 
181           mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
182           mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
183           mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
184           mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
185 
186           blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
187                     src1_stride, mask0, mask1, mask2, mask3, v_maxval,
188                     vec_offset, vec_round_bits);
189 
190           w_tmp -= 8;
191           mask_tmp += 8;
192           dst_tmp += 8;
193           src0_tmp += 8;
194           src1_tmp += 8;
195         } while (w_tmp > 7);
196         i += 4;
197         mask_tmp += (4 * mask_stride) - w;
198         dst_tmp += (4 * dst_stride) - w;
199         src0_tmp += (4 * src0_stride) - w;
200         src1_tmp += (4 * src1_stride) - w;
201       } while (i < h);
202     } else {
203       do {
204         load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
205 
206         mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
207         mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
208 
209         mask0_low = vget_low_s16(mask0);
210         mask1_low = vget_high_s16(mask0);
211         mask2_low = vget_low_s16(mask1);
212         mask3_low = vget_high_s16(mask1);
213 
214         blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
215                   src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
216                   v_maxval, vec_offset, vec_round_bits);
217 
218         i += 4;
219         mask_tmp += (4 * mask_stride);
220         dst_tmp += (4 * dst_stride);
221         src0_tmp += (4 * src0_stride);
222         src1_tmp += (4 * src1_stride);
223       } while (i < h);
224     }
225   } else if (subw == 1 && subh == 1) {
226     if (w_tmp > 7) {
227       do {
228         w_tmp = w;
229         do {
230           load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
231                        &t7);
232 
233           mask0 =
234               vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
235           mask1 =
236               vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
237           mask2 =
238               vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
239           mask3 =
240               vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
241 
242           mask4 = vreinterpretq_s16_u16(
243               vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
244           mask5 = vreinterpretq_s16_u16(
245               vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
246           mask6 = vreinterpretq_s16_u16(
247               vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
248           mask7 = vreinterpretq_s16_u16(
249               vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
250 
251           m0_32 = vpaddlq_s16(mask0);
252           m1_32 = vpaddlq_s16(mask1);
253           m2_32 = vpaddlq_s16(mask2);
254           m3_32 = vpaddlq_s16(mask3);
255 
256           m4_32 = vpaddlq_s16(mask4);
257           m5_32 = vpaddlq_s16(mask5);
258           m6_32 = vpaddlq_s16(mask6);
259           m7_32 = vpaddlq_s16(mask7);
260 
261           mask0 =
262               vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
263           mask1 =
264               vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
265           mask2 =
266               vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
267           mask3 =
268               vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
269 
270           blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
271                     src1_stride, mask0, mask1, mask2, mask3, v_maxval,
272                     vec_offset, vec_round_bits);
273 
274           w_tmp -= 8;
275           mask_tmp += 16;
276           dst_tmp += 8;
277           src0_tmp += 8;
278           src1_tmp += 8;
279         } while (w_tmp > 7);
280         i += 4;
281         mask_tmp += (8 * mask_stride) - (2 * w);
282         dst_tmp += (4 * dst_stride) - w;
283         src0_tmp += (4 * src0_stride) - w;
284         src1_tmp += (4 * src1_stride) - w;
285       } while (i < h);
286     } else {
287       do {
288         load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
289                     &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
290 
291         mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
292         mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
293         mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
294         mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
295 
296         m0_32 = vpaddlq_s16(mask0);
297         m1_32 = vpaddlq_s16(mask1);
298         m2_32 = vpaddlq_s16(mask2);
299         m3_32 = vpaddlq_s16(mask3);
300 
301         mask0_low = vqrshrn_n_s32(m0_32, 2);
302         mask1_low = vqrshrn_n_s32(m1_32, 2);
303         mask2_low = vqrshrn_n_s32(m2_32, 2);
304         mask3_low = vqrshrn_n_s32(m3_32, 2);
305 
306         blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
307                   src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
308                   v_maxval, vec_offset, vec_round_bits);
309 
310         i += 4;
311         mask_tmp += (8 * mask_stride);
312         dst_tmp += (4 * dst_stride);
313         src0_tmp += (4 * src0_stride);
314         src1_tmp += (4 * src1_stride);
315       } while (i < h);
316     }
317   } else if (subw == 1 && subh == 0) {
318     if (w_tmp > 7) {
319       do {
320         w_tmp = w;
321         do {
322           load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
323 
324           mask0 = vreinterpretq_s16_u16(vcombine_u16(
325               vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
326           mask1 = vreinterpretq_s16_u16(vcombine_u16(
327               vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
328           mask2 = vreinterpretq_s16_u16(vcombine_u16(
329               vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
330           mask3 = vreinterpretq_s16_u16(vcombine_u16(
331               vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
332 
333           mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
334           mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
335           mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
336           mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
337 
338           blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
339                     src1_stride, mask0, mask1, mask2, mask3, v_maxval,
340                     vec_offset, vec_round_bits);
341           w_tmp -= 8;
342           mask_tmp += 16;
343           dst_tmp += 8;
344           src0_tmp += 8;
345           src1_tmp += 8;
346         } while (w_tmp > 7);
347         i += 4;
348         mask_tmp += (4 * mask_stride) - (2 * w);
349         dst_tmp += (4 * dst_stride) - w;
350         src0_tmp += (4 * src0_stride) - w;
351         src1_tmp += (4 * src1_stride) - w;
352       } while (i < h);
353     } else {
354       do {
355         load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
356                     &mask3_l);
357 
358         mask0 =
359             vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
360         mask1 =
361             vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
362         mask2 =
363             vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
364         mask3 =
365             vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
366 
367         mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
368         mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
369         mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
370         mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
371 
372         blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
373                   src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
374                   v_maxval, vec_offset, vec_round_bits);
375 
376         i += 4;
377         mask_tmp += (4 * mask_stride);
378         dst_tmp += (4 * dst_stride);
379         src0_tmp += (4 * src0_stride);
380         src1_tmp += (4 * src1_stride);
381       } while (i < h);
382     }
383   } else {
384     if (w_tmp > 7) {
385       do {
386         w_tmp = w;
387         do {
388           load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
389                       &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
390 
391           mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
392           mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
393           mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
394           mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
395 
396           mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
397           mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
398           mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
399           mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
400 
401           blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
402                     src1_stride, mask0, mask1, mask2, mask3, v_maxval,
403                     vec_offset, vec_round_bits);
404 
405           w_tmp -= 8;
406           mask_tmp += 8;
407           dst_tmp += 8;
408           src0_tmp += 8;
409           src1_tmp += 8;
410         } while (w_tmp > 7);
411         i += 4;
412         mask_tmp += (8 * mask_stride) - w;
413         dst_tmp += (4 * dst_stride) - w;
414         src0_tmp += (4 * src0_stride) - w;
415         src1_tmp += (4 * src1_stride) - w;
416       } while (i < h);
417     } else {
418       do {
419         load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
420         load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
421                               &tu3);
422 
423         s0 = vreinterpret_u8_u32(tu0);
424         s1 = vreinterpret_u8_u32(tu1);
425         s2 = vreinterpret_u8_u32(tu2);
426         s3 = vreinterpret_u8_u32(tu3);
427 
428         mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
429         mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
430 
431         mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
432         mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
433 
434         mask0_low = vget_low_s16(mask0);
435         mask1_low = vget_high_s16(mask0);
436         mask2_low = vget_low_s16(mask1);
437         mask3_low = vget_high_s16(mask1);
438 
439         blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
440                   src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
441                   v_maxval, vec_offset, vec_round_bits);
442 
443         i += 4;
444         mask_tmp += (8 * mask_stride);
445         dst_tmp += (4 * dst_stride);
446         src0_tmp += (4 * src0_stride);
447         src1_tmp += (4 * src1_stride);
448       } while (i < h);
449     }
450   }
451 }
452