1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/blend.h"
18 #include "aom_ports/mem.h"
19 #include "av1/common/arm/mem_neon.h"
20 #include "config/aom_dsp_rtcd.h"
21
blend8x1(int16x8_t mask,int16x8_t src_0,int16x8_t src_1,const int16x8_t v_maxval,int16x8_t * res)22 static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
23 const int16x8_t v_maxval, int16x8_t *res) {
24 int32x4_t im_res_low, im_res_high;
25 const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
26
27 im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
28 im_res_low =
29 vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
30
31 im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
32 im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
33 vget_high_s16(src_1));
34
35 *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
36 vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
37 }
38
blend_8x4(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,int16x8_t mask0,int16x8_t mask1,int16x8_t mask2,int16x8_t mask3,const int16x8_t v_maxval,const uint16x8_t vec_round_offset,const int16x8_t vec_round_bits)39 static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
40 const CONV_BUF_TYPE *src0, uint32_t src0_stride,
41 const CONV_BUF_TYPE *src1, uint32_t src1_stride,
42 int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
43 int16x8_t mask3, const int16x8_t v_maxval,
44 const uint16x8_t vec_round_offset,
45 const int16x8_t vec_round_bits) {
46 int16x8_t src0_0, src0_1, src0_2, src0_3;
47 int16x8_t src1_0, src1_1, src1_2, src1_3;
48 int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
49
50 load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
51 &src0_3);
52 load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
53 &src1_3);
54
55 blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
56 blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
57 blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
58 blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
59
60 uint16x8_t im_res1_0 =
61 vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
62 uint16x8_t im_res1_1 =
63 vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
64 uint16x8_t im_res1_2 =
65 vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
66 uint16x8_t im_res1_3 =
67 vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
68
69 im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
70 im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
71 im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
72 im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
73
74 vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
75 vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
76 vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
77 vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
78 }
79
blend_4x4(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,int16x4_t mask0,int16x4_t mask1,int16x4_t mask2,int16x4_t mask3,const int16x8_t v_maxval,const uint16x8_t vec_round_offset,const int16x8_t vec_round_bits)80 static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
81 const CONV_BUF_TYPE *src0, uint32_t src0_stride,
82 const CONV_BUF_TYPE *src1, uint32_t src1_stride,
83 int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
84 int16x4_t mask3, const int16x8_t v_maxval,
85 const uint16x8_t vec_round_offset,
86 const int16x8_t vec_round_bits) {
87 int16x8_t src0_0, src0_1;
88 int16x8_t src1_0, src1_1;
89 uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
90 tu3 = vdupq_n_u64(0);
91 int16x8_t mask0_1, mask2_3;
92 int16x8_t res0, res1;
93
94 load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
95 load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
96
97 src0_0 = vreinterpretq_s16_u64(tu0);
98 src0_1 = vreinterpretq_s16_u64(tu1);
99
100 src1_0 = vreinterpretq_s16_u64(tu2);
101 src1_1 = vreinterpretq_s16_u64(tu3);
102
103 mask0_1 = vcombine_s16(mask0, mask1);
104 mask2_3 = vcombine_s16(mask2, mask3);
105
106 blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
107 blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
108
109 uint16x8_t im_res_0 =
110 vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
111 uint16x8_t im_res_1 =
112 vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
113
114 src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
115 src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
116
117 uint8x8_t res_0 = vqmovun_s16(src0_0);
118 uint8x8_t res_1 = vqmovun_s16(src0_1);
119
120 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
121 0);
122 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
123 1);
124 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
125 0);
126 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
127 1);
128 }
129
aom_lowbd_blend_a64_d16_mask_neon(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)130 void aom_lowbd_blend_a64_d16_mask_neon(
131 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
132 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
133 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
134 ConvolveParams *conv_params) {
135 int i = 0;
136 const int bd = 8;
137 int w_tmp = w;
138 const uint8_t *mask_tmp = mask;
139 const CONV_BUF_TYPE *src0_tmp = src0;
140 const CONV_BUF_TYPE *src1_tmp = src1;
141 uint8_t *dst_tmp = dst;
142
143 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
144 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
145 (1 << (offset_bits - conv_params->round_1 - 1));
146 const int round_bits =
147 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
148
149 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
150 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
151
152 assert(h >= 4);
153 assert(w >= 4);
154 assert(IS_POWER_OF_TWO(h));
155 assert(IS_POWER_OF_TWO(w));
156
157 uint8x8_t s0, s1, s2, s3;
158 uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
159 tu3 = vdup_n_u32(0);
160 uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
161 int16x8_t mask0, mask1, mask2, mask3;
162 int16x8_t mask4, mask5, mask6, mask7;
163 int32x4_t m0_32, m1_32, m2_32, m3_32;
164 int32x4_t m4_32, m5_32, m6_32, m7_32;
165 uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
166 uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
167 int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
168 const uint16x4_t vec_zero = vdup_n_u16(0);
169 const uint16_t offset = round_offset - (1 << (round_bits - 1));
170 const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
171 const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
172 const uint16x8_t vec_offset = vdupq_n_u16(offset);
173
174 if (subw == 0 && subh == 0) {
175 if (w_tmp > 7) {
176 do {
177 w_tmp = w;
178 do {
179 load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
180
181 mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
182 mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
183 mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
184 mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
185
186 blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
187 src1_stride, mask0, mask1, mask2, mask3, v_maxval,
188 vec_offset, vec_round_bits);
189
190 w_tmp -= 8;
191 mask_tmp += 8;
192 dst_tmp += 8;
193 src0_tmp += 8;
194 src1_tmp += 8;
195 } while (w_tmp > 7);
196 i += 4;
197 mask_tmp += (4 * mask_stride) - w;
198 dst_tmp += (4 * dst_stride) - w;
199 src0_tmp += (4 * src0_stride) - w;
200 src1_tmp += (4 * src1_stride) - w;
201 } while (i < h);
202 } else {
203 do {
204 load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
205
206 mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
207 mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
208
209 mask0_low = vget_low_s16(mask0);
210 mask1_low = vget_high_s16(mask0);
211 mask2_low = vget_low_s16(mask1);
212 mask3_low = vget_high_s16(mask1);
213
214 blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
215 src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
216 v_maxval, vec_offset, vec_round_bits);
217
218 i += 4;
219 mask_tmp += (4 * mask_stride);
220 dst_tmp += (4 * dst_stride);
221 src0_tmp += (4 * src0_stride);
222 src1_tmp += (4 * src1_stride);
223 } while (i < h);
224 }
225 } else if (subw == 1 && subh == 1) {
226 if (w_tmp > 7) {
227 do {
228 w_tmp = w;
229 do {
230 load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
231 &t7);
232
233 mask0 =
234 vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
235 mask1 =
236 vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
237 mask2 =
238 vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
239 mask3 =
240 vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
241
242 mask4 = vreinterpretq_s16_u16(
243 vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
244 mask5 = vreinterpretq_s16_u16(
245 vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
246 mask6 = vreinterpretq_s16_u16(
247 vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
248 mask7 = vreinterpretq_s16_u16(
249 vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
250
251 m0_32 = vpaddlq_s16(mask0);
252 m1_32 = vpaddlq_s16(mask1);
253 m2_32 = vpaddlq_s16(mask2);
254 m3_32 = vpaddlq_s16(mask3);
255
256 m4_32 = vpaddlq_s16(mask4);
257 m5_32 = vpaddlq_s16(mask5);
258 m6_32 = vpaddlq_s16(mask6);
259 m7_32 = vpaddlq_s16(mask7);
260
261 mask0 =
262 vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
263 mask1 =
264 vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
265 mask2 =
266 vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
267 mask3 =
268 vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
269
270 blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
271 src1_stride, mask0, mask1, mask2, mask3, v_maxval,
272 vec_offset, vec_round_bits);
273
274 w_tmp -= 8;
275 mask_tmp += 16;
276 dst_tmp += 8;
277 src0_tmp += 8;
278 src1_tmp += 8;
279 } while (w_tmp > 7);
280 i += 4;
281 mask_tmp += (8 * mask_stride) - (2 * w);
282 dst_tmp += (4 * dst_stride) - w;
283 src0_tmp += (4 * src0_stride) - w;
284 src1_tmp += (4 * src1_stride) - w;
285 } while (i < h);
286 } else {
287 do {
288 load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
289 &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
290
291 mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
292 mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
293 mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
294 mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
295
296 m0_32 = vpaddlq_s16(mask0);
297 m1_32 = vpaddlq_s16(mask1);
298 m2_32 = vpaddlq_s16(mask2);
299 m3_32 = vpaddlq_s16(mask3);
300
301 mask0_low = vqrshrn_n_s32(m0_32, 2);
302 mask1_low = vqrshrn_n_s32(m1_32, 2);
303 mask2_low = vqrshrn_n_s32(m2_32, 2);
304 mask3_low = vqrshrn_n_s32(m3_32, 2);
305
306 blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
307 src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
308 v_maxval, vec_offset, vec_round_bits);
309
310 i += 4;
311 mask_tmp += (8 * mask_stride);
312 dst_tmp += (4 * dst_stride);
313 src0_tmp += (4 * src0_stride);
314 src1_tmp += (4 * src1_stride);
315 } while (i < h);
316 }
317 } else if (subw == 1 && subh == 0) {
318 if (w_tmp > 7) {
319 do {
320 w_tmp = w;
321 do {
322 load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
323
324 mask0 = vreinterpretq_s16_u16(vcombine_u16(
325 vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
326 mask1 = vreinterpretq_s16_u16(vcombine_u16(
327 vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
328 mask2 = vreinterpretq_s16_u16(vcombine_u16(
329 vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
330 mask3 = vreinterpretq_s16_u16(vcombine_u16(
331 vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
332
333 mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
334 mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
335 mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
336 mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
337
338 blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
339 src1_stride, mask0, mask1, mask2, mask3, v_maxval,
340 vec_offset, vec_round_bits);
341 w_tmp -= 8;
342 mask_tmp += 16;
343 dst_tmp += 8;
344 src0_tmp += 8;
345 src1_tmp += 8;
346 } while (w_tmp > 7);
347 i += 4;
348 mask_tmp += (4 * mask_stride) - (2 * w);
349 dst_tmp += (4 * dst_stride) - w;
350 src0_tmp += (4 * src0_stride) - w;
351 src1_tmp += (4 * src1_stride) - w;
352 } while (i < h);
353 } else {
354 do {
355 load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
356 &mask3_l);
357
358 mask0 =
359 vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
360 mask1 =
361 vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
362 mask2 =
363 vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
364 mask3 =
365 vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
366
367 mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
368 mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
369 mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
370 mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
371
372 blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
373 src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
374 v_maxval, vec_offset, vec_round_bits);
375
376 i += 4;
377 mask_tmp += (4 * mask_stride);
378 dst_tmp += (4 * dst_stride);
379 src0_tmp += (4 * src0_stride);
380 src1_tmp += (4 * src1_stride);
381 } while (i < h);
382 }
383 } else {
384 if (w_tmp > 7) {
385 do {
386 w_tmp = w;
387 do {
388 load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
389 &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
390
391 mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
392 mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
393 mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
394 mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
395
396 mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
397 mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
398 mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
399 mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
400
401 blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
402 src1_stride, mask0, mask1, mask2, mask3, v_maxval,
403 vec_offset, vec_round_bits);
404
405 w_tmp -= 8;
406 mask_tmp += 8;
407 dst_tmp += 8;
408 src0_tmp += 8;
409 src1_tmp += 8;
410 } while (w_tmp > 7);
411 i += 4;
412 mask_tmp += (8 * mask_stride) - w;
413 dst_tmp += (4 * dst_stride) - w;
414 src0_tmp += (4 * src0_stride) - w;
415 src1_tmp += (4 * src1_stride) - w;
416 } while (i < h);
417 } else {
418 do {
419 load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
420 load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
421 &tu3);
422
423 s0 = vreinterpret_u8_u32(tu0);
424 s1 = vreinterpret_u8_u32(tu1);
425 s2 = vreinterpret_u8_u32(tu2);
426 s3 = vreinterpret_u8_u32(tu3);
427
428 mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
429 mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
430
431 mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
432 mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
433
434 mask0_low = vget_low_s16(mask0);
435 mask1_low = vget_high_s16(mask0);
436 mask2_low = vget_low_s16(mask1);
437 mask3_low = vget_high_s16(mask1);
438
439 blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
440 src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
441 v_maxval, vec_offset, vec_round_bits);
442
443 i += 4;
444 mask_tmp += (8 * mask_stride);
445 dst_tmp += (4 * dst_stride);
446 src0_tmp += (4 * src0_stride);
447 src1_tmp += (4 * src1_stride);
448 } while (i < h);
449 }
450 }
451 }
452