1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_ports/mem.h"
12 #include "vpx_dsp/mips/loopfilter_msa.h"
13
vpx_hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)14 int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
15 const uint8_t *b_limit_ptr,
16 const uint8_t *limit_ptr,
17 const uint8_t *thresh_ptr) {
18 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
19 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
20 v16u8 flat, mask, hev, thresh, b_limit, limit;
21 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
22 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
23 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
24 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
25 v16u8 zero = { 0 };
26
27 /* load vector elements */
28 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
29
30 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
31 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
32 limit = (v16u8)__msa_fill_b(*limit_ptr);
33
34 /* mask and hev */
35 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
36 mask, flat);
37 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
38 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
39
40 if (__msa_test_bz_v(flat)) {
41 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
42
43 return 1;
44 } else {
45 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
46 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
47 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
48 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
49
50 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
51 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
52 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
53 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
54
55 /* convert 16 bit output data into 8 bit */
56 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
57 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
58 p0_filt8_r, q0_filt8_r);
59 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
60 q2_filt8_r);
61
62 /* store pixel values */
63 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
64 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
65 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
66 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
67 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
68 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
69
70 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
71 filter48 += (4 * 16);
72 ST_UB2(q1_out, q2_out, filter48, 16);
73 filter48 += (2 * 16);
74 ST_UB(flat, filter48);
75
76 return 0;
77 }
78 }
79
vpx_hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)80 void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
81 v16u8 flat, flat2, filter8;
82 v16i8 zero = { 0 };
83 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
84 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
85 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
86 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
87 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
88 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
89 v8i16 l_out, r_out;
90
91 flat = LD_UB(filter48 + 96);
92
93 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
94 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
95 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
96
97 if (__msa_test_bz_v(flat2)) {
98 LD_UB4(filter48, 16, p2, p1, p0, q0);
99 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
100
101 src -= 3 * pitch;
102 ST_UB4(p2, p1, p0, q0, src, pitch);
103 src += (4 * pitch);
104 ST_UB2(q1, q2, src, pitch);
105 } else {
106 src -= 7 * pitch;
107
108 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
109 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
110 p2_r_in, p1_r_in, p0_r_in);
111
112 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
113
114 tmp0_r = p7_r_in << 3;
115 tmp0_r -= p7_r_in;
116 tmp0_r += p6_r_in;
117 tmp0_r += q0_r_in;
118 tmp1_r = p6_r_in + p5_r_in;
119 tmp1_r += p4_r_in;
120 tmp1_r += p3_r_in;
121 tmp1_r += p2_r_in;
122 tmp1_r += p1_r_in;
123 tmp1_r += p0_r_in;
124 tmp1_r += tmp0_r;
125 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
126
127 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
128 p5_l_in, p4_l_in);
129 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
130 p1_l_in, p0_l_in);
131 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
132
133 tmp0_l = p7_l_in << 3;
134 tmp0_l -= p7_l_in;
135 tmp0_l += p6_l_in;
136 tmp0_l += q0_l_in;
137 tmp1_l = p6_l_in + p5_l_in;
138 tmp1_l += p4_l_in;
139 tmp1_l += p3_l_in;
140 tmp1_l += p2_l_in;
141 tmp1_l += p1_l_in;
142 tmp1_l += p0_l_in;
143 tmp1_l += tmp0_l;
144 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
145
146 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
147 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
148 ST_UB(p6, src);
149 src += pitch;
150
151 /* p5 */
152 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
153 tmp0_r = p5_r_in - p6_r_in;
154 tmp0_r += q1_r_in;
155 tmp0_r -= p7_r_in;
156 tmp1_r += tmp0_r;
157 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
158
159 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
160 tmp0_l = p5_l_in - p6_l_in;
161 tmp0_l += q1_l_in;
162 tmp0_l -= p7_l_in;
163 tmp1_l += tmp0_l;
164 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
165
166 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
167 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
168 ST_UB(p5, src);
169 src += pitch;
170
171 /* p4 */
172 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
173 tmp0_r = p4_r_in - p5_r_in;
174 tmp0_r += q2_r_in;
175 tmp0_r -= p7_r_in;
176 tmp1_r += tmp0_r;
177 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
178
179 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
180 tmp0_l = p4_l_in - p5_l_in;
181 tmp0_l += q2_l_in;
182 tmp0_l -= p7_l_in;
183 tmp1_l += tmp0_l;
184 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
185
186 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
187 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
188 ST_UB(p4, src);
189 src += pitch;
190
191 /* p3 */
192 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
193 tmp0_r = p3_r_in - p4_r_in;
194 tmp0_r += q3_r_in;
195 tmp0_r -= p7_r_in;
196 tmp1_r += tmp0_r;
197 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
198
199 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
200 tmp0_l = p3_l_in - p4_l_in;
201 tmp0_l += q3_l_in;
202 tmp0_l -= p7_l_in;
203 tmp1_l += tmp0_l;
204 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
205
206 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
207 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
208 ST_UB(p3, src);
209 src += pitch;
210
211 /* p2 */
212 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
213 filter8 = LD_UB(filter48);
214 tmp0_r = p2_r_in - p3_r_in;
215 tmp0_r += q4_r_in;
216 tmp0_r -= p7_r_in;
217 tmp1_r += tmp0_r;
218 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
219
220 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
221 tmp0_l = p2_l_in - p3_l_in;
222 tmp0_l += q4_l_in;
223 tmp0_l -= p7_l_in;
224 tmp1_l += tmp0_l;
225 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
226
227 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
228 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
229 ST_UB(filter8, src);
230 src += pitch;
231
232 /* p1 */
233 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
234 filter8 = LD_UB(filter48 + 16);
235 tmp0_r = p1_r_in - p2_r_in;
236 tmp0_r += q5_r_in;
237 tmp0_r -= p7_r_in;
238 tmp1_r += tmp0_r;
239 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
240
241 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
242 tmp0_l = p1_l_in - p2_l_in;
243 tmp0_l += q5_l_in;
244 tmp0_l -= p7_l_in;
245 tmp1_l += tmp0_l;
246 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
247
248 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
249 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
250 ST_UB(filter8, src);
251 src += pitch;
252
253 /* p0 */
254 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
255 filter8 = LD_UB(filter48 + 32);
256 tmp0_r = p0_r_in - p1_r_in;
257 tmp0_r += q6_r_in;
258 tmp0_r -= p7_r_in;
259 tmp1_r += tmp0_r;
260 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
261
262 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
263 tmp0_l = p0_l_in - p1_l_in;
264 tmp0_l += q6_l_in;
265 tmp0_l -= p7_l_in;
266 tmp1_l += tmp0_l;
267 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
268
269 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
270 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
271 ST_UB(filter8, src);
272 src += pitch;
273
274 /* q0 */
275 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
276 filter8 = LD_UB(filter48 + 48);
277 tmp0_r = q7_r_in - p0_r_in;
278 tmp0_r += q0_r_in;
279 tmp0_r -= p7_r_in;
280 tmp1_r += tmp0_r;
281 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
282
283 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
284 tmp0_l = q7_l_in - p0_l_in;
285 tmp0_l += q0_l_in;
286 tmp0_l -= p7_l_in;
287 tmp1_l += tmp0_l;
288 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
289
290 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
291 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
292 ST_UB(filter8, src);
293 src += pitch;
294
295 /* q1 */
296 filter8 = LD_UB(filter48 + 64);
297 tmp0_r = q7_r_in - q0_r_in;
298 tmp0_r += q1_r_in;
299 tmp0_r -= p6_r_in;
300 tmp1_r += tmp0_r;
301 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
302
303 tmp0_l = q7_l_in - q0_l_in;
304 tmp0_l += q1_l_in;
305 tmp0_l -= p6_l_in;
306 tmp1_l += tmp0_l;
307 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
308
309 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
310 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
311 ST_UB(filter8, src);
312 src += pitch;
313
314 /* q2 */
315 filter8 = LD_UB(filter48 + 80);
316 tmp0_r = q7_r_in - q1_r_in;
317 tmp0_r += q2_r_in;
318 tmp0_r -= p5_r_in;
319 tmp1_r += tmp0_r;
320 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
321
322 tmp0_l = q7_l_in - q1_l_in;
323 tmp0_l += q2_l_in;
324 tmp0_l -= p5_l_in;
325 tmp1_l += tmp0_l;
326 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
327
328 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
329 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
330 ST_UB(filter8, src);
331 src += pitch;
332
333 /* q3 */
334 tmp0_r = q7_r_in - q2_r_in;
335 tmp0_r += q3_r_in;
336 tmp0_r -= p4_r_in;
337 tmp1_r += tmp0_r;
338 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
339
340 tmp0_l = q7_l_in - q2_l_in;
341 tmp0_l += q3_l_in;
342 tmp0_l -= p4_l_in;
343 tmp1_l += tmp0_l;
344 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
345
346 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
347 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
348 ST_UB(q3, src);
349 src += pitch;
350
351 /* q4 */
352 tmp0_r = q7_r_in - q3_r_in;
353 tmp0_r += q4_r_in;
354 tmp0_r -= p3_r_in;
355 tmp1_r += tmp0_r;
356 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
357
358 tmp0_l = q7_l_in - q3_l_in;
359 tmp0_l += q4_l_in;
360 tmp0_l -= p3_l_in;
361 tmp1_l += tmp0_l;
362 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
363
364 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
365 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
366 ST_UB(q4, src);
367 src += pitch;
368
369 /* q5 */
370 tmp0_r = q7_r_in - q4_r_in;
371 tmp0_r += q5_r_in;
372 tmp0_r -= p2_r_in;
373 tmp1_r += tmp0_r;
374 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
375
376 tmp0_l = q7_l_in - q4_l_in;
377 tmp0_l += q5_l_in;
378 tmp0_l -= p2_l_in;
379 tmp1_l += tmp0_l;
380 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
381
382 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
383 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
384 ST_UB(q5, src);
385 src += pitch;
386
387 /* q6 */
388 tmp0_r = q7_r_in - q5_r_in;
389 tmp0_r += q6_r_in;
390 tmp0_r -= p1_r_in;
391 tmp1_r += tmp0_r;
392 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
393
394 tmp0_l = q7_l_in - q5_l_in;
395 tmp0_l += q6_l_in;
396 tmp0_l -= p1_l_in;
397 tmp1_l += tmp0_l;
398 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
399
400 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
401 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
402 ST_UB(q6, src);
403 }
404 }
405
mb_lpf_horizontal_edge_dual(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)406 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
407 const uint8_t *b_limit_ptr,
408 const uint8_t *limit_ptr,
409 const uint8_t *thresh_ptr,
410 int32_t count) {
411 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
412 uint8_t early_exit = 0;
413
414 (void)count;
415
416 early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
417 limit_ptr, thresh_ptr);
418
419 if (0 == early_exit) {
420 vpx_hz_lpf_t16_16w(src, pitch, filter48);
421 }
422 }
423
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)424 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
425 const uint8_t *b_limit_ptr,
426 const uint8_t *limit_ptr,
427 const uint8_t *thresh_ptr, int32_t count) {
428 if (1 == count) {
429 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
430 uint64_t dword0, dword1;
431 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
432 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
433 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
434 v16u8 p0_filter16, p1_filter16;
435 v8i16 p2_filter8, p1_filter8, p0_filter8;
436 v8i16 q0_filter8, q1_filter8, q2_filter8;
437 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
438 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
439 v16i8 zero = { 0 };
440 v8u16 tmp0, tmp1, tmp2;
441
442 /* load vector elements */
443 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
444
445 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
446 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
447 limit = (v16u8)__msa_fill_b(*limit_ptr);
448
449 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
450 mask, flat);
451 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
452 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
453 q1_out);
454
455 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
456
457 if (__msa_test_bz_v(flat)) {
458 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
459 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
460 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
461 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
462 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
463 } else {
464 /* convert 8 bit input data into 16 bit */
465 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
466 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
467 q3_r);
468 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
469 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
470
471 /* convert 16 bit output data into 8 bit */
472 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
473 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
474 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
475
476 /* store pixel values */
477 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
478 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
479 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
480 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
481 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
482 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
483
484 /* load 16 vector elements */
485 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
486 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
487
488 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
489
490 if (__msa_test_bz_v(flat2)) {
491 p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
492 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
493 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
494 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
495 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
496 q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
497
498 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
499 SD(q1_d, src + pitch);
500 SD(q2_d, src + 2 * pitch);
501 } else {
502 /* LSB(right) 8 pixel operation */
503 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
504 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
505 q7_r);
506
507 tmp0 = p7_r << 3;
508 tmp0 -= p7_r;
509 tmp0 += p6_r;
510 tmp0 += q0_r;
511
512 src -= 7 * pitch;
513
514 /* calculation of p6 and p5 */
515 tmp1 = p6_r + p5_r + p4_r + p3_r;
516 tmp1 += (p2_r + p1_r + p0_r);
517 tmp1 += tmp0;
518 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
519 tmp0 = p5_r - p6_r + q1_r - p7_r;
520 tmp1 += tmp0;
521 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
522 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
523 p1_filter16);
524 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
525 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
526 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
527 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
528 SD(dword0, src);
529 src += pitch;
530 SD(dword1, src);
531 src += pitch;
532
533 /* calculation of p4 and p3 */
534 tmp0 = p4_r - p5_r + q2_r - p7_r;
535 tmp2 = p3_r - p4_r + q3_r - p7_r;
536 tmp1 += tmp0;
537 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
538 tmp1 += tmp2;
539 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
541 p1_filter16);
542 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
543 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
544 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
545 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
546 SD(dword0, src);
547 src += pitch;
548 SD(dword1, src);
549 src += pitch;
550
551 /* calculation of p2 and p1 */
552 tmp0 = p2_r - p3_r + q4_r - p7_r;
553 tmp2 = p1_r - p2_r + q5_r - p7_r;
554 tmp1 += tmp0;
555 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
556 tmp1 += tmp2;
557 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
559 p1_filter16);
560 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
561 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
562 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
563 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
564 SD(dword0, src);
565 src += pitch;
566 SD(dword1, src);
567 src += pitch;
568
569 /* calculation of p0 and q0 */
570 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
571 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
572 tmp1 += tmp0;
573 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
574 tmp1 += tmp2;
575 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
577 p1_filter16);
578 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
579 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
580 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
581 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
582 SD(dword0, src);
583 src += pitch;
584 SD(dword1, src);
585 src += pitch;
586
587 /* calculation of q1 and q2 */
588 tmp0 = q7_r - q0_r + q1_r - p6_r;
589 tmp2 = q7_r - q1_r + q2_r - p5_r;
590 tmp1 += tmp0;
591 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
592 tmp1 += tmp2;
593 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
595 p1_filter16);
596 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
597 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
598 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
599 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
600 SD(dword0, src);
601 src += pitch;
602 SD(dword1, src);
603 src += pitch;
604
605 /* calculation of q3 and q4 */
606 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
607 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
608 tmp1 += tmp0;
609 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
610 tmp1 += tmp2;
611 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
613 p1_filter16);
614 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
615 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
616 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
617 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
618 SD(dword0, src);
619 src += pitch;
620 SD(dword1, src);
621 src += pitch;
622
623 /* calculation of q5 and q6 */
624 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
625 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
626 tmp1 += tmp0;
627 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
628 tmp1 += tmp2;
629 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
631 p1_filter16);
632 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
633 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
634 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
635 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
636 SD(dword0, src);
637 src += pitch;
638 SD(dword1, src);
639 }
640 }
641 } else {
642 mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
643 count);
644 }
645 }
646
vpx_lpf_horizontal_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)647 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
648 const uint8_t *b_limit_ptr,
649 const uint8_t *limit_ptr,
650 const uint8_t *thresh_ptr) {
651 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
652 }
653
vpx_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)654 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
655 const uint8_t *b_limit_ptr,
656 const uint8_t *limit_ptr,
657 const uint8_t *thresh_ptr) {
658 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
659 }
660
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)661 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
662 uint8_t *output, int32_t out_pitch) {
663 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
664 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
665 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
666
667 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
668 p1_org, p0_org);
669 /* 8x8 transpose */
670 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
671 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
672 /* 8x8 transpose */
673 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
674 tmp0, tmp1, tmp2, tmp3);
675 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
676 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
677 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
678 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
679 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
680
681 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
682 output += (8 * out_pitch);
683 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
684 }
685
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)686 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
687 uint8_t *output, int32_t out_pitch) {
688 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
689 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
690
691 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
692 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
693 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
694 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
695 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
696 }
697
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)698 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
699 int32_t out_pitch) {
700 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
701 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
702 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
703 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
704 v4i32 tmp2, tmp3;
705
706 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
707 input += (8 * in_pitch);
708 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
709
710 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
711 row9, row10, row11, row12, row13, row14, row15, p7, p6,
712 p5, p4, p3, p2, p1, p0);
713
714 /* transpose 16x8 matrix into 8x16 */
715 /* total 8 intermediate register and 32 instructions */
716 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
717 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
718 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
719 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
720 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
721 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
722 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
723 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
724
725 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
726 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
727 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
728
729 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
730 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
731 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
732
733 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
734 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
735 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
736
737 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
738 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
739 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
740 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
741
742 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
743 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
744 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
745
746 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
747 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
748 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
749 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
750
751 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
752 output += (8 * out_pitch);
753 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
754 }
755
vpx_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)756 int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
757 uint8_t *src_org, int32_t pitch_org,
758 const uint8_t *b_limit_ptr,
759 const uint8_t *limit_ptr,
760 const uint8_t *thresh_ptr) {
761 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
762 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
763 v16u8 flat, mask, hev, thresh, b_limit, limit;
764 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
765 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
766 v16i8 zero = { 0 };
767 v8i16 vec0, vec1, vec2, vec3;
768
769 /* load vector elements */
770 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
771
772 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
773 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
774 limit = (v16u8)__msa_fill_b(*limit_ptr);
775
776 /* mask and hev */
777 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
778 mask, flat);
779 /* flat4 */
780 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
781 /* filter4 */
782 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
783
784 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
785
786 if (__msa_test_bz_v(flat)) {
787 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
788 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
789 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
790 return 1;
791 } else {
792 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
793 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
794 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
795 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
796
797 /* convert 16 bit output data into 8 bit */
798 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
799 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
800 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
801 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
802 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
803 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
804
805 /* store pixel values */
806 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
807 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
808 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
809 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
810 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
811 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
812
813 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
814 filter48 += (4 * 16);
815 ST_UB2(q1_out, q2_out, filter48, 16);
816 filter48 += (2 * 16);
817 ST_UB(flat, filter48);
818
819 return 0;
820 }
821 }
822
vpx_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)823 int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
824 uint8_t *filter48) {
825 v16i8 zero = { 0 };
826 v16u8 filter8, flat, flat2;
827 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
828 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
829 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
830 v8u16 tmp0_r, tmp1_r;
831 v8i16 r_out;
832
833 flat = LD_UB(filter48 + 6 * 16);
834
835 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
836 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
837
838 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
839
840 if (__msa_test_bz_v(flat2)) {
841 v8i16 vec0, vec1, vec2, vec3, vec4;
842
843 LD_UB4(filter48, 16, p2, p1, p0, q0);
844 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
845
846 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
847 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
848 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
849
850 src_org -= 3;
851 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
852 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
853 src_org += (4 * pitch);
854 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
855 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
856
857 return 1;
858 } else {
859 src -= 7 * 16;
860
861 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
862 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
863 p2_r_in, p1_r_in, p0_r_in);
864 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
865
866 tmp0_r = p7_r_in << 3;
867 tmp0_r -= p7_r_in;
868 tmp0_r += p6_r_in;
869 tmp0_r += q0_r_in;
870 tmp1_r = p6_r_in + p5_r_in;
871 tmp1_r += p4_r_in;
872 tmp1_r += p3_r_in;
873 tmp1_r += p2_r_in;
874 tmp1_r += p1_r_in;
875 tmp1_r += p0_r_in;
876 tmp1_r += tmp0_r;
877
878 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
879 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
880 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
881 ST8x1_UB(p6, src);
882 src += 16;
883
884 /* p5 */
885 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
886 tmp0_r = p5_r_in - p6_r_in;
887 tmp0_r += q1_r_in;
888 tmp0_r -= p7_r_in;
889 tmp1_r += tmp0_r;
890 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
891 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
892 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
893 ST8x1_UB(p5, src);
894 src += 16;
895
896 /* p4 */
897 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
898 tmp0_r = p4_r_in - p5_r_in;
899 tmp0_r += q2_r_in;
900 tmp0_r -= p7_r_in;
901 tmp1_r += tmp0_r;
902 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
903 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
904 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
905 ST8x1_UB(p4, src);
906 src += 16;
907
908 /* p3 */
909 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
910 tmp0_r = p3_r_in - p4_r_in;
911 tmp0_r += q3_r_in;
912 tmp0_r -= p7_r_in;
913 tmp1_r += tmp0_r;
914 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
915 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
916 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
917 ST8x1_UB(p3, src);
918 src += 16;
919
920 /* p2 */
921 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
922 filter8 = LD_UB(filter48);
923 tmp0_r = p2_r_in - p3_r_in;
924 tmp0_r += q4_r_in;
925 tmp0_r -= p7_r_in;
926 tmp1_r += tmp0_r;
927 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
928 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
929 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
930 ST8x1_UB(filter8, src);
931 src += 16;
932
933 /* p1 */
934 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
935 filter8 = LD_UB(filter48 + 16);
936 tmp0_r = p1_r_in - p2_r_in;
937 tmp0_r += q5_r_in;
938 tmp0_r -= p7_r_in;
939 tmp1_r += tmp0_r;
940 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
941 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
942 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
943 ST8x1_UB(filter8, src);
944 src += 16;
945
946 /* p0 */
947 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
948 filter8 = LD_UB(filter48 + 32);
949 tmp0_r = p0_r_in - p1_r_in;
950 tmp0_r += q6_r_in;
951 tmp0_r -= p7_r_in;
952 tmp1_r += tmp0_r;
953 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
954 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
955 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
956 ST8x1_UB(filter8, src);
957 src += 16;
958
959 /* q0 */
960 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
961 filter8 = LD_UB(filter48 + 48);
962 tmp0_r = q7_r_in - p0_r_in;
963 tmp0_r += q0_r_in;
964 tmp0_r -= p7_r_in;
965 tmp1_r += tmp0_r;
966 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
967 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
968 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
969 ST8x1_UB(filter8, src);
970 src += 16;
971
972 /* q1 */
973 filter8 = LD_UB(filter48 + 64);
974 tmp0_r = q7_r_in - q0_r_in;
975 tmp0_r += q1_r_in;
976 tmp0_r -= p6_r_in;
977 tmp1_r += tmp0_r;
978 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
979 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
980 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
981 ST8x1_UB(filter8, src);
982 src += 16;
983
984 /* q2 */
985 filter8 = LD_UB(filter48 + 80);
986 tmp0_r = q7_r_in - q1_r_in;
987 tmp0_r += q2_r_in;
988 tmp0_r -= p5_r_in;
989 tmp1_r += tmp0_r;
990 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
991 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
992 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
993 ST8x1_UB(filter8, src);
994 src += 16;
995
996 /* q3 */
997 tmp0_r = q7_r_in - q2_r_in;
998 tmp0_r += q3_r_in;
999 tmp0_r -= p4_r_in;
1000 tmp1_r += tmp0_r;
1001 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1002 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1003 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1004 ST8x1_UB(q3, src);
1005 src += 16;
1006
1007 /* q4 */
1008 tmp0_r = q7_r_in - q3_r_in;
1009 tmp0_r += q4_r_in;
1010 tmp0_r -= p3_r_in;
1011 tmp1_r += tmp0_r;
1012 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1013 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1014 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1015 ST8x1_UB(q4, src);
1016 src += 16;
1017
1018 /* q5 */
1019 tmp0_r = q7_r_in - q4_r_in;
1020 tmp0_r += q5_r_in;
1021 tmp0_r -= p2_r_in;
1022 tmp1_r += tmp0_r;
1023 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1024 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1025 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1026 ST8x1_UB(q5, src);
1027 src += 16;
1028
1029 /* q6 */
1030 tmp0_r = q7_r_in - q5_r_in;
1031 tmp0_r += q6_r_in;
1032 tmp0_r -= p1_r_in;
1033 tmp1_r += tmp0_r;
1034 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1035 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1036 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1037 ST8x1_UB(q6, src);
1038
1039 return 0;
1040 }
1041 }
1042
vpx_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1043 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1044 const uint8_t *b_limit_ptr,
1045 const uint8_t *limit_ptr,
1046 const uint8_t *thresh_ptr) {
1047 uint8_t early_exit = 0;
1048 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1049 uint8_t *filter48 = &transposed_input[16 * 16];
1050
1051 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1052
1053 early_exit =
1054 vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
1055 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1056
1057 if (0 == early_exit) {
1058 early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
1059 &filter48[0]);
1060
1061 if (0 == early_exit) {
1062 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1063 }
1064 }
1065 }
1066
vpx_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1067 int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1068 uint8_t *src_org, int32_t pitch,
1069 const uint8_t *b_limit_ptr,
1070 const uint8_t *limit_ptr,
1071 const uint8_t *thresh_ptr) {
1072 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1073 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1074 v16u8 flat, mask, hev, thresh, b_limit, limit;
1075 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1076 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1077 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1078 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1079 v16i8 zero = { 0 };
1080 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1081
1082 /* load vector elements */
1083 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1084
1085 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1086 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1087 limit = (v16u8)__msa_fill_b(*limit_ptr);
1088
1089 /* mask and hev */
1090 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1091 mask, flat);
1092 /* flat4 */
1093 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1094 /* filter4 */
1095 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1096
1097 if (__msa_test_bz_v(flat)) {
1098 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1099 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1100 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1102
1103 src_org -= 2;
1104 ST4x8_UB(vec2, vec3, src_org, pitch);
1105 src_org += 8 * pitch;
1106 ST4x8_UB(vec4, vec5, src_org, pitch);
1107
1108 return 1;
1109 } else {
1110 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1111 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1112 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1113 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1114 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1115 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1116 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1117 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1118
1119 /* convert 16 bit output data into 8 bit */
1120 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1121 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1122 p0_filt8_r, q0_filt8_r);
1123 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1124 q2_filt8_r);
1125
1126 /* store pixel values */
1127 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1128 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1129 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1130 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1131 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1132 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1133
1134 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1135 filter48 += (4 * 16);
1136 ST_UB2(q1_out, q2_out, filter48, 16);
1137 filter48 += (2 * 16);
1138 ST_UB(flat, filter48);
1139
1140 return 0;
1141 }
1142 }
1143
vpx_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1144 int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1145 uint8_t *filter48) {
1146 v16u8 flat, flat2, filter8;
1147 v16i8 zero = { 0 };
1148 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1149 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1150 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1151 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1152 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1153 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1154 v8i16 l_out, r_out;
1155
1156 flat = LD_UB(filter48 + 6 * 16);
1157
1158 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1159 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1160
1161 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1162
1163 if (__msa_test_bz_v(flat2)) {
1164 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1165
1166 LD_UB4(filter48, 16, p2, p1, p0, q0);
1167 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1168
1169 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1170 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1171 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1173 ILVRL_B2_SH(q2, q1, vec2, vec5);
1174
1175 src_org -= 3;
1176 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1177 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1178 src_org += (4 * pitch);
1179 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1180 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1181 src_org += (4 * pitch);
1182 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1183 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1184 src_org += (4 * pitch);
1185 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1186 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1187
1188 return 1;
1189 } else {
1190 src -= 7 * 16;
1191
1192 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1193 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1194 p2_r_in, p1_r_in, p0_r_in);
1195 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1196
1197 tmp0_r = p7_r_in << 3;
1198 tmp0_r -= p7_r_in;
1199 tmp0_r += p6_r_in;
1200 tmp0_r += q0_r_in;
1201 tmp1_r = p6_r_in + p5_r_in;
1202 tmp1_r += p4_r_in;
1203 tmp1_r += p3_r_in;
1204 tmp1_r += p2_r_in;
1205 tmp1_r += p1_r_in;
1206 tmp1_r += p0_r_in;
1207 tmp1_r += tmp0_r;
1208 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1209
1210 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1211 p5_l_in, p4_l_in);
1212 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1213 p1_l_in, p0_l_in);
1214 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1215
1216 tmp0_l = p7_l_in << 3;
1217 tmp0_l -= p7_l_in;
1218 tmp0_l += p6_l_in;
1219 tmp0_l += q0_l_in;
1220 tmp1_l = p6_l_in + p5_l_in;
1221 tmp1_l += p4_l_in;
1222 tmp1_l += p3_l_in;
1223 tmp1_l += p2_l_in;
1224 tmp1_l += p1_l_in;
1225 tmp1_l += p0_l_in;
1226 tmp1_l += tmp0_l;
1227 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1228
1229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1230 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1231 ST_UB(p6, src);
1232 src += 16;
1233
1234 /* p5 */
1235 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1236 tmp0_r = p5_r_in - p6_r_in;
1237 tmp0_r += q1_r_in;
1238 tmp0_r -= p7_r_in;
1239 tmp1_r += tmp0_r;
1240 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1241 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1242 tmp0_l = p5_l_in - p6_l_in;
1243 tmp0_l += q1_l_in;
1244 tmp0_l -= p7_l_in;
1245 tmp1_l += tmp0_l;
1246 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1247 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1248 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1249 ST_UB(p5, src);
1250 src += 16;
1251
1252 /* p4 */
1253 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1254 tmp0_r = p4_r_in - p5_r_in;
1255 tmp0_r += q2_r_in;
1256 tmp0_r -= p7_r_in;
1257 tmp1_r += tmp0_r;
1258 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1259 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1260 tmp0_l = p4_l_in - p5_l_in;
1261 tmp0_l += q2_l_in;
1262 tmp0_l -= p7_l_in;
1263 tmp1_l += tmp0_l;
1264 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1265 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1266 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1267 ST_UB(p4, src);
1268 src += 16;
1269
1270 /* p3 */
1271 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1272 tmp0_r = p3_r_in - p4_r_in;
1273 tmp0_r += q3_r_in;
1274 tmp0_r -= p7_r_in;
1275 tmp1_r += tmp0_r;
1276 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1277 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1278 tmp0_l = p3_l_in - p4_l_in;
1279 tmp0_l += q3_l_in;
1280 tmp0_l -= p7_l_in;
1281 tmp1_l += tmp0_l;
1282 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1283 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1284 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1285 ST_UB(p3, src);
1286 src += 16;
1287
1288 /* p2 */
1289 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1290 filter8 = LD_UB(filter48);
1291 tmp0_r = p2_r_in - p3_r_in;
1292 tmp0_r += q4_r_in;
1293 tmp0_r -= p7_r_in;
1294 tmp1_r += tmp0_r;
1295 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1296 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1297 tmp0_l = p2_l_in - p3_l_in;
1298 tmp0_l += q4_l_in;
1299 tmp0_l -= p7_l_in;
1300 tmp1_l += tmp0_l;
1301 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1302 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1303 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1304 ST_UB(filter8, src);
1305 src += 16;
1306
1307 /* p1 */
1308 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1309 filter8 = LD_UB(filter48 + 16);
1310 tmp0_r = p1_r_in - p2_r_in;
1311 tmp0_r += q5_r_in;
1312 tmp0_r -= p7_r_in;
1313 tmp1_r += tmp0_r;
1314 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1315 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1316 tmp0_l = p1_l_in - p2_l_in;
1317 tmp0_l += q5_l_in;
1318 tmp0_l -= p7_l_in;
1319 tmp1_l += tmp0_l;
1320 l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1321 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1322 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1323 ST_UB(filter8, src);
1324 src += 16;
1325
1326 /* p0 */
1327 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1328 filter8 = LD_UB(filter48 + 32);
1329 tmp0_r = p0_r_in - p1_r_in;
1330 tmp0_r += q6_r_in;
1331 tmp0_r -= p7_r_in;
1332 tmp1_r += tmp0_r;
1333 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1334 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1335 tmp0_l = p0_l_in - p1_l_in;
1336 tmp0_l += q6_l_in;
1337 tmp0_l -= p7_l_in;
1338 tmp1_l += tmp0_l;
1339 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1340 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1341 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1342 ST_UB(filter8, src);
1343 src += 16;
1344
1345 /* q0 */
1346 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1347 filter8 = LD_UB(filter48 + 48);
1348 tmp0_r = q7_r_in - p0_r_in;
1349 tmp0_r += q0_r_in;
1350 tmp0_r -= p7_r_in;
1351 tmp1_r += tmp0_r;
1352 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1353 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1354 tmp0_l = q7_l_in - p0_l_in;
1355 tmp0_l += q0_l_in;
1356 tmp0_l -= p7_l_in;
1357 tmp1_l += tmp0_l;
1358 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1359 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1360 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1361 ST_UB(filter8, src);
1362 src += 16;
1363
1364 /* q1 */
1365 filter8 = LD_UB(filter48 + 64);
1366 tmp0_r = q7_r_in - q0_r_in;
1367 tmp0_r += q1_r_in;
1368 tmp0_r -= p6_r_in;
1369 tmp1_r += tmp0_r;
1370 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1371 tmp0_l = q7_l_in - q0_l_in;
1372 tmp0_l += q1_l_in;
1373 tmp0_l -= p6_l_in;
1374 tmp1_l += tmp0_l;
1375 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1376 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1377 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1378 ST_UB(filter8, src);
1379 src += 16;
1380
1381 /* q2 */
1382 filter8 = LD_UB(filter48 + 80);
1383 tmp0_r = q7_r_in - q1_r_in;
1384 tmp0_r += q2_r_in;
1385 tmp0_r -= p5_r_in;
1386 tmp1_r += tmp0_r;
1387 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1388 tmp0_l = q7_l_in - q1_l_in;
1389 tmp0_l += q2_l_in;
1390 tmp0_l -= p5_l_in;
1391 tmp1_l += tmp0_l;
1392 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1393 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1394 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1395 ST_UB(filter8, src);
1396 src += 16;
1397
1398 /* q3 */
1399 tmp0_r = q7_r_in - q2_r_in;
1400 tmp0_r += q3_r_in;
1401 tmp0_r -= p4_r_in;
1402 tmp1_r += tmp0_r;
1403 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1404 tmp0_l = q7_l_in - q2_l_in;
1405 tmp0_l += q3_l_in;
1406 tmp0_l -= p4_l_in;
1407 tmp1_l += tmp0_l;
1408 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1409 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1410 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1411 ST_UB(q3, src);
1412 src += 16;
1413
1414 /* q4 */
1415 tmp0_r = q7_r_in - q3_r_in;
1416 tmp0_r += q4_r_in;
1417 tmp0_r -= p3_r_in;
1418 tmp1_r += tmp0_r;
1419 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1420 tmp0_l = q7_l_in - q3_l_in;
1421 tmp0_l += q4_l_in;
1422 tmp0_l -= p3_l_in;
1423 tmp1_l += tmp0_l;
1424 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1425 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1426 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1427 ST_UB(q4, src);
1428 src += 16;
1429
1430 /* q5 */
1431 tmp0_r = q7_r_in - q4_r_in;
1432 tmp0_r += q5_r_in;
1433 tmp0_r -= p2_r_in;
1434 tmp1_r += tmp0_r;
1435 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1436 tmp0_l = q7_l_in - q4_l_in;
1437 tmp0_l += q5_l_in;
1438 tmp0_l -= p2_l_in;
1439 tmp1_l += tmp0_l;
1440 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1441 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1442 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1443 ST_UB(q5, src);
1444 src += 16;
1445
1446 /* q6 */
1447 tmp0_r = q7_r_in - q5_r_in;
1448 tmp0_r += q6_r_in;
1449 tmp0_r -= p1_r_in;
1450 tmp1_r += tmp0_r;
1451 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1452 tmp0_l = q7_l_in - q5_l_in;
1453 tmp0_l += q6_l_in;
1454 tmp0_l -= p1_l_in;
1455 tmp1_l += tmp0_l;
1456 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1457 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1458 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1459 ST_UB(q6, src);
1460
1461 return 0;
1462 }
1463 }
1464
vpx_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1465 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1466 const uint8_t *b_limit_ptr,
1467 const uint8_t *limit_ptr,
1468 const uint8_t *thresh_ptr) {
1469 uint8_t early_exit = 0;
1470 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1471 uint8_t *filter48 = &transposed_input[16 * 16];
1472
1473 transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1474
1475 early_exit =
1476 vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1477 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1478
1479 if (0 == early_exit) {
1480 early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
1481 &filter48[0]);
1482
1483 if (0 == early_exit) {
1484 transpose_16x16(transposed_input, 16, (src - 8), pitch);
1485 }
1486 }
1487 }
1488