1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
12 #define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
13 
14 #include <stdlib.h>
15 
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_mem/vpx_mem.h"
19 
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23 
24 #if HAVE_DSPR2
25 /* processing 4 pixels at the same time
26  * compute hev and mask in the same function */
filter_hev_mask_dspr2(uint32_t limit,uint32_t flimit,uint32_t p1,uint32_t p0,uint32_t p3,uint32_t p2,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t thresh,uint32_t * hev,uint32_t * mask)27 static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
28                                          uint32_t p1, uint32_t p0, uint32_t p3,
29                                          uint32_t p2, uint32_t q0, uint32_t q1,
30                                          uint32_t q2, uint32_t q3,
31                                          uint32_t thresh, uint32_t *hev,
32                                          uint32_t *mask) {
33   uint32_t c, r, r3, r_k;
34   uint32_t s1, s2, s3;
35   uint32_t ones = 0xFFFFFFFF;
36   uint32_t hev1;
37 
38   __asm__ __volatile__(
39       /* mask |= (abs(p3 - p2) > limit) */
40       "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
41       "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
42       "or             %[r_k], %[r_k],    %[c]         \n\t"
43       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
44       "or             %[r],   $0,        %[c]         \n\t"
45 
46       /* mask |= (abs(p2 - p1) > limit) */
47       "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
48       "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
49       "or             %[r_k], %[r_k],    %[c]         \n\t"
50       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
51       "or             %[r],   %[r],      %[c]         \n\t"
52 
53       /* mask |= (abs(p1 - p0) > limit)
54        * hev  |= (abs(p1 - p0) > thresh)
55        */
56       "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
57       "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
58       "or             %[r_k], %[r_k],    %[c]         \n\t"
59       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
60       "or             %[r3],  $0,        %[c]         \n\t"
61       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
62       "or             %[r],   %[r],      %[c]         \n\t"
63 
64       /* mask |= (abs(q1 - q0) > limit)
65        * hev  |= (abs(q1 - q0) > thresh)
66        */
67       "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
68       "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
69       "or             %[r_k], %[r_k],    %[c]         \n\t"
70       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
71       "or             %[r3],  %[r3],     %[c]         \n\t"
72       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
73       "or             %[r],   %[r],      %[c]         \n\t"
74 
75       /* mask |= (abs(q2 - q1) > limit) */
76       "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
77       "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
78       "or             %[r_k], %[r_k],    %[c]         \n\t"
79       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
80       "or             %[r],   %[r],      %[c]         \n\t"
81       "sll            %[r3],    %[r3],    24          \n\t"
82 
83       /* mask |= (abs(q3 - q2) > limit) */
84       "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
85       "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
86       "or             %[r_k], %[r_k],    %[c]         \n\t"
87       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
88       "or             %[r],   %[r],      %[c]         \n\t"
89 
90       : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
91       : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
92         [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
93         [thresh] "r"(thresh));
94 
95   __asm__ __volatile__(
96       /* abs(p0 - q0) */
97       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
98       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
99       "wrdsp          %[r3]                           \n\t"
100       "or             %[s1],  %[r_k],    %[c]         \n\t"
101 
102       /* abs(p1 - q1) */
103       "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
104       "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
105       "pick.qb        %[hev1], %[ones],  $0           \n\t"
106       "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
107       "or             %[s2],   %[r_k],   %[c]         \n\t"
108 
109       /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
110       "shrl.qb        %[s2],   %[s2],     1           \n\t"
111       "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
112       "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
113       "or             %[r],    %[r],      %[c]        \n\t"
114       "sll            %[r],    %[r],      24          \n\t"
115 
116       "wrdsp          %[r]                            \n\t"
117       "pick.qb        %[s2],  $0,         %[ones]     \n\t"
118 
119       : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
120         [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
121       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
122         [ones] "r"(ones), [flimit] "r"(flimit));
123 
124   *hev = hev1;
125   *mask = s2;
126 }
127 
filter_hev_mask_flatmask4_dspr2(uint32_t limit,uint32_t flimit,uint32_t thresh,uint32_t p1,uint32_t p0,uint32_t p3,uint32_t p2,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t * hev,uint32_t * mask,uint32_t * flat)128 static INLINE void filter_hev_mask_flatmask4_dspr2(
129     uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
130     uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
131     uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
132   uint32_t c, r, r3, r_k, r_flat;
133   uint32_t s1, s2, s3;
134   uint32_t ones = 0xFFFFFFFF;
135   uint32_t flat_thresh = 0x01010101;
136   uint32_t hev1;
137   uint32_t flat1;
138 
139   __asm__ __volatile__(
140       /* mask |= (abs(p3 - p2) > limit) */
141       "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
142       "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
143       "or             %[r_k],     %[r_k],         %[c]         \n\t"
144       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
145       "or             %[r],       $0,             %[c]         \n\t"
146 
147       /* mask |= (abs(p2 - p1) > limit) */
148       "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
149       "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
150       "or             %[r_k],     %[r_k],         %[c]         \n\t"
151       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
152       "or             %[r],       %[r],           %[c]         \n\t"
153 
154       /* mask |= (abs(p1 - p0) > limit)
155        * hev  |= (abs(p1 - p0) > thresh)
156        * flat |= (abs(p1 - p0) > thresh)
157        */
158       "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
159       "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
160       "or             %[r_k],     %[r_k],         %[c]         \n\t"
161       "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
162       "or             %[r3],      $0,             %[c]         \n\t"
163       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
164       "or             %[r],       %[r],           %[c]         \n\t"
165       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
166       "or             %[r_flat],  $0,             %[c]         \n\t"
167 
168       /* mask |= (abs(q1 - q0) > limit)
169        * hev  |= (abs(q1 - q0) > thresh)
170        * flat |= (abs(q1 - q0) > thresh)
171        */
172       "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
173       "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
174       "or             %[r_k],     %[r_k],         %[c]         \n\t"
175       "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
176       "or             %[r3],      %[r3],          %[c]         \n\t"
177       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
178       "or             %[r],       %[r],           %[c]         \n\t"
179       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
180       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
181 
182       /* flat |= (abs(p0 - p2) > thresh) */
183       "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
184       "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
185       "or             %[r_k],     %[r_k],         %[c]         \n\t"
186       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
187       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
188 
189       /* flat |= (abs(q0 - q2) > thresh) */
190       "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
191       "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
192       "or             %[r_k],     %[r_k],         %[c]         \n\t"
193       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
194       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
195 
196       /* flat |= (abs(p3 - p0) > thresh) */
197       "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
198       "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
199       "or             %[r_k],     %[r_k],         %[c]         \n\t"
200       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
201       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
202 
203       /* flat |= (abs(q3 - q0) > thresh) */
204       "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
205       "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
206       "or             %[r_k],     %[r_k],         %[c]         \n\t"
207       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
208       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
209       "sll            %[r_flat],  %[r_flat],      24           \n\t"
210       /* look at stall here */
211       "wrdsp          %[r_flat]                                \n\t"
212       "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
213 
214       /* mask |= (abs(q2 - q1) > limit) */
215       "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
216       "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
217       "or             %[r_k],     %[r_k],         %[c]         \n\t"
218       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
219       "or             %[r],       %[r],           %[c]         \n\t"
220       "sll            %[r3],      %[r3],          24           \n\t"
221 
222       /* mask |= (abs(q3 - q2) > limit) */
223       "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
224       "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
225       "or             %[r_k],     %[r_k],         %[c]         \n\t"
226       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
227       "or             %[r],       %[r],           %[c]         \n\t"
228 
229       : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
230         [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
231       : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
232         [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
233         [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
234 
235   __asm__ __volatile__(
236       /* abs(p0 - q0) */
237       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
238       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
239       "wrdsp          %[r3]                           \n\t"
240       "or             %[s1],  %[r_k],    %[c]         \n\t"
241 
242       /* abs(p1 - q1) */
243       "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
244       "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
245       "pick.qb        %[hev1], %[ones],  $0           \n\t"
246       "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
247       "or             %[s2],   %[r_k],   %[c]         \n\t"
248 
249       /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
250       "shrl.qb        %[s2],   %[s2],     1           \n\t"
251       "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
252       "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
253       "or             %[r],    %[r],      %[c]        \n\t"
254       "sll            %[r],    %[r],      24          \n\t"
255 
256       "wrdsp          %[r]                            \n\t"
257       "pick.qb        %[s2],   $0,        %[ones]     \n\t"
258 
259       : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
260         [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
261       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
262         [ones] "r"(ones), [flimit] "r"(flimit));
263 
264   *hev = hev1;
265   *mask = s2;
266   *flat = flat1;
267 }
268 
flatmask5(uint32_t p4,uint32_t p3,uint32_t p2,uint32_t p1,uint32_t p0,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t q4,uint32_t * flat2)269 static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
270                              uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
271                              uint32_t q3, uint32_t q4, uint32_t *flat2) {
272   uint32_t c, r, r_k, r_flat;
273   uint32_t ones = 0xFFFFFFFF;
274   uint32_t flat_thresh = 0x01010101;
275   uint32_t flat1, flat3;
276 
277   __asm__ __volatile__(
278       /* flat |= (abs(p4 - p0) > thresh) */
279       "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
280       "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
281       "or             %[r_k], %[r_k],          %[c]         \n\t"
282       "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
283       "or             %[r],   $0,              %[c]         \n\t"
284 
285       /* flat |= (abs(q4 - q0) > thresh) */
286       "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
287       "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
288       "or             %[r_k],   %[r_k],          %[c]      \n\t"
289       "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
290       "or             %[r],     %[r],            %[c]      \n\t"
291       "sll            %[r],     %[r],            24        \n\t"
292       "wrdsp          %[r]                                 \n\t"
293       "pick.qb        %[flat3], $0,           %[ones]      \n\t"
294 
295       /* flat |= (abs(p1 - p0) > thresh) */
296       "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
297       "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
298       "or             %[r_k],     %[r_k],         %[c]         \n\t"
299       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
300       "or             %[r_flat],  $0,             %[c]         \n\t"
301 
302       /* flat |= (abs(q1 - q0) > thresh) */
303       "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
304       "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
305       "or             %[r_k],    %[r_k],          %[c]         \n\t"
306       "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
307       "or             %[r_flat], %[r_flat],       %[c]         \n\t"
308 
309       /* flat |= (abs(p0 - p2) > thresh) */
310       "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
311       "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
312       "or             %[r_k],     %[r_k],         %[c]         \n\t"
313       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
314       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
315 
316       /* flat |= (abs(q0 - q2) > thresh) */
317       "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
318       "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
319       "or             %[r_k],     %[r_k],         %[c]         \n\t"
320       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
321       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
322 
323       /* flat |= (abs(p3 - p0) > thresh) */
324       "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
325       "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
326       "or             %[r_k],     %[r_k],         %[c]         \n\t"
327       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
328       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
329 
330       /* flat |= (abs(q3 - q0) > thresh) */
331       "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
332       "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
333       "or             %[r_k],     %[r_k],         %[c]         \n\t"
334       "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
335       "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
336       "sll            %[r_flat],  %[r_flat],      24           \n\t"
337       "wrdsp          %[r_flat]                                \n\t"
338       "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
339       /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
340       "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
341 
342       : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
343         [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
344       : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
345         [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
346         [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
347 
348   *flat2 = flat1;
349 }
350 #endif  // #if HAVE_DSPR2
351 #ifdef __cplusplus
352 }  // extern "C"
353 #endif
354 
355 #endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
356