1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20 
21 #if HAVE_DSPR2
vpx_lpf_horizontal_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
23                                 const uint8_t *blimit, const uint8_t *limit,
24                                 const uint8_t *thresh) {
25   uint8_t i;
26   uint32_t mask;
27   uint32_t hev;
28   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
29   uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
30   uint32_t thresh_vec, flimit_vec, limit_vec;
31   uint32_t uflimit, ulimit, uthresh;
32 
33   uflimit = *blimit;
34   ulimit = *limit;
35   uthresh = *thresh;
36 
37   /* create quad-byte */
38   __asm__ __volatile__(
39       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
40       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
41       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
42 
43       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
44         [limit_vec] "=r"(limit_vec)
45       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
46 
47   /* prefetch data for store */
48   prefetch_store(s);
49 
50   /* loop filter designed to work using chars so that we can make maximum use
51      of 8 bit simd instructions. */
52   for (i = 0; i < 2; i++) {
53     sm1 = s - (pitch << 2);
54     s0 = sm1 + pitch;
55     s1 = s0 + pitch;
56     s2 = s - pitch;
57     s3 = s;
58     s4 = s + pitch;
59     s5 = s4 + pitch;
60     s6 = s5 + pitch;
61 
62     __asm__ __volatile__(
63         "lw     %[p1],  (%[s1])    \n\t"
64         "lw     %[p2],  (%[s2])    \n\t"
65         "lw     %[p3],  (%[s3])    \n\t"
66         "lw     %[p4],  (%[s4])    \n\t"
67 
68         : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
69         : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
70 
71     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
72        mask will be zero and filtering is not needed */
73     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
74       __asm__ __volatile__(
75           "lw       %[pm1], (%[sm1])   \n\t"
76           "lw       %[p0],  (%[s0])    \n\t"
77           "lw       %[p5],  (%[s5])    \n\t"
78           "lw       %[p6],  (%[s6])    \n\t"
79 
80           : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
81           : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
82 
83       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
84                             p6, thresh_vec, &hev, &mask);
85 
86       /* if mask == 0 do filtering is not needed */
87       if (mask) {
88         /* filtering */
89         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
90 
91         __asm__ __volatile__(
92             "sw     %[p1],  (%[s1])    \n\t"
93             "sw     %[p2],  (%[s2])    \n\t"
94             "sw     %[p3],  (%[s3])    \n\t"
95             "sw     %[p4],  (%[s4])    \n\t"
96 
97             :
98             : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
99               [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
100       }
101     }
102 
103     s = s + 4;
104   }
105 }
106 
vpx_lpf_vertical_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)107 void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
108                               const uint8_t *blimit, const uint8_t *limit,
109                               const uint8_t *thresh) {
110   uint8_t i;
111   uint32_t mask, hev;
112   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
113   uint8_t *s1, *s2, *s3, *s4;
114   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
115   uint32_t thresh_vec, flimit_vec, limit_vec;
116   uint32_t uflimit, ulimit, uthresh;
117 
118   uflimit = *blimit;
119   ulimit = *limit;
120   uthresh = *thresh;
121 
122   /* create quad-byte */
123   __asm__ __volatile__(
124       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
125       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
126       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
127 
128       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
129         [limit_vec] "=r"(limit_vec)
130       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
131 
132   /* prefetch data for store */
133   prefetch_store(s + pitch);
134 
135   for (i = 0; i < 2; i++) {
136     s1 = s;
137     s2 = s + pitch;
138     s3 = s2 + pitch;
139     s4 = s3 + pitch;
140     s = s4 + pitch;
141 
142     /* load quad-byte vectors
143      * memory is 4 byte aligned
144      */
145     p2 = *((uint32_t *)(s1 - 4));
146     p6 = *((uint32_t *)(s1));
147     p1 = *((uint32_t *)(s2 - 4));
148     p5 = *((uint32_t *)(s2));
149     p0 = *((uint32_t *)(s3 - 4));
150     p4 = *((uint32_t *)(s3));
151     pm1 = *((uint32_t *)(s4 - 4));
152     p3 = *((uint32_t *)(s4));
153 
154     /* transpose pm1, p0, p1, p2 */
155     __asm__ __volatile__(
156         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
157         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
158         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
159         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
160 
161         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
162         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
163         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
164         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
165 
166         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
167         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
168         "append         %[p1],      %[sec3],    16          \n\t"
169         "append         %[pm1],     %[sec4],    16          \n\t"
170 
171         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
172           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
173           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
174         :);
175 
176     /* transpose p3, p4, p5, p6 */
177     __asm__ __volatile__(
178         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
179         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
180         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
181         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
182 
183         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
184         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
185         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
186         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
187 
188         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
189         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
190         "append         %[p5],      %[sec3],    16          \n\t"
191         "append         %[p3],      %[sec4],    16          \n\t"
192 
193         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
194           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
195           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
196         :);
197 
198     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
199      * mask will be zero and filtering is not needed
200      */
201     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
202       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
203                             p6, thresh_vec, &hev, &mask);
204 
205       /* if mask == 0 do filtering is not needed */
206       if (mask) {
207         /* filtering */
208         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
209 
210         /* unpack processed 4x4 neighborhood
211          * don't use transpose on output data
212          * because memory isn't aligned
213          */
214         __asm__ __volatile__(
215             "sb     %[p4],   1(%[s4])    \n\t"
216             "sb     %[p3],   0(%[s4])    \n\t"
217             "sb     %[p2],  -1(%[s4])    \n\t"
218             "sb     %[p1],  -2(%[s4])    \n\t"
219 
220             :
221             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
222               [s4] "r"(s4));
223 
224         __asm__ __volatile__(
225             "srl    %[p4],  %[p4],  8     \n\t"
226             "srl    %[p3],  %[p3],  8     \n\t"
227             "srl    %[p2],  %[p2],  8     \n\t"
228             "srl    %[p1],  %[p1],  8     \n\t"
229 
230             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
231             :);
232 
233         __asm__ __volatile__(
234             "sb     %[p4],   1(%[s3])    \n\t"
235             "sb     %[p3],   0(%[s3])    \n\t"
236             "sb     %[p2],  -1(%[s3])    \n\t"
237             "sb     %[p1],  -2(%[s3])    \n\t"
238 
239             : [p1] "+r"(p1)
240             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
241 
242         __asm__ __volatile__(
243             "srl    %[p4],  %[p4],  8     \n\t"
244             "srl    %[p3],  %[p3],  8     \n\t"
245             "srl    %[p2],  %[p2],  8     \n\t"
246             "srl    %[p1],  %[p1],  8     \n\t"
247 
248             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
249             :);
250 
251         __asm__ __volatile__(
252             "sb     %[p4],   1(%[s2])    \n\t"
253             "sb     %[p3],   0(%[s2])    \n\t"
254             "sb     %[p2],  -1(%[s2])    \n\t"
255             "sb     %[p1],  -2(%[s2])    \n\t"
256 
257             :
258             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
259               [s2] "r"(s2));
260 
261         __asm__ __volatile__(
262             "srl    %[p4],  %[p4],  8     \n\t"
263             "srl    %[p3],  %[p3],  8     \n\t"
264             "srl    %[p2],  %[p2],  8     \n\t"
265             "srl    %[p1],  %[p1],  8     \n\t"
266 
267             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
268             :);
269 
270         __asm__ __volatile__(
271             "sb     %[p4],   1(%[s1])    \n\t"
272             "sb     %[p3],   0(%[s1])    \n\t"
273             "sb     %[p2],  -1(%[s1])    \n\t"
274             "sb     %[p1],  -2(%[s1])    \n\t"
275 
276             :
277             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
278               [s1] "r"(s1));
279       }
280     }
281   }
282 }
283 
vpx_lpf_horizontal_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)284 void vpx_lpf_horizontal_4_dual_dspr2(
285     uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
286     const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
287     const uint8_t *limit1, const uint8_t *thresh1) {
288   vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
289   vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
290 }
291 
vpx_lpf_horizontal_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)292 void vpx_lpf_horizontal_8_dual_dspr2(
293     uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
294     const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
295     const uint8_t *limit1, const uint8_t *thresh1) {
296   vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
297   vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
298 }
299 
vpx_lpf_vertical_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)300 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
301                                    const uint8_t *limit0,
302                                    const uint8_t *thresh0,
303                                    const uint8_t *blimit1,
304                                    const uint8_t *limit1,
305                                    const uint8_t *thresh1) {
306   vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
307   vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
308 }
309 
vpx_lpf_vertical_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)310 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
311                                    const uint8_t *limit0,
312                                    const uint8_t *thresh0,
313                                    const uint8_t *blimit1,
314                                    const uint8_t *limit1,
315                                    const uint8_t *thresh1) {
316   vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
317   vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
318 }
319 
vpx_lpf_vertical_16_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)320 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
321                                     const uint8_t *limit,
322                                     const uint8_t *thresh) {
323   vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
324   vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
325 }
326 #endif  // #if HAVE_DSPR2
327