1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 #include "vp8_rtcd.h"
13 #include "vp8/common/onyxc_int.h"
14 
15 #if HAVE_DSPR2
16 typedef unsigned char uc;
17 
18 /* prefetch data for load */
prefetch_load_lf(unsigned char * src)19 inline void prefetch_load_lf(unsigned char *src) {
20   __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
21 }
22 
23 /* prefetch data for store */
prefetch_store_lf(unsigned char * dst)24 inline void prefetch_store_lf(unsigned char *dst) {
25   __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
26 }
27 
28 /* processing 4 pixels at the same time
29  * compute hev and mask in the same function
30  */
vp8_filter_mask_vec_mips(uint32_t limit,uint32_t flimit,uint32_t p1,uint32_t p0,uint32_t p3,uint32_t p2,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t thresh,uint32_t * hev,uint32_t * mask)31 static __inline void vp8_filter_mask_vec_mips(
32     uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3,
33     uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3,
34     uint32_t thresh, uint32_t *hev, uint32_t *mask) {
35   uint32_t c, r, r3, r_k;
36   uint32_t s1, s2, s3;
37   uint32_t ones = 0xFFFFFFFF;
38   uint32_t hev1;
39 
40   __asm__ __volatile__(
41       /* mask |= (abs(p3 - p2) > limit) */
42       "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
43       "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
44       "or             %[r_k], %[r_k],    %[c]         \n\t"
45       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
46       "or             %[r],   $0,        %[c]         \n\t"
47 
48       /* mask |= (abs(p2 - p1) > limit) */
49       "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
50       "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
51       "or             %[r_k], %[r_k],    %[c]         \n\t"
52       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
53       "or             %[r],   %[r],      %[c]         \n\t"
54 
55       /* mask |= (abs(p1 - p0) > limit)
56        * hev  |= (abs(p1 - p0) > thresh)
57        */
58       "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
59       "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
60       "or             %[r_k], %[r_k],    %[c]         \n\t"
61       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
62       "or             %[r3],  $0,        %[c]         \n\t"
63       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
64       "or             %[r],   %[r],      %[c]         \n\t"
65 
66       /* mask |= (abs(q1 - q0) > limit)
67        * hev  |= (abs(q1 - q0) > thresh)
68        */
69       "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
70       "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
71       "or             %[r_k], %[r_k],    %[c]         \n\t"
72       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
73       "or             %[r3],  %[r3],     %[c]         \n\t"
74       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
75       "or             %[r],   %[r],      %[c]         \n\t"
76 
77       /* mask |= (abs(q2 - q1) > limit) */
78       "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
79       "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
80       "or             %[r_k], %[r_k],    %[c]         \n\t"
81       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
82       "or             %[r],   %[r],      %[c]         \n\t"
83       "sll            %[r3],    %[r3],    24          \n\t"
84 
85       /* mask |= (abs(q3 - q2) > limit) */
86       "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
87       "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
88       "or             %[r_k], %[r_k],    %[c]         \n\t"
89       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
90       "or             %[r],   %[r],      %[c]         \n\t"
91 
92       : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
93       : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
94         [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
95         [thresh] "r"(thresh));
96 
97   __asm__ __volatile__(
98       /* abs(p0 - q0) */
99       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
100       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
101       "wrdsp          %[r3]                           \n\t"
102       "or             %[s1],  %[r_k],    %[c]         \n\t"
103 
104       /* abs(p1 - q1) */
105       "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
106       "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
107       "pick.qb        %[hev1], %[ones],  $0           \n\t"
108       "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
109       "or             %[s2],   %[r_k],   %[c]         \n\t"
110 
111       /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
112       "shrl.qb        %[s2],   %[s2],     1           \n\t"
113       "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
114       "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
115       "or             %[r],    %[r],      %[c]        \n\t"
116       "sll            %[r],    %[r],      24          \n\t"
117 
118       "wrdsp          %[r]                            \n\t"
119       "pick.qb        %[s2],  $0,         %[ones]     \n\t"
120 
121       : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
122         [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
123       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
124         [ones] "r"(ones), [flimit] "r"(flimit));
125 
126   *hev = hev1;
127   *mask = s2;
128 }
129 
130 /* inputs & outputs are quad-byte vectors */
vp8_filter_mips(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)131 static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1,
132                                      uint32_t *ps0, uint32_t *qs0,
133                                      uint32_t *qs1) {
134   int32_t vp8_filter_l, vp8_filter_r;
135   int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
136   int32_t subr_r, subr_l;
137   uint32_t t1, t2, HWM, t3;
138   uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
139 
140   int32_t vps1, vps0, vqs0, vqs1;
141   int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
142   uint32_t N128;
143 
144   N128 = 0x80808080;
145   t1 = 0x03000300;
146   t2 = 0x04000400;
147   t3 = 0x01000100;
148   HWM = 0xFF00FF00;
149 
150   vps0 = (*ps0) ^ N128;
151   vps1 = (*ps1) ^ N128;
152   vqs0 = (*qs0) ^ N128;
153   vqs1 = (*qs1) ^ N128;
154 
155   /* use halfword pairs instead quad-bytes because of accuracy */
156   vps0_l = vps0 & HWM;
157   vps0_r = vps0 << 8;
158   vps0_r = vps0_r & HWM;
159 
160   vps1_l = vps1 & HWM;
161   vps1_r = vps1 << 8;
162   vps1_r = vps1_r & HWM;
163 
164   vqs0_l = vqs0 & HWM;
165   vqs0_r = vqs0 << 8;
166   vqs0_r = vqs0_r & HWM;
167 
168   vqs1_l = vqs1 & HWM;
169   vqs1_r = vqs1 << 8;
170   vqs1_r = vqs1_r & HWM;
171 
172   mask_l = mask & HWM;
173   mask_r = mask << 8;
174   mask_r = mask_r & HWM;
175 
176   hev_l = hev & HWM;
177   hev_r = hev << 8;
178   hev_r = hev_r & HWM;
179 
180   __asm__ __volatile__(
181       /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
182       "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
183       "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
184 
185       /* qs0 - ps0 */
186       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
187       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
188 
189       /* vp8_filter &= hev; */
190       "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
191       "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
192 
193       /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
194       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
195       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
196       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
197       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
198       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
199       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
200       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
201       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
202 
203       /* vp8_filter &= mask; */
204       "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
205       "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
206 
207       : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r),
208         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
209         [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
210 
211       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
212         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
213         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
214         [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
215         [HWM] "r"(HWM));
216 
217   /* save bottom 3 bits so that we round one side +4 and the other +3 */
218   __asm__ __volatile__(
219       /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
220       "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
221       "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
222 
223       /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
224       "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
225       "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
226       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
227       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
228 
229       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
230       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
231 
232       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
233       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
234 
235       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
236       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
237       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
238 
239       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
240       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
241       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
242 
243       : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
244         [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
245         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
246         [vqs0_r] "+r"(vqs0_r)
247 
248       : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l),
249         [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM));
250 
251   __asm__ __volatile__(
252       /* (vp8_filter += 1) >>= 1 */
253       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
254       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
255 
256       /* vp8_filter &= ~hev; */
257       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
258       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
259 
260       /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
261       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
262       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
263 
264       /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
265       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
266       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
267 
268       : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
269         [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
270         [vqs1_r] "+r"(vqs1_r)
271 
272       : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
273 
274   /* Create quad-bytes from halfword pairs */
275   vqs0_l = vqs0_l & HWM;
276   vqs1_l = vqs1_l & HWM;
277   vps0_l = vps0_l & HWM;
278   vps1_l = vps1_l & HWM;
279 
280   __asm__ __volatile__(
281       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
282       "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
283       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
284       "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
285 
286       : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
287         [vqs0_r] "+r"(vqs0_r)
288       :);
289 
290   vqs0 = vqs0_l | vqs0_r;
291   vqs1 = vqs1_l | vqs1_r;
292   vps0 = vps0_l | vps0_r;
293   vps1 = vps1_l | vps1_r;
294 
295   *ps0 = vps0 ^ N128;
296   *ps1 = vps1 ^ N128;
297   *qs0 = vqs0 ^ N128;
298   *qs1 = vqs1 ^ N128;
299 }
300 
vp8_loop_filter_horizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)301 void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
302                                           unsigned int flimit,
303                                           unsigned int limit,
304                                           unsigned int thresh, int count) {
305   uint32_t mask;
306   uint32_t hev;
307   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
308   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
309   (void)count;
310 
311   mask = 0;
312   hev = 0;
313   p1 = 0;
314   p2 = 0;
315   p3 = 0;
316   p4 = 0;
317 
318   /* prefetch data for store */
319   prefetch_store_lf(s);
320 
321   /* loop filter designed to work using chars so that we can make maximum use
322    * of 8 bit simd instructions.
323    */
324 
325   sm1 = s - (p << 2);
326   s0 = s - p - p - p;
327   s1 = s - p - p;
328   s2 = s - p;
329   s3 = s;
330   s4 = s + p;
331   s5 = s + p + p;
332   s6 = s + p + p + p;
333 
334   /* load quad-byte vectors
335    * memory is 4 byte aligned
336    */
337   p1 = *((uint32_t *)(s1));
338   p2 = *((uint32_t *)(s2));
339   p3 = *((uint32_t *)(s3));
340   p4 = *((uint32_t *)(s4));
341 
342   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
343    * mask will be zero and filtering is not needed
344    */
345   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
346     pm1 = *((uint32_t *)(sm1));
347     p0 = *((uint32_t *)(s0));
348     p5 = *((uint32_t *)(s5));
349     p6 = *((uint32_t *)(s6));
350 
351     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
352                              thresh, &hev, &mask);
353 
354     /* if mask == 0 do filtering is not needed */
355     if (mask) {
356       /* filtering */
357       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
358 
359       /* unpack processed 4x4 neighborhood */
360       *((uint32_t *)s1) = p1;
361       *((uint32_t *)s2) = p2;
362       *((uint32_t *)s3) = p3;
363       *((uint32_t *)s4) = p4;
364     }
365   }
366 
367   sm1 += 4;
368   s0 += 4;
369   s1 += 4;
370   s2 += 4;
371   s3 += 4;
372   s4 += 4;
373   s5 += 4;
374   s6 += 4;
375 
376   /* load quad-byte vectors
377    * memory is 4 byte aligned
378    */
379   p1 = *((uint32_t *)(s1));
380   p2 = *((uint32_t *)(s2));
381   p3 = *((uint32_t *)(s3));
382   p4 = *((uint32_t *)(s4));
383 
384   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
385    * mask will be zero and filtering is not needed
386    */
387   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
388     pm1 = *((uint32_t *)(sm1));
389     p0 = *((uint32_t *)(s0));
390     p5 = *((uint32_t *)(s5));
391     p6 = *((uint32_t *)(s6));
392 
393     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
394                              thresh, &hev, &mask);
395 
396     /* if mask == 0 do filtering is not needed */
397     if (mask) {
398       /* filtering */
399       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
400 
401       /* unpack processed 4x4 neighborhood */
402       *((uint32_t *)s1) = p1;
403       *((uint32_t *)s2) = p2;
404       *((uint32_t *)s3) = p3;
405       *((uint32_t *)s4) = p4;
406     }
407   }
408 
409   sm1 += 4;
410   s0 += 4;
411   s1 += 4;
412   s2 += 4;
413   s3 += 4;
414   s4 += 4;
415   s5 += 4;
416   s6 += 4;
417 
418   /* load quad-byte vectors
419    * memory is 4 byte aligned
420    */
421   p1 = *((uint32_t *)(s1));
422   p2 = *((uint32_t *)(s2));
423   p3 = *((uint32_t *)(s3));
424   p4 = *((uint32_t *)(s4));
425 
426   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
427    * mask will be zero and filtering is not needed
428    */
429   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
430     pm1 = *((uint32_t *)(sm1));
431     p0 = *((uint32_t *)(s0));
432     p5 = *((uint32_t *)(s5));
433     p6 = *((uint32_t *)(s6));
434 
435     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
436                              thresh, &hev, &mask);
437 
438     /* if mask == 0 do filtering is not needed */
439     if (mask) {
440       /* filtering */
441       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
442 
443       /* unpack processed 4x4 neighborhood */
444       *((uint32_t *)s1) = p1;
445       *((uint32_t *)s2) = p2;
446       *((uint32_t *)s3) = p3;
447       *((uint32_t *)s4) = p4;
448     }
449   }
450 
451   sm1 += 4;
452   s0 += 4;
453   s1 += 4;
454   s2 += 4;
455   s3 += 4;
456   s4 += 4;
457   s5 += 4;
458   s6 += 4;
459 
460   /* load quad-byte vectors
461    * memory is 4 byte aligned
462    */
463   p1 = *((uint32_t *)(s1));
464   p2 = *((uint32_t *)(s2));
465   p3 = *((uint32_t *)(s3));
466   p4 = *((uint32_t *)(s4));
467 
468   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
469    * mask will be zero and filtering is not needed
470    */
471   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
472     pm1 = *((uint32_t *)(sm1));
473     p0 = *((uint32_t *)(s0));
474     p5 = *((uint32_t *)(s5));
475     p6 = *((uint32_t *)(s6));
476 
477     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
478                              thresh, &hev, &mask);
479 
480     /* if mask == 0 do filtering is not needed */
481     if (mask) {
482       /* filtering */
483       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
484 
485       /* unpack processed 4x4 neighborhood */
486       *((uint32_t *)s1) = p1;
487       *((uint32_t *)s2) = p2;
488       *((uint32_t *)s3) = p3;
489       *((uint32_t *)s4) = p4;
490     }
491   }
492 }
493 
vp8_loop_filter_uvhorizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)494 void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
495                                             unsigned int flimit,
496                                             unsigned int limit,
497                                             unsigned int thresh, int count) {
498   uint32_t mask;
499   uint32_t hev;
500   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
501   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
502   (void)count;
503 
504   mask = 0;
505   hev = 0;
506   p1 = 0;
507   p2 = 0;
508   p3 = 0;
509   p4 = 0;
510 
511   /* loop filter designed to work using chars so that we can make maximum use
512    * of 8 bit simd instructions.
513    */
514 
515   sm1 = s - (p << 2);
516   s0 = s - p - p - p;
517   s1 = s - p - p;
518   s2 = s - p;
519   s3 = s;
520   s4 = s + p;
521   s5 = s + p + p;
522   s6 = s + p + p + p;
523 
524   /* load quad-byte vectors
525    * memory is 4 byte aligned
526    */
527   p1 = *((uint32_t *)(s1));
528   p2 = *((uint32_t *)(s2));
529   p3 = *((uint32_t *)(s3));
530   p4 = *((uint32_t *)(s4));
531 
532   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
533    * mask will be zero and filtering is not needed
534    */
535   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
536     pm1 = *((uint32_t *)(sm1));
537     p0 = *((uint32_t *)(s0));
538     p5 = *((uint32_t *)(s5));
539     p6 = *((uint32_t *)(s6));
540 
541     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
542                              thresh, &hev, &mask);
543 
544     /* if mask == 0 do filtering is not needed */
545     if (mask) {
546       /* filtering */
547       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
548 
549       /* unpack processed 4x4 neighborhood */
550       *((uint32_t *)s1) = p1;
551       *((uint32_t *)s2) = p2;
552       *((uint32_t *)s3) = p3;
553       *((uint32_t *)s4) = p4;
554     }
555   }
556 
557   sm1 += 4;
558   s0 += 4;
559   s1 += 4;
560   s2 += 4;
561   s3 += 4;
562   s4 += 4;
563   s5 += 4;
564   s6 += 4;
565 
566   /* load quad-byte vectors
567    * memory is 4 byte aligned
568    */
569   p1 = *((uint32_t *)(s1));
570   p2 = *((uint32_t *)(s2));
571   p3 = *((uint32_t *)(s3));
572   p4 = *((uint32_t *)(s4));
573 
574   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
575    * mask will be zero and filtering is not needed
576    */
577   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
578     pm1 = *((uint32_t *)(sm1));
579     p0 = *((uint32_t *)(s0));
580     p5 = *((uint32_t *)(s5));
581     p6 = *((uint32_t *)(s6));
582 
583     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
584                              thresh, &hev, &mask);
585 
586     /* if mask == 0 do filtering is not needed */
587     if (mask) {
588       /* filtering */
589       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
590 
591       /* unpack processed 4x4 neighborhood */
592       *((uint32_t *)s1) = p1;
593       *((uint32_t *)s2) = p2;
594       *((uint32_t *)s3) = p3;
595       *((uint32_t *)s4) = p4;
596     }
597   }
598 }
599 
vp8_loop_filter_vertical_edge_mips(unsigned char * s,int p,const unsigned int flimit,const unsigned int limit,const unsigned int thresh,int count)600 void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p,
601                                         const unsigned int flimit,
602                                         const unsigned int limit,
603                                         const unsigned int thresh, int count) {
604   int i;
605   uint32_t mask, hev;
606   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
607   unsigned char *s1, *s2, *s3, *s4;
608   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
609 
610   hev = 0;
611   mask = 0;
612   i = 0;
613   pm1 = 0;
614   p0 = 0;
615   p1 = 0;
616   p2 = 0;
617   p3 = 0;
618   p4 = 0;
619   p5 = 0;
620   p6 = 0;
621 
622   /* loop filter designed to work using chars so that we can make maximum use
623    * of 8 bit simd instructions.
624    */
625 
626   /* apply filter on 4 pixesl at the same time */
627   do {
628     /* prefetch data for store */
629     prefetch_store_lf(s + p);
630 
631     s1 = s;
632     s2 = s + p;
633     s3 = s2 + p;
634     s4 = s3 + p;
635     s = s4 + p;
636 
637     /* load quad-byte vectors
638      * memory is 4 byte aligned
639      */
640     p2 = *((uint32_t *)(s1 - 4));
641     p6 = *((uint32_t *)(s1));
642     p1 = *((uint32_t *)(s2 - 4));
643     p5 = *((uint32_t *)(s2));
644     p0 = *((uint32_t *)(s3 - 4));
645     p4 = *((uint32_t *)(s3));
646     pm1 = *((uint32_t *)(s4 - 4));
647     p3 = *((uint32_t *)(s4));
648 
649     /* transpose pm1, p0, p1, p2 */
650     __asm__ __volatile__(
651         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
652         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
653         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
654         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
655 
656         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
657         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
658         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
659         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
660 
661         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
662         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
663         "append         %[p1],      %[sec3],    16          \n\t"
664         "append         %[pm1],     %[sec4],    16          \n\t"
665 
666         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
667           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
668           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
669         :);
670 
671     /* transpose p3, p4, p5, p6 */
672     __asm__ __volatile__(
673         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
674         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
675         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
676         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
677 
678         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
679         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
680         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
681         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
682 
683         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
684         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
685         "append         %[p5],      %[sec3],    16          \n\t"
686         "append         %[p3],      %[sec4],    16          \n\t"
687 
688         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
689           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
690           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
691         :);
692 
693     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
694      * mask will be zero and filtering is not needed
695      */
696     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
697       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
698                                thresh, &hev, &mask);
699 
700       /* if mask == 0 do filtering is not needed */
701       if (mask) {
702         /* filtering */
703         vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
704 
705         /* unpack processed 4x4 neighborhood
706          * don't use transpose on output data
707          * because memory isn't aligned
708          */
709         __asm__ __volatile__(
710             "sb         %[p4],  1(%[s4])    \n\t"
711             "sb         %[p3],  0(%[s4])    \n\t"
712             "sb         %[p2], -1(%[s4])    \n\t"
713             "sb         %[p1], -2(%[s4])    \n\t"
714             :
715             : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
716               [p1] "r"(p1));
717 
718         __asm__ __volatile__(
719             "srl        %[p4], %[p4], 8     \n\t"
720             "srl        %[p3], %[p3], 8     \n\t"
721             "srl        %[p2], %[p2], 8     \n\t"
722             "srl        %[p1], %[p1], 8     \n\t"
723             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
724             :);
725 
726         __asm__ __volatile__(
727             "sb         %[p4],  1(%[s3])    \n\t"
728             "sb         %[p3],  0(%[s3])    \n\t"
729             "sb         %[p2], -1(%[s3])    \n\t"
730             "sb         %[p1], -2(%[s3])    \n\t"
731             : [p1] "+r"(p1)
732             : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
733 
734         __asm__ __volatile__(
735             "srl        %[p4], %[p4], 8     \n\t"
736             "srl        %[p3], %[p3], 8     \n\t"
737             "srl        %[p2], %[p2], 8     \n\t"
738             "srl        %[p1], %[p1], 8     \n\t"
739             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
740             :);
741 
742         __asm__ __volatile__(
743             "sb         %[p4],  1(%[s2])    \n\t"
744             "sb         %[p3],  0(%[s2])    \n\t"
745             "sb         %[p2], -1(%[s2])    \n\t"
746             "sb         %[p1], -2(%[s2])    \n\t"
747             :
748             : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
749               [p1] "r"(p1));
750 
751         __asm__ __volatile__(
752             "srl        %[p4], %[p4], 8     \n\t"
753             "srl        %[p3], %[p3], 8     \n\t"
754             "srl        %[p2], %[p2], 8     \n\t"
755             "srl        %[p1], %[p1], 8     \n\t"
756             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
757             :);
758 
759         __asm__ __volatile__(
760             "sb         %[p4],  1(%[s1])    \n\t"
761             "sb         %[p3],  0(%[s1])    \n\t"
762             "sb         %[p2], -1(%[s1])    \n\t"
763             "sb         %[p1], -2(%[s1])    \n\t"
764             :
765             : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
766               [p1] "r"(p1));
767       }
768     }
769 
770     s1 = s;
771     s2 = s + p;
772     s3 = s2 + p;
773     s4 = s3 + p;
774     s = s4 + p;
775 
776     /* load quad-byte vectors
777      * memory is 4 byte aligned
778      */
779     p2 = *((uint32_t *)(s1 - 4));
780     p6 = *((uint32_t *)(s1));
781     p1 = *((uint32_t *)(s2 - 4));
782     p5 = *((uint32_t *)(s2));
783     p0 = *((uint32_t *)(s3 - 4));
784     p4 = *((uint32_t *)(s3));
785     pm1 = *((uint32_t *)(s4 - 4));
786     p3 = *((uint32_t *)(s4));
787 
788     /* transpose pm1, p0, p1, p2 */
789     __asm__ __volatile__(
790         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
791         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
792         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
793         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
794 
795         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
796         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
797         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
798         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
799 
800         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
801         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
802         "append         %[p1],      %[sec3],    16          \n\t"
803         "append         %[pm1],     %[sec4],    16          \n\t"
804 
805         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
806           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
807           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
808         :);
809 
810     /* transpose p3, p4, p5, p6 */
811     __asm__ __volatile__(
812         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
813         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
814         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
815         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
816 
817         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
818         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
819         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
820         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
821 
822         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
823         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
824         "append         %[p5],      %[sec3],    16          \n\t"
825         "append         %[p3],      %[sec4],    16          \n\t"
826 
827         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
828           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
829           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
830         :);
831 
832     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
833      * mask will be zero and filtering is not needed
834      */
835     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
836       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
837                                thresh, &hev, &mask);
838 
839       /* if mask == 0 do filtering is not needed */
840       if (mask) {
841         /* filtering */
842         vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
843 
844         /* unpack processed 4x4 neighborhood
845          * don't use transpose on output data
846          * because memory isn't aligned
847          */
848         __asm__ __volatile__(
849             "sb         %[p4],  1(%[s4])    \n\t"
850             "sb         %[p3],  0(%[s4])    \n\t"
851             "sb         %[p2], -1(%[s4])    \n\t"
852             "sb         %[p1], -2(%[s4])    \n\t"
853             :
854             : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
855               [p1] "r"(p1));
856 
857         __asm__ __volatile__(
858             "srl        %[p4], %[p4], 8     \n\t"
859             "srl        %[p3], %[p3], 8     \n\t"
860             "srl        %[p2], %[p2], 8     \n\t"
861             "srl        %[p1], %[p1], 8     \n\t"
862             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
863             :);
864 
865         __asm__ __volatile__(
866             "sb         %[p4],  1(%[s3])    \n\t"
867             "sb         %[p3],  0(%[s3])    \n\t"
868             "sb         %[p2], -1(%[s3])    \n\t"
869             "sb         %[p1], -2(%[s3])    \n\t"
870             : [p1] "+r"(p1)
871             : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
872 
873         __asm__ __volatile__(
874             "srl        %[p4], %[p4], 8     \n\t"
875             "srl        %[p3], %[p3], 8     \n\t"
876             "srl        %[p2], %[p2], 8     \n\t"
877             "srl        %[p1], %[p1], 8     \n\t"
878             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
879             :);
880 
881         __asm__ __volatile__(
882             "sb         %[p4],  1(%[s2])    \n\t"
883             "sb         %[p3],  0(%[s2])    \n\t"
884             "sb         %[p2], -1(%[s2])    \n\t"
885             "sb         %[p1], -2(%[s2])    \n\t"
886             :
887             : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
888               [p1] "r"(p1));
889 
890         __asm__ __volatile__(
891             "srl        %[p4], %[p4], 8     \n\t"
892             "srl        %[p3], %[p3], 8     \n\t"
893             "srl        %[p2], %[p2], 8     \n\t"
894             "srl        %[p1], %[p1], 8     \n\t"
895             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
896             :);
897 
898         __asm__ __volatile__(
899             "sb         %[p4],  1(%[s1])    \n\t"
900             "sb         %[p3],  0(%[s1])    \n\t"
901             "sb         %[p2], -1(%[s1])    \n\t"
902             "sb         %[p1], -2(%[s1])    \n\t"
903             :
904             : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
905               [p1] "r"(p1));
906       }
907     }
908 
909     i += 8;
910   }
911 
912   while (i < count);
913 }
914 
vp8_loop_filter_uvvertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)915 void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
916                                           unsigned int flimit,
917                                           unsigned int limit,
918                                           unsigned int thresh, int count) {
919   uint32_t mask, hev;
920   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
921   unsigned char *s1, *s2, *s3, *s4;
922   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
923   (void)count;
924 
925   /* loop filter designed to work using chars so that we can make maximum use
926    * of 8 bit simd instructions.
927    */
928 
929   /* apply filter on 4 pixesl at the same time */
930 
931   s1 = s;
932   s2 = s + p;
933   s3 = s2 + p;
934   s4 = s3 + p;
935 
936   /* load quad-byte vectors
937    * memory is 4 byte aligned
938    */
939   p2 = *((uint32_t *)(s1 - 4));
940   p6 = *((uint32_t *)(s1));
941   p1 = *((uint32_t *)(s2 - 4));
942   p5 = *((uint32_t *)(s2));
943   p0 = *((uint32_t *)(s3 - 4));
944   p4 = *((uint32_t *)(s3));
945   pm1 = *((uint32_t *)(s4 - 4));
946   p3 = *((uint32_t *)(s4));
947 
948   /* transpose pm1, p0, p1, p2 */
949   __asm__ __volatile__(
950       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
951       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
952       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
953       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
954 
955       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
956       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
957       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
958       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
959 
960       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
961       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
962       "append         %[p1],      %[sec3],    16          \n\t"
963       "append         %[pm1],     %[sec4],    16          \n\t"
964 
965       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
966         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
967         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
968       :);
969 
970   /* transpose p3, p4, p5, p6 */
971   __asm__ __volatile__(
972       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
973       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
974       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
975       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
976 
977       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
978       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
979       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
980       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
981 
982       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
983       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
984       "append         %[p5],      %[sec3],    16          \n\t"
985       "append         %[p3],      %[sec4],    16          \n\t"
986 
987       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
988         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
989         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
990       :);
991 
992   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
993    * mask will be zero and filtering is not needed
994    */
995   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
996     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
997                              thresh, &hev, &mask);
998 
999     /* if mask == 0 do filtering is not needed */
1000     if (mask) {
1001       /* filtering */
1002       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1003 
1004       /* unpack processed 4x4 neighborhood
1005        * don't use transpose on output data
1006        * because memory isn't aligned
1007        */
1008       __asm__ __volatile__(
1009           "sb         %[p4],  1(%[s4])    \n\t"
1010           "sb         %[p3],  0(%[s4])    \n\t"
1011           "sb         %[p2], -1(%[s4])    \n\t"
1012           "sb         %[p1], -2(%[s4])    \n\t"
1013           :
1014           :
1015           [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1016 
1017       __asm__ __volatile__(
1018           "srl        %[p4], %[p4], 8     \n\t"
1019           "srl        %[p3], %[p3], 8     \n\t"
1020           "srl        %[p2], %[p2], 8     \n\t"
1021           "srl        %[p1], %[p1], 8     \n\t"
1022           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1023           :);
1024 
1025       __asm__ __volatile__(
1026           "sb         %[p4],  1(%[s3])    \n\t"
1027           "sb         %[p3],  0(%[s3])    \n\t"
1028           "sb         %[p2], -1(%[s3])    \n\t"
1029           "sb         %[p1], -2(%[s3])    \n\t"
1030           : [p1] "+r"(p1)
1031           : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1032 
1033       __asm__ __volatile__(
1034           "srl        %[p4], %[p4], 8     \n\t"
1035           "srl        %[p3], %[p3], 8     \n\t"
1036           "srl        %[p2], %[p2], 8     \n\t"
1037           "srl        %[p1], %[p1], 8     \n\t"
1038           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1039           :);
1040 
1041       __asm__ __volatile__(
1042           "sb         %[p4],  1(%[s2])    \n\t"
1043           "sb         %[p3],  0(%[s2])    \n\t"
1044           "sb         %[p2], -1(%[s2])    \n\t"
1045           "sb         %[p1], -2(%[s2])    \n\t"
1046           :
1047           :
1048           [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1049 
1050       __asm__ __volatile__(
1051           "srl        %[p4], %[p4], 8     \n\t"
1052           "srl        %[p3], %[p3], 8     \n\t"
1053           "srl        %[p2], %[p2], 8     \n\t"
1054           "srl        %[p1], %[p1], 8     \n\t"
1055           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1056           :);
1057 
1058       __asm__ __volatile__(
1059           "sb         %[p4],  1(%[s1])    \n\t"
1060           "sb         %[p3],  0(%[s1])    \n\t"
1061           "sb         %[p2], -1(%[s1])    \n\t"
1062           "sb         %[p1], -2(%[s1])    \n\t"
1063           :
1064           :
1065           [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1066     }
1067   }
1068 
1069   s1 = s4 + p;
1070   s2 = s1 + p;
1071   s3 = s2 + p;
1072   s4 = s3 + p;
1073 
1074   /* load quad-byte vectors
1075    * memory is 4 byte aligned
1076    */
1077   p2 = *((uint32_t *)(s1 - 4));
1078   p6 = *((uint32_t *)(s1));
1079   p1 = *((uint32_t *)(s2 - 4));
1080   p5 = *((uint32_t *)(s2));
1081   p0 = *((uint32_t *)(s3 - 4));
1082   p4 = *((uint32_t *)(s3));
1083   pm1 = *((uint32_t *)(s4 - 4));
1084   p3 = *((uint32_t *)(s4));
1085 
1086   /* transpose pm1, p0, p1, p2 */
1087   __asm__ __volatile__(
1088       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1089       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1090       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1091       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1092 
1093       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1094       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1095       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1096       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1097 
1098       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1099       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1100       "append         %[p1],      %[sec3],    16          \n\t"
1101       "append         %[pm1],     %[sec4],    16          \n\t"
1102 
1103       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1104         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1105         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1106       :);
1107 
1108   /* transpose p3, p4, p5, p6 */
1109   __asm__ __volatile__(
1110       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1111       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1112       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1113       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1114 
1115       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1116       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1117       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1118       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1119 
1120       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1121       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1122       "append         %[p5],      %[sec3],    16          \n\t"
1123       "append         %[p3],      %[sec4],    16          \n\t"
1124 
1125       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1126         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1127         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1128       :);
1129 
1130   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1131    * mask will be zero and filtering is not needed
1132    */
1133   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1134     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1135                              thresh, &hev, &mask);
1136 
1137     /* if mask == 0 do filtering is not needed */
1138     if (mask) {
1139       /* filtering */
1140       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1141 
1142       /* unpack processed 4x4 neighborhood
1143        * don't use transpose on output data
1144        * because memory isn't aligned
1145        */
1146       __asm__ __volatile__(
1147           "sb         %[p4],  1(%[s4])    \n\t"
1148           "sb         %[p3],  0(%[s4])    \n\t"
1149           "sb         %[p2], -1(%[s4])    \n\t"
1150           "sb         %[p1], -2(%[s4])    \n\t"
1151           :
1152           :
1153           [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1154 
1155       __asm__ __volatile__(
1156           "srl        %[p4], %[p4], 8     \n\t"
1157           "srl        %[p3], %[p3], 8     \n\t"
1158           "srl        %[p2], %[p2], 8     \n\t"
1159           "srl        %[p1], %[p1], 8     \n\t"
1160           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1161           :);
1162 
1163       __asm__ __volatile__(
1164           "sb         %[p4],  1(%[s3])    \n\t"
1165           "sb         %[p3],  0(%[s3])    \n\t"
1166           "sb         %[p2], -1(%[s3])    \n\t"
1167           "sb         %[p1], -2(%[s3])    \n\t"
1168           : [p1] "+r"(p1)
1169           : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1170 
1171       __asm__ __volatile__(
1172           "srl        %[p4], %[p4], 8     \n\t"
1173           "srl        %[p3], %[p3], 8     \n\t"
1174           "srl        %[p2], %[p2], 8     \n\t"
1175           "srl        %[p1], %[p1], 8     \n\t"
1176           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1177           :);
1178 
1179       __asm__ __volatile__(
1180           "sb         %[p4],  1(%[s2])    \n\t"
1181           "sb         %[p3],  0(%[s2])    \n\t"
1182           "sb         %[p2], -1(%[s2])    \n\t"
1183           "sb         %[p1], -2(%[s2])    \n\t"
1184           :
1185           :
1186           [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1187 
1188       __asm__ __volatile__(
1189           "srl        %[p4], %[p4], 8     \n\t"
1190           "srl        %[p3], %[p3], 8     \n\t"
1191           "srl        %[p2], %[p2], 8     \n\t"
1192           "srl        %[p1], %[p1], 8     \n\t"
1193           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1194           :);
1195 
1196       __asm__ __volatile__(
1197           "sb         %[p4],  1(%[s1])    \n\t"
1198           "sb         %[p3],  0(%[s1])    \n\t"
1199           "sb         %[p2], -1(%[s1])    \n\t"
1200           "sb         %[p1], -2(%[s1])    \n\t"
1201           :
1202           :
1203           [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1204     }
1205   }
1206 }
1207 
1208 /* inputs & outputs are quad-byte vectors */
vp8_mbfilter_mips(uint32_t mask,uint32_t hev,uint32_t * ps2,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1,uint32_t * qs2)1209 static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev,
1210                                        uint32_t *ps2, uint32_t *ps1,
1211                                        uint32_t *ps0, uint32_t *qs0,
1212                                        uint32_t *qs1, uint32_t *qs2) {
1213   int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
1214   int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
1215   int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
1216   uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r,
1217       subr_r, subr_l;
1218   uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l,
1219       invhev_r;
1220   uint32_t N128, R63;
1221   uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
1222 
1223   R63 = 0x003F003F;
1224   HWM = 0xFF00FF00;
1225   N128 = 0x80808080;
1226   t1 = 0x03000300;
1227   t2 = 0x04000400;
1228 
1229   vps0 = (*ps0) ^ N128;
1230   vps1 = (*ps1) ^ N128;
1231   vps2 = (*ps2) ^ N128;
1232   vqs0 = (*qs0) ^ N128;
1233   vqs1 = (*qs1) ^ N128;
1234   vqs2 = (*qs2) ^ N128;
1235 
1236   /* use halfword pairs instead quad-bytes because of accuracy */
1237   vps0_l = vps0 & HWM;
1238   vps0_r = vps0 << 8;
1239   vps0_r = vps0_r & HWM;
1240 
1241   vqs0_l = vqs0 & HWM;
1242   vqs0_r = vqs0 << 8;
1243   vqs0_r = vqs0_r & HWM;
1244 
1245   vps1_l = vps1 & HWM;
1246   vps1_r = vps1 << 8;
1247   vps1_r = vps1_r & HWM;
1248 
1249   vqs1_l = vqs1 & HWM;
1250   vqs1_r = vqs1 << 8;
1251   vqs1_r = vqs1_r & HWM;
1252 
1253   vqs2_l = vqs2 & HWM;
1254   vqs2_r = vqs2 << 8;
1255   vqs2_r = vqs2_r & HWM;
1256 
1257   __asm__ __volatile__(
1258       /* qs0 - ps0 */
1259       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
1260       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
1261 
1262       /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
1263       "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
1264       "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
1265 
1266       : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r),
1267         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r)
1268       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
1269         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
1270         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r));
1271 
1272   vps2_l = vps2 & HWM;
1273   vps2_r = vps2 << 8;
1274   vps2_r = vps2_r & HWM;
1275 
1276   /* add outer taps if we have high edge variance */
1277   __asm__ __volatile__(
1278       /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
1279       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1280       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1281       "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
1282       "sll          %[mask_r],       %[mask],         8               \n\t"
1283       "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
1284       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1285       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1286       "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
1287       "sll          %[hev_r],        %[hev],          8               \n\t"
1288       "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
1289       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1290       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1291 
1292       /* vp8_filter &= mask; */
1293       "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
1294       "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
1295 
1296       /* Filter2 = vp8_filter & hev; */
1297       "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
1298       "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
1299 
1300       : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r),
1301         [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l),
1302         [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l),
1303         [Filter2_r] "=&r"(Filter2_r)
1304       : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM),
1305         [hev] "r"(hev), [mask] "r"(mask));
1306 
1307   /* save bottom 3 bits so that we round one side +4 and the other +3 */
1308   __asm__ __volatile__(
1309       /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
1310       "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
1311       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
1312       "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
1313 
1314       /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
1315       "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
1316       "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
1317 
1318       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
1319       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
1320 
1321       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
1322       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
1323       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
1324       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
1325       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
1326 
1327       /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
1328       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
1329       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
1330 
1331       /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
1332       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
1333       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
1334 
1335       : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r),
1336         [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
1337         [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r),
1338         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1339         [vqs0_r] "+r"(vqs0_r)
1340       : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l),
1341         [hev_r] "r"(hev_r));
1342 
1343   /* only apply wider filter if not high edge variance */
1344   __asm__ __volatile__(
1345       /* vp8_filter &= ~hev; */
1346       "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
1347       "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
1348 
1349       "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
1350       "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
1351 
1352       : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r)
1353       : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r),
1354         [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
1355 
1356   /* roughly 3/7th difference across boundary */
1357   __asm__ __volatile__(
1358       "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
1359       "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
1360 
1361       "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
1362       "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
1363 
1364       "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
1365       "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
1366 
1367       "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
1368       "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
1369 
1370       "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
1371       "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
1372 
1373       "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
1374       "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
1375 
1376       /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
1377        * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
1378        */
1379       "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
1380       "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
1381       "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
1382       "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
1383       "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
1384       "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
1385       "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
1386       "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
1387       "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
1388       "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
1389 
1390       /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
1391       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
1392       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
1393 
1394       /* vps0 = vp8_signed_char_clamp(ps0 + u); */
1395       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
1396       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
1397 
1398       : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l),
1399         [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r),
1400         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1401         [vqs0_r] "+r"(vqs0_r)
1402       : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r));
1403 
1404   __asm__ __volatile__(
1405       /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
1406       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
1407       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
1408 
1409       /* vps1 = vp8_signed_char_clamp(ps1 + u); */
1410       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
1411       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
1412 
1413       : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1414         [vqs1_r] "+r"(vqs1_r)
1415       : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r));
1416 
1417   /* roughly 1/7th difference across boundary */
1418   __asm__ __volatile__(
1419       /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
1420       "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
1421       "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
1422       "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
1423       "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
1424 
1425       /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
1426       "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
1427       "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
1428 
1429       /* vps2 = vp8_signed_char_clamp(ps2 + u); */
1430       "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
1431       "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
1432 
1433       : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l),
1434         [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r)
1435       :);
1436 
1437   /* Create quad-bytes from halfword pairs */
1438   __asm__ __volatile__(
1439       "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
1440       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
1441 
1442       "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
1443       "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
1444 
1445       "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
1446       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
1447 
1448       "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
1449       "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
1450 
1451       "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
1452       "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
1453 
1454       "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
1455       "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
1456 
1457       "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
1458       "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
1459       "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
1460       "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
1461       "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
1462       "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
1463 
1464       : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1465         [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r),
1466         [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l),
1467         [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l)
1468       : [HWM] "r"(HWM));
1469 
1470   *ps0 = vps0_r ^ N128;
1471   *ps1 = vps1_r ^ N128;
1472   *ps2 = vps2_r ^ N128;
1473   *qs0 = vqs0_r ^ N128;
1474   *qs1 = vqs1_r ^ N128;
1475   *qs2 = vqs2_r ^ N128;
1476 }
1477 
vp8_mbloop_filter_horizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1478 void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p,
1479                                             unsigned int flimit,
1480                                             unsigned int limit,
1481                                             unsigned int thresh, int count) {
1482   int i;
1483   uint32_t mask, hev;
1484   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1485   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1486 
1487   mask = 0;
1488   hev = 0;
1489   i = 0;
1490   p1 = 0;
1491   p2 = 0;
1492   p3 = 0;
1493   p4 = 0;
1494 
1495   /* loop filter designed to work using chars so that we can make maximum use
1496    * of 8 bit simd instructions.
1497    */
1498 
1499   sm1 = s - (p << 2);
1500   s0 = s - p - p - p;
1501   s1 = s - p - p;
1502   s2 = s - p;
1503   s3 = s;
1504   s4 = s + p;
1505   s5 = s + p + p;
1506   s6 = s + p + p + p;
1507 
1508   /* prefetch data for load */
1509   prefetch_load_lf(s + p);
1510 
1511   /* apply filter on 4 pixesl at the same time */
1512   do {
1513     /* load quad-byte vectors
1514      * memory is 4 byte aligned
1515      */
1516     p1 = *((uint32_t *)(s1));
1517     p2 = *((uint32_t *)(s2));
1518     p3 = *((uint32_t *)(s3));
1519     p4 = *((uint32_t *)(s4));
1520 
1521     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1522      * mask will be zero and filtering is not needed
1523      */
1524     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1525       pm1 = *((uint32_t *)(sm1));
1526       p0 = *((uint32_t *)(s0));
1527       p5 = *((uint32_t *)(s5));
1528       p6 = *((uint32_t *)(s6));
1529 
1530       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1531                                thresh, &hev, &mask);
1532 
1533       /* if mask == 0 do filtering is not needed */
1534       if (mask) {
1535         /* filtering */
1536         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1537 
1538         /* unpack processed 4x4 neighborhood
1539          * memory is 4 byte aligned
1540          */
1541         *((uint32_t *)s0) = p0;
1542         *((uint32_t *)s1) = p1;
1543         *((uint32_t *)s2) = p2;
1544         *((uint32_t *)s3) = p3;
1545         *((uint32_t *)s4) = p4;
1546         *((uint32_t *)s5) = p5;
1547       }
1548     }
1549 
1550     sm1 += 4;
1551     s0 += 4;
1552     s1 += 4;
1553     s2 += 4;
1554     s3 += 4;
1555     s4 += 4;
1556     s5 += 4;
1557     s6 += 4;
1558 
1559     /* load quad-byte vectors
1560      * memory is 4 byte aligned
1561      */
1562     p1 = *((uint32_t *)(s1));
1563     p2 = *((uint32_t *)(s2));
1564     p3 = *((uint32_t *)(s3));
1565     p4 = *((uint32_t *)(s4));
1566 
1567     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1568      * mask will be zero and filtering is not needed
1569      */
1570     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1571       pm1 = *((uint32_t *)(sm1));
1572       p0 = *((uint32_t *)(s0));
1573       p5 = *((uint32_t *)(s5));
1574       p6 = *((uint32_t *)(s6));
1575 
1576       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1577                                thresh, &hev, &mask);
1578 
1579       /* if mask == 0 do filtering is not needed */
1580       if (mask) {
1581         /* filtering */
1582         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1583 
1584         /* unpack processed 4x4 neighborhood
1585          * memory is 4 byte aligned
1586          */
1587         *((uint32_t *)s0) = p0;
1588         *((uint32_t *)s1) = p1;
1589         *((uint32_t *)s2) = p2;
1590         *((uint32_t *)s3) = p3;
1591         *((uint32_t *)s4) = p4;
1592         *((uint32_t *)s5) = p5;
1593       }
1594     }
1595 
1596     sm1 += 4;
1597     s0 += 4;
1598     s1 += 4;
1599     s2 += 4;
1600     s3 += 4;
1601     s4 += 4;
1602     s5 += 4;
1603     s6 += 4;
1604 
1605     i += 8;
1606   }
1607 
1608   while (i < count);
1609 }
1610 
vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1611 void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
1612                                               unsigned int flimit,
1613                                               unsigned int limit,
1614                                               unsigned int thresh, int count) {
1615   uint32_t mask, hev;
1616   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1617   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1618   (void)count;
1619 
1620   mask = 0;
1621   hev = 0;
1622   p1 = 0;
1623   p2 = 0;
1624   p3 = 0;
1625   p4 = 0;
1626 
1627   /* loop filter designed to work using chars so that we can make maximum use
1628    * of 8 bit simd instructions.
1629    */
1630 
1631   sm1 = s - (p << 2);
1632   s0 = s - p - p - p;
1633   s1 = s - p - p;
1634   s2 = s - p;
1635   s3 = s;
1636   s4 = s + p;
1637   s5 = s + p + p;
1638   s6 = s + p + p + p;
1639 
1640   /* load quad-byte vectors
1641    * memory is 4 byte aligned
1642    */
1643   p1 = *((uint32_t *)(s1));
1644   p2 = *((uint32_t *)(s2));
1645   p3 = *((uint32_t *)(s3));
1646   p4 = *((uint32_t *)(s4));
1647 
1648   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1649    * mask will be zero and filtering is not needed
1650    */
1651   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1652     pm1 = *((uint32_t *)(sm1));
1653     p0 = *((uint32_t *)(s0));
1654     p5 = *((uint32_t *)(s5));
1655     p6 = *((uint32_t *)(s6));
1656 
1657     /* if mask == 0 do filtering is not needed */
1658     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1659                              thresh, &hev, &mask);
1660 
1661     if (mask) {
1662       /* filtering */
1663       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1664 
1665       /* unpack processed 4x4 neighborhood
1666        * memory is 4 byte aligned
1667        */
1668       *((uint32_t *)s0) = p0;
1669       *((uint32_t *)s1) = p1;
1670       *((uint32_t *)s2) = p2;
1671       *((uint32_t *)s3) = p3;
1672       *((uint32_t *)s4) = p4;
1673       *((uint32_t *)s5) = p5;
1674     }
1675   }
1676 
1677   sm1 += 4;
1678   s0 += 4;
1679   s1 += 4;
1680   s2 += 4;
1681   s3 += 4;
1682   s4 += 4;
1683   s5 += 4;
1684   s6 += 4;
1685 
1686   /* load quad-byte vectors
1687    * memory is 4 byte aligned
1688    */
1689   p1 = *((uint32_t *)(s1));
1690   p2 = *((uint32_t *)(s2));
1691   p3 = *((uint32_t *)(s3));
1692   p4 = *((uint32_t *)(s4));
1693 
1694   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1695    * mask will be zero and filtering is not needed
1696    */
1697   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1698     pm1 = *((uint32_t *)(sm1));
1699     p0 = *((uint32_t *)(s0));
1700     p5 = *((uint32_t *)(s5));
1701     p6 = *((uint32_t *)(s6));
1702 
1703     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1704                              thresh, &hev, &mask);
1705 
1706     /* if mask == 0 do filtering is not needed */
1707     if (mask) {
1708       /* filtering */
1709       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1710 
1711       /* unpack processed 4x4 neighborhood
1712        * memory is 4 byte aligned
1713        */
1714       *((uint32_t *)s0) = p0;
1715       *((uint32_t *)s1) = p1;
1716       *((uint32_t *)s2) = p2;
1717       *((uint32_t *)s3) = p3;
1718       *((uint32_t *)s4) = p4;
1719       *((uint32_t *)s5) = p5;
1720     }
1721   }
1722 }
1723 
vp8_mbloop_filter_vertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1724 void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p,
1725                                           unsigned int flimit,
1726                                           unsigned int limit,
1727                                           unsigned int thresh, int count) {
1728   int i;
1729   uint32_t mask, hev;
1730   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1731   unsigned char *s1, *s2, *s3, *s4;
1732   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1733 
1734   mask = 0;
1735   hev = 0;
1736   i = 0;
1737   pm1 = 0;
1738   p0 = 0;
1739   p1 = 0;
1740   p2 = 0;
1741   p3 = 0;
1742   p4 = 0;
1743   p5 = 0;
1744   p6 = 0;
1745 
1746   /* loop filter designed to work using chars so that we can make maximum use
1747    * of 8 bit simd instructions.
1748    */
1749 
1750   /* apply filter on 4 pixesl at the same time */
1751   do {
1752     s1 = s;
1753     s2 = s + p;
1754     s3 = s2 + p;
1755     s4 = s3 + p;
1756     s = s4 + p;
1757 
1758     /* load quad-byte vectors
1759      * memory is 4 byte aligned
1760      */
1761     p2 = *((uint32_t *)(s1 - 4));
1762     p6 = *((uint32_t *)(s1));
1763     p1 = *((uint32_t *)(s2 - 4));
1764     p5 = *((uint32_t *)(s2));
1765     p0 = *((uint32_t *)(s3 - 4));
1766     p4 = *((uint32_t *)(s3));
1767     pm1 = *((uint32_t *)(s4 - 4));
1768     p3 = *((uint32_t *)(s4));
1769 
1770     /* transpose pm1, p0, p1, p2 */
1771     __asm__ __volatile__(
1772         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1773         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1774         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1775         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1776 
1777         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1778         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1779         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1780         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1781 
1782         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1783         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1784         "append         %[p1],      %[sec3],    16          \n\t"
1785         "append         %[pm1],     %[sec4],    16          \n\t"
1786 
1787         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1788           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1789           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1790         :);
1791 
1792     /* transpose p3, p4, p5, p6 */
1793     __asm__ __volatile__(
1794         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1795         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1796         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1797         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1798 
1799         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1800         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1801         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1802         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1803 
1804         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1805         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1806         "append         %[p5],      %[sec3],    16          \n\t"
1807         "append         %[p3],      %[sec4],    16          \n\t"
1808 
1809         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1810           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1811           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1812         :);
1813 
1814     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1815      * mask will be zero and filtering is not needed
1816      */
1817     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1818       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1819                                thresh, &hev, &mask);
1820 
1821       /* if mask == 0 do filtering is not needed */
1822       if (mask) {
1823         /* filtering */
1824         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1825 
1826         /* don't use transpose on output data
1827          * because memory isn't aligned
1828          */
1829         __asm__ __volatile__(
1830             "sb         %[p5],  2(%[s4])        \n\t"
1831             "sb         %[p4],  1(%[s4])        \n\t"
1832             "sb         %[p3],  0(%[s4])        \n\t"
1833             "sb         %[p2], -1(%[s4])        \n\t"
1834             "sb         %[p1], -2(%[s4])        \n\t"
1835             "sb         %[p0], -3(%[s4])        \n\t"
1836             :
1837             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
1838               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1839 
1840         __asm__ __volatile__(
1841             "srl        %[p5], %[p5], 8         \n\t"
1842             "srl        %[p4], %[p4], 8         \n\t"
1843             "srl        %[p3], %[p3], 8         \n\t"
1844             "srl        %[p2], %[p2], 8         \n\t"
1845             "srl        %[p1], %[p1], 8         \n\t"
1846             "srl        %[p0], %[p0], 8         \n\t"
1847             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1848               [p1] "+r"(p1), [p0] "+r"(p0)
1849             :);
1850 
1851         __asm__ __volatile__(
1852             "sb         %[p5],  2(%[s3])        \n\t"
1853             "sb         %[p4],  1(%[s3])        \n\t"
1854             "sb         %[p3],  0(%[s3])        \n\t"
1855             "sb         %[p2], -1(%[s3])        \n\t"
1856             "sb         %[p1], -2(%[s3])        \n\t"
1857             "sb         %[p0], -3(%[s3])        \n\t"
1858             :
1859             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
1860               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1861 
1862         __asm__ __volatile__(
1863             "srl        %[p5], %[p5], 8         \n\t"
1864             "srl        %[p4], %[p4], 8         \n\t"
1865             "srl        %[p3], %[p3], 8         \n\t"
1866             "srl        %[p2], %[p2], 8         \n\t"
1867             "srl        %[p1], %[p1], 8         \n\t"
1868             "srl        %[p0], %[p0], 8         \n\t"
1869             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1870               [p1] "+r"(p1), [p0] "+r"(p0)
1871             :);
1872 
1873         __asm__ __volatile__(
1874             "sb         %[p5],  2(%[s2])        \n\t"
1875             "sb         %[p4],  1(%[s2])        \n\t"
1876             "sb         %[p3],  0(%[s2])        \n\t"
1877             "sb         %[p2], -1(%[s2])        \n\t"
1878             "sb         %[p1], -2(%[s2])        \n\t"
1879             "sb         %[p0], -3(%[s2])        \n\t"
1880             :
1881             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
1882               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1883 
1884         __asm__ __volatile__(
1885             "srl        %[p5], %[p5], 8         \n\t"
1886             "srl        %[p4], %[p4], 8         \n\t"
1887             "srl        %[p3], %[p3], 8         \n\t"
1888             "srl        %[p2], %[p2], 8         \n\t"
1889             "srl        %[p1], %[p1], 8         \n\t"
1890             "srl        %[p0], %[p0], 8         \n\t"
1891             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1892               [p1] "+r"(p1), [p0] "+r"(p0)
1893             :);
1894 
1895         __asm__ __volatile__(
1896             "sb         %[p5],  2(%[s1])        \n\t"
1897             "sb         %[p4],  1(%[s1])        \n\t"
1898             "sb         %[p3],  0(%[s1])        \n\t"
1899             "sb         %[p2], -1(%[s1])        \n\t"
1900             "sb         %[p1], -2(%[s1])        \n\t"
1901             "sb         %[p0], -3(%[s1])        \n\t"
1902             :
1903             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
1904               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1905       }
1906     }
1907 
1908     i += 4;
1909   }
1910 
1911   while (i < count);
1912 }
1913 
vp8_mbloop_filter_uvvertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1914 void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
1915                                             unsigned int flimit,
1916                                             unsigned int limit,
1917                                             unsigned int thresh, int count) {
1918   uint32_t mask, hev;
1919   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1920   unsigned char *s1, *s2, *s3, *s4;
1921   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1922   (void)count;
1923 
1924   mask = 0;
1925   hev = 0;
1926   pm1 = 0;
1927   p0 = 0;
1928   p1 = 0;
1929   p2 = 0;
1930   p3 = 0;
1931   p4 = 0;
1932   p5 = 0;
1933   p6 = 0;
1934 
1935   /* loop filter designed to work using chars so that we can make maximum use
1936    * of 8 bit simd instructions.
1937    */
1938 
1939   /* apply filter on 4 pixesl at the same time */
1940 
1941   s1 = s;
1942   s2 = s + p;
1943   s3 = s2 + p;
1944   s4 = s3 + p;
1945 
1946   /* prefetch data for load */
1947   prefetch_load_lf(s + 2 * p);
1948 
1949   /* load quad-byte vectors
1950    * memory is 4 byte aligned
1951    */
1952   p2 = *((uint32_t *)(s1 - 4));
1953   p6 = *((uint32_t *)(s1));
1954   p1 = *((uint32_t *)(s2 - 4));
1955   p5 = *((uint32_t *)(s2));
1956   p0 = *((uint32_t *)(s3 - 4));
1957   p4 = *((uint32_t *)(s3));
1958   pm1 = *((uint32_t *)(s4 - 4));
1959   p3 = *((uint32_t *)(s4));
1960 
1961   /* transpose pm1, p0, p1, p2 */
1962   __asm__ __volatile__(
1963       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1964       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1965       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1966       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1967 
1968       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1969       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1970       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1971       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1972 
1973       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1974       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1975       "append         %[p1],      %[sec3],    16          \n\t"
1976       "append         %[pm1],     %[sec4],    16          \n\t"
1977 
1978       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1979         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1980         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1981       :);
1982 
1983   /* transpose p3, p4, p5, p6 */
1984   __asm__ __volatile__(
1985       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1986       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1987       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1988       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1989 
1990       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1991       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1992       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1993       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1994 
1995       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1996       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1997       "append         %[p5],      %[sec3],    16          \n\t"
1998       "append         %[p3],      %[sec4],    16          \n\t"
1999 
2000       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2001         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2002         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2003       :);
2004 
2005   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2006    * mask will be zero and filtering is not needed
2007    */
2008   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2009     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2010                              thresh, &hev, &mask);
2011 
2012     /* if mask == 0 do filtering is not needed */
2013     if (mask) {
2014       /* filtering */
2015       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2016 
2017       /* don't use transpose on output data
2018        * because memory isn't aligned
2019        */
2020       __asm__ __volatile__(
2021           "sb         %[p5],  2(%[s4])        \n\t"
2022           "sb         %[p4],  1(%[s4])        \n\t"
2023           "sb         %[p3],  0(%[s4])        \n\t"
2024           "sb         %[p2], -1(%[s4])        \n\t"
2025           "sb         %[p1], -2(%[s4])        \n\t"
2026           "sb         %[p0], -3(%[s4])        \n\t"
2027           :
2028           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2029             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2030 
2031       __asm__ __volatile__(
2032           "srl        %[p5], %[p5], 8         \n\t"
2033           "srl        %[p4], %[p4], 8         \n\t"
2034           "srl        %[p3], %[p3], 8         \n\t"
2035           "srl        %[p2], %[p2], 8         \n\t"
2036           "srl        %[p1], %[p1], 8         \n\t"
2037           "srl        %[p0], %[p0], 8         \n\t"
2038           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2039             [p1] "+r"(p1), [p0] "+r"(p0)
2040           :);
2041 
2042       __asm__ __volatile__(
2043           "sb         %[p5],  2(%[s3])        \n\t"
2044           "sb         %[p4],  1(%[s3])        \n\t"
2045           "sb         %[p3],  0(%[s3])        \n\t"
2046           "sb         %[p2], -1(%[s3])        \n\t"
2047           "sb         %[p1], -2(%[s3])        \n\t"
2048           "sb         %[p0], -3(%[s3])        \n\t"
2049           :
2050           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2051             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2052 
2053       __asm__ __volatile__(
2054           "srl        %[p5], %[p5], 8         \n\t"
2055           "srl        %[p4], %[p4], 8         \n\t"
2056           "srl        %[p3], %[p3], 8         \n\t"
2057           "srl        %[p2], %[p2], 8         \n\t"
2058           "srl        %[p1], %[p1], 8         \n\t"
2059           "srl        %[p0], %[p0], 8         \n\t"
2060           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2061             [p1] "+r"(p1), [p0] "+r"(p0)
2062           :);
2063 
2064       __asm__ __volatile__(
2065           "sb         %[p5],  2(%[s2])        \n\t"
2066           "sb         %[p4],  1(%[s2])        \n\t"
2067           "sb         %[p3],  0(%[s2])        \n\t"
2068           "sb         %[p2], -1(%[s2])        \n\t"
2069           "sb         %[p1], -2(%[s2])        \n\t"
2070           "sb         %[p0], -3(%[s2])        \n\t"
2071           :
2072           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2073             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2074 
2075       __asm__ __volatile__(
2076           "srl        %[p5], %[p5], 8         \n\t"
2077           "srl        %[p4], %[p4], 8         \n\t"
2078           "srl        %[p3], %[p3], 8         \n\t"
2079           "srl        %[p2], %[p2], 8         \n\t"
2080           "srl        %[p1], %[p1], 8         \n\t"
2081           "srl        %[p0], %[p0], 8         \n\t"
2082           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2083             [p1] "+r"(p1), [p0] "+r"(p0)
2084           :);
2085 
2086       __asm__ __volatile__(
2087           "sb         %[p5],  2(%[s1])        \n\t"
2088           "sb         %[p4],  1(%[s1])        \n\t"
2089           "sb         %[p3],  0(%[s1])        \n\t"
2090           "sb         %[p2], -1(%[s1])        \n\t"
2091           "sb         %[p1], -2(%[s1])        \n\t"
2092           "sb         %[p0], -3(%[s1])        \n\t"
2093           :
2094           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2095             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2096     }
2097   }
2098 
2099   s1 = s4 + p;
2100   s2 = s1 + p;
2101   s3 = s2 + p;
2102   s4 = s3 + p;
2103 
2104   /* load quad-byte vectors
2105    * memory is 4 byte aligned
2106    */
2107   p2 = *((uint32_t *)(s1 - 4));
2108   p6 = *((uint32_t *)(s1));
2109   p1 = *((uint32_t *)(s2 - 4));
2110   p5 = *((uint32_t *)(s2));
2111   p0 = *((uint32_t *)(s3 - 4));
2112   p4 = *((uint32_t *)(s3));
2113   pm1 = *((uint32_t *)(s4 - 4));
2114   p3 = *((uint32_t *)(s4));
2115 
2116   /* transpose pm1, p0, p1, p2 */
2117   __asm__ __volatile__(
2118       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
2119       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
2120       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
2121       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
2122 
2123       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
2124       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
2125       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2126       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2127 
2128       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
2129       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
2130       "append         %[p1],      %[sec3],    16          \n\t"
2131       "append         %[pm1],     %[sec4],    16          \n\t"
2132 
2133       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2134         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
2135         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2136       :);
2137 
2138   /* transpose p3, p4, p5, p6 */
2139   __asm__ __volatile__(
2140       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
2141       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
2142       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
2143       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
2144 
2145       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
2146       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
2147       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2148       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2149 
2150       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
2151       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
2152       "append         %[p5],      %[sec3],    16          \n\t"
2153       "append         %[p3],      %[sec4],    16          \n\t"
2154 
2155       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2156         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2157         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2158       :);
2159 
2160   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2161    * mask will be zero and filtering is not needed
2162    */
2163   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2164     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2165                              thresh, &hev, &mask);
2166 
2167     /* if mask == 0 do filtering is not needed */
2168     if (mask) {
2169       /* filtering */
2170       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2171 
2172       /* don't use transpose on output data
2173        * because memory isn't aligned
2174        */
2175       __asm__ __volatile__(
2176           "sb         %[p5],  2(%[s4])        \n\t"
2177           "sb         %[p4],  1(%[s4])        \n\t"
2178           "sb         %[p3],  0(%[s4])        \n\t"
2179           "sb         %[p2], -1(%[s4])        \n\t"
2180           "sb         %[p1], -2(%[s4])        \n\t"
2181           "sb         %[p0], -3(%[s4])        \n\t"
2182           :
2183           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2184             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2185 
2186       __asm__ __volatile__(
2187           "srl        %[p5], %[p5], 8         \n\t"
2188           "srl        %[p4], %[p4], 8         \n\t"
2189           "srl        %[p3], %[p3], 8         \n\t"
2190           "srl        %[p2], %[p2], 8         \n\t"
2191           "srl        %[p1], %[p1], 8         \n\t"
2192           "srl        %[p0], %[p0], 8         \n\t"
2193           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2194             [p1] "+r"(p1), [p0] "+r"(p0)
2195           :);
2196 
2197       __asm__ __volatile__(
2198           "sb         %[p5],  2(%[s3])        \n\t"
2199           "sb         %[p4],  1(%[s3])        \n\t"
2200           "sb         %[p3],  0(%[s3])        \n\t"
2201           "sb         %[p2], -1(%[s3])        \n\t"
2202           "sb         %[p1], -2(%[s3])        \n\t"
2203           "sb         %[p0], -3(%[s3])        \n\t"
2204           :
2205           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2206             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2207 
2208       __asm__ __volatile__(
2209           "srl        %[p5], %[p5], 8         \n\t"
2210           "srl        %[p4], %[p4], 8         \n\t"
2211           "srl        %[p3], %[p3], 8         \n\t"
2212           "srl        %[p2], %[p2], 8         \n\t"
2213           "srl        %[p1], %[p1], 8         \n\t"
2214           "srl        %[p0], %[p0], 8         \n\t"
2215           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2216             [p1] "+r"(p1), [p0] "+r"(p0)
2217           :);
2218 
2219       __asm__ __volatile__(
2220           "sb         %[p5],  2(%[s2])        \n\t"
2221           "sb         %[p4],  1(%[s2])        \n\t"
2222           "sb         %[p3],  0(%[s2])        \n\t"
2223           "sb         %[p2], -1(%[s2])        \n\t"
2224           "sb         %[p1], -2(%[s2])        \n\t"
2225           "sb         %[p0], -3(%[s2])        \n\t"
2226           :
2227           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2228             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2229 
2230       __asm__ __volatile__(
2231           "srl        %[p5], %[p5], 8         \n\t"
2232           "srl        %[p4], %[p4], 8         \n\t"
2233           "srl        %[p3], %[p3], 8         \n\t"
2234           "srl        %[p2], %[p2], 8         \n\t"
2235           "srl        %[p1], %[p1], 8         \n\t"
2236           "srl        %[p0], %[p0], 8         \n\t"
2237           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2238             [p1] "+r"(p1), [p0] "+r"(p0)
2239           :);
2240 
2241       __asm__ __volatile__(
2242           "sb         %[p5],  2(%[s1])        \n\t"
2243           "sb         %[p4],  1(%[s1])        \n\t"
2244           "sb         %[p3],  0(%[s1])        \n\t"
2245           "sb         %[p2], -1(%[s1])        \n\t"
2246           "sb         %[p1], -2(%[s1])        \n\t"
2247           "sb         %[p0], -3(%[s1])        \n\t"
2248           :
2249           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2250             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2251     }
2252   }
2253 }
2254 
2255 /* Horizontal MB filtering */
vp8_loop_filter_mbh_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2256 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2257                                unsigned char *v_ptr, int y_stride,
2258                                int uv_stride, loop_filter_info *lfi) {
2259   unsigned int thresh_vec, flimit_vec, limit_vec;
2260   unsigned char thresh, flimit, limit, flimit_temp;
2261 
2262   /* use direct value instead pointers */
2263   limit = *(lfi->lim);
2264   flimit_temp = *(lfi->mblim);
2265   thresh = *(lfi->hev_thr);
2266   flimit = flimit_temp;
2267 
2268   /* create quad-byte */
2269   __asm__ __volatile__(
2270       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2271       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2272       "replv.qb       %[limit_vec],  %[limit]     \n\t"
2273       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2274         [limit_vec] "=r"(limit_vec)
2275       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2276 
2277   vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2278                                          thresh_vec, 16);
2279 
2280   if (u_ptr) {
2281     vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec,
2282                                              limit_vec, thresh_vec, 0);
2283   }
2284 
2285   if (v_ptr) {
2286     vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec,
2287                                              limit_vec, thresh_vec, 0);
2288   }
2289 }
2290 
2291 /* Vertical MB Filtering */
vp8_loop_filter_mbv_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2292 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2293                                unsigned char *v_ptr, int y_stride,
2294                                int uv_stride, loop_filter_info *lfi) {
2295   unsigned int thresh_vec, flimit_vec, limit_vec;
2296   unsigned char thresh, flimit, limit, flimit_temp;
2297 
2298   /* use direct value instead pointers */
2299   limit = *(lfi->lim);
2300   flimit_temp = *(lfi->mblim);
2301   thresh = *(lfi->hev_thr);
2302   flimit = flimit_temp;
2303 
2304   /* create quad-byte */
2305   __asm__ __volatile__(
2306       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2307       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2308       "replv.qb       %[limit_vec],  %[limit]     \n\t"
2309       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2310         [limit_vec] "=r"(limit_vec)
2311       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2312 
2313   vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2314                                        thresh_vec, 16);
2315 
2316   if (u_ptr)
2317     vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec,
2318                                            limit_vec, thresh_vec, 0);
2319 
2320   if (v_ptr)
2321     vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec,
2322                                            limit_vec, thresh_vec, 0);
2323 }
2324 
2325 /* Horizontal B Filtering */
vp8_loop_filter_bh_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2326 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2327                               unsigned char *v_ptr, int y_stride, int uv_stride,
2328                               loop_filter_info *lfi) {
2329   unsigned int thresh_vec, flimit_vec, limit_vec;
2330   unsigned char thresh, flimit, limit, flimit_temp;
2331 
2332   /* use direct value instead pointers */
2333   limit = *(lfi->lim);
2334   flimit_temp = *(lfi->blim);
2335   thresh = *(lfi->hev_thr);
2336   flimit = flimit_temp;
2337 
2338   /* create quad-byte */
2339   __asm__ __volatile__(
2340       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2341       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2342       "replv.qb       %[limit_vec],  %[limit]     \n\t"
2343       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2344         [limit_vec] "=r"(limit_vec)
2345       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2346 
2347   vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride,
2348                                        flimit_vec, limit_vec, thresh_vec, 16);
2349   vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride,
2350                                        flimit_vec, limit_vec, thresh_vec, 16);
2351   vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride,
2352                                        flimit_vec, limit_vec, thresh_vec, 16);
2353 
2354   if (u_ptr)
2355     vp8_loop_filter_uvhorizontal_edge_mips(
2356         u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2357 
2358   if (v_ptr)
2359     vp8_loop_filter_uvhorizontal_edge_mips(
2360         v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2361 }
2362 
2363 /* Vertical B Filtering */
vp8_loop_filter_bv_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2364 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2365                               unsigned char *v_ptr, int y_stride, int uv_stride,
2366                               loop_filter_info *lfi) {
2367   unsigned int thresh_vec, flimit_vec, limit_vec;
2368   unsigned char thresh, flimit, limit, flimit_temp;
2369 
2370   /* use direct value instead pointers */
2371   limit = *(lfi->lim);
2372   flimit_temp = *(lfi->blim);
2373   thresh = *(lfi->hev_thr);
2374   flimit = flimit_temp;
2375 
2376   /* create quad-byte */
2377   __asm__ __volatile__(
2378       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2379       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2380       "replv.qb       %[limit_vec],  %[limit]     \n\t"
2381       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2382         [limit_vec] "=r"(limit_vec)
2383       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2384 
2385   vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec,
2386                                      thresh_vec, 16);
2387   vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec,
2388                                      thresh_vec, 16);
2389   vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec,
2390                                      limit_vec, thresh_vec, 16);
2391 
2392   if (u_ptr)
2393     vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec,
2394                                          limit_vec, thresh_vec, 0);
2395 
2396   if (v_ptr)
2397     vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec,
2398                                          limit_vec, thresh_vec, 0);
2399 }
2400 
2401 #endif
2402