1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
13 #define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
14 
15 #include <stdlib.h>
16 
17 #include "config/aom_dsp_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/mem.h"
22 
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
26 
27 #if HAVE_DSPR2
28 /* inputs & outputs are quad-byte vectors */
filter_dspr2(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)29 static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
30                                 uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
31   int32_t aom_filter_l, aom_filter_r;
32   int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
33   int32_t subr_r, subr_l;
34   uint32_t t1, t2, HWM, t3;
35   uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
36   int32_t vps1, vps0, vqs0, vqs1;
37   int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
38   uint32_t N128;
39 
40   N128 = 0x80808080;
41   t1 = 0x03000300;
42   t2 = 0x04000400;
43   t3 = 0x01000100;
44   HWM = 0xFF00FF00;
45 
46   vps0 = (*ps0) ^ N128;
47   vps1 = (*ps1) ^ N128;
48   vqs0 = (*qs0) ^ N128;
49   vqs1 = (*qs1) ^ N128;
50 
51   /* use halfword pairs instead quad-bytes because of accuracy */
52   vps0_l = vps0 & HWM;
53   vps0_r = vps0 << 8;
54   vps0_r = vps0_r & HWM;
55 
56   vps1_l = vps1 & HWM;
57   vps1_r = vps1 << 8;
58   vps1_r = vps1_r & HWM;
59 
60   vqs0_l = vqs0 & HWM;
61   vqs0_r = vqs0 << 8;
62   vqs0_r = vqs0_r & HWM;
63 
64   vqs1_l = vqs1 & HWM;
65   vqs1_r = vqs1 << 8;
66   vqs1_r = vqs1_r & HWM;
67 
68   mask_l = mask & HWM;
69   mask_r = mask << 8;
70   mask_r = mask_r & HWM;
71 
72   hev_l = hev & HWM;
73   hev_r = hev << 8;
74   hev_r = hev_r & HWM;
75 
76   __asm__ __volatile__(
77       /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
78       "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
79       "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
80 
81       /* qs0 - ps0 */
82       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
83       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
84 
85       /* aom_filter &= hev; */
86       "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
87       "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
88 
89       /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
90       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
91       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
92       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
93       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
94       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
95       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
96       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
97       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
98 
99       /* aom_filter &= mask; */
100       "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
101       "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
102 
103       : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
104         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
105         [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
106       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
107         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
108         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
109         [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
110         [HWM] "r"(HWM));
111 
112   /* save bottom 3 bits so that we round one side +4 and the other +3 */
113   __asm__ __volatile__(
114       /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
115       "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
116       "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
117 
118       /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
119       "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
120       "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
121       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
122       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
123 
124       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
125       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
126 
127       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
128       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
129 
130       /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
131       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
132       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
133 
134       /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
135       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
136       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
137 
138       : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
139         [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
140         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
141         [vqs0_r] "+r"(vqs0_r)
142       : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
143         [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
144 
145   __asm__ __volatile__(
146       /* (aom_filter += 1) >>= 1 */
147       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
148       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
149 
150       /* aom_filter &= ~hev; */
151       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
152       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
153 
154       /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
155       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
156       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
157 
158       /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
159       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
160       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
161 
162       : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
163         [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
164         [vqs1_r] "+r"(vqs1_r)
165       : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
166 
167   /* Create quad-bytes from halfword pairs */
168   vqs0_l = vqs0_l & HWM;
169   vqs1_l = vqs1_l & HWM;
170   vps0_l = vps0_l & HWM;
171   vps1_l = vps1_l & HWM;
172 
173   __asm__ __volatile__(
174       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
175       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
176       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
177       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
178 
179       : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
180         [vqs0_r] "+r"(vqs0_r)
181       :);
182 
183   vqs0 = vqs0_l | vqs0_r;
184   vqs1 = vqs1_l | vqs1_r;
185   vps0 = vps0_l | vps0_r;
186   vps1 = vps1_l | vps1_r;
187 
188   *ps0 = vps0 ^ N128;
189   *ps1 = vps1 ^ N128;
190   *qs0 = vqs0 ^ N128;
191   *qs1 = vqs1 ^ N128;
192 }
193 
filter1_dspr2(uint32_t mask,uint32_t hev,uint32_t ps1,uint32_t ps0,uint32_t qs0,uint32_t qs1,uint32_t * p1_f0,uint32_t * p0_f0,uint32_t * q0_f0,uint32_t * q1_f0)194 static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
195                                  uint32_t ps0, uint32_t qs0, uint32_t qs1,
196                                  uint32_t *p1_f0, uint32_t *p0_f0,
197                                  uint32_t *q0_f0, uint32_t *q1_f0) {
198   int32_t aom_filter_l, aom_filter_r;
199   int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
200   int32_t subr_r, subr_l;
201   uint32_t t1, t2, HWM, t3;
202   uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
203   int32_t vps1, vps0, vqs0, vqs1;
204   int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
205   uint32_t N128;
206 
207   N128 = 0x80808080;
208   t1 = 0x03000300;
209   t2 = 0x04000400;
210   t3 = 0x01000100;
211   HWM = 0xFF00FF00;
212 
213   vps0 = (ps0) ^ N128;
214   vps1 = (ps1) ^ N128;
215   vqs0 = (qs0) ^ N128;
216   vqs1 = (qs1) ^ N128;
217 
218   /* use halfword pairs instead quad-bytes because of accuracy */
219   vps0_l = vps0 & HWM;
220   vps0_r = vps0 << 8;
221   vps0_r = vps0_r & HWM;
222 
223   vps1_l = vps1 & HWM;
224   vps1_r = vps1 << 8;
225   vps1_r = vps1_r & HWM;
226 
227   vqs0_l = vqs0 & HWM;
228   vqs0_r = vqs0 << 8;
229   vqs0_r = vqs0_r & HWM;
230 
231   vqs1_l = vqs1 & HWM;
232   vqs1_r = vqs1 << 8;
233   vqs1_r = vqs1_r & HWM;
234 
235   mask_l = mask & HWM;
236   mask_r = mask << 8;
237   mask_r = mask_r & HWM;
238 
239   hev_l = hev & HWM;
240   hev_r = hev << 8;
241   hev_r = hev_r & HWM;
242 
243   __asm__ __volatile__(
244       /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
245       "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
246       "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
247 
248       /* qs0 - ps0 */
249       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
250       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
251 
252       /* aom_filter &= hev; */
253       "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
254       "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
255 
256       /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
257       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
258       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
259       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
260       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
261       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
262       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
263       "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
264       "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
265 
266       /* aom_filter &= mask; */
267       "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
268       "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
269 
270       : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
271         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
272         [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
273       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
274         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
275         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
276         [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
277         [HWM] "r"(HWM));
278 
279   /* save bottom 3 bits so that we round one side +4 and the other +3 */
280   __asm__ __volatile__(
281       /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
282       "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
283       "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
284 
285       /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
286       "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
287       "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
288       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
289       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
290 
291       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
292       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
293 
294       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
295       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
296 
297       /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
298       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
299       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
300 
301       /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
302       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
303       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
304 
305       : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
306         [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
307         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
308         [vqs0_r] "+r"(vqs0_r)
309       : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
310         [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
311 
312   __asm__ __volatile__(
313       /* (aom_filter += 1) >>= 1 */
314       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
315       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
316 
317       /* aom_filter &= ~hev; */
318       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
319       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
320 
321       /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
322       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
323       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
324 
325       /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
326       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
327       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
328 
329       : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
330         [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
331         [vqs1_r] "+r"(vqs1_r)
332       : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
333 
334   /* Create quad-bytes from halfword pairs */
335   vqs0_l = vqs0_l & HWM;
336   vqs1_l = vqs1_l & HWM;
337   vps0_l = vps0_l & HWM;
338   vps1_l = vps1_l & HWM;
339 
340   __asm__ __volatile__(
341       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
342       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
343       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
344       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
345 
346       : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
347         [vqs0_r] "+r"(vqs0_r)
348       :);
349 
350   vqs0 = vqs0_l | vqs0_r;
351   vqs1 = vqs1_l | vqs1_r;
352   vps0 = vps0_l | vps0_r;
353   vps1 = vps1_l | vps1_r;
354 
355   *p0_f0 = vps0 ^ N128;
356   *p1_f0 = vps1 ^ N128;
357   *q0_f0 = vqs0 ^ N128;
358   *q1_f0 = vqs1 ^ N128;
359 }
360 
mbfilter_dspr2(uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3)361 static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
362                                   uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
363                                   uint32_t *oq2, uint32_t *oq3) {
364   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
365   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
366   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
367   uint32_t res_op2, res_op1, res_op0;
368   uint32_t res_oq0, res_oq1, res_oq2;
369   uint32_t tmp;
370   uint32_t add_p210_q012;
371   uint32_t u32Four = 0x00040004;
372 
373   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
374   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
375   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
376   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
377   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
378   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
379 
380   __asm__ __volatile__(
381       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
382       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
383       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
384       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
385       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
386       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
387 
388       "shll.ph    %[tmp],            %[p3],             1                \n\t"
389       "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
390       "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
391       "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
392       "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
393       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
394       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
395       "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
396       "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
397       "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
398       "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
399       "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
400       "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
401       "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
402       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
403       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
404       "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
405       "shll.ph    %[tmp],            %[q3],             1                \n\t"
406       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
407       "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
408       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
409       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
410       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
411       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
412       "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
413       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
414       "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
415       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
416       "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
417       "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
418 
419       : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
420         [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
421         [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
422         [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
423       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
424         [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
425 
426   *op2 = res_op2;
427   *op1 = res_op1;
428   *op0 = res_op0;
429   *oq0 = res_oq0;
430   *oq1 = res_oq1;
431   *oq2 = res_oq2;
432 }
433 
mbfilter1_dspr2(uint32_t p3,uint32_t p2,uint32_t p1,uint32_t p0,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t * op2_f1,uint32_t * op1_f1,uint32_t * op0_f1,uint32_t * oq0_f1,uint32_t * oq1_f1,uint32_t * oq2_f1)434 static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
435                                    uint32_t p0, uint32_t q0, uint32_t q1,
436                                    uint32_t q2, uint32_t q3, uint32_t *op2_f1,
437                                    uint32_t *op1_f1, uint32_t *op0_f1,
438                                    uint32_t *oq0_f1, uint32_t *oq1_f1,
439                                    uint32_t *oq2_f1) {
440   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
441   uint32_t res_op2, res_op1, res_op0;
442   uint32_t res_oq0, res_oq1, res_oq2;
443   uint32_t tmp;
444   uint32_t add_p210_q012;
445   uint32_t u32Four = 0x00040004;
446 
447   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
448   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
449   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
450   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
451   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
452   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
453 
454   __asm__ __volatile__(
455       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
456       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
457       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
458       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
459       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
460       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
461 
462       "shll.ph    %[tmp],            %[p3],             1                 \n\t"
463       "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
464       "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
465       "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
466       "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
467       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
468       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
469       "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
470       "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
471       "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
472       "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
473       "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
474       "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
475       "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
476       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
477       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
478       "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
479       "shll.ph    %[tmp],            %[q3],             1                 \n\t"
480       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
481       "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
482       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
483       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
484       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
485       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
486       "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
487       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
488       "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
489       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
490       "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
491       "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
492 
493       : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
494         [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
495         [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
496         [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
497       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
498         [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
499 
500   *op2_f1 = res_op2;
501   *op1_f1 = res_op1;
502   *op0_f1 = res_op0;
503   *oq0_f1 = res_oq0;
504   *oq1_f1 = res_oq1;
505   *oq2_f1 = res_oq2;
506 }
507 
wide_mbfilter_dspr2(uint32_t * op7,uint32_t * op6,uint32_t * op5,uint32_t * op4,uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3,uint32_t * oq4,uint32_t * oq5,uint32_t * oq6,uint32_t * oq7)508 static INLINE void wide_mbfilter_dspr2(
509     uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
510     uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
511     uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
512     uint32_t *oq7) {
513   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
514   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
515   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
516   const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
517   uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
518   uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
519   uint32_t tmp;
520   uint32_t add_p6toq6;
521   uint32_t u32Eight = 0x00080008;
522 
523   __asm__ __volatile__(
524       /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
525          which is used most of the time */
526       "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
527       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
528       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
529       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
530       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
531       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
532       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
533       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
534       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
535       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
536       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
537       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
538       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
539       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
540 
541       : [add_p6toq6] "=&r"(add_p6toq6)
542       : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
543         [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
544         [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
545         [u32Eight] "r"(u32Eight));
546 
547   __asm__ __volatile__(
548       /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
549                                    p3 + p2 + p1 + p0 + q0, 4) */
550       "shll.ph       %[tmp],            %[p7],            3               \n\t"
551       "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
552       "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
553       "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
554       "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
555       "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
556       "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
557       "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
558       "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
559       "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
560       "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
561 
562       /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
563                                    p2 + p1 + p0 + q0 + q1, 4) */
564       "shll.ph       %[tmp],            %[p7],            2               \n\t"
565       "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
566       "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
567       "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
568       "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
569       "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
570       "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
571       "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
572       "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
573       "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
574       "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
575 
576       /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
577                                    p1 + p0 + q0 + q1 + q2, 4) */
578       "shll.ph       %[tmp],            %[p7],            2               \n\t"
579       "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
580       "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
581       "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
582       "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
583       "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
584       "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
585       "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
586       "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
587 
588       /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
589                                    p1 + p0 + q0 + q1 + q2 + q3, 4) */
590       "shll.ph       %[tmp],            %[p7],            2               \n\t"
591       "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
592       "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
593       "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
594       "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
595       "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
596       "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
597 
598       /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
599                                    p0 + q0 + q1 + q2 + q3 + q4, 4) */
600       "shll.ph       %[tmp],            %[p7],            1               \n\t"
601       "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
602       "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
603       "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
604       "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
605       "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
606       "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
607 
608       /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
609                                    p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
610       "shll.ph       %[tmp],            %[p7],            1               \n\t"
611       "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
612       "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
613       "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
614       "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
615 
616       /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
617                                   q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
618       "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
619       "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
620       "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
621 
622       : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
623         [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
624         [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
625         [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
626       : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
627         [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
628         [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
629         [add_p6toq6] "r"(add_p6toq6));
630 
631   *op6 = res_op6;
632   *op5 = res_op5;
633   *op4 = res_op4;
634   *op3 = res_op3;
635   *op2 = res_op2;
636   *op1 = res_op1;
637   *op0 = res_op0;
638 
639   __asm__ __volatile__(
640       /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
641                                    q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
642       "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
643       "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
644       "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
645 
646       /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
647                                    q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
648       "shll.ph       %[tmp],            %[q7],            1               \n\t"
649       "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
650       "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
651       "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
652       "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
653 
654       /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
655                                    q3 + q4 + q5 + q6 + q7 * 3, 4) */
656       "shll.ph       %[tmp],            %[q7],            1               \n\t"
657       "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
658       "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
659       "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
660       "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
661       "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
662       "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
663 
664       /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
665                                    q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
666       "shll.ph       %[tmp],            %[q7],            2               \n\t"
667       "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
668       "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
669       "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
670       "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
671       "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
672       "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
673 
674       /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
675                                    q4 * 2 + q5 + q6 + q7 * 5, 4) */
676       "shll.ph       %[tmp],            %[q7],            2               \n\t"
677       "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
678       "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
679       "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
680       "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
681       "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
682       "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
683       "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
684       "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
685 
686       /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
687                                    q5 * 2 + q6 + q7 * 6, 4) */
688       "shll.ph       %[tmp],            %[q7],            2               \n\t"
689       "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
690       "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
691       "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
692       "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
693       "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
694       "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
695       "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
696       "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
697       "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
698       "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
699 
700       /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
701                                    q4 + q5 + q6 * 2 + q7 * 7, 4) */
702       "shll.ph       %[tmp],            %[q7],            3               \n\t"
703       "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
704       "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
705       "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
706       "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
707       "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
708       "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
709       "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
710       "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
711       "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
712       "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
713 
714       : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
715         [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
716         [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
717         [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
718       : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
719         [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
720         [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
721         [add_p6toq6] "r"(add_p6toq6));
722 
723   *oq0 = res_oq0;
724   *oq1 = res_oq1;
725   *oq2 = res_oq2;
726   *oq3 = res_oq3;
727   *oq4 = res_oq4;
728   *oq5 = res_oq5;
729   *oq6 = res_oq6;
730 }
731 #endif  // #if HAVE_DSPR2
732 #ifdef __cplusplus
733 }  // extern "C"
734 #endif
735 
736 #endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
737