1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
13 #define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
14
15 #include <stdlib.h>
16
17 #include "config/aom_dsp_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/mem.h"
22
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
26
27 #if HAVE_DSPR2
28 /* inputs & outputs are quad-byte vectors */
filter_dspr2(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)29 static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
30 uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
31 int32_t aom_filter_l, aom_filter_r;
32 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
33 int32_t subr_r, subr_l;
34 uint32_t t1, t2, HWM, t3;
35 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
36 int32_t vps1, vps0, vqs0, vqs1;
37 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
38 uint32_t N128;
39
40 N128 = 0x80808080;
41 t1 = 0x03000300;
42 t2 = 0x04000400;
43 t3 = 0x01000100;
44 HWM = 0xFF00FF00;
45
46 vps0 = (*ps0) ^ N128;
47 vps1 = (*ps1) ^ N128;
48 vqs0 = (*qs0) ^ N128;
49 vqs1 = (*qs1) ^ N128;
50
51 /* use halfword pairs instead quad-bytes because of accuracy */
52 vps0_l = vps0 & HWM;
53 vps0_r = vps0 << 8;
54 vps0_r = vps0_r & HWM;
55
56 vps1_l = vps1 & HWM;
57 vps1_r = vps1 << 8;
58 vps1_r = vps1_r & HWM;
59
60 vqs0_l = vqs0 & HWM;
61 vqs0_r = vqs0 << 8;
62 vqs0_r = vqs0_r & HWM;
63
64 vqs1_l = vqs1 & HWM;
65 vqs1_r = vqs1 << 8;
66 vqs1_r = vqs1_r & HWM;
67
68 mask_l = mask & HWM;
69 mask_r = mask << 8;
70 mask_r = mask_r & HWM;
71
72 hev_l = hev & HWM;
73 hev_r = hev << 8;
74 hev_r = hev_r & HWM;
75
76 __asm__ __volatile__(
77 /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
78 "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
79 "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
80
81 /* qs0 - ps0 */
82 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
83 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
84
85 /* aom_filter &= hev; */
86 "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
87 "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
88
89 /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
90 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
91 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
92 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
93 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
94 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
95 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
96 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
97 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
98
99 /* aom_filter &= mask; */
100 "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
101 "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
102
103 : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
104 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
105 [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
106 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
107 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
108 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
109 [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
110 [HWM] "r"(HWM));
111
112 /* save bottom 3 bits so that we round one side +4 and the other +3 */
113 __asm__ __volatile__(
114 /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
115 "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
116 "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
117
118 /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
119 "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
120 "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
121 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
122 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
123
124 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
125 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
126
127 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
128 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
129
130 /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
131 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
132 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
133
134 /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
135 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
136 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
137
138 : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
139 [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
140 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
141 [vqs0_r] "+r"(vqs0_r)
142 : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
143 [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
144
145 __asm__ __volatile__(
146 /* (aom_filter += 1) >>= 1 */
147 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
148 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
149
150 /* aom_filter &= ~hev; */
151 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
152 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
153
154 /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
155 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
156 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
157
158 /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
159 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
160 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
161
162 : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
163 [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
164 [vqs1_r] "+r"(vqs1_r)
165 : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
166
167 /* Create quad-bytes from halfword pairs */
168 vqs0_l = vqs0_l & HWM;
169 vqs1_l = vqs1_l & HWM;
170 vps0_l = vps0_l & HWM;
171 vps1_l = vps1_l & HWM;
172
173 __asm__ __volatile__(
174 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
175 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
176 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
177 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
178
179 : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
180 [vqs0_r] "+r"(vqs0_r)
181 :);
182
183 vqs0 = vqs0_l | vqs0_r;
184 vqs1 = vqs1_l | vqs1_r;
185 vps0 = vps0_l | vps0_r;
186 vps1 = vps1_l | vps1_r;
187
188 *ps0 = vps0 ^ N128;
189 *ps1 = vps1 ^ N128;
190 *qs0 = vqs0 ^ N128;
191 *qs1 = vqs1 ^ N128;
192 }
193
filter1_dspr2(uint32_t mask,uint32_t hev,uint32_t ps1,uint32_t ps0,uint32_t qs0,uint32_t qs1,uint32_t * p1_f0,uint32_t * p0_f0,uint32_t * q0_f0,uint32_t * q1_f0)194 static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
195 uint32_t ps0, uint32_t qs0, uint32_t qs1,
196 uint32_t *p1_f0, uint32_t *p0_f0,
197 uint32_t *q0_f0, uint32_t *q1_f0) {
198 int32_t aom_filter_l, aom_filter_r;
199 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
200 int32_t subr_r, subr_l;
201 uint32_t t1, t2, HWM, t3;
202 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
203 int32_t vps1, vps0, vqs0, vqs1;
204 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
205 uint32_t N128;
206
207 N128 = 0x80808080;
208 t1 = 0x03000300;
209 t2 = 0x04000400;
210 t3 = 0x01000100;
211 HWM = 0xFF00FF00;
212
213 vps0 = (ps0) ^ N128;
214 vps1 = (ps1) ^ N128;
215 vqs0 = (qs0) ^ N128;
216 vqs1 = (qs1) ^ N128;
217
218 /* use halfword pairs instead quad-bytes because of accuracy */
219 vps0_l = vps0 & HWM;
220 vps0_r = vps0 << 8;
221 vps0_r = vps0_r & HWM;
222
223 vps1_l = vps1 & HWM;
224 vps1_r = vps1 << 8;
225 vps1_r = vps1_r & HWM;
226
227 vqs0_l = vqs0 & HWM;
228 vqs0_r = vqs0 << 8;
229 vqs0_r = vqs0_r & HWM;
230
231 vqs1_l = vqs1 & HWM;
232 vqs1_r = vqs1 << 8;
233 vqs1_r = vqs1_r & HWM;
234
235 mask_l = mask & HWM;
236 mask_r = mask << 8;
237 mask_r = mask_r & HWM;
238
239 hev_l = hev & HWM;
240 hev_r = hev << 8;
241 hev_r = hev_r & HWM;
242
243 __asm__ __volatile__(
244 /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
245 "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
246 "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
247
248 /* qs0 - ps0 */
249 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
250 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
251
252 /* aom_filter &= hev; */
253 "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
254 "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
255
256 /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
257 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
258 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
259 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
260 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
261 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
262 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
263 "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
264 "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
265
266 /* aom_filter &= mask; */
267 "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
268 "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
269
270 : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
271 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
272 [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
273 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
274 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
275 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
276 [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
277 [HWM] "r"(HWM));
278
279 /* save bottom 3 bits so that we round one side +4 and the other +3 */
280 __asm__ __volatile__(
281 /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
282 "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
283 "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
284
285 /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
286 "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
287 "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
288 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
289 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
290
291 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
292 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
293
294 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
295 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
296
297 /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
298 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
299 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
300
301 /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
302 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
303 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
304
305 : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
306 [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
307 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
308 [vqs0_r] "+r"(vqs0_r)
309 : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
310 [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
311
312 __asm__ __volatile__(
313 /* (aom_filter += 1) >>= 1 */
314 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
315 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
316
317 /* aom_filter &= ~hev; */
318 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
319 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
320
321 /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
322 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
323 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
324
325 /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
326 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
327 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
328
329 : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
330 [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
331 [vqs1_r] "+r"(vqs1_r)
332 : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
333
334 /* Create quad-bytes from halfword pairs */
335 vqs0_l = vqs0_l & HWM;
336 vqs1_l = vqs1_l & HWM;
337 vps0_l = vps0_l & HWM;
338 vps1_l = vps1_l & HWM;
339
340 __asm__ __volatile__(
341 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
342 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
343 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
344 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
345
346 : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
347 [vqs0_r] "+r"(vqs0_r)
348 :);
349
350 vqs0 = vqs0_l | vqs0_r;
351 vqs1 = vqs1_l | vqs1_r;
352 vps0 = vps0_l | vps0_r;
353 vps1 = vps1_l | vps1_r;
354
355 *p0_f0 = vps0 ^ N128;
356 *p1_f0 = vps1 ^ N128;
357 *q0_f0 = vqs0 ^ N128;
358 *q1_f0 = vqs1 ^ N128;
359 }
360
mbfilter_dspr2(uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3)361 static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
362 uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
363 uint32_t *oq2, uint32_t *oq3) {
364 /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
365 const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
366 const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
367 uint32_t res_op2, res_op1, res_op0;
368 uint32_t res_oq0, res_oq1, res_oq2;
369 uint32_t tmp;
370 uint32_t add_p210_q012;
371 uint32_t u32Four = 0x00040004;
372
373 /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
374 /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
375 /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
376 /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
377 /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
378 /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
379
380 __asm__ __volatile__(
381 "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
382 "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
383 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
384 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
385 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
386 "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
387
388 "shll.ph %[tmp], %[p3], 1 \n\t"
389 "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
390 "addu.ph %[res_op1], %[p3], %[p3] \n\t"
391 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
392 "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
393 "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
394 "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
395 "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
396 "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
397 "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
398 "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
399 "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
400 "addu.ph %[res_op0], %[p3], %[p0] \n\t"
401 "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
402 "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
403 "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
404 "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
405 "shll.ph %[tmp], %[q3], 1 \n\t"
406 "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
407 "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
408 "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
409 "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
410 "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
411 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
412 "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
413 "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
414 "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
415 "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
416 "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
417 "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
418
419 : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
420 [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
421 [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
422 [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
423 : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
424 [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
425
426 *op2 = res_op2;
427 *op1 = res_op1;
428 *op0 = res_op0;
429 *oq0 = res_oq0;
430 *oq1 = res_oq1;
431 *oq2 = res_oq2;
432 }
433
mbfilter1_dspr2(uint32_t p3,uint32_t p2,uint32_t p1,uint32_t p0,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t * op2_f1,uint32_t * op1_f1,uint32_t * op0_f1,uint32_t * oq0_f1,uint32_t * oq1_f1,uint32_t * oq2_f1)434 static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
435 uint32_t p0, uint32_t q0, uint32_t q1,
436 uint32_t q2, uint32_t q3, uint32_t *op2_f1,
437 uint32_t *op1_f1, uint32_t *op0_f1,
438 uint32_t *oq0_f1, uint32_t *oq1_f1,
439 uint32_t *oq2_f1) {
440 /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
441 uint32_t res_op2, res_op1, res_op0;
442 uint32_t res_oq0, res_oq1, res_oq2;
443 uint32_t tmp;
444 uint32_t add_p210_q012;
445 uint32_t u32Four = 0x00040004;
446
447 /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
448 /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
449 /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
450 /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
451 /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
452 /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
453
454 __asm__ __volatile__(
455 "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
456 "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
457 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
458 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
459 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
460 "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
461
462 "shll.ph %[tmp], %[p3], 1 \n\t"
463 "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
464 "addu.ph %[res_op1], %[p3], %[p3] \n\t"
465 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
466 "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
467 "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
468 "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
469 "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
470 "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
471 "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
472 "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
473 "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
474 "addu.ph %[res_op0], %[p3], %[p0] \n\t"
475 "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
476 "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
477 "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
478 "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
479 "shll.ph %[tmp], %[q3], 1 \n\t"
480 "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
481 "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
482 "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
483 "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
484 "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
485 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
486 "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
487 "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
488 "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
489 "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
490 "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
491 "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
492
493 : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
494 [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
495 [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
496 [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
497 : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
498 [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
499
500 *op2_f1 = res_op2;
501 *op1_f1 = res_op1;
502 *op0_f1 = res_op0;
503 *oq0_f1 = res_oq0;
504 *oq1_f1 = res_oq1;
505 *oq2_f1 = res_oq2;
506 }
507
wide_mbfilter_dspr2(uint32_t * op7,uint32_t * op6,uint32_t * op5,uint32_t * op4,uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3,uint32_t * oq4,uint32_t * oq5,uint32_t * oq6,uint32_t * oq7)508 static INLINE void wide_mbfilter_dspr2(
509 uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
510 uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
511 uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
512 uint32_t *oq7) {
513 const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
514 const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
515 const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
516 const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
517 uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
518 uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
519 uint32_t tmp;
520 uint32_t add_p6toq6;
521 uint32_t u32Eight = 0x00080008;
522
523 __asm__ __volatile__(
524 /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
525 which is used most of the time */
526 "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
527 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
528 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
529 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
530 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
531 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
532 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
533 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
534 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
535 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
536 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
537 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
538 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
539 "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
540
541 : [add_p6toq6] "=&r"(add_p6toq6)
542 : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
543 [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
544 [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
545 [u32Eight] "r"(u32Eight));
546
547 __asm__ __volatile__(
548 /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
549 p3 + p2 + p1 + p0 + q0, 4) */
550 "shll.ph %[tmp], %[p7], 3 \n\t"
551 "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
552 "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
553 "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
554 "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
555 "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
556 "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
557 "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
558 "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
559 "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
560 "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
561
562 /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
563 p2 + p1 + p0 + q0 + q1, 4) */
564 "shll.ph %[tmp], %[p7], 2 \n\t"
565 "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
566 "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
567 "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
568 "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
569 "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
570 "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
571 "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
572 "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
573 "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
574 "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
575
576 /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
577 p1 + p0 + q0 + q1 + q2, 4) */
578 "shll.ph %[tmp], %[p7], 2 \n\t"
579 "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
580 "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
581 "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
582 "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
583 "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
584 "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
585 "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
586 "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
587
588 /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
589 p1 + p0 + q0 + q1 + q2 + q3, 4) */
590 "shll.ph %[tmp], %[p7], 2 \n\t"
591 "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
592 "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
593 "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
594 "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
595 "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
596 "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
597
598 /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
599 p0 + q0 + q1 + q2 + q3 + q4, 4) */
600 "shll.ph %[tmp], %[p7], 1 \n\t"
601 "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
602 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
603 "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
604 "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
605 "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
606 "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
607
608 /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
609 p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
610 "shll.ph %[tmp], %[p7], 1 \n\t"
611 "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
612 "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
613 "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
614 "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
615
616 /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
617 q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
618 "addu.ph %[res_op0], %[p7], %[p0] \n\t"
619 "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
620 "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
621
622 : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
623 [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
624 [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
625 [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
626 : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
627 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
628 [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
629 [add_p6toq6] "r"(add_p6toq6));
630
631 *op6 = res_op6;
632 *op5 = res_op5;
633 *op4 = res_op4;
634 *op3 = res_op3;
635 *op2 = res_op2;
636 *op1 = res_op1;
637 *op0 = res_op0;
638
639 __asm__ __volatile__(
640 /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
641 q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
642 "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
643 "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
644 "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
645
646 /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
647 q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
648 "shll.ph %[tmp], %[q7], 1 \n\t"
649 "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
650 "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
651 "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
652 "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
653
654 /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
655 q3 + q4 + q5 + q6 + q7 * 3, 4) */
656 "shll.ph %[tmp], %[q7], 1 \n\t"
657 "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
658 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
659 "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
660 "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
661 "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
662 "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
663
664 /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
665 q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
666 "shll.ph %[tmp], %[q7], 2 \n\t"
667 "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
668 "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
669 "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
670 "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
671 "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
672 "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
673
674 /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
675 q4 * 2 + q5 + q6 + q7 * 5, 4) */
676 "shll.ph %[tmp], %[q7], 2 \n\t"
677 "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
678 "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
679 "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
680 "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
681 "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
682 "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
683 "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
684 "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
685
686 /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
687 q5 * 2 + q6 + q7 * 6, 4) */
688 "shll.ph %[tmp], %[q7], 2 \n\t"
689 "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
690 "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
691 "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
692 "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
693 "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
694 "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
695 "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
696 "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
697 "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
698 "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
699
700 /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
701 q4 + q5 + q6 * 2 + q7 * 7, 4) */
702 "shll.ph %[tmp], %[q7], 3 \n\t"
703 "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
704 "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
705 "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
706 "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
707 "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
708 "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
709 "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
710 "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
711 "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
712 "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
713
714 : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
715 [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
716 [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
717 [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
718 : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
719 [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
720 [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
721 [add_p6toq6] "r"(add_p6toq6));
722
723 *oq0 = res_oq0;
724 *oq1 = res_oq1;
725 *oq2 = res_oq2;
726 *oq3 = res_oq3;
727 *oq4 = res_oq4;
728 *oq5 = res_oq5;
729 *oq6 = res_oq6;
730 }
731 #endif // #if HAVE_DSPR2
732 #ifdef __cplusplus
733 } // extern "C"
734 #endif
735
736 #endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
737