1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20
21 #if HAVE_DSPR2
vpx_lpf_horizontal_8_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)22 void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
23 const uint8_t *blimit, const uint8_t *limit,
24 const uint8_t *thresh) {
25 uint32_t mask;
26 uint32_t hev, flat;
27 uint8_t i;
28 uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
29 uint32_t thresh_vec, flimit_vec, limit_vec;
30 uint32_t uflimit, ulimit, uthresh;
31 uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
32 uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
33 uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
34 uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
35
36 uflimit = *blimit;
37 ulimit = *limit;
38 uthresh = *thresh;
39
40 /* create quad-byte */
41 __asm__ __volatile__(
42 "replv.qb %[thresh_vec], %[uthresh] \n\t"
43 "replv.qb %[flimit_vec], %[uflimit] \n\t"
44 "replv.qb %[limit_vec], %[ulimit] \n\t"
45
46 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
47 [limit_vec] "=r"(limit_vec)
48 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
49
50 /* prefetch data for store */
51 prefetch_store(s);
52
53 for (i = 0; i < 2; i++) {
54 sp3 = s - (pitch << 2);
55 sp2 = sp3 + pitch;
56 sp1 = sp2 + pitch;
57 sp0 = sp1 + pitch;
58 sq0 = s;
59 sq1 = s + pitch;
60 sq2 = sq1 + pitch;
61 sq3 = sq2 + pitch;
62
63 __asm__ __volatile__(
64 "lw %[p3], (%[sp3]) \n\t"
65 "lw %[p2], (%[sp2]) \n\t"
66 "lw %[p1], (%[sp1]) \n\t"
67 "lw %[p0], (%[sp0]) \n\t"
68 "lw %[q0], (%[sq0]) \n\t"
69 "lw %[q1], (%[sq1]) \n\t"
70 "lw %[q2], (%[sq2]) \n\t"
71 "lw %[q3], (%[sq3]) \n\t"
72
73 : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
74 [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
75 : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
76 [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
77
78 filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
79 p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
80
81 if ((flat == 0) && (mask != 0)) {
82 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
83
84 __asm__ __volatile__(
85 "sw %[p1_f0], (%[sp1]) \n\t"
86 "sw %[p0_f0], (%[sp0]) \n\t"
87 "sw %[q0_f0], (%[sq0]) \n\t"
88 "sw %[q1_f0], (%[sq1]) \n\t"
89
90 :
91 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
92 [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
93 [sq1] "r"(sq1));
94 } else if ((mask & flat) == 0xFFFFFFFF) {
95 /* left 2 element operation */
96 PACK_LEFT_0TO3()
97 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
98
99 /* right 2 element operation */
100 PACK_RIGHT_0TO3()
101 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
102
103 COMBINE_LEFT_RIGHT_0TO2()
104
105 __asm__ __volatile__(
106 "sw %[p2], (%[sp2]) \n\t"
107 "sw %[p1], (%[sp1]) \n\t"
108 "sw %[p0], (%[sp0]) \n\t"
109 "sw %[q0], (%[sq0]) \n\t"
110 "sw %[q1], (%[sq1]) \n\t"
111 "sw %[q2], (%[sq2]) \n\t"
112
113 :
114 : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
115 [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
116 [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
117 } else if ((flat != 0) && (mask != 0)) {
118 /* filtering */
119 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
120
121 /* left 2 element operation */
122 PACK_LEFT_0TO3()
123 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
124
125 /* right 2 element operation */
126 PACK_RIGHT_0TO3()
127 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
128
129 if (mask & flat & 0x000000FF) {
130 __asm__ __volatile__(
131 "sb %[p2_r], (%[sp2]) \n\t"
132 "sb %[p1_r], (%[sp1]) \n\t"
133 "sb %[p0_r], (%[sp0]) \n\t"
134 "sb %[q0_r], (%[sq0]) \n\t"
135 "sb %[q1_r], (%[sq1]) \n\t"
136 "sb %[q2_r], (%[sq2]) \n\t"
137
138 :
139 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
140 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
141 [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
142 [sq1] "r"(sq1), [sq2] "r"(sq2));
143 } else if (mask & 0x000000FF) {
144 __asm__ __volatile__(
145 "sb %[p1_f0], (%[sp1]) \n\t"
146 "sb %[p0_f0], (%[sp0]) \n\t"
147 "sb %[q0_f0], (%[sq0]) \n\t"
148 "sb %[q1_f0], (%[sq1]) \n\t"
149
150 :
151 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
152 [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
153 [sq0] "r"(sq0), [sq1] "r"(sq1));
154 }
155
156 __asm__ __volatile__(
157 "srl %[p2_r], %[p2_r], 16 \n\t"
158 "srl %[p1_r], %[p1_r], 16 \n\t"
159 "srl %[p0_r], %[p0_r], 16 \n\t"
160 "srl %[q0_r], %[q0_r], 16 \n\t"
161 "srl %[q1_r], %[q1_r], 16 \n\t"
162 "srl %[q2_r], %[q2_r], 16 \n\t"
163 "srl %[p1_f0], %[p1_f0], 8 \n\t"
164 "srl %[p0_f0], %[p0_f0], 8 \n\t"
165 "srl %[q0_f0], %[q0_f0], 8 \n\t"
166 "srl %[q1_f0], %[q1_f0], 8 \n\t"
167
168 : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
169 [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
170 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
171 [q1_f0] "+r"(q1_f0)
172 :);
173
174 if (mask & flat & 0x0000FF00) {
175 __asm__ __volatile__(
176 "sb %[p2_r], +1(%[sp2]) \n\t"
177 "sb %[p1_r], +1(%[sp1]) \n\t"
178 "sb %[p0_r], +1(%[sp0]) \n\t"
179 "sb %[q0_r], +1(%[sq0]) \n\t"
180 "sb %[q1_r], +1(%[sq1]) \n\t"
181 "sb %[q2_r], +1(%[sq2]) \n\t"
182
183 :
184 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
185 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
186 [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
187 [sq1] "r"(sq1), [sq2] "r"(sq2));
188 } else if (mask & 0x0000FF00) {
189 __asm__ __volatile__(
190 "sb %[p1_f0], +1(%[sp1]) \n\t"
191 "sb %[p0_f0], +1(%[sp0]) \n\t"
192 "sb %[q0_f0], +1(%[sq0]) \n\t"
193 "sb %[q1_f0], +1(%[sq1]) \n\t"
194
195 :
196 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
197 [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
198 [sq0] "r"(sq0), [sq1] "r"(sq1));
199 }
200
201 __asm__ __volatile__(
202 "srl %[p1_f0], %[p1_f0], 8 \n\t"
203 "srl %[p0_f0], %[p0_f0], 8 \n\t"
204 "srl %[q0_f0], %[q0_f0], 8 \n\t"
205 "srl %[q1_f0], %[q1_f0], 8 \n\t"
206
207 : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
208 [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
209 [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
210 :);
211
212 if (mask & flat & 0x00FF0000) {
213 __asm__ __volatile__(
214 "sb %[p2_l], +2(%[sp2]) \n\t"
215 "sb %[p1_l], +2(%[sp1]) \n\t"
216 "sb %[p0_l], +2(%[sp0]) \n\t"
217 "sb %[q0_l], +2(%[sq0]) \n\t"
218 "sb %[q1_l], +2(%[sq1]) \n\t"
219 "sb %[q2_l], +2(%[sq2]) \n\t"
220
221 :
222 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
223 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
224 [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
225 [sq1] "r"(sq1), [sq2] "r"(sq2));
226 } else if (mask & 0x00FF0000) {
227 __asm__ __volatile__(
228 "sb %[p1_f0], +2(%[sp1]) \n\t"
229 "sb %[p0_f0], +2(%[sp0]) \n\t"
230 "sb %[q0_f0], +2(%[sq0]) \n\t"
231 "sb %[q1_f0], +2(%[sq1]) \n\t"
232
233 :
234 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
235 [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
236 [sq0] "r"(sq0), [sq1] "r"(sq1));
237 }
238
239 __asm__ __volatile__(
240 "srl %[p2_l], %[p2_l], 16 \n\t"
241 "srl %[p1_l], %[p1_l], 16 \n\t"
242 "srl %[p0_l], %[p0_l], 16 \n\t"
243 "srl %[q0_l], %[q0_l], 16 \n\t"
244 "srl %[q1_l], %[q1_l], 16 \n\t"
245 "srl %[q2_l], %[q2_l], 16 \n\t"
246 "srl %[p1_f0], %[p1_f0], 8 \n\t"
247 "srl %[p0_f0], %[p0_f0], 8 \n\t"
248 "srl %[q0_f0], %[q0_f0], 8 \n\t"
249 "srl %[q1_f0], %[q1_f0], 8 \n\t"
250
251 : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
252 [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
253 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
254 [q1_f0] "+r"(q1_f0)
255 :);
256
257 if (mask & flat & 0xFF000000) {
258 __asm__ __volatile__(
259 "sb %[p2_l], +3(%[sp2]) \n\t"
260 "sb %[p1_l], +3(%[sp1]) \n\t"
261 "sb %[p0_l], +3(%[sp0]) \n\t"
262 "sb %[q0_l], +3(%[sq0]) \n\t"
263 "sb %[q1_l], +3(%[sq1]) \n\t"
264 "sb %[q2_l], +3(%[sq2]) \n\t"
265
266 :
267 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
268 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
269 [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
270 [sq1] "r"(sq1), [sq2] "r"(sq2));
271 } else if (mask & 0xFF000000) {
272 __asm__ __volatile__(
273 "sb %[p1_f0], +3(%[sp1]) \n\t"
274 "sb %[p0_f0], +3(%[sp0]) \n\t"
275 "sb %[q0_f0], +3(%[sq0]) \n\t"
276 "sb %[q1_f0], +3(%[sq1]) \n\t"
277
278 :
279 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
280 [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
281 [sq0] "r"(sq0), [sq1] "r"(sq1));
282 }
283 }
284
285 s = s + 4;
286 }
287 }
288
vpx_lpf_vertical_8_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)289 void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
290 const uint8_t *blimit, const uint8_t *limit,
291 const uint8_t *thresh) {
292 uint8_t i;
293 uint32_t mask, hev, flat;
294 uint8_t *s1, *s2, *s3, *s4;
295 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
296 uint32_t thresh_vec, flimit_vec, limit_vec;
297 uint32_t uflimit, ulimit, uthresh;
298 uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
299 uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
300 uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
301 uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
302
303 uflimit = *blimit;
304 ulimit = *limit;
305 uthresh = *thresh;
306
307 /* create quad-byte */
308 __asm__ __volatile__(
309 "replv.qb %[thresh_vec], %[uthresh] \n\t"
310 "replv.qb %[flimit_vec], %[uflimit] \n\t"
311 "replv.qb %[limit_vec], %[ulimit] \n\t"
312
313 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
314 [limit_vec] "=r"(limit_vec)
315 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
316
317 prefetch_store(s + pitch);
318
319 for (i = 0; i < 2; i++) {
320 s1 = s;
321 s2 = s + pitch;
322 s3 = s2 + pitch;
323 s4 = s3 + pitch;
324 s = s4 + pitch;
325
326 __asm__ __volatile__(
327 "lw %[p0], -4(%[s1]) \n\t"
328 "lw %[p1], -4(%[s2]) \n\t"
329 "lw %[p2], -4(%[s3]) \n\t"
330 "lw %[p3], -4(%[s4]) \n\t"
331 "lw %[q3], (%[s1]) \n\t"
332 "lw %[q2], (%[s2]) \n\t"
333 "lw %[q1], (%[s3]) \n\t"
334 "lw %[q0], (%[s4]) \n\t"
335
336 : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
337 [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
338 : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
339
340 /* transpose p3, p2, p1, p0
341 original (when loaded from memory)
342 register -4 -3 -2 -1
343 p0 p0_0 p0_1 p0_2 p0_3
344 p1 p1_0 p1_1 p1_2 p1_3
345 p2 p2_0 p2_1 p2_2 p2_3
346 p3 p3_0 p3_1 p3_2 p3_3
347
348 after transpose
349 register
350 p0 p3_3 p2_3 p1_3 p0_3
351 p1 p3_2 p2_2 p1_2 p0_2
352 p2 p3_1 p2_1 p1_1 p0_1
353 p3 p3_0 p2_0 p1_0 p0_0
354 */
355 __asm__ __volatile__(
356 "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
357 "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
358 "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
359 "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
360
361 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
362 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
363 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
364 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
365
366 "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
367 "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
368 "append %[p1], %[sec3], 16 \n\t"
369 "append %[p3], %[sec4], 16 \n\t"
370
371 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
372 [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
373 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
374 :);
375
376 /* transpose q0, q1, q2, q3
377 original (when loaded from memory)
378 register +1 +2 +3 +4
379 q3 q3_0 q3_1 q3_2 q3_3
380 q2 q2_0 q2_1 q2_2 q2_3
381 q1 q1_0 q1_1 q1_2 q1_3
382 q0 q0_0 q0_1 q0_2 q0_3
383
384 after transpose
385 register
386 q3 q0_3 q1_3 q2_3 q3_3
387 q2 q0_2 q1_2 q2_2 q3_2
388 q1 q0_1 q1_1 q2_1 q3_1
389 q0 q0_0 q1_0 q2_0 q3_0
390 */
391 __asm__ __volatile__(
392 "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
393 "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
394 "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
395 "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
396
397 "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
398 "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
399 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
400 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
401
402 "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
403 "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
404 "append %[q2], %[sec3], 16 \n\t"
405 "append %[q0], %[sec4], 16 \n\t"
406
407 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
408 [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
409 [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
410 :);
411
412 filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
413 p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
414
415 if ((flat == 0) && (mask != 0)) {
416 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
417 STORE_F0()
418 } else if ((mask & flat) == 0xFFFFFFFF) {
419 /* left 2 element operation */
420 PACK_LEFT_0TO3()
421 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
422
423 /* right 2 element operation */
424 PACK_RIGHT_0TO3()
425 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
426
427 STORE_F1()
428 } else if ((flat != 0) && (mask != 0)) {
429 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
430
431 /* left 2 element operation */
432 PACK_LEFT_0TO3()
433 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
434
435 /* right 2 element operation */
436 PACK_RIGHT_0TO3()
437 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
438
439 if (mask & flat & 0x000000FF) {
440 __asm__ __volatile__(
441 "sb %[p2_r], -3(%[s4]) \n\t"
442 "sb %[p1_r], -2(%[s4]) \n\t"
443 "sb %[p0_r], -1(%[s4]) \n\t"
444 "sb %[q0_r], (%[s4]) \n\t"
445 "sb %[q1_r], +1(%[s4]) \n\t"
446 "sb %[q2_r], +2(%[s4]) \n\t"
447
448 :
449 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
450 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
451 [s4] "r"(s4));
452 } else if (mask & 0x000000FF) {
453 __asm__ __volatile__(
454 "sb %[p1_f0], -2(%[s4]) \n\t"
455 "sb %[p0_f0], -1(%[s4]) \n\t"
456 "sb %[q0_f0], (%[s4]) \n\t"
457 "sb %[q1_f0], +1(%[s4]) \n\t"
458
459 :
460 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
461 [q1_f0] "r"(q1_f0), [s4] "r"(s4));
462 }
463
464 __asm__ __volatile__(
465 "srl %[p2_r], %[p2_r], 16 \n\t"
466 "srl %[p1_r], %[p1_r], 16 \n\t"
467 "srl %[p0_r], %[p0_r], 16 \n\t"
468 "srl %[q0_r], %[q0_r], 16 \n\t"
469 "srl %[q1_r], %[q1_r], 16 \n\t"
470 "srl %[q2_r], %[q2_r], 16 \n\t"
471 "srl %[p1_f0], %[p1_f0], 8 \n\t"
472 "srl %[p0_f0], %[p0_f0], 8 \n\t"
473 "srl %[q0_f0], %[q0_f0], 8 \n\t"
474 "srl %[q1_f0], %[q1_f0], 8 \n\t"
475
476 : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
477 [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
478 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
479 [q1_f0] "+r"(q1_f0)
480 :);
481
482 if (mask & flat & 0x0000FF00) {
483 __asm__ __volatile__(
484 "sb %[p2_r], -3(%[s3]) \n\t"
485 "sb %[p1_r], -2(%[s3]) \n\t"
486 "sb %[p0_r], -1(%[s3]) \n\t"
487 "sb %[q0_r], (%[s3]) \n\t"
488 "sb %[q1_r], +1(%[s3]) \n\t"
489 "sb %[q2_r], +2(%[s3]) \n\t"
490
491 :
492 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
493 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
494 [s3] "r"(s3));
495 } else if (mask & 0x0000FF00) {
496 __asm__ __volatile__(
497 "sb %[p1_f0], -2(%[s3]) \n\t"
498 "sb %[p0_f0], -1(%[s3]) \n\t"
499 "sb %[q0_f0], (%[s3]) \n\t"
500 "sb %[q1_f0], +1(%[s3]) \n\t"
501
502 :
503 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
504 [q1_f0] "r"(q1_f0), [s3] "r"(s3));
505 }
506
507 __asm__ __volatile__(
508 "srl %[p1_f0], %[p1_f0], 8 \n\t"
509 "srl %[p0_f0], %[p0_f0], 8 \n\t"
510 "srl %[q0_f0], %[q0_f0], 8 \n\t"
511 "srl %[q1_f0], %[q1_f0], 8 \n\t"
512
513 : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
514 [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
515 [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
516 :);
517
518 if (mask & flat & 0x00FF0000) {
519 __asm__ __volatile__(
520 "sb %[p2_l], -3(%[s2]) \n\t"
521 "sb %[p1_l], -2(%[s2]) \n\t"
522 "sb %[p0_l], -1(%[s2]) \n\t"
523 "sb %[q0_l], (%[s2]) \n\t"
524 "sb %[q1_l], +1(%[s2]) \n\t"
525 "sb %[q2_l], +2(%[s2]) \n\t"
526
527 :
528 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
529 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
530 [s2] "r"(s2));
531 } else if (mask & 0x00FF0000) {
532 __asm__ __volatile__(
533 "sb %[p1_f0], -2(%[s2]) \n\t"
534 "sb %[p0_f0], -1(%[s2]) \n\t"
535 "sb %[q0_f0], (%[s2]) \n\t"
536 "sb %[q1_f0], +1(%[s2]) \n\t"
537
538 :
539 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
540 [q1_f0] "r"(q1_f0), [s2] "r"(s2));
541 }
542
543 __asm__ __volatile__(
544 "srl %[p2_l], %[p2_l], 16 \n\t"
545 "srl %[p1_l], %[p1_l], 16 \n\t"
546 "srl %[p0_l], %[p0_l], 16 \n\t"
547 "srl %[q0_l], %[q0_l], 16 \n\t"
548 "srl %[q1_l], %[q1_l], 16 \n\t"
549 "srl %[q2_l], %[q2_l], 16 \n\t"
550 "srl %[p1_f0], %[p1_f0], 8 \n\t"
551 "srl %[p0_f0], %[p0_f0], 8 \n\t"
552 "srl %[q0_f0], %[q0_f0], 8 \n\t"
553 "srl %[q1_f0], %[q1_f0], 8 \n\t"
554
555 : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
556 [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
557 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
558 [q1_f0] "+r"(q1_f0)
559 :);
560
561 if (mask & flat & 0xFF000000) {
562 __asm__ __volatile__(
563 "sb %[p2_l], -3(%[s1]) \n\t"
564 "sb %[p1_l], -2(%[s1]) \n\t"
565 "sb %[p0_l], -1(%[s1]) \n\t"
566 "sb %[q0_l], (%[s1]) \n\t"
567 "sb %[q1_l], +1(%[s1]) \n\t"
568 "sb %[q2_l], +2(%[s1]) \n\t"
569
570 :
571 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
572 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
573 [s1] "r"(s1));
574 } else if (mask & 0xFF000000) {
575 __asm__ __volatile__(
576 "sb %[p1_f0], -2(%[s1]) \n\t"
577 "sb %[p0_f0], -1(%[s1]) \n\t"
578 "sb %[q0_f0], (%[s1]) \n\t"
579 "sb %[q1_f0], +1(%[s1]) \n\t"
580
581 :
582 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
583 [q1_f0] "r"(q1_f0), [s1] "r"(s1));
584 }
585 }
586 }
587 }
588 #endif // #if HAVE_DSPR2
589