1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20
21 #if HAVE_DSPR2
vpx_lpf_vertical_16_dspr2(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)22 void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
23 const uint8_t *limit, const uint8_t *thresh) {
24 uint8_t i;
25 uint32_t mask, hev, flat, flat2;
26 uint8_t *s1, *s2, *s3, *s4;
27 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
28 uint32_t thresh_vec, flimit_vec, limit_vec;
29 uint32_t uflimit, ulimit, uthresh;
30 uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
31 uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
32 uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
33 uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
34 uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
35 uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
36 uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
37 uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
38
39 uflimit = *blimit;
40 ulimit = *limit;
41 uthresh = *thresh;
42
43 /* create quad-byte */
44 __asm__ __volatile__(
45 "replv.qb %[thresh_vec], %[uthresh] \n\t"
46 "replv.qb %[flimit_vec], %[uflimit] \n\t"
47 "replv.qb %[limit_vec], %[ulimit] \n\t"
48
49 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
50 [limit_vec] "=r"(limit_vec)
51 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
52
53 prefetch_store(s + pitch);
54
55 for (i = 0; i < 2; i++) {
56 s1 = s;
57 s2 = s + pitch;
58 s3 = s2 + pitch;
59 s4 = s3 + pitch;
60 s = s4 + pitch;
61
62 __asm__ __volatile__(
63 "lw %[p0], -4(%[s1]) \n\t"
64 "lw %[p1], -4(%[s2]) \n\t"
65 "lw %[p2], -4(%[s3]) \n\t"
66 "lw %[p3], -4(%[s4]) \n\t"
67 "lw %[p4], -8(%[s1]) \n\t"
68 "lw %[p5], -8(%[s2]) \n\t"
69 "lw %[p6], -8(%[s3]) \n\t"
70 "lw %[p7], -8(%[s4]) \n\t"
71
72 : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
73 [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
74 : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
75
76 __asm__ __volatile__(
77 "lw %[q3], (%[s1]) \n\t"
78 "lw %[q2], (%[s2]) \n\t"
79 "lw %[q1], (%[s3]) \n\t"
80 "lw %[q0], (%[s4]) \n\t"
81 "lw %[q7], +4(%[s1]) \n\t"
82 "lw %[q6], +4(%[s2]) \n\t"
83 "lw %[q5], +4(%[s3]) \n\t"
84 "lw %[q4], +4(%[s4]) \n\t"
85
86 : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
87 [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
88 : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
89
90 /* transpose p3, p2, p1, p0
91 original (when loaded from memory)
92 register -4 -3 -2 -1
93 p0 p0_0 p0_1 p0_2 p0_3
94 p1 p1_0 p1_1 p1_2 p1_3
95 p2 p2_0 p2_1 p2_2 p2_3
96 p3 p3_0 p3_1 p3_2 p3_3
97
98 after transpose
99 register
100 p0 p3_3 p2_3 p1_3 p0_3
101 p1 p3_2 p2_2 p1_2 p0_2
102 p2 p3_1 p2_1 p1_1 p0_1
103 p3 p3_0 p2_0 p1_0 p0_0
104 */
105 __asm__ __volatile__(
106 "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
107 "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
108 "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
109 "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
110
111 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
112 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
113 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
114 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
115
116 "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
117 "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
118 "append %[p1], %[sec3], 16 \n\t"
119 "append %[p3], %[sec4], 16 \n\t"
120
121 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
122 [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
123 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
124 :);
125
126 /* transpose q0, q1, q2, q3
127 original (when loaded from memory)
128 register +1 +2 +3 +4
129 q3 q3_0 q3_1 q3_2 q3_3
130 q2 q2_0 q2_1 q2_2 q2_3
131 q1 q1_0 q1_1 q1_2 q1_3
132 q0 q0_0 q0_1 q0_2 q0_3
133
134 after transpose
135 register
136 q3 q0_3 q1_3 q2_3 q3_3
137 q2 q0_2 q1_2 q2_2 q3_2
138 q1 q0_1 q1_1 q2_1 q3_1
139 q0 q0_0 q1_0 q2_0 q3_0
140 */
141 __asm__ __volatile__(
142 "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
143 "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
144 "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
145 "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
146
147 "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
148 "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
149 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
150 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
151
152 "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
153 "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
154 "append %[q2], %[sec3], 16 \n\t"
155 "append %[q0], %[sec4], 16 \n\t"
156
157 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
158 [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
159 [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
160 :);
161
162 /* transpose p7, p6, p5, p4
163 original (when loaded from memory)
164 register -8 -7 -6 -5
165 p4 p4_0 p4_1 p4_2 p4_3
166 p5 p5_0 p5_1 p5_2 p5_3
167 p6 p6_0 p6_1 p6_2 p6_3
168 p7 p7_0 p7_1 p7_2 p7_3
169
170 after transpose
171 register
172 p4 p7_3 p6_3 p5_3 p4_3
173 p5 p7_2 p6_2 p5_2 p4_2
174 p6 p7_1 p6_1 p5_1 p4_1
175 p7 p7_0 p6_0 p5_0 p4_0
176 */
177 __asm__ __volatile__(
178 "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
179 "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
180 "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
181 "precr.qb.ph %[prim4], %[p6], %[p7] \n\t"
182
183 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
184 "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t"
185 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
186 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
187
188 "precrq.ph.w %[p4], %[p5], %[sec3] \n\t"
189 "precrq.ph.w %[p6], %[p7], %[sec4] \n\t"
190 "append %[p5], %[sec3], 16 \n\t"
191 "append %[p7], %[sec4], 16 \n\t"
192
193 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
194 [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
195 [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
196 :);
197
198 /* transpose q4, q5, q6, q7
199 original (when loaded from memory)
200 register +5 +6 +7 +8
201 q7 q7_0 q7_1 q7_2 q7_3
202 q6 q6_0 q6_1 q6_2 q6_3
203 q5 q5_0 q5_1 q5_2 q5_3
204 q4 q4_0 q4_1 q4_2 q4_3
205
206 after transpose
207 register
208 q7 q4_3 q5_3 q26_3 q7_3
209 q6 q4_2 q5_2 q26_2 q7_2
210 q5 q4_1 q5_1 q26_1 q7_1
211 q4 q4_0 q5_0 q26_0 q7_0
212 */
213 __asm__ __volatile__(
214 "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
215 "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
216 "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
217 "precr.qb.ph %[prim4], %[q5], %[q4] \n\t"
218
219 "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t"
220 "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t"
221 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
222 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
223
224 "precrq.ph.w %[q7], %[q6], %[sec3] \n\t"
225 "precrq.ph.w %[q5], %[q4], %[sec4] \n\t"
226 "append %[q6], %[sec3], 16 \n\t"
227 "append %[q4], %[sec4], 16 \n\t"
228
229 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
230 [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
231 [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
232 :);
233
234 filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
235 p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
236
237 flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
238
239 /* f0 */
240 if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
241 ((flat2 != 0) && (flat == 0) && (mask != 0))) {
242 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
243 STORE_F0()
244 } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
245 (mask == 0xFFFFFFFF)) {
246 /* f2 */
247 PACK_LEFT_0TO3()
248 PACK_LEFT_4TO7()
249 wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
250 &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
251 &q6_l, &q7_l);
252
253 PACK_RIGHT_0TO3()
254 PACK_RIGHT_4TO7()
255 wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
256 &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
257 &q6_r, &q7_r);
258
259 STORE_F2()
260 } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
261 /* f1 */
262 PACK_LEFT_0TO3()
263 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
264
265 PACK_RIGHT_0TO3()
266 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
267
268 STORE_F1()
269 } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
270 /* f0 + f1 */
271 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
272
273 /* left 2 element operation */
274 PACK_LEFT_0TO3()
275 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
276
277 /* right 2 element operation */
278 PACK_RIGHT_0TO3()
279 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
280
281 if (mask & flat & 0x000000FF) {
282 __asm__ __volatile__(
283 "sb %[p2_r], -3(%[s4]) \n\t"
284 "sb %[p1_r], -2(%[s4]) \n\t"
285 "sb %[p0_r], -1(%[s4]) \n\t"
286 "sb %[q0_r], (%[s4]) \n\t"
287 "sb %[q1_r], +1(%[s4]) \n\t"
288 "sb %[q2_r], +2(%[s4]) \n\t"
289
290 :
291 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
292 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
293 [s4] "r"(s4));
294 } else if (mask & 0x000000FF) {
295 __asm__ __volatile__(
296 "sb %[p1_f0], -2(%[s4]) \n\t"
297 "sb %[p0_f0], -1(%[s4]) \n\t"
298 "sb %[q0_f0], (%[s4]) \n\t"
299 "sb %[q1_f0], +1(%[s4]) \n\t"
300
301 :
302 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
303 [q1_f0] "r"(q1_f0), [s4] "r"(s4));
304 }
305
306 __asm__ __volatile__(
307 "srl %[p2_r], %[p2_r], 16 \n\t"
308 "srl %[p1_r], %[p1_r], 16 \n\t"
309 "srl %[p0_r], %[p0_r], 16 \n\t"
310 "srl %[q0_r], %[q0_r], 16 \n\t"
311 "srl %[q1_r], %[q1_r], 16 \n\t"
312 "srl %[q2_r], %[q2_r], 16 \n\t"
313 "srl %[p1_f0], %[p1_f0], 8 \n\t"
314 "srl %[p0_f0], %[p0_f0], 8 \n\t"
315 "srl %[q0_f0], %[q0_f0], 8 \n\t"
316 "srl %[q1_f0], %[q1_f0], 8 \n\t"
317
318 : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
319 [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
320 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
321 [q1_f0] "+r"(q1_f0)
322 :);
323
324 if (mask & flat & 0x0000FF00) {
325 __asm__ __volatile__(
326 "sb %[p2_r], -3(%[s3]) \n\t"
327 "sb %[p1_r], -2(%[s3]) \n\t"
328 "sb %[p0_r], -1(%[s3]) \n\t"
329 "sb %[q0_r], (%[s3]) \n\t"
330 "sb %[q1_r], +1(%[s3]) \n\t"
331 "sb %[q2_r], +2(%[s3]) \n\t"
332
333 :
334 : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
335 [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
336 [s3] "r"(s3));
337 } else if (mask & 0x0000FF00) {
338 __asm__ __volatile__(
339 "sb %[p1_f0], -2(%[s3]) \n\t"
340 "sb %[p0_f0], -1(%[s3]) \n\t"
341 "sb %[q0_f0], (%[s3]) \n\t"
342 "sb %[q1_f0], +1(%[s3]) \n\t"
343
344 :
345 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
346 [q1_f0] "r"(q1_f0), [s3] "r"(s3));
347 }
348
349 __asm__ __volatile__(
350 "srl %[p1_f0], %[p1_f0], 8 \n\t"
351 "srl %[p0_f0], %[p0_f0], 8 \n\t"
352 "srl %[q0_f0], %[q0_f0], 8 \n\t"
353 "srl %[q1_f0], %[q1_f0], 8 \n\t"
354
355 : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
356 [q1_f0] "+r"(q1_f0)
357 :);
358
359 if (mask & flat & 0x00FF0000) {
360 __asm__ __volatile__(
361 "sb %[p2_l], -3(%[s2]) \n\t"
362 "sb %[p1_l], -2(%[s2]) \n\t"
363 "sb %[p0_l], -1(%[s2]) \n\t"
364 "sb %[q0_l], (%[s2]) \n\t"
365 "sb %[q1_l], +1(%[s2]) \n\t"
366 "sb %[q2_l], +2(%[s2]) \n\t"
367
368 :
369 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
370 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
371 [s2] "r"(s2));
372 } else if (mask & 0x00FF0000) {
373 __asm__ __volatile__(
374 "sb %[p1_f0], -2(%[s2]) \n\t"
375 "sb %[p0_f0], -1(%[s2]) \n\t"
376 "sb %[q0_f0], (%[s2]) \n\t"
377 "sb %[q1_f0], +1(%[s2]) \n\t"
378
379 :
380 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
381 [q1_f0] "r"(q1_f0), [s2] "r"(s2));
382 }
383
384 __asm__ __volatile__(
385 "srl %[p2_l], %[p2_l], 16 \n\t"
386 "srl %[p1_l], %[p1_l], 16 \n\t"
387 "srl %[p0_l], %[p0_l], 16 \n\t"
388 "srl %[q0_l], %[q0_l], 16 \n\t"
389 "srl %[q1_l], %[q1_l], 16 \n\t"
390 "srl %[q2_l], %[q2_l], 16 \n\t"
391 "srl %[p1_f0], %[p1_f0], 8 \n\t"
392 "srl %[p0_f0], %[p0_f0], 8 \n\t"
393 "srl %[q0_f0], %[q0_f0], 8 \n\t"
394 "srl %[q1_f0], %[q1_f0], 8 \n\t"
395
396 : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
397 [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
398 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
399 [q1_f0] "+r"(q1_f0)
400 :);
401
402 if (mask & flat & 0xFF000000) {
403 __asm__ __volatile__(
404 "sb %[p2_l], -3(%[s1]) \n\t"
405 "sb %[p1_l], -2(%[s1]) \n\t"
406 "sb %[p0_l], -1(%[s1]) \n\t"
407 "sb %[q0_l], (%[s1]) \n\t"
408 "sb %[q1_l], +1(%[s1]) \n\t"
409 "sb %[q2_l], +2(%[s1]) \n\t"
410
411 :
412 : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
413 [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
414 [s1] "r"(s1));
415 } else if (mask & 0xFF000000) {
416 __asm__ __volatile__(
417 "sb %[p1_f0], -2(%[s1]) \n\t"
418 "sb %[p0_f0], -1(%[s1]) \n\t"
419 "sb %[q0_f0], (%[s1]) \n\t"
420 "sb %[q1_f0], +1(%[s1]) \n\t"
421
422 :
423 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
424 [q1_f0] "r"(q1_f0), [s1] "r"(s1));
425 }
426 } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
427 /* f0+f1+f2 */
428 filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
429
430 PACK_LEFT_0TO3()
431 mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
432 &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
433
434 PACK_RIGHT_0TO3()
435 mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
436 &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
437
438 PACK_LEFT_4TO7()
439 wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
440 &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
441 &q6_l, &q7_l);
442
443 PACK_RIGHT_4TO7()
444 wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
445 &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
446 &q6_r, &q7_r);
447
448 if (mask & flat & flat2 & 0x000000FF) {
449 __asm__ __volatile__(
450 "sb %[p6_r], -7(%[s4]) \n\t"
451 "sb %[p5_r], -6(%[s4]) \n\t"
452 "sb %[p4_r], -5(%[s4]) \n\t"
453 "sb %[p3_r], -4(%[s4]) \n\t"
454 "sb %[p2_r], -3(%[s4]) \n\t"
455 "sb %[p1_r], -2(%[s4]) \n\t"
456 "sb %[p0_r], -1(%[s4]) \n\t"
457
458 :
459 : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
460 [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
461 [p0_r] "r"(p0_r), [s4] "r"(s4));
462
463 __asm__ __volatile__(
464 "sb %[q0_r], (%[s4]) \n\t"
465 "sb %[q1_r], +1(%[s4]) \n\t"
466 "sb %[q2_r], +2(%[s4]) \n\t"
467 "sb %[q3_r], +3(%[s4]) \n\t"
468 "sb %[q4_r], +4(%[s4]) \n\t"
469 "sb %[q5_r], +5(%[s4]) \n\t"
470 "sb %[q6_r], +6(%[s4]) \n\t"
471
472 :
473 : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
474 [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
475 [q6_r] "r"(q6_r), [s4] "r"(s4));
476 } else if (mask & flat & 0x000000FF) {
477 __asm__ __volatile__(
478 "sb %[p2_r_f1], -3(%[s4]) \n\t"
479 "sb %[p1_r_f1], -2(%[s4]) \n\t"
480 "sb %[p0_r_f1], -1(%[s4]) \n\t"
481 "sb %[q0_r_f1], (%[s4]) \n\t"
482 "sb %[q1_r_f1], +1(%[s4]) \n\t"
483 "sb %[q2_r_f1], +2(%[s4]) \n\t"
484
485 :
486 : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
487 [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
488 [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
489 } else if (mask & 0x000000FF) {
490 __asm__ __volatile__(
491 "sb %[p1_f0], -2(%[s4]) \n\t"
492 "sb %[p0_f0], -1(%[s4]) \n\t"
493 "sb %[q0_f0], (%[s4]) \n\t"
494 "sb %[q1_f0], +1(%[s4]) \n\t"
495
496 :
497 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
498 [q1_f0] "r"(q1_f0), [s4] "r"(s4));
499 }
500
501 __asm__ __volatile__(
502 "srl %[p6_r], %[p6_r], 16 \n\t"
503 "srl %[p5_r], %[p5_r], 16 \n\t"
504 "srl %[p4_r], %[p4_r], 16 \n\t"
505 "srl %[p3_r], %[p3_r], 16 \n\t"
506 "srl %[p2_r], %[p2_r], 16 \n\t"
507 "srl %[p1_r], %[p1_r], 16 \n\t"
508 "srl %[p0_r], %[p0_r], 16 \n\t"
509 "srl %[q0_r], %[q0_r], 16 \n\t"
510 "srl %[q1_r], %[q1_r], 16 \n\t"
511 "srl %[q2_r], %[q2_r], 16 \n\t"
512 "srl %[q3_r], %[q3_r], 16 \n\t"
513 "srl %[q4_r], %[q4_r], 16 \n\t"
514 "srl %[q5_r], %[q5_r], 16 \n\t"
515 "srl %[q6_r], %[q6_r], 16 \n\t"
516
517 : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
518 [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
519 [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
520 [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
521 [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
522 :);
523
524 __asm__ __volatile__(
525 "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
526 "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
527 "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
528 "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
529 "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
530 "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
531 "srl %[p1_f0], %[p1_f0], 8 \n\t"
532 "srl %[p0_f0], %[p0_f0], 8 \n\t"
533 "srl %[q0_f0], %[q0_f0], 8 \n\t"
534 "srl %[q1_f0], %[q1_f0], 8 \n\t"
535
536 : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
537 [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
538 [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
539 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
540 [q1_f0] "+r"(q1_f0)
541 :);
542
543 if (mask & flat & flat2 & 0x0000FF00) {
544 __asm__ __volatile__(
545 "sb %[p6_r], -7(%[s3]) \n\t"
546 "sb %[p5_r], -6(%[s3]) \n\t"
547 "sb %[p4_r], -5(%[s3]) \n\t"
548 "sb %[p3_r], -4(%[s3]) \n\t"
549 "sb %[p2_r], -3(%[s3]) \n\t"
550 "sb %[p1_r], -2(%[s3]) \n\t"
551 "sb %[p0_r], -1(%[s3]) \n\t"
552
553 :
554 : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
555 [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
556 [p0_r] "r"(p0_r), [s3] "r"(s3));
557
558 __asm__ __volatile__(
559 "sb %[q0_r], (%[s3]) \n\t"
560 "sb %[q1_r], +1(%[s3]) \n\t"
561 "sb %[q2_r], +2(%[s3]) \n\t"
562 "sb %[q3_r], +3(%[s3]) \n\t"
563 "sb %[q4_r], +4(%[s3]) \n\t"
564 "sb %[q5_r], +5(%[s3]) \n\t"
565 "sb %[q6_r], +6(%[s3]) \n\t"
566
567 :
568 : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
569 [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
570 [q6_r] "r"(q6_r), [s3] "r"(s3));
571 } else if (mask & flat & 0x0000FF00) {
572 __asm__ __volatile__(
573 "sb %[p2_r_f1], -3(%[s3]) \n\t"
574 "sb %[p1_r_f1], -2(%[s3]) \n\t"
575 "sb %[p0_r_f1], -1(%[s3]) \n\t"
576 "sb %[q0_r_f1], (%[s3]) \n\t"
577 "sb %[q1_r_f1], +1(%[s3]) \n\t"
578 "sb %[q2_r_f1], +2(%[s3]) \n\t"
579
580 :
581 : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
582 [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
583 [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
584 } else if (mask & 0x0000FF00) {
585 __asm__ __volatile__(
586 "sb %[p1_f0], -2(%[s3]) \n\t"
587 "sb %[p0_f0], -1(%[s3]) \n\t"
588 "sb %[q0_f0], (%[s3]) \n\t"
589 "sb %[q1_f0], +1(%[s3]) \n\t"
590
591 :
592 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
593 [q1_f0] "r"(q1_f0), [s3] "r"(s3));
594 }
595
596 __asm__ __volatile__(
597 "srl %[p1_f0], %[p1_f0], 8 \n\t"
598 "srl %[p0_f0], %[p0_f0], 8 \n\t"
599 "srl %[q0_f0], %[q0_f0], 8 \n\t"
600 "srl %[q1_f0], %[q1_f0], 8 \n\t"
601
602 : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
603 [q1_f0] "+r"(q1_f0)
604 :);
605
606 if (mask & flat & flat2 & 0x00FF0000) {
607 __asm__ __volatile__(
608 "sb %[p6_l], -7(%[s2]) \n\t"
609 "sb %[p5_l], -6(%[s2]) \n\t"
610 "sb %[p4_l], -5(%[s2]) \n\t"
611 "sb %[p3_l], -4(%[s2]) \n\t"
612 "sb %[p2_l], -3(%[s2]) \n\t"
613 "sb %[p1_l], -2(%[s2]) \n\t"
614 "sb %[p0_l], -1(%[s2]) \n\t"
615
616 :
617 : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
618 [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
619 [p0_l] "r"(p0_l), [s2] "r"(s2));
620
621 __asm__ __volatile__(
622 "sb %[q0_l], (%[s2]) \n\t"
623 "sb %[q1_l], +1(%[s2]) \n\t"
624 "sb %[q2_l], +2(%[s2]) \n\t"
625 "sb %[q3_l], +3(%[s2]) \n\t"
626 "sb %[q4_l], +4(%[s2]) \n\t"
627 "sb %[q5_l], +5(%[s2]) \n\t"
628 "sb %[q6_l], +6(%[s2]) \n\t"
629
630 :
631 : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
632 [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
633 [q6_l] "r"(q6_l), [s2] "r"(s2));
634 } else if (mask & flat & 0x00FF0000) {
635 __asm__ __volatile__(
636 "sb %[p2_l_f1], -3(%[s2]) \n\t"
637 "sb %[p1_l_f1], -2(%[s2]) \n\t"
638 "sb %[p0_l_f1], -1(%[s2]) \n\t"
639 "sb %[q0_l_f1], (%[s2]) \n\t"
640 "sb %[q1_l_f1], +1(%[s2]) \n\t"
641 "sb %[q2_l_f1], +2(%[s2]) \n\t"
642
643 :
644 : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
645 [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
646 [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
647 } else if (mask & 0x00FF0000) {
648 __asm__ __volatile__(
649 "sb %[p1_f0], -2(%[s2]) \n\t"
650 "sb %[p0_f0], -1(%[s2]) \n\t"
651 "sb %[q0_f0], (%[s2]) \n\t"
652 "sb %[q1_f0], +1(%[s2]) \n\t"
653
654 :
655 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
656 [q1_f0] "r"(q1_f0), [s2] "r"(s2));
657 }
658
659 __asm__ __volatile__(
660 "srl %[p6_l], %[p6_l], 16 \n\t"
661 "srl %[p5_l], %[p5_l], 16 \n\t"
662 "srl %[p4_l], %[p4_l], 16 \n\t"
663 "srl %[p3_l], %[p3_l], 16 \n\t"
664 "srl %[p2_l], %[p2_l], 16 \n\t"
665 "srl %[p1_l], %[p1_l], 16 \n\t"
666 "srl %[p0_l], %[p0_l], 16 \n\t"
667 "srl %[q0_l], %[q0_l], 16 \n\t"
668 "srl %[q1_l], %[q1_l], 16 \n\t"
669 "srl %[q2_l], %[q2_l], 16 \n\t"
670 "srl %[q3_l], %[q3_l], 16 \n\t"
671 "srl %[q4_l], %[q4_l], 16 \n\t"
672 "srl %[q5_l], %[q5_l], 16 \n\t"
673 "srl %[q6_l], %[q6_l], 16 \n\t"
674
675 : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
676 [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
677 [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
678 [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
679 [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
680 :);
681
682 __asm__ __volatile__(
683 "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
684 "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
685 "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
686 "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
687 "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
688 "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
689 "srl %[p1_f0], %[p1_f0], 8 \n\t"
690 "srl %[p0_f0], %[p0_f0], 8 \n\t"
691 "srl %[q0_f0], %[q0_f0], 8 \n\t"
692 "srl %[q1_f0], %[q1_f0], 8 \n\t"
693
694 : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
695 [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
696 [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
697 [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
698 [q1_f0] "+r"(q1_f0)
699 :);
700
701 if (mask & flat & flat2 & 0xFF000000) {
702 __asm__ __volatile__(
703 "sb %[p6_l], -7(%[s1]) \n\t"
704 "sb %[p5_l], -6(%[s1]) \n\t"
705 "sb %[p4_l], -5(%[s1]) \n\t"
706 "sb %[p3_l], -4(%[s1]) \n\t"
707 "sb %[p2_l], -3(%[s1]) \n\t"
708 "sb %[p1_l], -2(%[s1]) \n\t"
709 "sb %[p0_l], -1(%[s1]) \n\t"
710
711 :
712 : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
713 [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
714 [p0_l] "r"(p0_l), [s1] "r"(s1));
715
716 __asm__ __volatile__(
717 "sb %[q0_l], (%[s1]) \n\t"
718 "sb %[q1_l], 1(%[s1]) \n\t"
719 "sb %[q2_l], 2(%[s1]) \n\t"
720 "sb %[q3_l], 3(%[s1]) \n\t"
721 "sb %[q4_l], 4(%[s1]) \n\t"
722 "sb %[q5_l], 5(%[s1]) \n\t"
723 "sb %[q6_l], 6(%[s1]) \n\t"
724
725 :
726 : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
727 [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
728 [q6_l] "r"(q6_l), [s1] "r"(s1));
729 } else if (mask & flat & 0xFF000000) {
730 __asm__ __volatile__(
731 "sb %[p2_l_f1], -3(%[s1]) \n\t"
732 "sb %[p1_l_f1], -2(%[s1]) \n\t"
733 "sb %[p0_l_f1], -1(%[s1]) \n\t"
734 "sb %[q0_l_f1], (%[s1]) \n\t"
735 "sb %[q1_l_f1], +1(%[s1]) \n\t"
736 "sb %[q2_l_f1], +2(%[s1]) \n\t"
737
738 :
739 : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
740 [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
741 [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
742 } else if (mask & 0xFF000000) {
743 __asm__ __volatile__(
744 "sb %[p1_f0], -2(%[s1]) \n\t"
745 "sb %[p0_f0], -1(%[s1]) \n\t"
746 "sb %[q0_f0], (%[s1]) \n\t"
747 "sb %[q1_f0], +1(%[s1]) \n\t"
748
749 :
750 : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
751 [q1_f0] "r"(q1_f0), [s1] "r"(s1));
752 }
753 }
754 }
755 }
756 #endif // #if HAVE_DSPR2
757