1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_transposed_dspr2(
22 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
23 const int16_t *filter_x0, int32_t h) {
24 int32_t y;
25 uint8_t *cm = vpx_ff_cropTbl;
26 uint8_t *dst_ptr;
27 int32_t Temp1, Temp2;
28 uint32_t vector4a = 64;
29 uint32_t tp1, tp2;
30 uint32_t p1, p2;
31 const int16_t *filter = &filter_x0[3];
32 uint32_t filter45;
33
34 filter45 = ((const int32_t *)filter)[0];
35
36 for (y = h; y--;) {
37 dst_ptr = dst;
38 /* prefetch data to cache memory */
39 prefetch_load(src + src_stride);
40 prefetch_load(src + src_stride + 32);
41
42 __asm__ __volatile__(
43 "ulw %[tp1], 0(%[src]) \n\t"
44 "ulw %[tp2], 4(%[src]) \n\t"
45
46 /* even 1. pixel */
47 "mtlo %[vector4a], $ac3 \n\t"
48 "mthi $zero, $ac3 \n\t"
49 "preceu.ph.qbr %[p1], %[tp1] \n\t"
50 "preceu.ph.qbl %[p2], %[tp1] \n\t"
51 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
52 "extp %[Temp1], $ac3, 31 \n\t"
53
54 /* even 2. pixel */
55 "mtlo %[vector4a], $ac2 \n\t"
56 "mthi $zero, $ac2 \n\t"
57 "balign %[tp2], %[tp1], 3 \n\t"
58 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
59 "extp %[Temp2], $ac2, 31 \n\t"
60
61 /* odd 1. pixel */
62 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
63 "mtlo %[vector4a], $ac3 \n\t"
64 "mthi $zero, $ac3 \n\t"
65 "preceu.ph.qbr %[p1], %[tp2] \n\t"
66 "preceu.ph.qbl %[p2], %[tp2] \n\t"
67 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
68 "extp %[Temp1], $ac3, 31 \n\t"
69
70 /* odd 2. pixel */
71 "lbux %[tp2], %[Temp2](%[cm]) \n\t"
72 "mtlo %[vector4a], $ac2 \n\t"
73 "mthi $zero, $ac2 \n\t"
74 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
75 "extp %[Temp2], $ac2, 31 \n\t"
76
77 /* clamp */
78 "lbux %[p1], %[Temp1](%[cm]) \n\t"
79 "lbux %[p2], %[Temp2](%[cm]) \n\t"
80
81 /* store bytes */
82 "sb %[tp1], 0(%[dst_ptr]) \n\t"
83 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
84
85 "sb %[p1], 0(%[dst_ptr]) \n\t"
86 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
87
88 "sb %[tp2], 0(%[dst_ptr]) \n\t"
89 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
90
91 "sb %[p2], 0(%[dst_ptr]) \n\t"
92 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
93
94 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
95 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
96 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
97 [src] "r"(src), [dst_stride] "r"(dst_stride));
98
99 /* Next row... */
100 src += src_stride;
101 dst += 1;
102 }
103 }
104
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)105 static void convolve_bi_horiz_8_transposed_dspr2(
106 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
107 const int16_t *filter_x0, int32_t h) {
108 int32_t y;
109 uint8_t *cm = vpx_ff_cropTbl;
110 uint8_t *dst_ptr;
111 uint32_t vector4a = 64;
112 int32_t Temp1, Temp2, Temp3;
113 uint32_t tp1, tp2, tp3;
114 uint32_t p1, p2, p3, p4;
115 uint8_t *odd_dst;
116 uint32_t dst_pitch_2 = (dst_stride << 1);
117 const int16_t *filter = &filter_x0[3];
118 uint32_t filter45;
119
120 filter45 = ((const int32_t *)filter)[0];
121
122 for (y = h; y--;) {
123 /* prefetch data to cache memory */
124 prefetch_load(src + src_stride);
125 prefetch_load(src + src_stride + 32);
126
127 dst_ptr = dst;
128 odd_dst = (dst_ptr + dst_stride);
129
130 __asm__ __volatile__(
131 "ulw %[tp1], 0(%[src]) \n\t"
132 "ulw %[tp2], 4(%[src]) \n\t"
133
134 /* even 1. pixel */
135 "mtlo %[vector4a], $ac3 \n\t"
136 "mthi $zero, $ac3 \n\t"
137 "mtlo %[vector4a], $ac2 \n\t"
138 "mthi $zero, $ac2 \n\t"
139 "preceu.ph.qbr %[p1], %[tp1] \n\t"
140 "preceu.ph.qbl %[p2], %[tp1] \n\t"
141 "preceu.ph.qbr %[p3], %[tp2] \n\t"
142 "preceu.ph.qbl %[p4], %[tp2] \n\t"
143 "ulw %[tp3], 8(%[src]) \n\t"
144 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
145 "extp %[Temp1], $ac3, 31 \n\t"
146
147 /* even 2. pixel */
148 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
149 "extp %[Temp3], $ac2, 31 \n\t"
150
151 /* even 3. pixel */
152 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
153 "mtlo %[vector4a], $ac1 \n\t"
154 "mthi $zero, $ac1 \n\t"
155 "balign %[tp3], %[tp2], 3 \n\t"
156 "balign %[tp2], %[tp1], 3 \n\t"
157 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
158 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
159 "extp %[p3], $ac1, 31 \n\t"
160
161 /* even 4. pixel */
162 "mtlo %[vector4a], $ac2 \n\t"
163 "mthi $zero, $ac2 \n\t"
164 "mtlo %[vector4a], $ac3 \n\t"
165 "mthi $zero, $ac3 \n\t"
166 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
167 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
168 "sb %[tp1], 0(%[dst_ptr]) \n\t"
169 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
170
171 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
172 "extp %[Temp3], $ac2, 31 \n\t"
173
174 "lbux %[Temp1], %[p3](%[cm]) "
175 "\n\t"
176
177 /* odd 1. pixel */
178 "mtlo %[vector4a], $ac1 \n\t"
179 "mthi $zero, $ac1 \n\t"
180 "preceu.ph.qbr %[p1], %[tp2] \n\t"
181 "preceu.ph.qbl %[p2], %[tp2] \n\t"
182 "preceu.ph.qbr %[p3], %[tp3] \n\t"
183 "preceu.ph.qbl %[p4], %[tp3] \n\t"
184 "sb %[Temp1], 0(%[dst_ptr]) \n\t"
185 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
186
187 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
188 "extp %[Temp2], $ac3, 31 \n\t"
189
190 /* odd 2. pixel */
191 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
192 "mtlo %[vector4a], $ac3 \n\t"
193 "mthi $zero, $ac3 \n\t"
194 "mtlo %[vector4a], $ac2 \n\t"
195 "mthi $zero, $ac2 \n\t"
196 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
197 "sb %[tp1], 0(%[dst_ptr]) \n\t"
198 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
199 "extp %[Temp3], $ac1, 31 \n\t"
200
201 /* odd 3. pixel */
202 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
203 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
204 "extp %[Temp2], $ac3, 31 \n\t"
205
206 /* odd 4. pixel */
207 "sb %[tp3], 0(%[odd_dst]) \n\t"
208 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
209 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
210 "extp %[Temp1], $ac2, 31 \n\t"
211
212 /* clamp */
213 "lbux %[p4], %[Temp3](%[cm]) \n\t"
214 "lbux %[p2], %[Temp2](%[cm]) \n\t"
215 "lbux %[p1], %[Temp1](%[cm]) \n\t"
216
217 /* store bytes */
218 "sb %[p4], 0(%[odd_dst]) \n\t"
219 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
220
221 "sb %[p2], 0(%[odd_dst]) \n\t"
222 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
223
224 "sb %[p1], 0(%[odd_dst]) \n\t"
225
226 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
227 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
228 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
229 [odd_dst] "+r"(odd_dst)
230 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
231 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
232
233 /* Next row... */
234 src += src_stride;
235 dst += 1;
236 }
237 }
238
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)239 static void convolve_bi_horiz_16_transposed_dspr2(
240 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
241 int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
242 int32_t c, y;
243 const uint8_t *src;
244 uint8_t *dst;
245 uint8_t *cm = vpx_ff_cropTbl;
246 uint32_t vector_64 = 64;
247 int32_t Temp1, Temp2, Temp3;
248 uint32_t qload1, qload2;
249 uint32_t p1, p2, p3, p4, p5;
250 uint32_t st1, st2, st3;
251 uint32_t dst_pitch_2 = (dst_stride << 1);
252 uint8_t *odd_dst;
253 const int16_t *filter = &filter_x0[3];
254 uint32_t filter45;
255
256 filter45 = ((const int32_t *)filter)[0];
257
258 for (y = h; y--;) {
259 /* prefetch data to cache memory */
260 prefetch_load(src_ptr + src_stride);
261 prefetch_load(src_ptr + src_stride + 32);
262
263 src = src_ptr;
264 dst = dst_ptr;
265
266 odd_dst = (dst + dst_stride);
267
268 for (c = 0; c < count; c++) {
269 __asm__ __volatile__(
270 "ulw %[qload1], 0(%[src]) "
271 "\n\t"
272 "ulw %[qload2], 4(%[src]) "
273 "\n\t"
274
275 /* even 1. pixel */
276 "mtlo %[vector_64], $ac1 "
277 "\n\t" /* even 1 */
278 "mthi $zero, $ac1 "
279 "\n\t"
280 "mtlo %[vector_64], $ac2 "
281 "\n\t" /* even 2 */
282 "mthi $zero, $ac2 "
283 "\n\t"
284 "preceu.ph.qbr %[p1], %[qload1] "
285 "\n\t"
286 "preceu.ph.qbl %[p2], %[qload1] "
287 "\n\t"
288 "preceu.ph.qbr %[p3], %[qload2] "
289 "\n\t"
290 "preceu.ph.qbl %[p4], %[qload2] "
291 "\n\t"
292 "ulw %[qload1], 8(%[src]) "
293 "\n\t"
294 "dpa.w.ph $ac1, %[p1], %[filter45] "
295 "\n\t" /* even 1 */
296 "extp %[Temp1], $ac1, 31 "
297 "\n\t" /* even 1 */
298
299 /* even 2. pixel */
300 "mtlo %[vector_64], $ac3 "
301 "\n\t" /* even 3 */
302 "mthi $zero, $ac3 "
303 "\n\t"
304 "preceu.ph.qbr %[p1], %[qload1] "
305 "\n\t"
306 "preceu.ph.qbl %[p5], %[qload1] "
307 "\n\t"
308 "ulw %[qload2], 12(%[src]) "
309 "\n\t"
310 "dpa.w.ph $ac2, %[p2], %[filter45] "
311 "\n\t" /* even 1 */
312 "lbux %[st1], %[Temp1](%[cm]) "
313 "\n\t" /* even 1 */
314 "extp %[Temp2], $ac2, 31 "
315 "\n\t" /* even 1 */
316
317 /* even 3. pixel */
318 "mtlo %[vector_64], $ac1 "
319 "\n\t" /* even 4 */
320 "mthi $zero, $ac1 "
321 "\n\t"
322 "preceu.ph.qbr %[p2], %[qload2] "
323 "\n\t"
324 "sb %[st1], 0(%[dst]) "
325 "\n\t" /* even 1 */
326 "addu %[dst], %[dst], %[dst_pitch_2] "
327 " \n\t"
328 "dpa.w.ph $ac3, %[p3], %[filter45] "
329 "\n\t" /* even 3 */
330 "extp %[Temp3], $ac3, 31 "
331 "\n\t" /* even 3 */
332 "lbux %[st2], %[Temp2](%[cm]) "
333 "\n\t" /* even 1 */
334
335 /* even 4. pixel */
336 "mtlo %[vector_64], $ac2 "
337 "\n\t" /* even 5 */
338 "mthi $zero, $ac2 "
339 "\n\t"
340 "preceu.ph.qbl %[p3], %[qload2] "
341 "\n\t"
342 "sb %[st2], 0(%[dst]) "
343 "\n\t" /* even 2 */
344 "addu %[dst], %[dst], %[dst_pitch_2] "
345 "\n\t"
346 "dpa.w.ph $ac1, %[p4], %[filter45] "
347 "\n\t" /* even 4 */
348 "extp %[Temp1], $ac1, 31 "
349 "\n\t" /* even 4 */
350 "lbux %[st3], %[Temp3](%[cm]) "
351 "\n\t" /* even 3 */
352
353 /* even 5. pixel */
354 "mtlo %[vector_64], $ac3 "
355 "\n\t" /* even 6 */
356 "mthi $zero, $ac3 "
357 "\n\t"
358 "sb %[st3], 0(%[dst]) "
359 "\n\t" /* even 3 */
360 "addu %[dst], %[dst], %[dst_pitch_2] "
361 "\n\t"
362 "dpa.w.ph $ac2, %[p1], %[filter45] "
363 "\n\t" /* even 5 */
364 "extp %[Temp2], $ac2, 31 "
365 "\n\t" /* even 5 */
366 "lbux %[st1], %[Temp1](%[cm]) "
367 "\n\t" /* even 4 */
368
369 /* even 6. pixel */
370 "mtlo %[vector_64], $ac1 "
371 "\n\t" /* even 7 */
372 "mthi $zero, $ac1 "
373 "\n\t"
374 "sb %[st1], 0(%[dst]) "
375 "\n\t" /* even 4 */
376 "addu %[dst], %[dst], %[dst_pitch_2] "
377 "\n\t"
378 "ulw %[qload1], 20(%[src]) "
379 "\n\t"
380 "dpa.w.ph $ac3, %[p5], %[filter45] "
381 "\n\t" /* even 6 */
382 "extp %[Temp3], $ac3, 31 "
383 "\n\t" /* even 6 */
384 "lbux %[st2], %[Temp2](%[cm]) "
385 "\n\t" /* even 5 */
386
387 /* even 7. pixel */
388 "mtlo %[vector_64], $ac2 "
389 "\n\t" /* even 8 */
390 "mthi $zero, $ac2 "
391 "\n\t"
392 "preceu.ph.qbr %[p5], %[qload1] "
393 "\n\t"
394 "sb %[st2], 0(%[dst]) "
395 "\n\t" /* even 5 */
396 "addu %[dst], %[dst], %[dst_pitch_2] "
397 "\n\t"
398 "dpa.w.ph $ac1, %[p2], %[filter45] "
399 "\n\t" /* even 7 */
400 "extp %[Temp1], $ac1, 31 "
401 "\n\t" /* even 7 */
402 "lbux %[st3], %[Temp3](%[cm]) "
403 "\n\t" /* even 6 */
404
405 /* even 8. pixel */
406 "mtlo %[vector_64], $ac3 "
407 "\n\t" /* odd 1 */
408 "mthi $zero, $ac3 "
409 "\n\t"
410 "dpa.w.ph $ac2, %[p3], %[filter45] "
411 "\n\t" /* even 8 */
412 "sb %[st3], 0(%[dst]) "
413 "\n\t" /* even 6 */
414 "addu %[dst], %[dst], %[dst_pitch_2] "
415 "\n\t"
416 "extp %[Temp2], $ac2, 31 "
417 "\n\t" /* even 8 */
418 "lbux %[st1], %[Temp1](%[cm]) "
419 "\n\t" /* even 7 */
420
421 /* ODD pixels */
422 "ulw %[qload1], 1(%[src]) "
423 "\n\t"
424 "ulw %[qload2], 5(%[src]) "
425 "\n\t"
426
427 /* odd 1. pixel */
428 "mtlo %[vector_64], $ac1 "
429 "\n\t" /* odd 2 */
430 "mthi $zero, $ac1 "
431 "\n\t"
432 "preceu.ph.qbr %[p1], %[qload1] "
433 "\n\t"
434 "preceu.ph.qbl %[p2], %[qload1] "
435 "\n\t"
436 "preceu.ph.qbr %[p3], %[qload2] "
437 "\n\t"
438 "preceu.ph.qbl %[p4], %[qload2] "
439 "\n\t"
440 "sb %[st1], 0(%[dst]) "
441 "\n\t" /* even 7 */
442 "addu %[dst], %[dst], %[dst_pitch_2] "
443 "\n\t"
444 "ulw %[qload2], 9(%[src]) "
445 "\n\t"
446 "dpa.w.ph $ac3, %[p1], %[filter45] "
447 "\n\t" /* odd 1 */
448 "extp %[Temp3], $ac3, 31 "
449 "\n\t" /* odd 1 */
450 "lbux %[st2], %[Temp2](%[cm]) "
451 "\n\t" /* even 8 */
452
453 /* odd 2. pixel */
454 "mtlo %[vector_64], $ac2 "
455 "\n\t" /* odd 3 */
456 "mthi $zero, $ac2 "
457 "\n\t"
458 "preceu.ph.qbr %[p1], %[qload2] "
459 "\n\t"
460 "preceu.ph.qbl %[p5], %[qload2] "
461 "\n\t"
462 "sb %[st2], 0(%[dst]) "
463 "\n\t" /* even 8 */
464 "ulw %[qload1], 13(%[src]) "
465 "\n\t"
466 "dpa.w.ph $ac1, %[p2], %[filter45] "
467 "\n\t" /* odd 2 */
468 "extp %[Temp1], $ac1, 31 "
469 "\n\t" /* odd 2 */
470 "lbux %[st3], %[Temp3](%[cm]) "
471 "\n\t" /* odd 1 */
472
473 /* odd 3. pixel */
474 "mtlo %[vector_64], $ac3 "
475 "\n\t" /* odd 4 */
476 "mthi $zero, $ac3 "
477 "\n\t"
478 "preceu.ph.qbr %[p2], %[qload1] "
479 "\n\t"
480 "sb %[st3], 0(%[odd_dst]) "
481 "\n\t" /* odd 1 */
482 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
483 "\n\t"
484 "dpa.w.ph $ac2, %[p3], %[filter45] "
485 "\n\t" /* odd 3 */
486 "extp %[Temp2], $ac2, 31 "
487 "\n\t" /* odd 3 */
488 "lbux %[st1], %[Temp1](%[cm]) "
489 "\n\t" /* odd 2 */
490
491 /* odd 4. pixel */
492 "mtlo %[vector_64], $ac1 "
493 "\n\t" /* odd 5 */
494 "mthi $zero, $ac1 "
495 "\n\t"
496 "preceu.ph.qbl %[p3], %[qload1] "
497 "\n\t"
498 "sb %[st1], 0(%[odd_dst]) "
499 "\n\t" /* odd 2 */
500 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
501 "\n\t"
502 "dpa.w.ph $ac3, %[p4], %[filter45] "
503 "\n\t" /* odd 4 */
504 "extp %[Temp3], $ac3, 31 "
505 "\n\t" /* odd 4 */
506 "lbux %[st2], %[Temp2](%[cm]) "
507 "\n\t" /* odd 3 */
508
509 /* odd 5. pixel */
510 "mtlo %[vector_64], $ac2 "
511 "\n\t" /* odd 6 */
512 "mthi $zero, $ac2 "
513 "\n\t"
514 "sb %[st2], 0(%[odd_dst]) "
515 "\n\t" /* odd 3 */
516 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
517 "\n\t"
518 "dpa.w.ph $ac1, %[p1], %[filter45] "
519 "\n\t" /* odd 5 */
520 "extp %[Temp1], $ac1, 31 "
521 "\n\t" /* odd 5 */
522 "lbux %[st3], %[Temp3](%[cm]) "
523 "\n\t" /* odd 4 */
524
525 /* odd 6. pixel */
526 "mtlo %[vector_64], $ac3 "
527 "\n\t" /* odd 7 */
528 "mthi $zero, $ac3 "
529 "\n\t"
530 "sb %[st3], 0(%[odd_dst]) "
531 "\n\t" /* odd 4 */
532 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
533 "\n\t"
534 "ulw %[qload1], 21(%[src]) "
535 "\n\t"
536 "dpa.w.ph $ac2, %[p5], %[filter45] "
537 "\n\t" /* odd 6 */
538 "extp %[Temp2], $ac2, 31 "
539 "\n\t" /* odd 6 */
540 "lbux %[st1], %[Temp1](%[cm]) "
541 "\n\t" /* odd 5 */
542
543 /* odd 7. pixel */
544 "mtlo %[vector_64], $ac1 "
545 "\n\t" /* odd 8 */
546 "mthi $zero, $ac1 "
547 "\n\t"
548 "preceu.ph.qbr %[p5], %[qload1] "
549 "\n\t"
550 "sb %[st1], 0(%[odd_dst]) "
551 "\n\t" /* odd 5 */
552 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
553 "\n\t"
554 "dpa.w.ph $ac3, %[p2], %[filter45] "
555 "\n\t" /* odd 7 */
556 "extp %[Temp3], $ac3, 31 "
557 "\n\t" /* odd 7 */
558
559 /* odd 8. pixel */
560 "dpa.w.ph $ac1, %[p3], %[filter45] "
561 "\n\t" /* odd 8 */
562 "extp %[Temp1], $ac1, 31 "
563 "\n\t" /* odd 8 */
564
565 "lbux %[st2], %[Temp2](%[cm]) "
566 "\n\t" /* odd 6 */
567 "lbux %[st3], %[Temp3](%[cm]) "
568 "\n\t" /* odd 7 */
569 "lbux %[st1], %[Temp1](%[cm]) "
570 "\n\t" /* odd 8 */
571
572 "sb %[st2], 0(%[odd_dst]) "
573 "\n\t" /* odd 6 */
574 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
575 "\n\t"
576
577 "sb %[st3], 0(%[odd_dst]) "
578 "\n\t" /* odd 7 */
579 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
580 "\n\t"
581
582 "sb %[st1], 0(%[odd_dst]) "
583 "\n\t" /* odd 8 */
584
585 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
586 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
587 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
588 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
589 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
590 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
591 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
592
593 src += 16;
594 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
595 odd_dst = (dst + dst_stride);
596 }
597
598 /* Next row... */
599 src_ptr += src_stride;
600 dst_ptr += 1;
601 }
602 }
603
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)604 static void convolve_bi_horiz_64_transposed_dspr2(
605 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
606 int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
607 int32_t c, y;
608 const uint8_t *src;
609 uint8_t *dst;
610 uint8_t *cm = vpx_ff_cropTbl;
611 uint32_t vector_64 = 64;
612 int32_t Temp1, Temp2, Temp3;
613 uint32_t qload1, qload2;
614 uint32_t p1, p2, p3, p4, p5;
615 uint32_t st1, st2, st3;
616 uint32_t dst_pitch_2 = (dst_stride << 1);
617 uint8_t *odd_dst;
618 const int16_t *filter = &filter_x0[3];
619 uint32_t filter45;
620
621 filter45 = ((const int32_t *)filter)[0];
622
623 for (y = h; y--;) {
624 /* prefetch data to cache memory */
625 prefetch_load(src_ptr + src_stride);
626 prefetch_load(src_ptr + src_stride + 32);
627 prefetch_load(src_ptr + src_stride + 64);
628
629 src = src_ptr;
630 dst = dst_ptr;
631
632 odd_dst = (dst + dst_stride);
633
634 for (c = 0; c < 4; c++) {
635 __asm__ __volatile__(
636 "ulw %[qload1], 0(%[src]) "
637 "\n\t"
638 "ulw %[qload2], 4(%[src]) "
639 "\n\t"
640
641 /* even 1. pixel */
642 "mtlo %[vector_64], $ac1 "
643 "\n\t" /* even 1 */
644 "mthi $zero, $ac1 "
645 "\n\t"
646 "mtlo %[vector_64], $ac2 "
647 "\n\t" /* even 2 */
648 "mthi $zero, $ac2 "
649 "\n\t"
650 "preceu.ph.qbr %[p1], %[qload1] "
651 "\n\t"
652 "preceu.ph.qbl %[p2], %[qload1] "
653 "\n\t"
654 "preceu.ph.qbr %[p3], %[qload2] "
655 "\n\t"
656 "preceu.ph.qbl %[p4], %[qload2] "
657 "\n\t"
658 "ulw %[qload1], 8(%[src]) "
659 "\n\t"
660 "dpa.w.ph $ac1, %[p1], %[filter45] "
661 "\n\t" /* even 1 */
662 "extp %[Temp1], $ac1, 31 "
663 "\n\t" /* even 1 */
664
665 /* even 2. pixel */
666 "mtlo %[vector_64], $ac3 "
667 "\n\t" /* even 3 */
668 "mthi $zero, $ac3 "
669 "\n\t"
670 "preceu.ph.qbr %[p1], %[qload1] "
671 "\n\t"
672 "preceu.ph.qbl %[p5], %[qload1] "
673 "\n\t"
674 "ulw %[qload2], 12(%[src]) "
675 "\n\t"
676 "dpa.w.ph $ac2, %[p2], %[filter45] "
677 "\n\t" /* even 1 */
678 "lbux %[st1], %[Temp1](%[cm]) "
679 "\n\t" /* even 1 */
680 "extp %[Temp2], $ac2, 31 "
681 "\n\t" /* even 1 */
682
683 /* even 3. pixel */
684 "mtlo %[vector_64], $ac1 "
685 "\n\t" /* even 4 */
686 "mthi $zero, $ac1 "
687 "\n\t"
688 "preceu.ph.qbr %[p2], %[qload2] "
689 "\n\t"
690 "sb %[st1], 0(%[dst]) "
691 "\n\t" /* even 1 */
692 "addu %[dst], %[dst], %[dst_pitch_2] "
693 " \n\t"
694 "dpa.w.ph $ac3, %[p3], %[filter45] "
695 "\n\t" /* even 3 */
696 "extp %[Temp3], $ac3, 31 "
697 "\n\t" /* even 3 */
698 "lbux %[st2], %[Temp2](%[cm]) "
699 "\n\t" /* even 1 */
700
701 /* even 4. pixel */
702 "mtlo %[vector_64], $ac2 "
703 "\n\t" /* even 5 */
704 "mthi $zero, $ac2 "
705 "\n\t"
706 "preceu.ph.qbl %[p3], %[qload2] "
707 "\n\t"
708 "sb %[st2], 0(%[dst]) "
709 "\n\t" /* even 2 */
710 "addu %[dst], %[dst], %[dst_pitch_2] "
711 "\n\t"
712 "dpa.w.ph $ac1, %[p4], %[filter45] "
713 "\n\t" /* even 4 */
714 "extp %[Temp1], $ac1, 31 "
715 "\n\t" /* even 4 */
716 "lbux %[st3], %[Temp3](%[cm]) "
717 "\n\t" /* even 3 */
718
719 /* even 5. pixel */
720 "mtlo %[vector_64], $ac3 "
721 "\n\t" /* even 6 */
722 "mthi $zero, $ac3 "
723 "\n\t"
724 "sb %[st3], 0(%[dst]) "
725 "\n\t" /* even 3 */
726 "addu %[dst], %[dst], %[dst_pitch_2] "
727 "\n\t"
728 "dpa.w.ph $ac2, %[p1], %[filter45] "
729 "\n\t" /* even 5 */
730 "extp %[Temp2], $ac2, 31 "
731 "\n\t" /* even 5 */
732 "lbux %[st1], %[Temp1](%[cm]) "
733 "\n\t" /* even 4 */
734
735 /* even 6. pixel */
736 "mtlo %[vector_64], $ac1 "
737 "\n\t" /* even 7 */
738 "mthi $zero, $ac1 "
739 "\n\t"
740 "sb %[st1], 0(%[dst]) "
741 "\n\t" /* even 4 */
742 "addu %[dst], %[dst], %[dst_pitch_2] "
743 "\n\t"
744 "ulw %[qload1], 20(%[src]) "
745 "\n\t"
746 "dpa.w.ph $ac3, %[p5], %[filter45] "
747 "\n\t" /* even 6 */
748 "extp %[Temp3], $ac3, 31 "
749 "\n\t" /* even 6 */
750 "lbux %[st2], %[Temp2](%[cm]) "
751 "\n\t" /* even 5 */
752
753 /* even 7. pixel */
754 "mtlo %[vector_64], $ac2 "
755 "\n\t" /* even 8 */
756 "mthi $zero, $ac2 "
757 "\n\t"
758 "preceu.ph.qbr %[p5], %[qload1] "
759 "\n\t"
760 "sb %[st2], 0(%[dst]) "
761 "\n\t" /* even 5 */
762 "addu %[dst], %[dst], %[dst_pitch_2] "
763 "\n\t"
764 "dpa.w.ph $ac1, %[p2], %[filter45] "
765 "\n\t" /* even 7 */
766 "extp %[Temp1], $ac1, 31 "
767 "\n\t" /* even 7 */
768 "lbux %[st3], %[Temp3](%[cm]) "
769 "\n\t" /* even 6 */
770
771 /* even 8. pixel */
772 "mtlo %[vector_64], $ac3 "
773 "\n\t" /* odd 1 */
774 "mthi $zero, $ac3 "
775 "\n\t"
776 "dpa.w.ph $ac2, %[p3], %[filter45] "
777 "\n\t" /* even 8 */
778 "sb %[st3], 0(%[dst]) "
779 "\n\t" /* even 6 */
780 "addu %[dst], %[dst], %[dst_pitch_2] "
781 "\n\t"
782 "extp %[Temp2], $ac2, 31 "
783 "\n\t" /* even 8 */
784 "lbux %[st1], %[Temp1](%[cm]) "
785 "\n\t" /* even 7 */
786
787 /* ODD pixels */
788 "ulw %[qload1], 1(%[src]) "
789 "\n\t"
790 "ulw %[qload2], 5(%[src]) "
791 "\n\t"
792
793 /* odd 1. pixel */
794 "mtlo %[vector_64], $ac1 "
795 "\n\t" /* odd 2 */
796 "mthi $zero, $ac1 "
797 "\n\t"
798 "preceu.ph.qbr %[p1], %[qload1] "
799 "\n\t"
800 "preceu.ph.qbl %[p2], %[qload1] "
801 "\n\t"
802 "preceu.ph.qbr %[p3], %[qload2] "
803 "\n\t"
804 "preceu.ph.qbl %[p4], %[qload2] "
805 "\n\t"
806 "sb %[st1], 0(%[dst]) "
807 "\n\t" /* even 7 */
808 "addu %[dst], %[dst], %[dst_pitch_2] "
809 "\n\t"
810 "ulw %[qload2], 9(%[src]) "
811 "\n\t"
812 "dpa.w.ph $ac3, %[p1], %[filter45] "
813 "\n\t" /* odd 1 */
814 "extp %[Temp3], $ac3, 31 "
815 "\n\t" /* odd 1 */
816 "lbux %[st2], %[Temp2](%[cm]) "
817 "\n\t" /* even 8 */
818
819 /* odd 2. pixel */
820 "mtlo %[vector_64], $ac2 "
821 "\n\t" /* odd 3 */
822 "mthi $zero, $ac2 "
823 "\n\t"
824 "preceu.ph.qbr %[p1], %[qload2] "
825 "\n\t"
826 "preceu.ph.qbl %[p5], %[qload2] "
827 "\n\t"
828 "sb %[st2], 0(%[dst]) "
829 "\n\t" /* even 8 */
830 "ulw %[qload1], 13(%[src]) "
831 "\n\t"
832 "dpa.w.ph $ac1, %[p2], %[filter45] "
833 "\n\t" /* odd 2 */
834 "extp %[Temp1], $ac1, 31 "
835 "\n\t" /* odd 2 */
836 "lbux %[st3], %[Temp3](%[cm]) "
837 "\n\t" /* odd 1 */
838
839 /* odd 3. pixel */
840 "mtlo %[vector_64], $ac3 "
841 "\n\t" /* odd 4 */
842 "mthi $zero, $ac3 "
843 "\n\t"
844 "preceu.ph.qbr %[p2], %[qload1] "
845 "\n\t"
846 "sb %[st3], 0(%[odd_dst]) "
847 "\n\t" /* odd 1 */
848 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
849 "\n\t"
850 "dpa.w.ph $ac2, %[p3], %[filter45] "
851 "\n\t" /* odd 3 */
852 "extp %[Temp2], $ac2, 31 "
853 "\n\t" /* odd 3 */
854 "lbux %[st1], %[Temp1](%[cm]) "
855 "\n\t" /* odd 2 */
856
857 /* odd 4. pixel */
858 "mtlo %[vector_64], $ac1 "
859 "\n\t" /* odd 5 */
860 "mthi $zero, $ac1 "
861 "\n\t"
862 "preceu.ph.qbl %[p3], %[qload1] "
863 "\n\t"
864 "sb %[st1], 0(%[odd_dst]) "
865 "\n\t" /* odd 2 */
866 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
867 "\n\t"
868 "dpa.w.ph $ac3, %[p4], %[filter45] "
869 "\n\t" /* odd 4 */
870 "extp %[Temp3], $ac3, 31 "
871 "\n\t" /* odd 4 */
872 "lbux %[st2], %[Temp2](%[cm]) "
873 "\n\t" /* odd 3 */
874
875 /* odd 5. pixel */
876 "mtlo %[vector_64], $ac2 "
877 "\n\t" /* odd 6 */
878 "mthi $zero, $ac2 "
879 "\n\t"
880 "sb %[st2], 0(%[odd_dst]) "
881 "\n\t" /* odd 3 */
882 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
883 "\n\t"
884 "dpa.w.ph $ac1, %[p1], %[filter45] "
885 "\n\t" /* odd 5 */
886 "extp %[Temp1], $ac1, 31 "
887 "\n\t" /* odd 5 */
888 "lbux %[st3], %[Temp3](%[cm]) "
889 "\n\t" /* odd 4 */
890
891 /* odd 6. pixel */
892 "mtlo %[vector_64], $ac3 "
893 "\n\t" /* odd 7 */
894 "mthi $zero, $ac3 "
895 "\n\t"
896 "sb %[st3], 0(%[odd_dst]) "
897 "\n\t" /* odd 4 */
898 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
899 "\n\t"
900 "ulw %[qload1], 21(%[src]) "
901 "\n\t"
902 "dpa.w.ph $ac2, %[p5], %[filter45] "
903 "\n\t" /* odd 6 */
904 "extp %[Temp2], $ac2, 31 "
905 "\n\t" /* odd 6 */
906 "lbux %[st1], %[Temp1](%[cm]) "
907 "\n\t" /* odd 5 */
908
909 /* odd 7. pixel */
910 "mtlo %[vector_64], $ac1 "
911 "\n\t" /* odd 8 */
912 "mthi $zero, $ac1 "
913 "\n\t"
914 "preceu.ph.qbr %[p5], %[qload1] "
915 "\n\t"
916 "sb %[st1], 0(%[odd_dst]) "
917 "\n\t" /* odd 5 */
918 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
919 "\n\t"
920 "dpa.w.ph $ac3, %[p2], %[filter45] "
921 "\n\t" /* odd 7 */
922 "extp %[Temp3], $ac3, 31 "
923 "\n\t" /* odd 7 */
924
925 /* odd 8. pixel */
926 "dpa.w.ph $ac1, %[p3], %[filter45] "
927 "\n\t" /* odd 8 */
928 "extp %[Temp1], $ac1, 31 "
929 "\n\t" /* odd 8 */
930
931 "lbux %[st2], %[Temp2](%[cm]) "
932 "\n\t" /* odd 6 */
933 "lbux %[st3], %[Temp3](%[cm]) "
934 "\n\t" /* odd 7 */
935 "lbux %[st1], %[Temp1](%[cm]) "
936 "\n\t" /* odd 8 */
937
938 "sb %[st2], 0(%[odd_dst]) "
939 "\n\t" /* odd 6 */
940 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
941 "\n\t"
942
943 "sb %[st3], 0(%[odd_dst]) "
944 "\n\t" /* odd 7 */
945 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
946 "\n\t"
947
948 "sb %[st1], 0(%[odd_dst]) "
949 "\n\t" /* odd 8 */
950
951 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
952 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
953 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
954 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
955 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
956 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
957 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
958
959 src += 16;
960 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
961 odd_dst = (dst + dst_stride);
962 }
963
964 /* Next row... */
965 src_ptr += src_stride;
966 dst_ptr += 1;
967 }
968 }
969
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)970 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
971 uint8_t *dst, ptrdiff_t dst_stride,
972 const int16_t *filter, int w, int h) {
973 int x, y;
974
975 for (y = 0; y < h; ++y) {
976 for (x = 0; x < w; ++x) {
977 int sum = 0;
978
979 sum += src[x] * filter[3];
980 sum += src[x + 1] * filter[4];
981
982 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
983 }
984
985 src += src_stride;
986 dst += 1;
987 }
988 }
989
vpx_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)990 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
991 ptrdiff_t dst_stride, const int16_t *filter, int w,
992 int h) {
993 uint32_t pos = 38;
994
995 /* bit positon for extract from acc */
996 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
997 :
998 : [pos] "r"(pos));
999
1000 /* prefetch data to cache memory */
1001 prefetch_load(src);
1002 prefetch_load(src + 32);
1003
1004 switch (w) {
1005 case 4:
1006 convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
1007 filter, h);
1008 break;
1009 case 8:
1010 convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
1011 filter, h);
1012 break;
1013 case 16:
1014 case 32:
1015 convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
1016 filter, h, (w / 16));
1017 break;
1018 case 64:
1019 prefetch_load(src + 32);
1020 convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
1021 filter, h);
1022 break;
1023 default:
1024 convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
1025 h);
1026 break;
1027 }
1028 }
1029 #endif
1030