1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
22 int32_t src_stride, uint8_t *dst,
23 int32_t dst_stride,
24 const int16_t *filter_x0,
25 int32_t h) {
26 int32_t y;
27 uint8_t *cm = vpx_ff_cropTbl;
28 uint8_t *dst_ptr;
29 int32_t vector1b, vector2b, vector3b, vector4b;
30 int32_t Temp1, Temp2, Temp3, Temp4;
31 uint32_t vector4a = 64;
32 uint32_t tp1, tp2;
33 uint32_t p1, p2, p3, p4;
34 uint32_t tn1, tn2;
35
36 vector1b = ((const int32_t *)filter_x0)[0];
37 vector2b = ((const int32_t *)filter_x0)[1];
38 vector3b = ((const int32_t *)filter_x0)[2];
39 vector4b = ((const int32_t *)filter_x0)[3];
40
41 for (y = h; y--;) {
42 dst_ptr = dst;
43 /* prefetch data to cache memory */
44 prefetch_load(src + src_stride);
45 prefetch_load(src + src_stride + 32);
46
47 __asm__ __volatile__(
48 "ulw %[tp1], 0(%[src]) \n\t"
49 "ulw %[tp2], 4(%[src]) \n\t"
50
51 /* even 1. pixel */
52 "mtlo %[vector4a], $ac3 \n\t"
53 "mthi $zero, $ac3 \n\t"
54 "preceu.ph.qbr %[p1], %[tp1] \n\t"
55 "preceu.ph.qbl %[p2], %[tp1] \n\t"
56 "preceu.ph.qbr %[p3], %[tp2] \n\t"
57 "preceu.ph.qbl %[p4], %[tp2] \n\t"
58 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
59 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
60 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
61 "ulw %[tn2], 8(%[src]) \n\t"
62 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
63 "extp %[Temp1], $ac3, 31 \n\t"
64
65 /* even 2. pixel */
66 "mtlo %[vector4a], $ac2 \n\t"
67 "mthi $zero, $ac2 \n\t"
68 "preceu.ph.qbr %[p1], %[tn2] \n\t"
69 "balign %[tn1], %[tn2], 3 \n\t"
70 "balign %[tn2], %[tp2], 3 \n\t"
71 "balign %[tp2], %[tp1], 3 \n\t"
72 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
73 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
74 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
75 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
76 "extp %[Temp3], $ac2, 31 \n\t"
77
78 /* odd 1. pixel */
79 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
80 "mtlo %[vector4a], $ac3 \n\t"
81 "mthi $zero, $ac3 \n\t"
82 "preceu.ph.qbr %[p1], %[tp2] \n\t"
83 "preceu.ph.qbl %[p2], %[tp2] \n\t"
84 "preceu.ph.qbr %[p3], %[tn2] \n\t"
85 "preceu.ph.qbl %[p4], %[tn2] \n\t"
86 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
87 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
88 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
89 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
90 "extp %[Temp2], $ac3, 31 \n\t"
91
92 /* odd 2. pixel */
93 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
94 "mtlo %[vector4a], $ac2 \n\t"
95 "mthi $zero, $ac2 \n\t"
96 "preceu.ph.qbr %[p1], %[tn1] \n\t"
97 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
98 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
99 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
100 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
101 "extp %[Temp4], $ac2, 31 \n\t"
102
103 /* clamp */
104 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
105 "lbux %[p2], %[Temp4](%[cm]) \n\t"
106
107 /* store bytes */
108 "sb %[tp1], 0(%[dst_ptr]) \n\t"
109 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
110
111 "sb %[tn1], 0(%[dst_ptr]) \n\t"
112 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
113
114 "sb %[tp2], 0(%[dst_ptr]) \n\t"
115 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
116
117 "sb %[p2], 0(%[dst_ptr]) \n\t"
118 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
119
120 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
121 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
122 [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
123 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
124 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
125 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
126 [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
127 [dst_stride] "r"(dst_stride));
128
129 /* Next row... */
130 src += src_stride;
131 dst += 1;
132 }
133 }
134
convolve_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)135 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
136 int32_t src_stride, uint8_t *dst,
137 int32_t dst_stride,
138 const int16_t *filter_x0,
139 int32_t h) {
140 int32_t y;
141 uint8_t *cm = vpx_ff_cropTbl;
142 uint8_t *dst_ptr;
143 uint32_t vector4a = 64;
144 int32_t vector1b, vector2b, vector3b, vector4b;
145 int32_t Temp1, Temp2, Temp3;
146 uint32_t tp1, tp2, tp3;
147 uint32_t p1, p2, p3, p4, n1;
148 uint8_t *odd_dst;
149 uint32_t dst_pitch_2 = (dst_stride << 1);
150
151 vector1b = ((const int32_t *)filter_x0)[0];
152 vector2b = ((const int32_t *)filter_x0)[1];
153 vector3b = ((const int32_t *)filter_x0)[2];
154 vector4b = ((const int32_t *)filter_x0)[3];
155
156 for (y = h; y--;) {
157 /* prefetch data to cache memory */
158 prefetch_load(src + src_stride);
159 prefetch_load(src + src_stride + 32);
160
161 dst_ptr = dst;
162 odd_dst = (dst_ptr + dst_stride);
163
164 __asm__ __volatile__(
165 "ulw %[tp2], 0(%[src]) \n\t"
166 "ulw %[tp1], 4(%[src]) \n\t"
167
168 /* even 1. pixel */
169 "mtlo %[vector4a], $ac3 \n\t"
170 "mthi $zero, $ac3 \n\t"
171 "mtlo %[vector4a], $ac2 \n\t"
172 "mthi $zero, $ac2 \n\t"
173 "preceu.ph.qbr %[p1], %[tp2] \n\t"
174 "preceu.ph.qbl %[p2], %[tp2] \n\t"
175 "preceu.ph.qbr %[p3], %[tp1] \n\t"
176 "preceu.ph.qbl %[p4], %[tp1] \n\t"
177 "ulw %[tp3], 8(%[src]) \n\t"
178 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
179 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
180 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
181 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
182 "extp %[Temp1], $ac3, 31 \n\t"
183
184 /* even 2. pixel */
185 "preceu.ph.qbr %[p1], %[tp3] \n\t"
186 "preceu.ph.qbl %[n1], %[tp3] \n\t"
187 "ulw %[tp2], 12(%[src]) \n\t"
188 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
189 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
190 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
191 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
192 "extp %[Temp3], $ac2, 31 \n\t"
193
194 /* even 3. pixel */
195 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
196 "mtlo %[vector4a], $ac1 \n\t"
197 "mthi $zero, $ac1 \n\t"
198 "preceu.ph.qbr %[p2], %[tp2] \n\t"
199 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
200 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
201 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
202 "lbux %[tp3], %[Temp3](%[cm]) \n\t"
203 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
204 "extp %[p3], $ac1, 31 \n\t"
205
206 /* even 4. pixel */
207 "mtlo %[vector4a], $ac2 \n\t"
208 "mthi $zero, $ac2 \n\t"
209 "mtlo %[vector4a], $ac3 \n\t"
210 "mthi $zero, $ac3 \n\t"
211 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
212 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
213 "sb %[tp3], 0(%[dst_ptr]) \n\t"
214 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
215
216 "ulw %[tp1], 1(%[src]) \n\t"
217 "ulw %[tp3], 5(%[src]) \n\t"
218
219 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
220 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
221 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
222 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
223 "extp %[Temp3], $ac2, 31 \n\t"
224
225 "lbux %[tp2], %[p3](%[cm]) \n\t"
226
227 /* odd 1. pixel */
228 "mtlo %[vector4a], $ac1 \n\t"
229 "mthi $zero, $ac1 \n\t"
230 "preceu.ph.qbr %[p1], %[tp1] \n\t"
231 "preceu.ph.qbl %[p2], %[tp1] \n\t"
232 "preceu.ph.qbr %[p3], %[tp3] \n\t"
233 "preceu.ph.qbl %[p4], %[tp3] \n\t"
234 "sb %[tp2], 0(%[dst_ptr]) \n\t"
235 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
236 "ulw %[tp2], 9(%[src]) \n\t"
237
238 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
239 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
240 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
241 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
242 "extp %[Temp2], $ac3, 31 \n\t"
243
244 /* odd 2. pixel */
245 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
246 "mtlo %[vector4a], $ac3 \n\t"
247 "mthi $zero, $ac3 \n\t"
248 "mtlo %[vector4a], $ac2 \n\t"
249 "mthi $zero, $ac2 \n\t"
250 "preceu.ph.qbr %[p1], %[tp2] \n\t"
251 "preceu.ph.qbl %[n1], %[tp2] \n\t"
252 "ulw %[Temp1], 13(%[src]) \n\t"
253 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
254 "sb %[tp1], 0(%[dst_ptr]) \n\t"
255 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
256 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
257 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
258 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
259 "extp %[Temp3], $ac1, 31 \n\t"
260
261 /* odd 3. pixel */
262 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
263 "preceu.ph.qbr %[p2], %[Temp1] \n\t"
264 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
265 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
266 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
267 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
268 "extp %[Temp2], $ac3, 31 \n\t"
269
270 /* odd 4. pixel */
271 "sb %[tp3], 0(%[odd_dst]) \n\t"
272 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
273 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
274 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
275 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
276 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
277 "extp %[Temp1], $ac2, 31 \n\t"
278
279 /* clamp */
280 "lbux %[p4], %[Temp3](%[cm]) \n\t"
281 "lbux %[p2], %[Temp2](%[cm]) \n\t"
282 "lbux %[n1], %[Temp1](%[cm]) \n\t"
283
284 /* store bytes */
285 "sb %[p4], 0(%[odd_dst]) \n\t"
286 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
287
288 "sb %[p2], 0(%[odd_dst]) \n\t"
289 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
290
291 "sb %[n1], 0(%[odd_dst]) \n\t"
292
293 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
294 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
295 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
296 [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
297 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
298 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
299 [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
300 [dst_pitch_2] "r"(dst_pitch_2));
301
302 /* Next row... */
303 src += src_stride;
304 dst += 1;
305 }
306 }
307
convolve_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)308 static void convolve_horiz_16_transposed_dspr2(
309 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
310 int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
311 int32_t c, y;
312 const uint8_t *src;
313 uint8_t *dst;
314 uint8_t *cm = vpx_ff_cropTbl;
315 uint32_t vector_64 = 64;
316 int32_t filter12, filter34, filter56, filter78;
317 int32_t Temp1, Temp2, Temp3;
318 uint32_t qload1, qload2;
319 uint32_t p1, p2, p3, p4, p5;
320 uint32_t st1, st2, st3;
321 uint32_t dst_pitch_2 = (dst_stride << 1);
322 uint8_t *odd_dst;
323
324 filter12 = ((const int32_t *)filter_x0)[0];
325 filter34 = ((const int32_t *)filter_x0)[1];
326 filter56 = ((const int32_t *)filter_x0)[2];
327 filter78 = ((const int32_t *)filter_x0)[3];
328
329 for (y = h; y--;) {
330 /* prefetch data to cache memory */
331 prefetch_load(src_ptr + src_stride);
332 prefetch_load(src_ptr + src_stride + 32);
333
334 src = src_ptr;
335 dst = dst_ptr;
336
337 odd_dst = (dst + dst_stride);
338
339 for (c = 0; c < count; c++) {
340 __asm__ __volatile__(
341 "ulw %[qload1], 0(%[src]) "
342 "\n\t"
343 "ulw %[qload2], 4(%[src]) "
344 "\n\t"
345
346 /* even 1. pixel */
347 "mtlo %[vector_64], $ac1 "
348 "\n\t" /* even 1 */
349 "mthi $zero, $ac1 "
350 "\n\t"
351 "mtlo %[vector_64], $ac2 "
352 "\n\t" /* even 2 */
353 "mthi $zero, $ac2 "
354 "\n\t"
355 "preceu.ph.qbr %[p3], %[qload2] "
356 "\n\t"
357 "preceu.ph.qbl %[p4], %[qload2] "
358 "\n\t"
359 "preceu.ph.qbr %[p1], %[qload1] "
360 "\n\t"
361 "preceu.ph.qbl %[p2], %[qload1] "
362 "\n\t"
363 "ulw %[qload2], 8(%[src]) "
364 "\n\t"
365 "dpa.w.ph $ac1, %[p1], %[filter12] "
366 "\n\t" /* even 1 */
367 "dpa.w.ph $ac1, %[p2], %[filter34] "
368 "\n\t" /* even 1 */
369 "dpa.w.ph $ac1, %[p3], %[filter56] "
370 "\n\t" /* even 1 */
371 "dpa.w.ph $ac1, %[p4], %[filter78] "
372 "\n\t" /* even 1 */
373 "extp %[Temp1], $ac1, 31 "
374 "\n\t" /* even 1 */
375
376 /* even 2. pixel */
377 "mtlo %[vector_64], $ac3 "
378 "\n\t" /* even 3 */
379 "mthi $zero, $ac3 "
380 "\n\t"
381 "preceu.ph.qbr %[p1], %[qload2] "
382 "\n\t"
383 "preceu.ph.qbl %[p5], %[qload2] "
384 "\n\t"
385 "ulw %[qload1], 12(%[src]) "
386 "\n\t"
387 "dpa.w.ph $ac2, %[p2], %[filter12] "
388 "\n\t" /* even 1 */
389 "dpa.w.ph $ac2, %[p3], %[filter34] "
390 "\n\t" /* even 1 */
391 "dpa.w.ph $ac2, %[p4], %[filter56] "
392 "\n\t" /* even 1 */
393 "dpa.w.ph $ac2, %[p1], %[filter78] "
394 "\n\t" /* even 1 */
395 "lbux %[st1], %[Temp1](%[cm]) "
396 "\n\t" /* even 1 */
397 "extp %[Temp2], $ac2, 31 "
398 "\n\t" /* even 1 */
399
400 /* even 3. pixel */
401 "mtlo %[vector_64], $ac1 "
402 "\n\t" /* even 4 */
403 "mthi $zero, $ac1 "
404 "\n\t"
405 "preceu.ph.qbr %[p2], %[qload1] "
406 "\n\t"
407 "sb %[st1], 0(%[dst]) "
408 "\n\t" /* even 1 */
409 "addu %[dst], %[dst], %[dst_pitch_2] "
410 " \n\t"
411 "dpa.w.ph $ac3, %[p3], %[filter12] "
412 "\n\t" /* even 3 */
413 "dpa.w.ph $ac3, %[p4], %[filter34] "
414 "\n\t" /* even 3 */
415 "dpa.w.ph $ac3, %[p1], %[filter56] "
416 "\n\t" /* even 3 */
417 "dpa.w.ph $ac3, %[p5], %[filter78] "
418 "\n\t" /* even 3 */
419 "extp %[Temp3], $ac3, 31 "
420 "\n\t" /* even 3 */
421 "lbux %[st2], %[Temp2](%[cm]) "
422 "\n\t" /* even 1 */
423
424 /* even 4. pixel */
425 "mtlo %[vector_64], $ac2 "
426 "\n\t" /* even 5 */
427 "mthi $zero, $ac2 "
428 "\n\t"
429 "preceu.ph.qbl %[p3], %[qload1] "
430 "\n\t"
431 "sb %[st2], 0(%[dst]) "
432 "\n\t" /* even 2 */
433 "addu %[dst], %[dst], %[dst_pitch_2] "
434 "\n\t"
435 "ulw %[qload2], 16(%[src]) "
436 "\n\t"
437 "dpa.w.ph $ac1, %[p4], %[filter12] "
438 "\n\t" /* even 4 */
439 "dpa.w.ph $ac1, %[p1], %[filter34] "
440 "\n\t" /* even 4 */
441 "dpa.w.ph $ac1, %[p5], %[filter56] "
442 "\n\t" /* even 4 */
443 "dpa.w.ph $ac1, %[p2], %[filter78] "
444 "\n\t" /* even 4 */
445 "extp %[Temp1], $ac1, 31 "
446 "\n\t" /* even 4 */
447 "lbux %[st3], %[Temp3](%[cm]) "
448 "\n\t" /* even 3 */
449
450 /* even 5. pixel */
451 "mtlo %[vector_64], $ac3 "
452 "\n\t" /* even 6 */
453 "mthi $zero, $ac3 "
454 "\n\t"
455 "preceu.ph.qbr %[p4], %[qload2] "
456 "\n\t"
457 "sb %[st3], 0(%[dst]) "
458 "\n\t" /* even 3 */
459 "addu %[dst], %[dst], %[dst_pitch_2] "
460 "\n\t"
461 "dpa.w.ph $ac2, %[p1], %[filter12] "
462 "\n\t" /* even 5 */
463 "dpa.w.ph $ac2, %[p5], %[filter34] "
464 "\n\t" /* even 5 */
465 "dpa.w.ph $ac2, %[p2], %[filter56] "
466 "\n\t" /* even 5 */
467 "dpa.w.ph $ac2, %[p3], %[filter78] "
468 "\n\t" /* even 5 */
469 "extp %[Temp2], $ac2, 31 "
470 "\n\t" /* even 5 */
471 "lbux %[st1], %[Temp1](%[cm]) "
472 "\n\t" /* even 4 */
473
474 /* even 6. pixel */
475 "mtlo %[vector_64], $ac1 "
476 "\n\t" /* even 7 */
477 "mthi $zero, $ac1 "
478 "\n\t"
479 "preceu.ph.qbl %[p1], %[qload2] "
480 "\n\t"
481 "sb %[st1], 0(%[dst]) "
482 "\n\t" /* even 4 */
483 "addu %[dst], %[dst], %[dst_pitch_2] "
484 "\n\t"
485 "ulw %[qload1], 20(%[src]) "
486 "\n\t"
487 "dpa.w.ph $ac3, %[p5], %[filter12] "
488 "\n\t" /* even 6 */
489 "dpa.w.ph $ac3, %[p2], %[filter34] "
490 "\n\t" /* even 6 */
491 "dpa.w.ph $ac3, %[p3], %[filter56] "
492 "\n\t" /* even 6 */
493 "dpa.w.ph $ac3, %[p4], %[filter78] "
494 "\n\t" /* even 6 */
495 "extp %[Temp3], $ac3, 31 "
496 "\n\t" /* even 6 */
497 "lbux %[st2], %[Temp2](%[cm]) "
498 "\n\t" /* even 5 */
499
500 /* even 7. pixel */
501 "mtlo %[vector_64], $ac2 "
502 "\n\t" /* even 8 */
503 "mthi $zero, $ac2 "
504 "\n\t"
505 "preceu.ph.qbr %[p5], %[qload1] "
506 "\n\t"
507 "sb %[st2], 0(%[dst]) "
508 "\n\t" /* even 5 */
509 "addu %[dst], %[dst], %[dst_pitch_2] "
510 "\n\t"
511 "dpa.w.ph $ac1, %[p2], %[filter12] "
512 "\n\t" /* even 7 */
513 "dpa.w.ph $ac1, %[p3], %[filter34] "
514 "\n\t" /* even 7 */
515 "dpa.w.ph $ac1, %[p4], %[filter56] "
516 "\n\t" /* even 7 */
517 "dpa.w.ph $ac1, %[p1], %[filter78] "
518 "\n\t" /* even 7 */
519 "extp %[Temp1], $ac1, 31 "
520 "\n\t" /* even 7 */
521 "lbux %[st3], %[Temp3](%[cm]) "
522 "\n\t" /* even 6 */
523
524 /* even 8. pixel */
525 "mtlo %[vector_64], $ac3 "
526 "\n\t" /* odd 1 */
527 "mthi $zero, $ac3 "
528 "\n\t"
529 "dpa.w.ph $ac2, %[p3], %[filter12] "
530 "\n\t" /* even 8 */
531 "dpa.w.ph $ac2, %[p4], %[filter34] "
532 "\n\t" /* even 8 */
533 "sb %[st3], 0(%[dst]) "
534 "\n\t" /* even 6 */
535 "addu %[dst], %[dst], %[dst_pitch_2] "
536 "\n\t"
537 "dpa.w.ph $ac2, %[p1], %[filter56] "
538 "\n\t" /* even 8 */
539 "dpa.w.ph $ac2, %[p5], %[filter78] "
540 "\n\t" /* even 8 */
541 "extp %[Temp2], $ac2, 31 "
542 "\n\t" /* even 8 */
543 "lbux %[st1], %[Temp1](%[cm]) "
544 "\n\t" /* even 7 */
545
546 /* ODD pixels */
547 "ulw %[qload1], 1(%[src]) "
548 "\n\t"
549 "ulw %[qload2], 5(%[src]) "
550 "\n\t"
551
552 /* odd 1. pixel */
553 "mtlo %[vector_64], $ac1 "
554 "\n\t" /* odd 2 */
555 "mthi $zero, $ac1 "
556 "\n\t"
557 "preceu.ph.qbr %[p1], %[qload1] "
558 "\n\t"
559 "preceu.ph.qbl %[p2], %[qload1] "
560 "\n\t"
561 "preceu.ph.qbr %[p3], %[qload2] "
562 "\n\t"
563 "preceu.ph.qbl %[p4], %[qload2] "
564 "\n\t"
565 "sb %[st1], 0(%[dst]) "
566 "\n\t" /* even 7 */
567 "addu %[dst], %[dst], %[dst_pitch_2] "
568 "\n\t"
569 "ulw %[qload2], 9(%[src]) "
570 "\n\t"
571 "dpa.w.ph $ac3, %[p1], %[filter12] "
572 "\n\t" /* odd 1 */
573 "dpa.w.ph $ac3, %[p2], %[filter34] "
574 "\n\t" /* odd 1 */
575 "dpa.w.ph $ac3, %[p3], %[filter56] "
576 "\n\t" /* odd 1 */
577 "dpa.w.ph $ac3, %[p4], %[filter78] "
578 "\n\t" /* odd 1 */
579 "extp %[Temp3], $ac3, 31 "
580 "\n\t" /* odd 1 */
581 "lbux %[st2], %[Temp2](%[cm]) "
582 "\n\t" /* even 8 */
583
584 /* odd 2. pixel */
585 "mtlo %[vector_64], $ac2 "
586 "\n\t" /* odd 3 */
587 "mthi $zero, $ac2 "
588 "\n\t"
589 "preceu.ph.qbr %[p1], %[qload2] "
590 "\n\t"
591 "preceu.ph.qbl %[p5], %[qload2] "
592 "\n\t"
593 "sb %[st2], 0(%[dst]) "
594 "\n\t" /* even 8 */
595 "ulw %[qload1], 13(%[src]) "
596 "\n\t"
597 "dpa.w.ph $ac1, %[p2], %[filter12] "
598 "\n\t" /* odd 2 */
599 "dpa.w.ph $ac1, %[p3], %[filter34] "
600 "\n\t" /* odd 2 */
601 "dpa.w.ph $ac1, %[p4], %[filter56] "
602 "\n\t" /* odd 2 */
603 "dpa.w.ph $ac1, %[p1], %[filter78] "
604 "\n\t" /* odd 2 */
605 "extp %[Temp1], $ac1, 31 "
606 "\n\t" /* odd 2 */
607 "lbux %[st3], %[Temp3](%[cm]) "
608 "\n\t" /* odd 1 */
609
610 /* odd 3. pixel */
611 "mtlo %[vector_64], $ac3 "
612 "\n\t" /* odd 4 */
613 "mthi $zero, $ac3 "
614 "\n\t"
615 "preceu.ph.qbr %[p2], %[qload1] "
616 "\n\t"
617 "sb %[st3], 0(%[odd_dst]) "
618 "\n\t" /* odd 1 */
619 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
620 "\n\t"
621 "dpa.w.ph $ac2, %[p3], %[filter12] "
622 "\n\t" /* odd 3 */
623 "dpa.w.ph $ac2, %[p4], %[filter34] "
624 "\n\t" /* odd 3 */
625 "dpa.w.ph $ac2, %[p1], %[filter56] "
626 "\n\t" /* odd 3 */
627 "dpa.w.ph $ac2, %[p5], %[filter78] "
628 "\n\t" /* odd 3 */
629 "extp %[Temp2], $ac2, 31 "
630 "\n\t" /* odd 3 */
631 "lbux %[st1], %[Temp1](%[cm]) "
632 "\n\t" /* odd 2 */
633
634 /* odd 4. pixel */
635 "mtlo %[vector_64], $ac1 "
636 "\n\t" /* odd 5 */
637 "mthi $zero, $ac1 "
638 "\n\t"
639 "preceu.ph.qbl %[p3], %[qload1] "
640 "\n\t"
641 "sb %[st1], 0(%[odd_dst]) "
642 "\n\t" /* odd 2 */
643 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
644 "\n\t"
645 "ulw %[qload2], 17(%[src]) "
646 "\n\t"
647 "dpa.w.ph $ac3, %[p4], %[filter12] "
648 "\n\t" /* odd 4 */
649 "dpa.w.ph $ac3, %[p1], %[filter34] "
650 "\n\t" /* odd 4 */
651 "dpa.w.ph $ac3, %[p5], %[filter56] "
652 "\n\t" /* odd 4 */
653 "dpa.w.ph $ac3, %[p2], %[filter78] "
654 "\n\t" /* odd 4 */
655 "extp %[Temp3], $ac3, 31 "
656 "\n\t" /* odd 4 */
657 "lbux %[st2], %[Temp2](%[cm]) "
658 "\n\t" /* odd 3 */
659
660 /* odd 5. pixel */
661 "mtlo %[vector_64], $ac2 "
662 "\n\t" /* odd 6 */
663 "mthi $zero, $ac2 "
664 "\n\t"
665 "preceu.ph.qbr %[p4], %[qload2] "
666 "\n\t"
667 "sb %[st2], 0(%[odd_dst]) "
668 "\n\t" /* odd 3 */
669 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
670 "\n\t"
671 "dpa.w.ph $ac1, %[p1], %[filter12] "
672 "\n\t" /* odd 5 */
673 "dpa.w.ph $ac1, %[p5], %[filter34] "
674 "\n\t" /* odd 5 */
675 "dpa.w.ph $ac1, %[p2], %[filter56] "
676 "\n\t" /* odd 5 */
677 "dpa.w.ph $ac1, %[p3], %[filter78] "
678 "\n\t" /* odd 5 */
679 "extp %[Temp1], $ac1, 31 "
680 "\n\t" /* odd 5 */
681 "lbux %[st3], %[Temp3](%[cm]) "
682 "\n\t" /* odd 4 */
683
684 /* odd 6. pixel */
685 "mtlo %[vector_64], $ac3 "
686 "\n\t" /* odd 7 */
687 "mthi $zero, $ac3 "
688 "\n\t"
689 "preceu.ph.qbl %[p1], %[qload2] "
690 "\n\t"
691 "sb %[st3], 0(%[odd_dst]) "
692 "\n\t" /* odd 4 */
693 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
694 "\n\t"
695 "ulw %[qload1], 21(%[src]) "
696 "\n\t"
697 "dpa.w.ph $ac2, %[p5], %[filter12] "
698 "\n\t" /* odd 6 */
699 "dpa.w.ph $ac2, %[p2], %[filter34] "
700 "\n\t" /* odd 6 */
701 "dpa.w.ph $ac2, %[p3], %[filter56] "
702 "\n\t" /* odd 6 */
703 "dpa.w.ph $ac2, %[p4], %[filter78] "
704 "\n\t" /* odd 6 */
705 "extp %[Temp2], $ac2, 31 "
706 "\n\t" /* odd 6 */
707 "lbux %[st1], %[Temp1](%[cm]) "
708 "\n\t" /* odd 5 */
709
710 /* odd 7. pixel */
711 "mtlo %[vector_64], $ac1 "
712 "\n\t" /* odd 8 */
713 "mthi $zero, $ac1 "
714 "\n\t"
715 "preceu.ph.qbr %[p5], %[qload1] "
716 "\n\t"
717 "sb %[st1], 0(%[odd_dst]) "
718 "\n\t" /* odd 5 */
719 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
720 "\n\t"
721 "dpa.w.ph $ac3, %[p2], %[filter12] "
722 "\n\t" /* odd 7 */
723 "dpa.w.ph $ac3, %[p3], %[filter34] "
724 "\n\t" /* odd 7 */
725 "dpa.w.ph $ac3, %[p4], %[filter56] "
726 "\n\t" /* odd 7 */
727 "dpa.w.ph $ac3, %[p1], %[filter78] "
728 "\n\t" /* odd 7 */
729 "extp %[Temp3], $ac3, 31 "
730 "\n\t" /* odd 7 */
731
732 /* odd 8. pixel */
733 "dpa.w.ph $ac1, %[p3], %[filter12] "
734 "\n\t" /* odd 8 */
735 "dpa.w.ph $ac1, %[p4], %[filter34] "
736 "\n\t" /* odd 8 */
737 "dpa.w.ph $ac1, %[p1], %[filter56] "
738 "\n\t" /* odd 8 */
739 "dpa.w.ph $ac1, %[p5], %[filter78] "
740 "\n\t" /* odd 8 */
741 "extp %[Temp1], $ac1, 31 "
742 "\n\t" /* odd 8 */
743
744 "lbux %[st2], %[Temp2](%[cm]) "
745 "\n\t" /* odd 6 */
746 "lbux %[st3], %[Temp3](%[cm]) "
747 "\n\t" /* odd 7 */
748 "lbux %[st1], %[Temp1](%[cm]) "
749 "\n\t" /* odd 8 */
750
751 "sb %[st2], 0(%[odd_dst]) "
752 "\n\t" /* odd 6 */
753 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
754 "\n\t"
755
756 "sb %[st3], 0(%[odd_dst]) "
757 "\n\t" /* odd 7 */
758 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
759 "\n\t"
760
761 "sb %[st1], 0(%[odd_dst]) "
762 "\n\t" /* odd 8 */
763
764 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
765 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
766 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
767 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
768 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
769 : [filter12] "r"(filter12), [filter34] "r"(filter34),
770 [filter56] "r"(filter56), [filter78] "r"(filter78),
771 [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
772 [dst_pitch_2] "r"(dst_pitch_2));
773
774 src += 16;
775 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
776 odd_dst = (dst + dst_stride);
777 }
778
779 /* Next row... */
780 src_ptr += src_stride;
781
782 dst_ptr += 1;
783 }
784 }
785
convolve_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)786 static void convolve_horiz_64_transposed_dspr2(
787 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
788 int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
789 int32_t c, y;
790 const uint8_t *src;
791 uint8_t *dst;
792 uint8_t *cm = vpx_ff_cropTbl;
793 uint32_t vector_64 = 64;
794 int32_t filter12, filter34, filter56, filter78;
795 int32_t Temp1, Temp2, Temp3;
796 uint32_t qload1, qload2;
797 uint32_t p1, p2, p3, p4, p5;
798 uint32_t st1, st2, st3;
799 uint32_t dst_pitch_2 = (dst_stride << 1);
800 uint8_t *odd_dst;
801
802 filter12 = ((const int32_t *)filter_x0)[0];
803 filter34 = ((const int32_t *)filter_x0)[1];
804 filter56 = ((const int32_t *)filter_x0)[2];
805 filter78 = ((const int32_t *)filter_x0)[3];
806
807 for (y = h; y--;) {
808 /* prefetch data to cache memory */
809 prefetch_load(src_ptr + src_stride);
810 prefetch_load(src_ptr + src_stride + 32);
811 prefetch_load(src_ptr + src_stride + 64);
812
813 src = src_ptr;
814 dst = dst_ptr;
815
816 odd_dst = (dst + dst_stride);
817
818 for (c = 0; c < 4; c++) {
819 __asm__ __volatile__(
820 "ulw %[qload1], 0(%[src]) "
821 "\n\t"
822 "ulw %[qload2], 4(%[src]) "
823 "\n\t"
824
825 /* even 1. pixel */
826 "mtlo %[vector_64], $ac1 "
827 "\n\t" /* even 1 */
828 "mthi $zero, $ac1 "
829 "\n\t"
830 "mtlo %[vector_64], $ac2 "
831 "\n\t" /* even 2 */
832 "mthi $zero, $ac2 "
833 "\n\t"
834 "preceu.ph.qbr %[p3], %[qload2] "
835 "\n\t"
836 "preceu.ph.qbl %[p4], %[qload2] "
837 "\n\t"
838 "preceu.ph.qbr %[p1], %[qload1] "
839 "\n\t"
840 "preceu.ph.qbl %[p2], %[qload1] "
841 "\n\t"
842 "ulw %[qload2], 8(%[src]) "
843 "\n\t"
844 "dpa.w.ph $ac1, %[p1], %[filter12] "
845 "\n\t" /* even 1 */
846 "dpa.w.ph $ac1, %[p2], %[filter34] "
847 "\n\t" /* even 1 */
848 "dpa.w.ph $ac1, %[p3], %[filter56] "
849 "\n\t" /* even 1 */
850 "dpa.w.ph $ac1, %[p4], %[filter78] "
851 "\n\t" /* even 1 */
852 "extp %[Temp1], $ac1, 31 "
853 "\n\t" /* even 1 */
854
855 /* even 2. pixel */
856 "mtlo %[vector_64], $ac3 "
857 "\n\t" /* even 3 */
858 "mthi $zero, $ac3 "
859 "\n\t"
860 "preceu.ph.qbr %[p1], %[qload2] "
861 "\n\t"
862 "preceu.ph.qbl %[p5], %[qload2] "
863 "\n\t"
864 "ulw %[qload1], 12(%[src]) "
865 "\n\t"
866 "dpa.w.ph $ac2, %[p2], %[filter12] "
867 "\n\t" /* even 1 */
868 "dpa.w.ph $ac2, %[p3], %[filter34] "
869 "\n\t" /* even 1 */
870 "dpa.w.ph $ac2, %[p4], %[filter56] "
871 "\n\t" /* even 1 */
872 "dpa.w.ph $ac2, %[p1], %[filter78] "
873 "\n\t" /* even 1 */
874 "lbux %[st1], %[Temp1](%[cm]) "
875 "\n\t" /* even 1 */
876 "extp %[Temp2], $ac2, 31 "
877 "\n\t" /* even 1 */
878
879 /* even 3. pixel */
880 "mtlo %[vector_64], $ac1 "
881 "\n\t" /* even 4 */
882 "mthi $zero, $ac1 "
883 "\n\t"
884 "preceu.ph.qbr %[p2], %[qload1] "
885 "\n\t"
886 "sb %[st1], 0(%[dst]) "
887 "\n\t" /* even 1 */
888 "addu %[dst], %[dst], %[dst_pitch_2] "
889 " \n\t"
890 "dpa.w.ph $ac3, %[p3], %[filter12] "
891 "\n\t" /* even 3 */
892 "dpa.w.ph $ac3, %[p4], %[filter34] "
893 "\n\t" /* even 3 */
894 "dpa.w.ph $ac3, %[p1], %[filter56] "
895 "\n\t" /* even 3 */
896 "dpa.w.ph $ac3, %[p5], %[filter78] "
897 "\n\t" /* even 3 */
898 "extp %[Temp3], $ac3, 31 "
899 "\n\t" /* even 3 */
900 "lbux %[st2], %[Temp2](%[cm]) "
901 "\n\t" /* even 1 */
902
903 /* even 4. pixel */
904 "mtlo %[vector_64], $ac2 "
905 "\n\t" /* even 5 */
906 "mthi $zero, $ac2 "
907 "\n\t"
908 "preceu.ph.qbl %[p3], %[qload1] "
909 "\n\t"
910 "sb %[st2], 0(%[dst]) "
911 "\n\t" /* even 2 */
912 "addu %[dst], %[dst], %[dst_pitch_2] "
913 "\n\t"
914 "ulw %[qload2], 16(%[src]) "
915 "\n\t"
916 "dpa.w.ph $ac1, %[p4], %[filter12] "
917 "\n\t" /* even 4 */
918 "dpa.w.ph $ac1, %[p1], %[filter34] "
919 "\n\t" /* even 4 */
920 "dpa.w.ph $ac1, %[p5], %[filter56] "
921 "\n\t" /* even 4 */
922 "dpa.w.ph $ac1, %[p2], %[filter78] "
923 "\n\t" /* even 4 */
924 "extp %[Temp1], $ac1, 31 "
925 "\n\t" /* even 4 */
926 "lbux %[st3], %[Temp3](%[cm]) "
927 "\n\t" /* even 3 */
928
929 /* even 5. pixel */
930 "mtlo %[vector_64], $ac3 "
931 "\n\t" /* even 6 */
932 "mthi $zero, $ac3 "
933 "\n\t"
934 "preceu.ph.qbr %[p4], %[qload2] "
935 "\n\t"
936 "sb %[st3], 0(%[dst]) "
937 "\n\t" /* even 3 */
938 "addu %[dst], %[dst], %[dst_pitch_2] "
939 "\n\t"
940 "dpa.w.ph $ac2, %[p1], %[filter12] "
941 "\n\t" /* even 5 */
942 "dpa.w.ph $ac2, %[p5], %[filter34] "
943 "\n\t" /* even 5 */
944 "dpa.w.ph $ac2, %[p2], %[filter56] "
945 "\n\t" /* even 5 */
946 "dpa.w.ph $ac2, %[p3], %[filter78] "
947 "\n\t" /* even 5 */
948 "extp %[Temp2], $ac2, 31 "
949 "\n\t" /* even 5 */
950 "lbux %[st1], %[Temp1](%[cm]) "
951 "\n\t" /* even 4 */
952
953 /* even 6. pixel */
954 "mtlo %[vector_64], $ac1 "
955 "\n\t" /* even 7 */
956 "mthi $zero, $ac1 "
957 "\n\t"
958 "preceu.ph.qbl %[p1], %[qload2] "
959 "\n\t"
960 "sb %[st1], 0(%[dst]) "
961 "\n\t" /* even 4 */
962 "addu %[dst], %[dst], %[dst_pitch_2] "
963 "\n\t"
964 "ulw %[qload1], 20(%[src]) "
965 "\n\t"
966 "dpa.w.ph $ac3, %[p5], %[filter12] "
967 "\n\t" /* even 6 */
968 "dpa.w.ph $ac3, %[p2], %[filter34] "
969 "\n\t" /* even 6 */
970 "dpa.w.ph $ac3, %[p3], %[filter56] "
971 "\n\t" /* even 6 */
972 "dpa.w.ph $ac3, %[p4], %[filter78] "
973 "\n\t" /* even 6 */
974 "extp %[Temp3], $ac3, 31 "
975 "\n\t" /* even 6 */
976 "lbux %[st2], %[Temp2](%[cm]) "
977 "\n\t" /* even 5 */
978
979 /* even 7. pixel */
980 "mtlo %[vector_64], $ac2 "
981 "\n\t" /* even 8 */
982 "mthi $zero, $ac2 "
983 "\n\t"
984 "preceu.ph.qbr %[p5], %[qload1] "
985 "\n\t"
986 "sb %[st2], 0(%[dst]) "
987 "\n\t" /* even 5 */
988 "addu %[dst], %[dst], %[dst_pitch_2] "
989 "\n\t"
990 "dpa.w.ph $ac1, %[p2], %[filter12] "
991 "\n\t" /* even 7 */
992 "dpa.w.ph $ac1, %[p3], %[filter34] "
993 "\n\t" /* even 7 */
994 "dpa.w.ph $ac1, %[p4], %[filter56] "
995 "\n\t" /* even 7 */
996 "dpa.w.ph $ac1, %[p1], %[filter78] "
997 "\n\t" /* even 7 */
998 "extp %[Temp1], $ac1, 31 "
999 "\n\t" /* even 7 */
1000 "lbux %[st3], %[Temp3](%[cm]) "
1001 "\n\t" /* even 6 */
1002
1003 /* even 8. pixel */
1004 "mtlo %[vector_64], $ac3 "
1005 "\n\t" /* odd 1 */
1006 "mthi $zero, $ac3 "
1007 "\n\t"
1008 "dpa.w.ph $ac2, %[p3], %[filter12] "
1009 "\n\t" /* even 8 */
1010 "dpa.w.ph $ac2, %[p4], %[filter34] "
1011 "\n\t" /* even 8 */
1012 "sb %[st3], 0(%[dst]) "
1013 "\n\t" /* even 6 */
1014 "addu %[dst], %[dst], %[dst_pitch_2] "
1015 "\n\t"
1016 "dpa.w.ph $ac2, %[p1], %[filter56] "
1017 "\n\t" /* even 8 */
1018 "dpa.w.ph $ac2, %[p5], %[filter78] "
1019 "\n\t" /* even 8 */
1020 "extp %[Temp2], $ac2, 31 "
1021 "\n\t" /* even 8 */
1022 "lbux %[st1], %[Temp1](%[cm]) "
1023 "\n\t" /* even 7 */
1024
1025 /* ODD pixels */
1026 "ulw %[qload1], 1(%[src]) "
1027 "\n\t"
1028 "ulw %[qload2], 5(%[src]) "
1029 "\n\t"
1030
1031 /* odd 1. pixel */
1032 "mtlo %[vector_64], $ac1 "
1033 "\n\t" /* odd 2 */
1034 "mthi $zero, $ac1 "
1035 "\n\t"
1036 "preceu.ph.qbr %[p1], %[qload1] "
1037 "\n\t"
1038 "preceu.ph.qbl %[p2], %[qload1] "
1039 "\n\t"
1040 "preceu.ph.qbr %[p3], %[qload2] "
1041 "\n\t"
1042 "preceu.ph.qbl %[p4], %[qload2] "
1043 "\n\t"
1044 "sb %[st1], 0(%[dst]) "
1045 "\n\t" /* even 7 */
1046 "addu %[dst], %[dst], %[dst_pitch_2] "
1047 "\n\t"
1048 "ulw %[qload2], 9(%[src]) "
1049 "\n\t"
1050 "dpa.w.ph $ac3, %[p1], %[filter12] "
1051 "\n\t" /* odd 1 */
1052 "dpa.w.ph $ac3, %[p2], %[filter34] "
1053 "\n\t" /* odd 1 */
1054 "dpa.w.ph $ac3, %[p3], %[filter56] "
1055 "\n\t" /* odd 1 */
1056 "dpa.w.ph $ac3, %[p4], %[filter78] "
1057 "\n\t" /* odd 1 */
1058 "extp %[Temp3], $ac3, 31 "
1059 "\n\t" /* odd 1 */
1060 "lbux %[st2], %[Temp2](%[cm]) "
1061 "\n\t" /* even 8 */
1062
1063 /* odd 2. pixel */
1064 "mtlo %[vector_64], $ac2 "
1065 "\n\t" /* odd 3 */
1066 "mthi $zero, $ac2 "
1067 "\n\t"
1068 "preceu.ph.qbr %[p1], %[qload2] "
1069 "\n\t"
1070 "preceu.ph.qbl %[p5], %[qload2] "
1071 "\n\t"
1072 "sb %[st2], 0(%[dst]) "
1073 "\n\t" /* even 8 */
1074 "ulw %[qload1], 13(%[src]) "
1075 "\n\t"
1076 "dpa.w.ph $ac1, %[p2], %[filter12] "
1077 "\n\t" /* odd 2 */
1078 "dpa.w.ph $ac1, %[p3], %[filter34] "
1079 "\n\t" /* odd 2 */
1080 "dpa.w.ph $ac1, %[p4], %[filter56] "
1081 "\n\t" /* odd 2 */
1082 "dpa.w.ph $ac1, %[p1], %[filter78] "
1083 "\n\t" /* odd 2 */
1084 "extp %[Temp1], $ac1, 31 "
1085 "\n\t" /* odd 2 */
1086 "lbux %[st3], %[Temp3](%[cm]) "
1087 "\n\t" /* odd 1 */
1088
1089 /* odd 3. pixel */
1090 "mtlo %[vector_64], $ac3 "
1091 "\n\t" /* odd 4 */
1092 "mthi $zero, $ac3 "
1093 "\n\t"
1094 "preceu.ph.qbr %[p2], %[qload1] "
1095 "\n\t"
1096 "sb %[st3], 0(%[odd_dst]) "
1097 "\n\t" /* odd 1 */
1098 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1099 "\n\t"
1100 "dpa.w.ph $ac2, %[p3], %[filter12] "
1101 "\n\t" /* odd 3 */
1102 "dpa.w.ph $ac2, %[p4], %[filter34] "
1103 "\n\t" /* odd 3 */
1104 "dpa.w.ph $ac2, %[p1], %[filter56] "
1105 "\n\t" /* odd 3 */
1106 "dpa.w.ph $ac2, %[p5], %[filter78] "
1107 "\n\t" /* odd 3 */
1108 "extp %[Temp2], $ac2, 31 "
1109 "\n\t" /* odd 3 */
1110 "lbux %[st1], %[Temp1](%[cm]) "
1111 "\n\t" /* odd 2 */
1112
1113 /* odd 4. pixel */
1114 "mtlo %[vector_64], $ac1 "
1115 "\n\t" /* odd 5 */
1116 "mthi $zero, $ac1 "
1117 "\n\t"
1118 "preceu.ph.qbl %[p3], %[qload1] "
1119 "\n\t"
1120 "sb %[st1], 0(%[odd_dst]) "
1121 "\n\t" /* odd 2 */
1122 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1123 "\n\t"
1124 "ulw %[qload2], 17(%[src]) "
1125 "\n\t"
1126 "dpa.w.ph $ac3, %[p4], %[filter12] "
1127 "\n\t" /* odd 4 */
1128 "dpa.w.ph $ac3, %[p1], %[filter34] "
1129 "\n\t" /* odd 4 */
1130 "dpa.w.ph $ac3, %[p5], %[filter56] "
1131 "\n\t" /* odd 4 */
1132 "dpa.w.ph $ac3, %[p2], %[filter78] "
1133 "\n\t" /* odd 4 */
1134 "extp %[Temp3], $ac3, 31 "
1135 "\n\t" /* odd 4 */
1136 "lbux %[st2], %[Temp2](%[cm]) "
1137 "\n\t" /* odd 3 */
1138
1139 /* odd 5. pixel */
1140 "mtlo %[vector_64], $ac2 "
1141 "\n\t" /* odd 6 */
1142 "mthi $zero, $ac2 "
1143 "\n\t"
1144 "preceu.ph.qbr %[p4], %[qload2] "
1145 "\n\t"
1146 "sb %[st2], 0(%[odd_dst]) "
1147 "\n\t" /* odd 3 */
1148 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1149 "\n\t"
1150 "dpa.w.ph $ac1, %[p1], %[filter12] "
1151 "\n\t" /* odd 5 */
1152 "dpa.w.ph $ac1, %[p5], %[filter34] "
1153 "\n\t" /* odd 5 */
1154 "dpa.w.ph $ac1, %[p2], %[filter56] "
1155 "\n\t" /* odd 5 */
1156 "dpa.w.ph $ac1, %[p3], %[filter78] "
1157 "\n\t" /* odd 5 */
1158 "extp %[Temp1], $ac1, 31 "
1159 "\n\t" /* odd 5 */
1160 "lbux %[st3], %[Temp3](%[cm]) "
1161 "\n\t" /* odd 4 */
1162
1163 /* odd 6. pixel */
1164 "mtlo %[vector_64], $ac3 "
1165 "\n\t" /* odd 7 */
1166 "mthi $zero, $ac3 "
1167 "\n\t"
1168 "preceu.ph.qbl %[p1], %[qload2] "
1169 "\n\t"
1170 "sb %[st3], 0(%[odd_dst]) "
1171 "\n\t" /* odd 4 */
1172 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1173 "\n\t"
1174 "ulw %[qload1], 21(%[src]) "
1175 "\n\t"
1176 "dpa.w.ph $ac2, %[p5], %[filter12] "
1177 "\n\t" /* odd 6 */
1178 "dpa.w.ph $ac2, %[p2], %[filter34] "
1179 "\n\t" /* odd 6 */
1180 "dpa.w.ph $ac2, %[p3], %[filter56] "
1181 "\n\t" /* odd 6 */
1182 "dpa.w.ph $ac2, %[p4], %[filter78] "
1183 "\n\t" /* odd 6 */
1184 "extp %[Temp2], $ac2, 31 "
1185 "\n\t" /* odd 6 */
1186 "lbux %[st1], %[Temp1](%[cm]) "
1187 "\n\t" /* odd 5 */
1188
1189 /* odd 7. pixel */
1190 "mtlo %[vector_64], $ac1 "
1191 "\n\t" /* odd 8 */
1192 "mthi $zero, $ac1 "
1193 "\n\t"
1194 "preceu.ph.qbr %[p5], %[qload1] "
1195 "\n\t"
1196 "sb %[st1], 0(%[odd_dst]) "
1197 "\n\t" /* odd 5 */
1198 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1199 "\n\t"
1200 "dpa.w.ph $ac3, %[p2], %[filter12] "
1201 "\n\t" /* odd 7 */
1202 "dpa.w.ph $ac3, %[p3], %[filter34] "
1203 "\n\t" /* odd 7 */
1204 "dpa.w.ph $ac3, %[p4], %[filter56] "
1205 "\n\t" /* odd 7 */
1206 "dpa.w.ph $ac3, %[p1], %[filter78] "
1207 "\n\t" /* odd 7 */
1208 "extp %[Temp3], $ac3, 31 "
1209 "\n\t" /* odd 7 */
1210
1211 /* odd 8. pixel */
1212 "dpa.w.ph $ac1, %[p3], %[filter12] "
1213 "\n\t" /* odd 8 */
1214 "dpa.w.ph $ac1, %[p4], %[filter34] "
1215 "\n\t" /* odd 8 */
1216 "dpa.w.ph $ac1, %[p1], %[filter56] "
1217 "\n\t" /* odd 8 */
1218 "dpa.w.ph $ac1, %[p5], %[filter78] "
1219 "\n\t" /* odd 8 */
1220 "extp %[Temp1], $ac1, 31 "
1221 "\n\t" /* odd 8 */
1222
1223 "lbux %[st2], %[Temp2](%[cm]) "
1224 "\n\t" /* odd 6 */
1225 "lbux %[st3], %[Temp3](%[cm]) "
1226 "\n\t" /* odd 7 */
1227 "lbux %[st1], %[Temp1](%[cm]) "
1228 "\n\t" /* odd 8 */
1229
1230 "sb %[st2], 0(%[odd_dst]) "
1231 "\n\t" /* odd 6 */
1232 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1233 "\n\t"
1234
1235 "sb %[st3], 0(%[odd_dst]) "
1236 "\n\t" /* odd 7 */
1237 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
1238 "\n\t"
1239
1240 "sb %[st1], 0(%[odd_dst]) "
1241 "\n\t" /* odd 8 */
1242
1243 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
1244 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
1245 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
1246 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1247 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
1248 : [filter12] "r"(filter12), [filter34] "r"(filter34),
1249 [filter56] "r"(filter56), [filter78] "r"(filter78),
1250 [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
1251 [dst_pitch_2] "r"(dst_pitch_2));
1252
1253 src += 16;
1254 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
1255 odd_dst = (dst + dst_stride);
1256 }
1257
1258 /* Next row... */
1259 src_ptr += src_stride;
1260
1261 dst_ptr += 1;
1262 }
1263 }
1264
convolve_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)1265 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1266 uint8_t *dst, ptrdiff_t dst_stride,
1267 const int16_t *filter, int w, int h) {
1268 int x, y, k;
1269
1270 for (y = 0; y < h; ++y) {
1271 for (x = 0; x < w; ++x) {
1272 int sum = 0;
1273
1274 for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
1275
1276 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
1277 }
1278
1279 src += src_stride;
1280 dst += 1;
1281 }
1282 }
1283
copy_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)1284 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1285 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
1286 int x, y;
1287
1288 for (y = 0; y < h; ++y) {
1289 for (x = 0; x < w; ++x) {
1290 dst[x * dst_stride] = src[x];
1291 }
1292
1293 src += src_stride;
1294 dst += 1;
1295 }
1296 }
1297
vpx_convolve8_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)1298 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1299 ptrdiff_t dst_stride, const int16_t *filter_x,
1300 int x_step_q4, const int16_t *filter_y, int y_step_q4,
1301 int w, int h) {
1302 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
1303 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
1304 uint32_t pos = 38;
1305
1306 assert(x_step_q4 == 16);
1307 assert(y_step_q4 == 16);
1308 assert(((const int32_t *)filter_x)[1] != 0x800000);
1309 assert(((const int32_t *)filter_y)[1] != 0x800000);
1310 (void)x_step_q4;
1311
1312 /* bit positon for extract from acc */
1313 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
1314 :
1315 : [pos] "r"(pos));
1316
1317 if (intermediate_height < h) intermediate_height = h;
1318
1319 /* copy the src to dst */
1320 if (filter_x[3] == 0x80) {
1321 copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
1322 intermediate_height, w, intermediate_height);
1323 } else if (((const int32_t *)filter_x)[0] == 0) {
1324 vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
1325 intermediate_height, filter_x, w, intermediate_height);
1326 } else {
1327 src -= (src_stride * 3 + 3);
1328
1329 /* prefetch data to cache memory */
1330 prefetch_load(src);
1331 prefetch_load(src + 32);
1332
1333 switch (w) {
1334 case 4:
1335 convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
1336 intermediate_height, filter_x,
1337 intermediate_height);
1338 break;
1339 case 8:
1340 convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
1341 intermediate_height, filter_x,
1342 intermediate_height);
1343 break;
1344 case 16:
1345 case 32:
1346 convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
1347 intermediate_height, filter_x,
1348 intermediate_height, (w / 16));
1349 break;
1350 case 64:
1351 prefetch_load(src + 32);
1352 convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
1353 intermediate_height, filter_x,
1354 intermediate_height);
1355 break;
1356 default:
1357 convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
1358 filter_x, w, intermediate_height);
1359 break;
1360 }
1361 }
1362
1363 /* copy the src to dst */
1364 if (filter_y[3] == 0x80) {
1365 copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
1366 } else if (((const int32_t *)filter_y)[0] == 0) {
1367 vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
1368 filter_y, h, w);
1369 } else {
1370 switch (h) {
1371 case 4:
1372 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
1373 dst_stride, filter_y, w);
1374 break;
1375 case 8:
1376 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
1377 dst_stride, filter_y, w);
1378 break;
1379 case 16:
1380 case 32:
1381 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
1382 dst_stride, filter_y, w, (h / 16));
1383 break;
1384 case 64:
1385 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
1386 dst_stride, filter_y, w);
1387 break;
1388 default:
1389 convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
1390 filter_y, h, w);
1391 break;
1392 }
1393 }
1394 }
1395
vpx_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1396 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1397 uint8_t *dst, ptrdiff_t dst_stride,
1398 const int16_t *filter_x, int filter_x_stride,
1399 const int16_t *filter_y, int filter_y_stride,
1400 int w, int h) {
1401 int x, y;
1402 (void)filter_x;
1403 (void)filter_x_stride;
1404 (void)filter_y;
1405 (void)filter_y_stride;
1406
1407 /* prefetch data to cache memory */
1408 prefetch_load(src);
1409 prefetch_load(src + 32);
1410 prefetch_store(dst);
1411
1412 switch (w) {
1413 case 4: {
1414 uint32_t tp1;
1415
1416 /* 1 word storage */
1417 for (y = h; y--;) {
1418 prefetch_load(src + src_stride);
1419 prefetch_load(src + src_stride + 32);
1420 prefetch_store(dst + dst_stride);
1421
1422 __asm__ __volatile__(
1423 "ulw %[tp1], (%[src]) \n\t"
1424 "sw %[tp1], (%[dst]) \n\t" /* store */
1425
1426 : [tp1] "=&r"(tp1)
1427 : [src] "r"(src), [dst] "r"(dst));
1428
1429 src += src_stride;
1430 dst += dst_stride;
1431 }
1432 break;
1433 }
1434 case 8: {
1435 uint32_t tp1, tp2;
1436
1437 /* 2 word storage */
1438 for (y = h; y--;) {
1439 prefetch_load(src + src_stride);
1440 prefetch_load(src + src_stride + 32);
1441 prefetch_store(dst + dst_stride);
1442
1443 __asm__ __volatile__(
1444 "ulw %[tp1], 0(%[src]) \n\t"
1445 "ulw %[tp2], 4(%[src]) \n\t"
1446 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1447 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1448
1449 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
1450 : [src] "r"(src), [dst] "r"(dst));
1451
1452 src += src_stride;
1453 dst += dst_stride;
1454 }
1455 break;
1456 }
1457 case 16: {
1458 uint32_t tp1, tp2, tp3, tp4;
1459
1460 /* 4 word storage */
1461 for (y = h; y--;) {
1462 prefetch_load(src + src_stride);
1463 prefetch_load(src + src_stride + 32);
1464 prefetch_store(dst + dst_stride);
1465
1466 __asm__ __volatile__(
1467 "ulw %[tp1], 0(%[src]) \n\t"
1468 "ulw %[tp2], 4(%[src]) \n\t"
1469 "ulw %[tp3], 8(%[src]) \n\t"
1470 "ulw %[tp4], 12(%[src]) \n\t"
1471
1472 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1473 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1474 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1475 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1476
1477 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1478 [tp4] "=&r"(tp4)
1479 : [src] "r"(src), [dst] "r"(dst));
1480
1481 src += src_stride;
1482 dst += dst_stride;
1483 }
1484 break;
1485 }
1486 case 32: {
1487 uint32_t tp1, tp2, tp3, tp4;
1488 uint32_t tp5, tp6, tp7, tp8;
1489
1490 /* 8 word storage */
1491 for (y = h; y--;) {
1492 prefetch_load(src + src_stride);
1493 prefetch_load(src + src_stride + 32);
1494 prefetch_store(dst + dst_stride);
1495
1496 __asm__ __volatile__(
1497 "ulw %[tp1], 0(%[src]) \n\t"
1498 "ulw %[tp2], 4(%[src]) \n\t"
1499 "ulw %[tp3], 8(%[src]) \n\t"
1500 "ulw %[tp4], 12(%[src]) \n\t"
1501 "ulw %[tp5], 16(%[src]) \n\t"
1502 "ulw %[tp6], 20(%[src]) \n\t"
1503 "ulw %[tp7], 24(%[src]) \n\t"
1504 "ulw %[tp8], 28(%[src]) \n\t"
1505
1506 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1507 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1508 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1509 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1510 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1511 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1512 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1513 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1514
1515 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1516 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1517 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1518 : [src] "r"(src), [dst] "r"(dst));
1519
1520 src += src_stride;
1521 dst += dst_stride;
1522 }
1523 break;
1524 }
1525 case 64: {
1526 uint32_t tp1, tp2, tp3, tp4;
1527 uint32_t tp5, tp6, tp7, tp8;
1528
1529 prefetch_load(src + 64);
1530 prefetch_store(dst + 32);
1531
1532 /* 16 word storage */
1533 for (y = h; y--;) {
1534 prefetch_load(src + src_stride);
1535 prefetch_load(src + src_stride + 32);
1536 prefetch_load(src + src_stride + 64);
1537 prefetch_store(dst + dst_stride);
1538 prefetch_store(dst + dst_stride + 32);
1539
1540 __asm__ __volatile__(
1541 "ulw %[tp1], 0(%[src]) \n\t"
1542 "ulw %[tp2], 4(%[src]) \n\t"
1543 "ulw %[tp3], 8(%[src]) \n\t"
1544 "ulw %[tp4], 12(%[src]) \n\t"
1545 "ulw %[tp5], 16(%[src]) \n\t"
1546 "ulw %[tp6], 20(%[src]) \n\t"
1547 "ulw %[tp7], 24(%[src]) \n\t"
1548 "ulw %[tp8], 28(%[src]) \n\t"
1549
1550 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1551 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1552 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1553 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1554 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1555 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1556 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1557 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1558
1559 "ulw %[tp1], 32(%[src]) \n\t"
1560 "ulw %[tp2], 36(%[src]) \n\t"
1561 "ulw %[tp3], 40(%[src]) \n\t"
1562 "ulw %[tp4], 44(%[src]) \n\t"
1563 "ulw %[tp5], 48(%[src]) \n\t"
1564 "ulw %[tp6], 52(%[src]) \n\t"
1565 "ulw %[tp7], 56(%[src]) \n\t"
1566 "ulw %[tp8], 60(%[src]) \n\t"
1567
1568 "sw %[tp1], 32(%[dst]) \n\t" /* store */
1569 "sw %[tp2], 36(%[dst]) \n\t" /* store */
1570 "sw %[tp3], 40(%[dst]) \n\t" /* store */
1571 "sw %[tp4], 44(%[dst]) \n\t" /* store */
1572 "sw %[tp5], 48(%[dst]) \n\t" /* store */
1573 "sw %[tp6], 52(%[dst]) \n\t" /* store */
1574 "sw %[tp7], 56(%[dst]) \n\t" /* store */
1575 "sw %[tp8], 60(%[dst]) \n\t" /* store */
1576
1577 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1578 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1579 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1580 : [src] "r"(src), [dst] "r"(dst));
1581
1582 src += src_stride;
1583 dst += dst_stride;
1584 }
1585 break;
1586 }
1587 default:
1588 for (y = h; y--;) {
1589 for (x = 0; x < w; ++x) {
1590 dst[x] = src[x];
1591 }
1592
1593 src += src_stride;
1594 dst += dst_stride;
1595 }
1596 break;
1597 }
1598 }
1599 #endif
1600