1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22 uint8_t *dst, int32_t dst_stride,
23 const int16_t *filter_x0, int32_t h) {
24 int32_t y;
25 uint8_t *cm = vpx_ff_cropTbl;
26 int32_t Temp1, Temp2, Temp3, Temp4;
27 uint32_t vector4a = 64;
28 uint32_t tp1, tp2;
29 uint32_t p1, p2;
30 const int16_t *filter = &filter_x0[3];
31 uint32_t filter45;
32
33 filter45 = ((const int32_t *)filter)[0];
34
35 for (y = h; y--;) {
36 /* prefetch data to cache memory */
37 prefetch_load(src + src_stride);
38 prefetch_load(src + src_stride + 32);
39 prefetch_store(dst + dst_stride);
40
41 __asm__ __volatile__(
42 "ulw %[tp1], 0(%[src]) \n\t"
43 "ulw %[tp2], 4(%[src]) \n\t"
44
45 /* even 1. pixel */
46 "mtlo %[vector4a], $ac3 \n\t"
47 "mthi $zero, $ac3 \n\t"
48 "preceu.ph.qbr %[p1], %[tp1] \n\t"
49 "preceu.ph.qbl %[p2], %[tp1] \n\t"
50 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
51 "extp %[Temp1], $ac3, 31 \n\t"
52
53 /* even 2. pixel */
54 "mtlo %[vector4a], $ac2 \n\t"
55 "mthi $zero, $ac2 \n\t"
56 "balign %[tp2], %[tp1], 3 \n\t"
57 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
58 "extp %[Temp3], $ac2, 31 \n\t"
59
60 /* odd 1. pixel */
61 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
62 "mtlo %[vector4a], $ac3 \n\t"
63 "mthi $zero, $ac3 \n\t"
64 "preceu.ph.qbr %[p1], %[tp2] \n\t"
65 "preceu.ph.qbl %[p2], %[tp2] \n\t"
66 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
67 "extp %[Temp2], $ac3, 31 \n\t"
68
69 /* odd 2. pixel */
70 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
71 "mtlo %[vector4a], $ac2 \n\t"
72 "mthi $zero, $ac2 \n\t"
73 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
74 "extp %[Temp4], $ac2, 31 \n\t"
75
76 /* clamp */
77 "lbux %[p1], %[Temp2](%[cm]) \n\t"
78 "lbux %[p2], %[Temp4](%[cm]) \n\t"
79
80 /* store bytes */
81 "sb %[tp1], 0(%[dst]) \n\t"
82 "sb %[p1], 1(%[dst]) \n\t"
83 "sb %[tp2], 2(%[dst]) \n\t"
84 "sb %[p2], 3(%[dst]) \n\t"
85
86 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
87 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
88 [Temp4] "=&r"(Temp4)
89 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
90 [dst] "r"(dst), [src] "r"(src));
91
92 /* Next row... */
93 src += src_stride;
94 dst += dst_stride;
95 }
96 }
97
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)98 static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
99 uint8_t *dst, int32_t dst_stride,
100 const int16_t *filter_x0, int32_t h) {
101 int32_t y;
102 uint8_t *cm = vpx_ff_cropTbl;
103 uint32_t vector4a = 64;
104 int32_t Temp1, Temp2, Temp3;
105 uint32_t tp1, tp2, tp3;
106 uint32_t p1, p2, p3, p4;
107 uint32_t st0, st1;
108 const int16_t *filter = &filter_x0[3];
109 uint32_t filter45;
110
111 filter45 = ((const int32_t *)filter)[0];
112
113 for (y = h; y--;) {
114 /* prefetch data to cache memory */
115 prefetch_load(src + src_stride);
116 prefetch_load(src + src_stride + 32);
117 prefetch_store(dst + dst_stride);
118
119 __asm__ __volatile__(
120 "ulw %[tp1], 0(%[src]) \n\t"
121 "ulw %[tp2], 4(%[src]) \n\t"
122
123 /* even 1. pixel */
124 "mtlo %[vector4a], $ac3 \n\t"
125 "mthi $zero, $ac3 \n\t"
126 "mtlo %[vector4a], $ac2 \n\t"
127 "mthi $zero, $ac2 \n\t"
128 "preceu.ph.qbr %[p1], %[tp1] \n\t"
129 "preceu.ph.qbl %[p2], %[tp1] \n\t"
130 "preceu.ph.qbr %[p3], %[tp2] \n\t"
131 "preceu.ph.qbl %[p4], %[tp2] \n\t"
132 "ulw %[tp3], 8(%[src]) \n\t"
133 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
134 "extp %[Temp1], $ac3, 31 \n\t"
135
136 /* even 2. pixel */
137 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
138 "extp %[Temp3], $ac2, 31 \n\t"
139
140 /* even 3. pixel */
141 "lbux %[st0], %[Temp1](%[cm]) \n\t"
142 "mtlo %[vector4a], $ac1 \n\t"
143 "mthi $zero, $ac1 \n\t"
144 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
145 "extp %[Temp1], $ac1, 31 \n\t"
146
147 /* even 4. pixel */
148 "mtlo %[vector4a], $ac2 \n\t"
149 "mthi $zero, $ac2 \n\t"
150 "mtlo %[vector4a], $ac3 \n\t"
151 "mthi $zero, $ac3 \n\t"
152 "sb %[st0], 0(%[dst]) \n\t"
153 "lbux %[st1], %[Temp3](%[cm]) \n\t"
154
155 "balign %[tp3], %[tp2], 3 \n\t"
156 "balign %[tp2], %[tp1], 3 \n\t"
157
158 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
159 "extp %[Temp3], $ac2, 31 \n\t"
160
161 "lbux %[st0], %[Temp1](%[cm]) \n\t"
162
163 /* odd 1. pixel */
164 "mtlo %[vector4a], $ac1 \n\t"
165 "mthi $zero, $ac1 \n\t"
166 "sb %[st1], 2(%[dst]) \n\t"
167 "preceu.ph.qbr %[p1], %[tp2] \n\t"
168 "preceu.ph.qbl %[p2], %[tp2] \n\t"
169 "preceu.ph.qbr %[p3], %[tp3] \n\t"
170 "preceu.ph.qbl %[p4], %[tp3] \n\t"
171 "sb %[st0], 4(%[dst]) \n\t"
172 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
173 "extp %[Temp2], $ac3, 31 \n\t"
174
175 /* odd 2. pixel */
176 "mtlo %[vector4a], $ac3 \n\t"
177 "mthi $zero, $ac3 \n\t"
178 "mtlo %[vector4a], $ac2 \n\t"
179 "mthi $zero, $ac2 \n\t"
180 "lbux %[st0], %[Temp3](%[cm]) \n\t"
181 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
182 "extp %[Temp3], $ac1, 31 \n\t"
183
184 /* odd 3. pixel */
185 "lbux %[st1], %[Temp2](%[cm]) \n\t"
186 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
187 "extp %[Temp2], $ac3, 31 \n\t"
188
189 /* odd 4. pixel */
190 "sb %[st1], 1(%[dst]) \n\t"
191 "sb %[st0], 6(%[dst]) \n\t"
192 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
193 "extp %[Temp1], $ac2, 31 \n\t"
194
195 /* clamp */
196 "lbux %[p4], %[Temp3](%[cm]) \n\t"
197 "lbux %[p2], %[Temp2](%[cm]) \n\t"
198 "lbux %[p1], %[Temp1](%[cm]) \n\t"
199
200 /* store bytes */
201 "sb %[p4], 3(%[dst]) \n\t"
202 "sb %[p2], 5(%[dst]) \n\t"
203 "sb %[p1], 7(%[dst]) \n\t"
204
205 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
206 [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
207 [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
208 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
209 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
210 [dst] "r"(dst), [src] "r"(src));
211
212 /* Next row... */
213 src += src_stride;
214 dst += dst_stride;
215 }
216 }
217
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)218 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
219 int32_t src_stride, uint8_t *dst_ptr,
220 int32_t dst_stride,
221 const int16_t *filter_x0, int32_t h,
222 int32_t count) {
223 int32_t y, c;
224 const uint8_t *src;
225 uint8_t *dst;
226 uint8_t *cm = vpx_ff_cropTbl;
227 uint32_t vector_64 = 64;
228 int32_t Temp1, Temp2, Temp3;
229 uint32_t qload1, qload2, qload3;
230 uint32_t p1, p2, p3, p4, p5;
231 uint32_t st1, st2, st3;
232 const int16_t *filter = &filter_x0[3];
233 uint32_t filter45;
234
235 filter45 = ((const int32_t *)filter)[0];
236
237 for (y = h; y--;) {
238 src = src_ptr;
239 dst = dst_ptr;
240
241 /* prefetch data to cache memory */
242 prefetch_load(src_ptr + src_stride);
243 prefetch_load(src_ptr + src_stride + 32);
244 prefetch_store(dst_ptr + dst_stride);
245
246 for (c = 0; c < count; c++) {
247 __asm__ __volatile__(
248 "ulw %[qload1], 0(%[src]) \n\t"
249 "ulw %[qload2], 4(%[src]) \n\t"
250
251 /* even 1. pixel */
252 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
253 "mthi $zero, $ac1 \n\t"
254 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
255 "mthi $zero, $ac2 \n\t"
256 "preceu.ph.qbr %[p1], %[qload1] \n\t"
257 "preceu.ph.qbl %[p2], %[qload1] \n\t"
258 "preceu.ph.qbr %[p3], %[qload2] \n\t"
259 "preceu.ph.qbl %[p4], %[qload2] \n\t"
260 "ulw %[qload3], 8(%[src]) \n\t"
261 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
262 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
263
264 /* even 2. pixel */
265 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
266 "mthi $zero, $ac3 \n\t"
267 "preceu.ph.qbr %[p1], %[qload3] \n\t"
268 "preceu.ph.qbl %[p5], %[qload3] \n\t"
269 "ulw %[qload1], 12(%[src]) \n\t"
270 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
271 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
272 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
273
274 /* even 3. pixel */
275 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
276 "mthi $zero, $ac1 \n\t"
277 "preceu.ph.qbr %[p2], %[qload1] \n\t"
278 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
279 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
280 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
281 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
282
283 /* even 4. pixel */
284 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
285 "mthi $zero, $ac2 \n\t"
286 "preceu.ph.qbl %[p3], %[qload1] \n\t"
287 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
288 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
289 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
290 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
291
292 /* even 5. pixel */
293 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
294 "mthi $zero, $ac3 \n\t"
295 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
296 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
297 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
298 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
299
300 /* even 6. pixel */
301 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
302 "mthi $zero, $ac1 \n\t"
303 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
304 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
305 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
306 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
307
308 /* even 7. pixel */
309 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
310 "mthi $zero, $ac2 \n\t"
311 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
312 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
313 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
314 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
315
316 /* even 8. pixel */
317 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
318 "mthi $zero, $ac3 \n\t"
319 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
320 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
321 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
322 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
323
324 /* ODD pixels */
325 "ulw %[qload1], 1(%[src]) \n\t"
326 "ulw %[qload2], 5(%[src]) \n\t"
327
328 /* odd 1. pixel */
329 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
330 "mthi $zero, $ac1 \n\t"
331 "preceu.ph.qbr %[p1], %[qload1] \n\t"
332 "preceu.ph.qbl %[p2], %[qload1] \n\t"
333 "preceu.ph.qbr %[p3], %[qload2] \n\t"
334 "preceu.ph.qbl %[p4], %[qload2] \n\t"
335 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
336 "ulw %[qload3], 9(%[src]) \n\t"
337 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
338 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
339 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
340
341 /* odd 2. pixel */
342 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
343 "mthi $zero, $ac2 \n\t"
344 "preceu.ph.qbr %[p1], %[qload3] \n\t"
345 "preceu.ph.qbl %[p5], %[qload3] \n\t"
346 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
347 "ulw %[qload1], 13(%[src]) \n\t"
348 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
349 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
350 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
351
352 /* odd 3. pixel */
353 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
354 "mthi $zero, $ac3 \n\t"
355 "preceu.ph.qbr %[p2], %[qload1] \n\t"
356 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
357 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
358 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
359 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
360
361 /* odd 4. pixel */
362 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
363 "mthi $zero, $ac1 \n\t"
364 "preceu.ph.qbl %[p3], %[qload1] \n\t"
365 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
366 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
367 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
368 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
369
370 /* odd 5. pixel */
371 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
372 "mthi $zero, $ac2 \n\t"
373 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
374 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
375 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
376 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
377
378 /* odd 6. pixel */
379 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
380 "mthi $zero, $ac3 \n\t"
381 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
382 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
383 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
385
386 /* odd 7. pixel */
387 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
388 "mthi $zero, $ac1 \n\t"
389 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
390 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
391 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
392
393 /* odd 8. pixel */
394 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
395 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
396
397 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
398 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
399 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
400
401 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
402 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
403 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
404
405 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
406 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
407 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
408 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
409 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
410 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
411 [dst] "r"(dst), [src] "r"(src));
412
413 src += 16;
414 dst += 16;
415 }
416
417 /* Next row... */
418 src_ptr += src_stride;
419 dst_ptr += dst_stride;
420 }
421 }
422
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)423 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
424 int32_t src_stride, uint8_t *dst_ptr,
425 int32_t dst_stride,
426 const int16_t *filter_x0, int32_t h) {
427 int32_t y, c;
428 const uint8_t *src;
429 uint8_t *dst;
430 uint8_t *cm = vpx_ff_cropTbl;
431 uint32_t vector_64 = 64;
432 int32_t Temp1, Temp2, Temp3;
433 uint32_t qload1, qload2, qload3;
434 uint32_t p1, p2, p3, p4, p5;
435 uint32_t st1, st2, st3;
436 const int16_t *filter = &filter_x0[3];
437 uint32_t filter45;
438
439 filter45 = ((const int32_t *)filter)[0];
440
441 for (y = h; y--;) {
442 src = src_ptr;
443 dst = dst_ptr;
444
445 /* prefetch data to cache memory */
446 prefetch_load(src_ptr + src_stride);
447 prefetch_load(src_ptr + src_stride + 32);
448 prefetch_load(src_ptr + src_stride + 64);
449 prefetch_store(dst_ptr + dst_stride);
450 prefetch_store(dst_ptr + dst_stride + 32);
451
452 for (c = 0; c < 4; c++) {
453 __asm__ __volatile__(
454 "ulw %[qload1], 0(%[src]) \n\t"
455 "ulw %[qload2], 4(%[src]) \n\t"
456
457 /* even 1. pixel */
458 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
459 "mthi $zero, $ac1 \n\t"
460 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
461 "mthi $zero, $ac2 \n\t"
462 "preceu.ph.qbr %[p1], %[qload1] \n\t"
463 "preceu.ph.qbl %[p2], %[qload1] \n\t"
464 "preceu.ph.qbr %[p3], %[qload2] \n\t"
465 "preceu.ph.qbl %[p4], %[qload2] \n\t"
466 "ulw %[qload3], 8(%[src]) \n\t"
467 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
468 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
469
470 /* even 2. pixel */
471 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
472 "mthi $zero, $ac3 \n\t"
473 "preceu.ph.qbr %[p1], %[qload3] \n\t"
474 "preceu.ph.qbl %[p5], %[qload3] \n\t"
475 "ulw %[qload1], 12(%[src]) \n\t"
476 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
477 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
478 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
479
480 /* even 3. pixel */
481 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
482 "mthi $zero, $ac1 \n\t"
483 "preceu.ph.qbr %[p2], %[qload1] \n\t"
484 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
485 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
486 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
487 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
488
489 /* even 4. pixel */
490 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
491 "mthi $zero, $ac2 \n\t"
492 "preceu.ph.qbl %[p3], %[qload1] \n\t"
493 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
494 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
495 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
496 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
497
498 /* even 5. pixel */
499 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
500 "mthi $zero, $ac3 \n\t"
501 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
502 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
503 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
504 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
505
506 /* even 6. pixel */
507 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
508 "mthi $zero, $ac1 \n\t"
509 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
510 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
511 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
512 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
513
514 /* even 7. pixel */
515 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
516 "mthi $zero, $ac2 \n\t"
517 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
518 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
519 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
520 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
521
522 /* even 8. pixel */
523 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
524 "mthi $zero, $ac3 \n\t"
525 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
526 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
527 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
528 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
529
530 /* ODD pixels */
531 "ulw %[qload1], 1(%[src]) \n\t"
532 "ulw %[qload2], 5(%[src]) \n\t"
533
534 /* odd 1. pixel */
535 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
536 "mthi $zero, $ac1 \n\t"
537 "preceu.ph.qbr %[p1], %[qload1] \n\t"
538 "preceu.ph.qbl %[p2], %[qload1] \n\t"
539 "preceu.ph.qbr %[p3], %[qload2] \n\t"
540 "preceu.ph.qbl %[p4], %[qload2] \n\t"
541 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
542 "ulw %[qload3], 9(%[src]) \n\t"
543 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
544 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
545 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
546
547 /* odd 2. pixel */
548 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
549 "mthi $zero, $ac2 \n\t"
550 "preceu.ph.qbr %[p1], %[qload3] \n\t"
551 "preceu.ph.qbl %[p5], %[qload3] \n\t"
552 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
553 "ulw %[qload1], 13(%[src]) \n\t"
554 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
555 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
556 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
557
558 /* odd 3. pixel */
559 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
560 "mthi $zero, $ac3 \n\t"
561 "preceu.ph.qbr %[p2], %[qload1] \n\t"
562 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
563 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
564 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
565 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
566
567 /* odd 4. pixel */
568 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
569 "mthi $zero, $ac1 \n\t"
570 "preceu.ph.qbl %[p3], %[qload1] \n\t"
571 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
572 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
573 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
574 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
575
576 /* odd 5. pixel */
577 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
578 "mthi $zero, $ac2 \n\t"
579 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
580 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
581 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
582 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
583
584 /* odd 6. pixel */
585 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
586 "mthi $zero, $ac3 \n\t"
587 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
588 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
589 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
590 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
591
592 /* odd 7. pixel */
593 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
594 "mthi $zero, $ac1 \n\t"
595 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
596 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
597 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
598
599 /* odd 8. pixel */
600 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
601 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
602
603 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
604 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
605 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
606
607 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
608 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
609 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
610
611 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
612 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
613 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
614 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
615 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
616 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
617 [dst] "r"(dst), [src] "r"(src));
618
619 src += 16;
620 dst += 16;
621 }
622
623 /* Next row... */
624 src_ptr += src_stride;
625 dst_ptr += dst_stride;
626 }
627 }
628
vpx_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)629 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
630 uint8_t *dst, ptrdiff_t dst_stride,
631 const InterpKernel *filter, int x0_q4,
632 int32_t x_step_q4, int y0_q4, int y_step_q4,
633 int w, int h) {
634 const int16_t *const filter_x = filter[x0_q4];
635 uint32_t pos = 38;
636
637 assert(x_step_q4 == 16);
638
639 prefetch_load((const uint8_t *)filter_x);
640
641 /* bit positon for extract from acc */
642 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
643 :
644 : [pos] "r"(pos));
645
646 /* prefetch data to cache memory */
647 prefetch_load(src);
648 prefetch_load(src + 32);
649 prefetch_store(dst);
650
651 switch (w) {
652 case 4:
653 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
654 (int32_t)dst_stride, filter_x, (int32_t)h);
655 break;
656 case 8:
657 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
658 (int32_t)dst_stride, filter_x, (int32_t)h);
659 break;
660 case 16:
661 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
662 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
663 break;
664 case 32:
665 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
666 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
667 break;
668 case 64:
669 prefetch_load(src + 64);
670 prefetch_store(dst + 32);
671
672 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
673 (int32_t)dst_stride, filter_x, (int32_t)h);
674 break;
675 default:
676 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
677 x_step_q4, y0_q4, y_step_q4, w, h);
678 break;
679 }
680 }
681 #endif
682