1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
22 int32_t src_stride, uint8_t *dst,
23 int32_t dst_stride,
24 const int16_t *filter_x0, int32_t h) {
25 int32_t y;
26 uint8_t *cm = vpx_ff_cropTbl;
27 int32_t Temp1, Temp2, Temp3, Temp4;
28 uint32_t vector4a = 64;
29 uint32_t tp1, tp2;
30 uint32_t p1, p2, p3;
31 uint32_t tn1, tn2;
32 const int16_t *filter = &filter_x0[3];
33 uint32_t filter45;
34
35 filter45 = ((const int32_t *)filter)[0];
36
37 for (y = h; y--;) {
38 /* prefetch data to cache memory */
39 prefetch_load(src + src_stride);
40 prefetch_load(src + src_stride + 32);
41 prefetch_store(dst + dst_stride);
42
43 __asm__ __volatile__(
44 "ulw %[tp1], 0(%[src]) \n\t"
45 "ulw %[tp2], 4(%[src]) \n\t"
46
47 /* even 1. pixel */
48 "mtlo %[vector4a], $ac3 \n\t"
49 "mthi $zero, $ac3 \n\t"
50 "preceu.ph.qbr %[p1], %[tp1] \n\t"
51 "preceu.ph.qbl %[p2], %[tp1] \n\t"
52 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
53 "extp %[Temp1], $ac3, 31 \n\t"
54
55 /* even 2. pixel */
56 "mtlo %[vector4a], $ac2 \n\t"
57 "mthi $zero, $ac2 \n\t"
58 "balign %[tp2], %[tp1], 3 \n\t"
59 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
60 "extp %[Temp3], $ac2, 31 \n\t"
61
62 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
63
64 /* odd 1. pixel */
65 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
66 "mtlo %[vector4a], $ac3 \n\t"
67 "mthi $zero, $ac3 \n\t"
68 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
69 "preceu.ph.qbr %[p1], %[tp2] \n\t"
70 "preceu.ph.qbl %[p3], %[tp2] \n\t"
71 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
72 "extp %[Temp2], $ac3, 31 \n\t"
73
74 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
75
76 /* odd 2. pixel */
77 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
78 "mtlo %[vector4a], $ac2 \n\t"
79 "mthi $zero, $ac2 \n\t"
80 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
81 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
82 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
83 "extp %[Temp4], $ac2, 31 \n\t"
84
85 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
86 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
87
88 /* clamp */
89 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
90 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
91 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
92
93 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
94 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
95
96 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
97 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
98
99 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
100 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
101 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
102 [Temp4] "=&r"(Temp4)
103 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
104 [dst] "r"(dst), [src] "r"(src));
105
106 /* Next row... */
107 src += src_stride;
108 dst += dst_stride;
109 }
110 }
111
convolve_bi_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)112 static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
113 int32_t src_stride, uint8_t *dst,
114 int32_t dst_stride,
115 const int16_t *filter_x0, int32_t h) {
116 int32_t y;
117 uint8_t *cm = vpx_ff_cropTbl;
118 uint32_t vector4a = 64;
119 int32_t Temp1, Temp2, Temp3;
120 uint32_t tp1, tp2, tp3, tp4;
121 uint32_t p1, p2, p3, p4, n1;
122 uint32_t st0, st1;
123 const int16_t *filter = &filter_x0[3];
124 uint32_t filter45;
125
126 filter45 = ((const int32_t *)filter)[0];
127
128 for (y = h; y--;) {
129 /* prefetch data to cache memory */
130 prefetch_load(src + src_stride);
131 prefetch_load(src + src_stride + 32);
132 prefetch_store(dst + dst_stride);
133
134 __asm__ __volatile__(
135 "ulw %[tp1], 0(%[src]) \n\t"
136 "ulw %[tp2], 4(%[src]) \n\t"
137
138 /* even 1. pixel */
139 "mtlo %[vector4a], $ac3 \n\t"
140 "mthi $zero, $ac3 \n\t"
141 "mtlo %[vector4a], $ac2 \n\t"
142 "mthi $zero, $ac2 \n\t"
143 "preceu.ph.qbr %[p1], %[tp1] \n\t"
144 "preceu.ph.qbl %[p2], %[tp1] \n\t"
145 "preceu.ph.qbr %[p3], %[tp2] \n\t"
146 "preceu.ph.qbl %[p4], %[tp2] \n\t"
147 "ulw %[tp3], 8(%[src]) \n\t"
148 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
149 "extp %[Temp1], $ac3, 31 \n\t"
150 "lbu %[Temp2], 0(%[dst]) \n\t"
151 "lbu %[tp4], 2(%[dst]) \n\t"
152
153 /* even 2. pixel */
154 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
155 "extp %[Temp3], $ac2, 31 \n\t"
156
157 /* even 3. pixel */
158 "lbux %[st0], %[Temp1](%[cm]) \n\t"
159 "mtlo %[vector4a], $ac1 \n\t"
160 "mthi $zero, $ac1 \n\t"
161 "lbux %[st1], %[Temp3](%[cm]) \n\t"
162 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
163 "extp %[Temp1], $ac1, 31 \n\t"
164
165 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
166 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
167 "sb %[Temp2], 0(%[dst]) \n\t"
168 "sb %[tp4], 2(%[dst]) \n\t"
169
170 /* even 4. pixel */
171 "mtlo %[vector4a], $ac2 \n\t"
172 "mthi $zero, $ac2 \n\t"
173 "mtlo %[vector4a], $ac3 \n\t"
174 "mthi $zero, $ac3 \n\t"
175
176 "balign %[tp3], %[tp2], 3 \n\t"
177 "balign %[tp2], %[tp1], 3 \n\t"
178
179 "lbux %[st0], %[Temp1](%[cm]) \n\t"
180 "lbu %[Temp2], 4(%[dst]) \n\t"
181 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
182
183 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
184 "extp %[Temp3], $ac2, 31 \n\t"
185
186 /* odd 1. pixel */
187 "mtlo %[vector4a], $ac1 \n\t"
188 "mthi $zero, $ac1 \n\t"
189 "sb %[Temp2], 4(%[dst]) \n\t"
190 "preceu.ph.qbr %[p1], %[tp2] \n\t"
191 "preceu.ph.qbl %[p2], %[tp2] \n\t"
192 "preceu.ph.qbr %[p3], %[tp3] \n\t"
193 "preceu.ph.qbl %[p4], %[tp3] \n\t"
194 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
195 "extp %[Temp2], $ac3, 31 \n\t"
196
197 "lbu %[tp1], 6(%[dst]) \n\t"
198
199 /* odd 2. pixel */
200 "mtlo %[vector4a], $ac3 \n\t"
201 "mthi $zero, $ac3 \n\t"
202 "mtlo %[vector4a], $ac2 \n\t"
203 "mthi $zero, $ac2 \n\t"
204 "lbux %[st0], %[Temp3](%[cm]) \n\t"
205 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
206 "extp %[Temp3], $ac1, 31 \n\t"
207
208 "lbu %[tp2], 1(%[dst]) \n\t"
209 "lbu %[tp3], 3(%[dst]) \n\t"
210 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
211
212 /* odd 3. pixel */
213 "lbux %[st1], %[Temp2](%[cm]) \n\t"
214 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
215 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
216 "extp %[Temp2], $ac3, 31 \n\t"
217
218 "lbu %[tp4], 5(%[dst]) \n\t"
219
220 /* odd 4. pixel */
221 "sb %[tp2], 1(%[dst]) \n\t"
222 "sb %[tp1], 6(%[dst]) \n\t"
223 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
224 "extp %[Temp1], $ac2, 31 \n\t"
225
226 "lbu %[tp1], 7(%[dst]) \n\t"
227
228 /* clamp */
229 "lbux %[p4], %[Temp3](%[cm]) \n\t"
230 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
231
232 "lbux %[p2], %[Temp2](%[cm]) \n\t"
233 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
234
235 "lbux %[p1], %[Temp1](%[cm]) \n\t"
236 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
237
238 /* store bytes */
239 "sb %[tp3], 3(%[dst]) \n\t"
240 "sb %[tp4], 5(%[dst]) \n\t"
241 "sb %[tp1], 7(%[dst]) \n\t"
242
243 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
244 [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
245 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
246 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
247 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
248 [dst] "r"(dst), [src] "r"(src));
249
250 /* Next row... */
251 src += src_stride;
252 dst += dst_stride;
253 }
254 }
255
convolve_bi_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)256 static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
257 int32_t src_stride, uint8_t *dst_ptr,
258 int32_t dst_stride,
259 const int16_t *filter_x0, int32_t h,
260 int32_t count) {
261 int32_t y, c;
262 const uint8_t *src;
263 uint8_t *dst;
264 uint8_t *cm = vpx_ff_cropTbl;
265 uint32_t vector_64 = 64;
266 int32_t Temp1, Temp2, Temp3;
267 uint32_t qload1, qload2, qload3;
268 uint32_t p1, p2, p3, p4, p5;
269 uint32_t st1, st2, st3;
270 const int16_t *filter = &filter_x0[3];
271 uint32_t filter45;
272
273 filter45 = ((const int32_t *)filter)[0];
274
275 for (y = h; y--;) {
276 src = src_ptr;
277 dst = dst_ptr;
278
279 /* prefetch data to cache memory */
280 prefetch_load(src_ptr + src_stride);
281 prefetch_load(src_ptr + src_stride + 32);
282 prefetch_store(dst_ptr + dst_stride);
283
284 for (c = 0; c < count; c++) {
285 __asm__ __volatile__(
286 "ulw %[qload1], 0(%[src]) \n\t"
287 "ulw %[qload2], 4(%[src]) \n\t"
288
289 /* even 1. pixel */
290 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
291 "mthi $zero, $ac1 \n\t"
292 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
293 "mthi $zero, $ac2 \n\t"
294 "preceu.ph.qbr %[p1], %[qload1] \n\t"
295 "preceu.ph.qbl %[p2], %[qload1] \n\t"
296 "preceu.ph.qbr %[p3], %[qload2] \n\t"
297 "preceu.ph.qbl %[p4], %[qload2] \n\t"
298 "ulw %[qload3], 8(%[src]) \n\t"
299 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
300 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
301 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
302
303 /* even 2. pixel */
304 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
305 "mthi $zero, $ac3 \n\t"
306 "preceu.ph.qbr %[p1], %[qload3] \n\t"
307 "preceu.ph.qbl %[p5], %[qload3] \n\t"
308 "ulw %[qload1], 12(%[src]) \n\t"
309 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
310 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
311 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
312
313 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
314
315 /* even 3. pixel */
316 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
317 "mthi $zero, $ac1 \n\t"
318 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
319 "preceu.ph.qbr %[p2], %[qload1] \n\t"
320 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
321 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
322 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
323 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
324
325 /* even 4. pixel */
326 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
327 "mthi $zero, $ac2 \n\t"
328 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
329 "preceu.ph.qbl %[p3], %[qload1] \n\t"
330 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
331 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
332 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
333 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
334 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
335 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
336
337 /* even 5. pixel */
338 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
339 "mthi $zero, $ac3 \n\t"
340 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
341 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
342 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
343 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
344 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
345
346 /* even 6. pixel */
347 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
348 "mthi $zero, $ac1 \n\t"
349 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
350 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
351 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
352 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
353 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
354 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
355
356 /* even 7. pixel */
357 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
358 "mthi $zero, $ac2 \n\t"
359 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
360 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
361 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
362 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
363 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
364 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
365
366 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
367
368 /* even 8. pixel */
369 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
370 "mthi $zero, $ac3 \n\t"
371 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
372 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
373 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
374 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
375 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
376
377 /* ODD pixels */
378 "ulw %[qload1], 1(%[src]) \n\t"
379 "ulw %[qload2], 5(%[src]) \n\t"
380
381 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
382
383 /* odd 1. pixel */
384 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
385 "mthi $zero, $ac1 \n\t"
386 "preceu.ph.qbr %[p1], %[qload1] \n\t"
387 "preceu.ph.qbl %[p2], %[qload1] \n\t"
388 "preceu.ph.qbr %[p3], %[qload2] \n\t"
389 "preceu.ph.qbl %[p4], %[qload2] \n\t"
390 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
391 "ulw %[qload3], 9(%[src]) \n\t"
392 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
393 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
394 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
395 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
396
397 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
398
399 /* odd 2. pixel */
400 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
401 "mthi $zero, $ac2 \n\t"
402 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
403 "preceu.ph.qbr %[p1], %[qload3] \n\t"
404 "preceu.ph.qbl %[p5], %[qload3] \n\t"
405 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
406 "ulw %[qload1], 13(%[src]) \n\t"
407 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
408 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
409 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
410 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
411
412 /* odd 3. pixel */
413 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
414 "mthi $zero, $ac3 \n\t"
415 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
416 "preceu.ph.qbr %[p2], %[qload1] \n\t"
417 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
418 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
419 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
420 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
421
422 /* odd 4. pixel */
423 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
424 "mthi $zero, $ac1 \n\t"
425 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
426 "preceu.ph.qbl %[p3], %[qload1] \n\t"
427 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
428 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
429 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
430 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
431 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
432
433 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
434
435 /* odd 5. pixel */
436 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
437 "mthi $zero, $ac2 \n\t"
438 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
439 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
440 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
441 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
442 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
443
444 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
445
446 /* odd 6. pixel */
447 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
448 "mthi $zero, $ac3 \n\t"
449 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
450 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
451 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
452 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
453 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
454
455 /* odd 7. pixel */
456 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
457 "mthi $zero, $ac1 \n\t"
458 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
459 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
460 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
461 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
462 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
463
464 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
465
466 /* odd 8. pixel */
467 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
468 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
469
470 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
471
472 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
473 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
474
475 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
476 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
477
478 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
479 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
480
481 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
482 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
483 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
484
485 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
486 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
487 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
488 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
489 [Temp3] "=&r"(Temp3)
490 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
491 [dst] "r"(dst), [src] "r"(src));
492
493 src += 16;
494 dst += 16;
495 }
496
497 /* Next row... */
498 src_ptr += src_stride;
499 dst_ptr += dst_stride;
500 }
501 }
502
convolve_bi_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)503 static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
504 int32_t src_stride, uint8_t *dst_ptr,
505 int32_t dst_stride,
506 const int16_t *filter_x0,
507 int32_t h) {
508 int32_t y, c;
509 const uint8_t *src;
510 uint8_t *dst;
511 uint8_t *cm = vpx_ff_cropTbl;
512 uint32_t vector_64 = 64;
513 int32_t Temp1, Temp2, Temp3;
514 uint32_t qload1, qload2, qload3;
515 uint32_t p1, p2, p3, p4, p5;
516 uint32_t st1, st2, st3;
517 const int16_t *filter = &filter_x0[3];
518 uint32_t filter45;
519
520 filter45 = ((const int32_t *)filter)[0];
521
522 for (y = h; y--;) {
523 src = src_ptr;
524 dst = dst_ptr;
525
526 /* prefetch data to cache memory */
527 prefetch_load(src_ptr + src_stride);
528 prefetch_load(src_ptr + src_stride + 32);
529 prefetch_load(src_ptr + src_stride + 64);
530 prefetch_store(dst_ptr + dst_stride);
531 prefetch_store(dst_ptr + dst_stride + 32);
532
533 for (c = 0; c < 4; c++) {
534 __asm__ __volatile__(
535 "ulw %[qload1], 0(%[src]) \n\t"
536 "ulw %[qload2], 4(%[src]) \n\t"
537
538 /* even 1. pixel */
539 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
540 "mthi $zero, $ac1 \n\t"
541 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
542 "mthi $zero, $ac2 \n\t"
543 "preceu.ph.qbr %[p1], %[qload1] \n\t"
544 "preceu.ph.qbl %[p2], %[qload1] \n\t"
545 "preceu.ph.qbr %[p3], %[qload2] \n\t"
546 "preceu.ph.qbl %[p4], %[qload2] \n\t"
547 "ulw %[qload3], 8(%[src]) \n\t"
548 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
549 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
550 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
551
552 /* even 2. pixel */
553 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
554 "mthi $zero, $ac3 \n\t"
555 "preceu.ph.qbr %[p1], %[qload3] \n\t"
556 "preceu.ph.qbl %[p5], %[qload3] \n\t"
557 "ulw %[qload1], 12(%[src]) \n\t"
558 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
559 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
560 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
561
562 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
563
564 /* even 3. pixel */
565 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
566 "mthi $zero, $ac1 \n\t"
567 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
568 "preceu.ph.qbr %[p2], %[qload1] \n\t"
569 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
570 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
571 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
572 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
573
574 /* even 4. pixel */
575 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
576 "mthi $zero, $ac2 \n\t"
577 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
578 "preceu.ph.qbl %[p3], %[qload1] \n\t"
579 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
580 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
581 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
582 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
583 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
584 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
585
586 /* even 5. pixel */
587 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
588 "mthi $zero, $ac3 \n\t"
589 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
590 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
591 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
592 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
593 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
594
595 /* even 6. pixel */
596 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
597 "mthi $zero, $ac1 \n\t"
598 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
599 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
600 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
601 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
602 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
603 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
604
605 /* even 7. pixel */
606 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
607 "mthi $zero, $ac2 \n\t"
608 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
609 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
610 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
611 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
612 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
613 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
614
615 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
616
617 /* even 8. pixel */
618 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
619 "mthi $zero, $ac3 \n\t"
620 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
621 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
622 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
623 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
624 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
625
626 /* ODD pixels */
627 "ulw %[qload1], 1(%[src]) \n\t"
628 "ulw %[qload2], 5(%[src]) \n\t"
629
630 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
631
632 /* odd 1. pixel */
633 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
634 "mthi $zero, $ac1 \n\t"
635 "preceu.ph.qbr %[p1], %[qload1] \n\t"
636 "preceu.ph.qbl %[p2], %[qload1] \n\t"
637 "preceu.ph.qbr %[p3], %[qload2] \n\t"
638 "preceu.ph.qbl %[p4], %[qload2] \n\t"
639 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
640 "ulw %[qload3], 9(%[src]) \n\t"
641 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
642 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
643 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
644 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
645
646 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
647
648 /* odd 2. pixel */
649 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
650 "mthi $zero, $ac2 \n\t"
651 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
652 "preceu.ph.qbr %[p1], %[qload3] \n\t"
653 "preceu.ph.qbl %[p5], %[qload3] \n\t"
654 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
655 "ulw %[qload1], 13(%[src]) \n\t"
656 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
657 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
658 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
659 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
660
661 /* odd 3. pixel */
662 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
663 "mthi $zero, $ac3 \n\t"
664 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
665 "preceu.ph.qbr %[p2], %[qload1] \n\t"
666 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
667 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
668 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
669 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
670
671 /* odd 4. pixel */
672 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
673 "mthi $zero, $ac1 \n\t"
674 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
675 "preceu.ph.qbl %[p3], %[qload1] \n\t"
676 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
677 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
678 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
679 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
680 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
681
682 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
683
684 /* odd 5. pixel */
685 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
686 "mthi $zero, $ac2 \n\t"
687 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
688 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
689 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
690 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
691 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
692
693 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
694
695 /* odd 6. pixel */
696 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
697 "mthi $zero, $ac3 \n\t"
698 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
699 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
700 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
701 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
702 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
703
704 /* odd 7. pixel */
705 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
706 "mthi $zero, $ac1 \n\t"
707 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
708 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
709 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
710 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
711 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
712
713 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
714
715 /* odd 8. pixel */
716 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
717 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
718
719 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
720
721 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
722 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
723
724 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
725 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
726
727 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
728 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
729
730 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
731 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
732 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
733
734 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
735 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
736 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
737 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
738 [Temp3] "=&r"(Temp3)
739 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
740 [dst] "r"(dst), [src] "r"(src));
741
742 src += 16;
743 dst += 16;
744 }
745
746 /* Next row... */
747 src_ptr += src_stride;
748 dst_ptr += dst_stride;
749 }
750 }
751
vpx_convolve2_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)752 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
753 uint8_t *dst, ptrdiff_t dst_stride,
754 const int16_t *filter_x, int x_step_q4,
755 const int16_t *filter_y, int y_step_q4,
756 int w, int h) {
757 uint32_t pos = 38;
758
759 assert(x_step_q4 == 16);
760
761 /* bit positon for extract from acc */
762 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
763 :
764 : [pos] "r"(pos));
765
766 /* prefetch data to cache memory */
767 prefetch_load(src);
768 prefetch_load(src + 32);
769 prefetch_store(dst);
770
771 switch (w) {
772 case 4:
773 convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
774 h);
775 break;
776 case 8:
777 convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
778 h);
779 break;
780 case 16:
781 convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
782 h, 1);
783 break;
784 case 32:
785 convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
786 h, 2);
787 break;
788 case 64:
789 prefetch_load(src + 64);
790 prefetch_store(dst + 32);
791
792 convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
793 h);
794 break;
795 default:
796 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
797 x_step_q4, filter_y, y_step_q4, w, h);
798 break;
799 }
800 }
801 #endif
802