1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                    uint8_t *dst, int32_t dst_stride,
23                                    const int16_t *filter_x0, int32_t h) {
24   int32_t y;
25   uint8_t *cm = vpx_ff_cropTbl;
26   int32_t vector1b, vector2b, vector3b, vector4b;
27   int32_t Temp1, Temp2, Temp3, Temp4;
28   uint32_t vector4a = 64;
29   uint32_t tp1, tp2;
30   uint32_t p1, p2, p3, p4;
31   uint32_t n1, n2, n3, n4;
32   uint32_t tn1, tn2;
33 
34   vector1b = ((const int32_t *)filter_x0)[0];
35   vector2b = ((const int32_t *)filter_x0)[1];
36   vector3b = ((const int32_t *)filter_x0)[2];
37   vector4b = ((const int32_t *)filter_x0)[3];
38 
39   for (y = h; y--;) {
40     /* prefetch data to cache memory */
41     prefetch_load(src + src_stride);
42     prefetch_load(src + src_stride + 32);
43     prefetch_store(dst + dst_stride);
44 
45     __asm__ __volatile__(
46         "ulw              %[tp1],      0(%[src])                      \n\t"
47         "ulw              %[tp2],      4(%[src])                      \n\t"
48 
49         /* even 1. pixel */
50         "mtlo             %[vector4a], $ac3                           \n\t"
51         "mthi             $zero,       $ac3                           \n\t"
52         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
53         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
54         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
55         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
56         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
57         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
58         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
59         "ulw              %[tn2],      8(%[src])                      \n\t"
60         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
61         "extp             %[Temp1],    $ac3,           31             \n\t"
62 
63         /* even 2. pixel */
64         "mtlo             %[vector4a], $ac2                           \n\t"
65         "mthi             $zero,       $ac2                           \n\t"
66         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
67         "balign           %[tn1],      %[tn2],         3              \n\t"
68         "balign           %[tn2],      %[tp2],         3              \n\t"
69         "balign           %[tp2],      %[tp1],         3              \n\t"
70         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
71         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
72         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
73         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
74         "extp             %[Temp3],    $ac2,           31             \n\t"
75 
76         /* odd 1. pixel */
77         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
78         "mtlo             %[vector4a], $ac3                           \n\t"
79         "mthi             $zero,       $ac3                           \n\t"
80         "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
81         "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
82         "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
83         "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
84         "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
85         "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
86         "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
87         "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
88         "extp             %[Temp2],    $ac3,           31             \n\t"
89 
90         /* odd 2. pixel */
91         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
92         "mtlo             %[vector4a], $ac2                           \n\t"
93         "mthi             $zero,       $ac2                           \n\t"
94         "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
95         "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
96         "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
97         "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
98         "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
99         "extp             %[Temp4],    $ac2,           31             \n\t"
100 
101         /* clamp */
102         "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
103         "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
104 
105         /* store bytes */
106         "sb               %[tp1],      0(%[dst])                      \n\t"
107         "sb               %[tn1],      1(%[dst])                      \n\t"
108         "sb               %[tp2],      2(%[dst])                      \n\t"
109         "sb               %[n2],       3(%[dst])                      \n\t"
110 
111         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
112           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
113           [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
114           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
115           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
116         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
117           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
118           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
119           [src] "r"(src));
120 
121     /* Next row... */
122     src += src_stride;
123     dst += dst_stride;
124   }
125 }
126 
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)127 static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
128                                    uint8_t *dst, int32_t dst_stride,
129                                    const int16_t *filter_x0, int32_t h) {
130   int32_t y;
131   uint8_t *cm = vpx_ff_cropTbl;
132   uint32_t vector4a = 64;
133   int32_t vector1b, vector2b, vector3b, vector4b;
134   int32_t Temp1, Temp2, Temp3;
135   uint32_t tp1, tp2;
136   uint32_t p1, p2, p3, p4, n1;
137   uint32_t tn1, tn2, tn3;
138   uint32_t st0, st1;
139 
140   vector1b = ((const int32_t *)filter_x0)[0];
141   vector2b = ((const int32_t *)filter_x0)[1];
142   vector3b = ((const int32_t *)filter_x0)[2];
143   vector4b = ((const int32_t *)filter_x0)[3];
144 
145   for (y = h; y--;) {
146     /* prefetch data to cache memory */
147     prefetch_load(src + src_stride);
148     prefetch_load(src + src_stride + 32);
149     prefetch_store(dst + dst_stride);
150 
151     __asm__ __volatile__(
152         "ulw              %[tp1],      0(%[src])                      \n\t"
153         "ulw              %[tp2],      4(%[src])                      \n\t"
154 
155         /* even 1. pixel */
156         "mtlo             %[vector4a], $ac3                           \n\t"
157         "mthi             $zero,       $ac3                           \n\t"
158         "mtlo             %[vector4a], $ac2                           \n\t"
159         "mthi             $zero,       $ac2                           \n\t"
160         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
161         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
162         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
163         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
164         "ulw              %[tn2],      8(%[src])                      \n\t"
165         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
166         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
167         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
168         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
169         "extp             %[Temp1],    $ac3,           31             \n\t"
170 
171         /* even 2. pixel */
172         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
173         "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
174         "ulw              %[tn1],      12(%[src])                     \n\t"
175         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
176         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
177         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
178         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
179         "extp             %[Temp3],    $ac2,           31             \n\t"
180 
181         /* even 3. pixel */
182         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
183         "mtlo             %[vector4a], $ac1                           \n\t"
184         "mthi             $zero,       $ac1                           \n\t"
185         "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
186         "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
187         "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
188         "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
189         "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
190         "extp             %[Temp1],    $ac1,           31             \n\t"
191 
192         /* even 4. pixel */
193         "mtlo             %[vector4a], $ac2                           \n\t"
194         "mthi             $zero,       $ac2                           \n\t"
195         "mtlo             %[vector4a], $ac3                           \n\t"
196         "mthi             $zero,       $ac3                           \n\t"
197         "sb               %[st0],      0(%[dst])                      \n\t"
198         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
199 
200         "balign           %[tn3],      %[tn1],         3              \n\t"
201         "balign           %[tn1],      %[tn2],         3              \n\t"
202         "balign           %[tn2],      %[tp2],         3              \n\t"
203         "balign           %[tp2],      %[tp1],         3              \n\t"
204 
205         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
206         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
207         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
208         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
209         "extp             %[Temp3],    $ac2,           31             \n\t"
210 
211         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
212 
213         /* odd 1. pixel */
214         "mtlo             %[vector4a], $ac1                           \n\t"
215         "mthi             $zero,       $ac1                           \n\t"
216         "sb               %[st1],      2(%[dst])                      \n\t"
217         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
218         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
219         "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
220         "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
221         "sb               %[st0],      4(%[dst])                      \n\t"
222         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
223         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
224         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
225         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
226         "extp             %[Temp2],    $ac3,           31             \n\t"
227 
228         /* odd 2. pixel */
229         "mtlo             %[vector4a], $ac3                           \n\t"
230         "mthi             $zero,       $ac3                           \n\t"
231         "mtlo             %[vector4a], $ac2                           \n\t"
232         "mthi             $zero,       $ac2                           \n\t"
233         "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
234         "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
235         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
236         "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
237         "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
238         "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
239         "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
240         "extp             %[Temp3],    $ac1,           31             \n\t"
241 
242         /* odd 3. pixel */
243         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
244         "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
245         "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
246         "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
247         "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
248         "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
249         "extp             %[Temp2],    $ac3,           31             \n\t"
250 
251         /* odd 4. pixel */
252         "sb               %[st1],      1(%[dst])                      \n\t"
253         "sb               %[st0],      6(%[dst])                      \n\t"
254         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
255         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
256         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
257         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
258         "extp             %[Temp1],    $ac2,           31             \n\t"
259 
260         /* clamp */
261         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
262         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
263         "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
264 
265         /* store bytes */
266         "sb               %[p4],       3(%[dst])                      \n\t"
267         "sb               %[p2],       5(%[dst])                      \n\t"
268         "sb               %[n1],       7(%[dst])                      \n\t"
269 
270         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
271           [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
272           [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
273           [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
274           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
275         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
276           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
277           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
278           [src] "r"(src));
279 
280     /* Next row... */
281     src += src_stride;
282     dst += dst_stride;
283   }
284 }
285 
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)286 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
287                                     uint8_t *dst_ptr, int32_t dst_stride,
288                                     const int16_t *filter_x0, int32_t h,
289                                     int32_t count) {
290   int32_t y, c;
291   const uint8_t *src;
292   uint8_t *dst;
293   uint8_t *cm = vpx_ff_cropTbl;
294   uint32_t vector_64 = 64;
295   int32_t filter12, filter34, filter56, filter78;
296   int32_t Temp1, Temp2, Temp3;
297   uint32_t qload1, qload2, qload3;
298   uint32_t p1, p2, p3, p4, p5;
299   uint32_t st1, st2, st3;
300 
301   filter12 = ((const int32_t *)filter_x0)[0];
302   filter34 = ((const int32_t *)filter_x0)[1];
303   filter56 = ((const int32_t *)filter_x0)[2];
304   filter78 = ((const int32_t *)filter_x0)[3];
305 
306   for (y = h; y--;) {
307     src = src_ptr;
308     dst = dst_ptr;
309 
310     /* prefetch data to cache memory */
311     prefetch_load(src_ptr + src_stride);
312     prefetch_load(src_ptr + src_stride + 32);
313     prefetch_store(dst_ptr + dst_stride);
314 
315     for (c = 0; c < count; c++) {
316       __asm__ __volatile__(
317           "ulw              %[qload1],    0(%[src])                    \n\t"
318           "ulw              %[qload2],    4(%[src])                    \n\t"
319 
320           /* even 1. pixel */
321           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
322           "mthi             $zero,        $ac1                         \n\t"
323           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
324           "mthi             $zero,        $ac2                         \n\t"
325           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
326           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
327           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
328           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
329           "ulw              %[qload3],    8(%[src])                    \n\t"
330           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
331           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
332           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
333           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
334           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
335 
336           /* even 2. pixel */
337           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
338           "mthi             $zero,        $ac3                         \n\t"
339           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
340           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
341           "ulw              %[qload1],    12(%[src])                   \n\t"
342           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
343           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
344           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
345           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
346           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
347           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
348 
349           /* even 3. pixel */
350           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
351           "mthi             $zero,        $ac1                         \n\t"
352           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
353           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
354           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
355           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
356           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
357           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
358           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
359           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
360 
361           /* even 4. pixel */
362           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
363           "mthi             $zero,        $ac2                         \n\t"
364           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
365           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
366           "ulw              %[qload2],    16(%[src])                   \n\t"
367           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
368           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
369           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
370           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
371           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
372           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
373 
374           /* even 5. pixel */
375           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
376           "mthi             $zero,        $ac3                         \n\t"
377           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
378           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
379           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
380           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
381           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
382           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
383           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
384           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
385 
386           /* even 6. pixel */
387           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
388           "mthi             $zero,        $ac1                         \n\t"
389           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
390           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
391           "ulw              %[qload3],    20(%[src])                   \n\t"
392           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
393           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
394           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
395           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
396           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
397           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
398 
399           /* even 7. pixel */
400           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
401           "mthi             $zero,        $ac2                         \n\t"
402           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
403           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
404           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
405           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
406           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
407           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
408           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
409           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
410 
411           /* even 8. pixel */
412           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
413           "mthi             $zero,        $ac3                         \n\t"
414           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
415           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
416           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
417           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
418           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
419           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
420           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
421 
422           /* ODD pixels */
423           "ulw              %[qload1],    1(%[src])                    \n\t"
424           "ulw              %[qload2],    5(%[src])                    \n\t"
425 
426           /* odd 1. pixel */
427           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
428           "mthi             $zero,        $ac1                         \n\t"
429           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
430           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
431           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
432           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
433           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
434           "ulw              %[qload3],    9(%[src])                    \n\t"
435           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
436           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
437           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
438           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
439           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
440           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
441 
442           /* odd 2. pixel */
443           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
444           "mthi             $zero,        $ac2                         \n\t"
445           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
446           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
447           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
448           "ulw              %[qload1],    13(%[src])                   \n\t"
449           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
450           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
451           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
452           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
453           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
454           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
455 
456           /* odd 3. pixel */
457           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
458           "mthi             $zero,        $ac3                         \n\t"
459           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
460           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
461           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
462           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
463           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
464           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
465           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
466           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
467 
468           /* odd 4. pixel */
469           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
470           "mthi             $zero,        $ac1                         \n\t"
471           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
472           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
473           "ulw              %[qload2],    17(%[src])                   \n\t"
474           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
475           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
476           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
477           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
478           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
479           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
480 
481           /* odd 5. pixel */
482           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
483           "mthi             $zero,        $ac2                         \n\t"
484           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
485           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
486           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
487           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
488           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
489           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
490           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
491           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
492 
493           /* odd 6. pixel */
494           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
495           "mthi             $zero,        $ac3                         \n\t"
496           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
497           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
498           "ulw              %[qload3],    21(%[src])                   \n\t"
499           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
500           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
501           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
502           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
503           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
504           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
505 
506           /* odd 7. pixel */
507           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
508           "mthi             $zero,        $ac1                         \n\t"
509           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
510           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
511           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
512           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
513           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
514           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
515           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
516 
517           /* odd 8. pixel */
518           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
519           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
520           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
521           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
522           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
523 
524           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
525           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
526           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
527 
528           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
529           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
530           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
531 
532           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
533             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
534             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
535             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
536             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
537           : [filter12] "r"(filter12), [filter34] "r"(filter34),
538             [filter56] "r"(filter56), [filter78] "r"(filter78),
539             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
540             [src] "r"(src));
541 
542       src += 16;
543       dst += 16;
544     }
545 
546     /* Next row... */
547     src_ptr += src_stride;
548     dst_ptr += dst_stride;
549   }
550 }
551 
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)552 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
553                                     uint8_t *dst_ptr, int32_t dst_stride,
554                                     const int16_t *filter_x0, int32_t h) {
555   int32_t y, c;
556   const uint8_t *src;
557   uint8_t *dst;
558   uint8_t *cm = vpx_ff_cropTbl;
559   uint32_t vector_64 = 64;
560   int32_t filter12, filter34, filter56, filter78;
561   int32_t Temp1, Temp2, Temp3;
562   uint32_t qload1, qload2, qload3;
563   uint32_t p1, p2, p3, p4, p5;
564   uint32_t st1, st2, st3;
565 
566   filter12 = ((const int32_t *)filter_x0)[0];
567   filter34 = ((const int32_t *)filter_x0)[1];
568   filter56 = ((const int32_t *)filter_x0)[2];
569   filter78 = ((const int32_t *)filter_x0)[3];
570 
571   for (y = h; y--;) {
572     src = src_ptr;
573     dst = dst_ptr;
574 
575     /* prefetch data to cache memory */
576     prefetch_load(src_ptr + src_stride);
577     prefetch_load(src_ptr + src_stride + 32);
578     prefetch_load(src_ptr + src_stride + 64);
579     prefetch_store(dst_ptr + dst_stride);
580     prefetch_store(dst_ptr + dst_stride + 32);
581 
582     for (c = 0; c < 4; c++) {
583       __asm__ __volatile__(
584           "ulw              %[qload1],    0(%[src])                    \n\t"
585           "ulw              %[qload2],    4(%[src])                    \n\t"
586 
587           /* even 1. pixel */
588           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
589           "mthi             $zero,        $ac1                         \n\t"
590           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
591           "mthi             $zero,        $ac2                         \n\t"
592           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
593           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
594           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
595           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
596           "ulw              %[qload3],    8(%[src])                    \n\t"
597           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
598           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
599           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
600           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
601           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
602 
603           /* even 2. pixel */
604           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
605           "mthi             $zero,        $ac3                         \n\t"
606           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
607           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
608           "ulw              %[qload1],    12(%[src])                   \n\t"
609           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
610           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
611           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
612           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
613           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
614           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
615 
616           /* even 3. pixel */
617           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
618           "mthi             $zero,        $ac1                         \n\t"
619           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
620           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
621           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
622           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
623           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
624           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
625           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
626           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
627 
628           /* even 4. pixel */
629           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
630           "mthi             $zero,        $ac2                         \n\t"
631           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
632           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
633           "ulw              %[qload2],    16(%[src])                   \n\t"
634           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
635           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
636           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
637           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
638           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
639           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
640 
641           /* even 5. pixel */
642           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
643           "mthi             $zero,        $ac3                         \n\t"
644           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
645           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
646           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
647           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
648           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
649           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
650           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
651           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
652 
653           /* even 6. pixel */
654           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
655           "mthi             $zero,        $ac1                         \n\t"
656           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
657           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
658           "ulw              %[qload3],    20(%[src])                   \n\t"
659           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
660           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
661           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
662           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
663           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
664           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
665 
666           /* even 7. pixel */
667           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
668           "mthi             $zero,        $ac2                         \n\t"
669           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
670           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
671           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
672           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
673           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
674           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
675           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
676           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
677 
678           /* even 8. pixel */
679           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
680           "mthi             $zero,        $ac3                         \n\t"
681           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
682           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
683           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
684           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
685           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
686           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
687           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
688 
689           /* ODD pixels */
690           "ulw              %[qload1],    1(%[src])                    \n\t"
691           "ulw              %[qload2],    5(%[src])                    \n\t"
692 
693           /* odd 1. pixel */
694           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
695           "mthi             $zero,        $ac1                         \n\t"
696           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
697           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
698           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
699           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
700           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
701           "ulw              %[qload3],    9(%[src])                    \n\t"
702           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
703           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
704           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
705           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
706           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
707           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
708 
709           /* odd 2. pixel */
710           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
711           "mthi             $zero,        $ac2                         \n\t"
712           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
713           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
714           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
715           "ulw              %[qload1],    13(%[src])                   \n\t"
716           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
717           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
718           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
719           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
720           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
721           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
722 
723           /* odd 3. pixel */
724           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
725           "mthi             $zero,        $ac3                         \n\t"
726           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
727           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
728           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
729           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
730           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
731           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
732           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
733           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
734 
735           /* odd 4. pixel */
736           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
737           "mthi             $zero,        $ac1                         \n\t"
738           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
739           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
740           "ulw              %[qload2],    17(%[src])                   \n\t"
741           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
742           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
743           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
744           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
745           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
746           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
747 
748           /* odd 5. pixel */
749           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
750           "mthi             $zero,        $ac2                         \n\t"
751           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
752           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
753           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
754           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
755           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
756           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
757           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
758           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
759 
760           /* odd 6. pixel */
761           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
762           "mthi             $zero,        $ac3                         \n\t"
763           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
764           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
765           "ulw              %[qload3],    21(%[src])                   \n\t"
766           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
767           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
768           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
769           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
770           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
771           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
772 
773           /* odd 7. pixel */
774           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
775           "mthi             $zero,        $ac1                         \n\t"
776           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
777           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
778           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
779           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
780           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
781           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
782           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
783 
784           /* odd 8. pixel */
785           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
786           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
787           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
788           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
789           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
790 
791           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
792           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
793           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
794 
795           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
796           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
797           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
798 
799           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
800             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
801             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
802             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
803             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
804           : [filter12] "r"(filter12), [filter34] "r"(filter34),
805             [filter56] "r"(filter56), [filter78] "r"(filter78),
806             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
807             [src] "r"(src));
808 
809       src += 16;
810       dst += 16;
811     }
812 
813     /* Next row... */
814     src_ptr += src_stride;
815     dst_ptr += dst_stride;
816   }
817 }
818 
vpx_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)819 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
820                                uint8_t *dst, ptrdiff_t dst_stride,
821                                const InterpKernel *filter, int x0_q4,
822                                int x_step_q4, int y0_q4, int y_step_q4, int w,
823                                int h) {
824   const int16_t *const filter_x = filter[x0_q4];
825   assert(x_step_q4 == 16);
826   assert(((const int32_t *)filter_x)[1] != 0x800000);
827 
828   if (vpx_get_filter_taps(filter_x) == 2) {
829     vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
830                               x_step_q4, y0_q4, y_step_q4, w, h);
831   } else {
832     uint32_t pos = 38;
833 
834     prefetch_load((const uint8_t *)filter_x);
835     src -= 3;
836 
837     /* bit positon for extract from acc */
838     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
839                          :
840                          : [pos] "r"(pos));
841 
842     /* prefetch data to cache memory */
843     prefetch_load(src);
844     prefetch_load(src + 32);
845     prefetch_store(dst);
846 
847     switch (w) {
848       case 4:
849         convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
850                                (int32_t)dst_stride, filter_x, (int32_t)h);
851         break;
852       case 8:
853         convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
854                                (int32_t)dst_stride, filter_x, (int32_t)h);
855         break;
856       case 16:
857         convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
858                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
859         break;
860       case 32:
861         convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
862                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
863         break;
864       case 64:
865         prefetch_load(src + 64);
866         prefetch_store(dst + 32);
867 
868         convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
869                                 (int32_t)dst_stride, filter_x, (int32_t)h);
870         break;
871       default:
872         vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
873                               x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
874         break;
875     }
876   }
877 }
878 #endif
879