1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                       uint8_t *dst, int32_t dst_stride,
23                                       const int16_t *filter_x0, int32_t h) {
24   int32_t y;
25   uint8_t *cm = vpx_ff_cropTbl;
26   int32_t Temp1, Temp2, Temp3, Temp4;
27   uint32_t vector4a = 64;
28   uint32_t tp1, tp2;
29   uint32_t p1, p2;
30   const int16_t *filter = &filter_x0[3];
31   uint32_t filter45;
32 
33   filter45 = ((const int32_t *)filter)[0];
34 
35   for (y = h; y--;) {
36     /* prefetch data to cache memory */
37     prefetch_load(src + src_stride);
38     prefetch_load(src + src_stride + 32);
39     prefetch_store(dst + dst_stride);
40 
41     __asm__ __volatile__(
42         "ulw              %[tp1],      0(%[src])                      \n\t"
43         "ulw              %[tp2],      4(%[src])                      \n\t"
44 
45         /* even 1. pixel */
46         "mtlo             %[vector4a], $ac3                           \n\t"
47         "mthi             $zero,       $ac3                           \n\t"
48         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
49         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
50         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
51         "extp             %[Temp1],    $ac3,           31             \n\t"
52 
53         /* even 2. pixel */
54         "mtlo             %[vector4a], $ac2                           \n\t"
55         "mthi             $zero,       $ac2                           \n\t"
56         "balign           %[tp2],      %[tp1],         3              \n\t"
57         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
58         "extp             %[Temp3],    $ac2,           31             \n\t"
59 
60         /* odd 1. pixel */
61         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
62         "mtlo             %[vector4a], $ac3                           \n\t"
63         "mthi             $zero,       $ac3                           \n\t"
64         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
65         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
66         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
67         "extp             %[Temp2],    $ac3,           31             \n\t"
68 
69         /* odd 2. pixel */
70         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
71         "mtlo             %[vector4a], $ac2                           \n\t"
72         "mthi             $zero,       $ac2                           \n\t"
73         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
74         "extp             %[Temp4],    $ac2,           31             \n\t"
75 
76         /* clamp */
77         "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
78         "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
79 
80         /* store bytes */
81         "sb               %[tp1],      0(%[dst])                      \n\t"
82         "sb               %[p1],       1(%[dst])                      \n\t"
83         "sb               %[tp2],      2(%[dst])                      \n\t"
84         "sb               %[p2],       3(%[dst])                      \n\t"
85 
86         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
87           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
88           [Temp4] "=&r"(Temp4)
89         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
90           [dst] "r"(dst), [src] "r"(src));
91 
92     /* Next row... */
93     src += src_stride;
94     dst += dst_stride;
95   }
96 }
97 
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)98 static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
99                                       uint8_t *dst, int32_t dst_stride,
100                                       const int16_t *filter_x0, int32_t h) {
101   int32_t y;
102   uint8_t *cm = vpx_ff_cropTbl;
103   uint32_t vector4a = 64;
104   int32_t Temp1, Temp2, Temp3;
105   uint32_t tp1, tp2, tp3;
106   uint32_t p1, p2, p3, p4;
107   uint32_t st0, st1;
108   const int16_t *filter = &filter_x0[3];
109   uint32_t filter45;
110 
111   filter45 = ((const int32_t *)filter)[0];
112 
113   for (y = h; y--;) {
114     /* prefetch data to cache memory */
115     prefetch_load(src + src_stride);
116     prefetch_load(src + src_stride + 32);
117     prefetch_store(dst + dst_stride);
118 
119     __asm__ __volatile__(
120         "ulw              %[tp1],      0(%[src])                      \n\t"
121         "ulw              %[tp2],      4(%[src])                      \n\t"
122 
123         /* even 1. pixel */
124         "mtlo             %[vector4a], $ac3                           \n\t"
125         "mthi             $zero,       $ac3                           \n\t"
126         "mtlo             %[vector4a], $ac2                           \n\t"
127         "mthi             $zero,       $ac2                           \n\t"
128         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
129         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
130         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
131         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
132         "ulw              %[tp3],      8(%[src])                      \n\t"
133         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
134         "extp             %[Temp1],    $ac3,           31             \n\t"
135 
136         /* even 2. pixel */
137         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
138         "extp             %[Temp3],    $ac2,           31             \n\t"
139 
140         /* even 3. pixel */
141         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
142         "mtlo             %[vector4a], $ac1                           \n\t"
143         "mthi             $zero,       $ac1                           \n\t"
144         "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
145         "extp             %[Temp1],    $ac1,           31             \n\t"
146 
147         /* even 4. pixel */
148         "mtlo             %[vector4a], $ac2                           \n\t"
149         "mthi             $zero,       $ac2                           \n\t"
150         "mtlo             %[vector4a], $ac3                           \n\t"
151         "mthi             $zero,       $ac3                           \n\t"
152         "sb               %[st0],      0(%[dst])                      \n\t"
153         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
154 
155         "balign           %[tp3],      %[tp2],         3              \n\t"
156         "balign           %[tp2],      %[tp1],         3              \n\t"
157 
158         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
159         "extp             %[Temp3],    $ac2,           31             \n\t"
160 
161         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
162 
163         /* odd 1. pixel */
164         "mtlo             %[vector4a], $ac1                           \n\t"
165         "mthi             $zero,       $ac1                           \n\t"
166         "sb               %[st1],      2(%[dst])                      \n\t"
167         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
168         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
169         "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
170         "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
171         "sb               %[st0],      4(%[dst])                      \n\t"
172         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
173         "extp             %[Temp2],    $ac3,           31             \n\t"
174 
175         /* odd 2. pixel */
176         "mtlo             %[vector4a], $ac3                           \n\t"
177         "mthi             $zero,       $ac3                           \n\t"
178         "mtlo             %[vector4a], $ac2                           \n\t"
179         "mthi             $zero,       $ac2                           \n\t"
180         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
181         "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
182         "extp             %[Temp3],    $ac1,           31             \n\t"
183 
184         /* odd 3. pixel */
185         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
186         "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
187         "extp             %[Temp2],    $ac3,           31             \n\t"
188 
189         /* odd 4. pixel */
190         "sb               %[st1],      1(%[dst])                      \n\t"
191         "sb               %[st0],      6(%[dst])                      \n\t"
192         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
193         "extp             %[Temp1],    $ac2,           31             \n\t"
194 
195         /* clamp */
196         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
197         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
198         "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
199 
200         /* store bytes */
201         "sb               %[p4],       3(%[dst])                      \n\t"
202         "sb               %[p2],       5(%[dst])                      \n\t"
203         "sb               %[p1],       7(%[dst])                      \n\t"
204 
205         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
206           [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
207           [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
208           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
209         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
210           [dst] "r"(dst), [src] "r"(src));
211 
212     /* Next row... */
213     src += src_stride;
214     dst += dst_stride;
215   }
216 }
217 
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)218 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
219                                        int32_t src_stride, uint8_t *dst_ptr,
220                                        int32_t dst_stride,
221                                        const int16_t *filter_x0, int32_t h,
222                                        int32_t count) {
223   int32_t y, c;
224   const uint8_t *src;
225   uint8_t *dst;
226   uint8_t *cm = vpx_ff_cropTbl;
227   uint32_t vector_64 = 64;
228   int32_t Temp1, Temp2, Temp3;
229   uint32_t qload1, qload2, qload3;
230   uint32_t p1, p2, p3, p4, p5;
231   uint32_t st1, st2, st3;
232   const int16_t *filter = &filter_x0[3];
233   uint32_t filter45;
234 
235   filter45 = ((const int32_t *)filter)[0];
236 
237   for (y = h; y--;) {
238     src = src_ptr;
239     dst = dst_ptr;
240 
241     /* prefetch data to cache memory */
242     prefetch_load(src_ptr + src_stride);
243     prefetch_load(src_ptr + src_stride + 32);
244     prefetch_store(dst_ptr + dst_stride);
245 
246     for (c = 0; c < count; c++) {
247       __asm__ __volatile__(
248           "ulw              %[qload1],    0(%[src])                    \n\t"
249           "ulw              %[qload2],    4(%[src])                    \n\t"
250 
251           /* even 1. pixel */
252           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
253           "mthi             $zero,        $ac1                         \n\t"
254           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
255           "mthi             $zero,        $ac2                         \n\t"
256           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
257           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
258           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
259           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
260           "ulw              %[qload3],    8(%[src])                    \n\t"
261           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
262           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
263 
264           /* even 2. pixel */
265           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
266           "mthi             $zero,        $ac3                         \n\t"
267           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
268           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
269           "ulw              %[qload1],    12(%[src])                   \n\t"
270           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
271           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
272           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
273 
274           /* even 3. pixel */
275           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
276           "mthi             $zero,        $ac1                         \n\t"
277           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
278           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
279           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
280           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
281           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
282 
283           /* even 4. pixel */
284           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
285           "mthi             $zero,        $ac2                         \n\t"
286           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
287           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
288           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
289           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
290           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
291 
292           /* even 5. pixel */
293           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
294           "mthi             $zero,        $ac3                         \n\t"
295           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
296           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
297           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
298           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
299 
300           /* even 6. pixel */
301           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
302           "mthi             $zero,        $ac1                         \n\t"
303           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
304           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
305           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
306           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
307 
308           /* even 7. pixel */
309           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
310           "mthi             $zero,        $ac2                         \n\t"
311           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
312           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
313           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
314           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
315 
316           /* even 8. pixel */
317           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
318           "mthi             $zero,        $ac3                         \n\t"
319           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
320           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
321           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
322           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
323 
324           /* ODD pixels */
325           "ulw              %[qload1],    1(%[src])                    \n\t"
326           "ulw              %[qload2],    5(%[src])                    \n\t"
327 
328           /* odd 1. pixel */
329           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
330           "mthi             $zero,        $ac1                         \n\t"
331           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
332           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
333           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
334           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
335           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
336           "ulw              %[qload3],    9(%[src])                    \n\t"
337           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
338           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
339           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
340 
341           /* odd 2. pixel */
342           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
343           "mthi             $zero,        $ac2                         \n\t"
344           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
345           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
346           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
347           "ulw              %[qload1],    13(%[src])                   \n\t"
348           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
349           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
350           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
351 
352           /* odd 3. pixel */
353           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
354           "mthi             $zero,        $ac3                         \n\t"
355           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
356           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
357           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
358           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
359           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
360 
361           /* odd 4. pixel */
362           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
363           "mthi             $zero,        $ac1                         \n\t"
364           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
365           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
366           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
367           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
368           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
369 
370           /* odd 5. pixel */
371           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
372           "mthi             $zero,        $ac2                         \n\t"
373           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
374           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
375           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
376           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
377 
378           /* odd 6. pixel */
379           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
380           "mthi             $zero,        $ac3                         \n\t"
381           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
382           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
383           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
384           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
385 
386           /* odd 7. pixel */
387           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
388           "mthi             $zero,        $ac1                         \n\t"
389           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
390           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
391           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
392 
393           /* odd 8. pixel */
394           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
395           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
396 
397           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
398           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
399           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
400 
401           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
402           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
403           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
404 
405           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
406             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
407             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
408             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
409             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
410           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
411             [dst] "r"(dst), [src] "r"(src));
412 
413       src += 16;
414       dst += 16;
415     }
416 
417     /* Next row... */
418     src_ptr += src_stride;
419     dst_ptr += dst_stride;
420   }
421 }
422 
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)423 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
424                                        int32_t src_stride, uint8_t *dst_ptr,
425                                        int32_t dst_stride,
426                                        const int16_t *filter_x0, int32_t h) {
427   int32_t y, c;
428   const uint8_t *src;
429   uint8_t *dst;
430   uint8_t *cm = vpx_ff_cropTbl;
431   uint32_t vector_64 = 64;
432   int32_t Temp1, Temp2, Temp3;
433   uint32_t qload1, qload2, qload3;
434   uint32_t p1, p2, p3, p4, p5;
435   uint32_t st1, st2, st3;
436   const int16_t *filter = &filter_x0[3];
437   uint32_t filter45;
438 
439   filter45 = ((const int32_t *)filter)[0];
440 
441   for (y = h; y--;) {
442     src = src_ptr;
443     dst = dst_ptr;
444 
445     /* prefetch data to cache memory */
446     prefetch_load(src_ptr + src_stride);
447     prefetch_load(src_ptr + src_stride + 32);
448     prefetch_load(src_ptr + src_stride + 64);
449     prefetch_store(dst_ptr + dst_stride);
450     prefetch_store(dst_ptr + dst_stride + 32);
451 
452     for (c = 0; c < 4; c++) {
453       __asm__ __volatile__(
454           "ulw              %[qload1],    0(%[src])                    \n\t"
455           "ulw              %[qload2],    4(%[src])                    \n\t"
456 
457           /* even 1. pixel */
458           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
459           "mthi             $zero,        $ac1                         \n\t"
460           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
461           "mthi             $zero,        $ac2                         \n\t"
462           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
463           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
464           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
465           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
466           "ulw              %[qload3],    8(%[src])                    \n\t"
467           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
468           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
469 
470           /* even 2. pixel */
471           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
472           "mthi             $zero,        $ac3                         \n\t"
473           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
474           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
475           "ulw              %[qload1],    12(%[src])                   \n\t"
476           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
477           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
478           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
479 
480           /* even 3. pixel */
481           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
482           "mthi             $zero,        $ac1                         \n\t"
483           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
484           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
485           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
486           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
487           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
488 
489           /* even 4. pixel */
490           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
491           "mthi             $zero,        $ac2                         \n\t"
492           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
493           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
494           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
495           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
496           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
497 
498           /* even 5. pixel */
499           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
500           "mthi             $zero,        $ac3                         \n\t"
501           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
502           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
503           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
504           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
505 
506           /* even 6. pixel */
507           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
508           "mthi             $zero,        $ac1                         \n\t"
509           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
510           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
511           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
512           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
513 
514           /* even 7. pixel */
515           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
516           "mthi             $zero,        $ac2                         \n\t"
517           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
518           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
519           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
520           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
521 
522           /* even 8. pixel */
523           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
524           "mthi             $zero,        $ac3                         \n\t"
525           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
526           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
527           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
528           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
529 
530           /* ODD pixels */
531           "ulw              %[qload1],    1(%[src])                    \n\t"
532           "ulw              %[qload2],    5(%[src])                    \n\t"
533 
534           /* odd 1. pixel */
535           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
536           "mthi             $zero,        $ac1                         \n\t"
537           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
538           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
539           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
540           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
541           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
542           "ulw              %[qload3],    9(%[src])                    \n\t"
543           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
544           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
545           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
546 
547           /* odd 2. pixel */
548           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
549           "mthi             $zero,        $ac2                         \n\t"
550           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
551           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
552           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
553           "ulw              %[qload1],    13(%[src])                   \n\t"
554           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
555           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
556           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
557 
558           /* odd 3. pixel */
559           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
560           "mthi             $zero,        $ac3                         \n\t"
561           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
562           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
563           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
564           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
565           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
566 
567           /* odd 4. pixel */
568           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
569           "mthi             $zero,        $ac1                         \n\t"
570           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
571           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
572           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
573           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
574           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
575 
576           /* odd 5. pixel */
577           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
578           "mthi             $zero,        $ac2                         \n\t"
579           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
580           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
581           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
582           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
583 
584           /* odd 6. pixel */
585           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
586           "mthi             $zero,        $ac3                         \n\t"
587           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
588           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
589           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
590           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
591 
592           /* odd 7. pixel */
593           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
594           "mthi             $zero,        $ac1                         \n\t"
595           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
596           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
597           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
598 
599           /* odd 8. pixel */
600           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
601           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
602 
603           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
604           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
605           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
606 
607           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
608           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
609           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
610 
611           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
612             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
613             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
614             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
615             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
616           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
617             [dst] "r"(dst), [src] "r"(src));
618 
619       src += 16;
620       dst += 16;
621     }
622 
623     /* Next row... */
624     src_ptr += src_stride;
625     dst_ptr += dst_stride;
626   }
627 }
628 
vpx_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)629 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
630                                uint8_t *dst, ptrdiff_t dst_stride,
631                                const InterpKernel *filter, int x0_q4,
632                                int32_t x_step_q4, int y0_q4, int y_step_q4,
633                                int w, int h) {
634   const int16_t *const filter_x = filter[x0_q4];
635   uint32_t pos = 38;
636 
637   assert(x_step_q4 == 16);
638 
639   prefetch_load((const uint8_t *)filter_x);
640 
641   /* bit positon for extract from acc */
642   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
643                        :
644                        : [pos] "r"(pos));
645 
646   /* prefetch data to cache memory */
647   prefetch_load(src);
648   prefetch_load(src + 32);
649   prefetch_store(dst);
650 
651   switch (w) {
652     case 4:
653       convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
654                                 (int32_t)dst_stride, filter_x, (int32_t)h);
655       break;
656     case 8:
657       convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
658                                 (int32_t)dst_stride, filter_x, (int32_t)h);
659       break;
660     case 16:
661       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
662                                  (int32_t)dst_stride, filter_x, (int32_t)h, 1);
663       break;
664     case 32:
665       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
666                                  (int32_t)dst_stride, filter_x, (int32_t)h, 2);
667       break;
668     case 64:
669       prefetch_load(src + 64);
670       prefetch_store(dst + 32);
671 
672       convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
673                                  (int32_t)dst_stride, filter_x, (int32_t)h);
674       break;
675     default:
676       vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
677                             x_step_q4, y0_q4, y_step_q4, w, h);
678       break;
679   }
680 }
681 #endif
682