1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_transposed_dspr2(
22     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
23     const int16_t *filter_x0, int32_t h) {
24   int32_t y;
25   uint8_t *cm = vpx_ff_cropTbl;
26   uint8_t *dst_ptr;
27   int32_t Temp1, Temp2;
28   uint32_t vector4a = 64;
29   uint32_t tp1, tp2;
30   uint32_t p1, p2;
31   const int16_t *filter = &filter_x0[3];
32   uint32_t filter45;
33 
34   filter45 = ((const int32_t *)filter)[0];
35 
36   for (y = h; y--;) {
37     dst_ptr = dst;
38     /* prefetch data to cache memory */
39     prefetch_load(src + src_stride);
40     prefetch_load(src + src_stride + 32);
41 
42     __asm__ __volatile__(
43         "ulw              %[tp1],         0(%[src])                      \n\t"
44         "ulw              %[tp2],         4(%[src])                      \n\t"
45 
46         /* even 1. pixel */
47         "mtlo             %[vector4a],    $ac3                           \n\t"
48         "mthi             $zero,          $ac3                           \n\t"
49         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
50         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
51         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
52         "extp             %[Temp1],       $ac3,           31             \n\t"
53 
54         /* even 2. pixel */
55         "mtlo             %[vector4a],    $ac2                           \n\t"
56         "mthi             $zero,          $ac2                           \n\t"
57         "balign           %[tp2],         %[tp1],         3              \n\t"
58         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
59         "extp             %[Temp2],       $ac2,           31             \n\t"
60 
61         /* odd 1. pixel */
62         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
63         "mtlo             %[vector4a],    $ac3                           \n\t"
64         "mthi             $zero,          $ac3                           \n\t"
65         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
66         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
67         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
68         "extp             %[Temp1],       $ac3,           31             \n\t"
69 
70         /* odd 2. pixel */
71         "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
72         "mtlo             %[vector4a],    $ac2                           \n\t"
73         "mthi             $zero,          $ac2                           \n\t"
74         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
75         "extp             %[Temp2],       $ac2,           31             \n\t"
76 
77         /* clamp */
78         "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
79         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
80 
81         /* store bytes */
82         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
83         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
84 
85         "sb               %[p1],          0(%[dst_ptr])                  \n\t"
86         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
87 
88         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
89         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
90 
91         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
92         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
93 
94         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
95           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
96         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
97           [src] "r"(src), [dst_stride] "r"(dst_stride));
98 
99     /* Next row... */
100     src += src_stride;
101     dst += 1;
102   }
103 }
104 
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)105 static void convolve_bi_horiz_8_transposed_dspr2(
106     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
107     const int16_t *filter_x0, int32_t h) {
108   int32_t y;
109   uint8_t *cm = vpx_ff_cropTbl;
110   uint8_t *dst_ptr;
111   uint32_t vector4a = 64;
112   int32_t Temp1, Temp2, Temp3;
113   uint32_t tp1, tp2, tp3;
114   uint32_t p1, p2, p3, p4;
115   uint8_t *odd_dst;
116   uint32_t dst_pitch_2 = (dst_stride << 1);
117   const int16_t *filter = &filter_x0[3];
118   uint32_t filter45;
119 
120   filter45 = ((const int32_t *)filter)[0];
121 
122   for (y = h; y--;) {
123     /* prefetch data to cache memory */
124     prefetch_load(src + src_stride);
125     prefetch_load(src + src_stride + 32);
126 
127     dst_ptr = dst;
128     odd_dst = (dst_ptr + dst_stride);
129 
130     __asm__ __volatile__(
131         "ulw              %[tp1],         0(%[src])                       \n\t"
132         "ulw              %[tp2],         4(%[src])                       \n\t"
133 
134         /* even 1. pixel */
135         "mtlo             %[vector4a],    $ac3                            \n\t"
136         "mthi             $zero,          $ac3                            \n\t"
137         "mtlo             %[vector4a],    $ac2                            \n\t"
138         "mthi             $zero,          $ac2                            \n\t"
139         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
140         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
141         "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
142         "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
143         "ulw              %[tp3],         8(%[src])                       \n\t"
144         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
145         "extp             %[Temp1],       $ac3,           31              \n\t"
146 
147         /* even 2. pixel */
148         "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
149         "extp             %[Temp3],       $ac2,           31              \n\t"
150 
151         /* even 3. pixel */
152         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
153         "mtlo             %[vector4a],    $ac1                            \n\t"
154         "mthi             $zero,          $ac1                            \n\t"
155         "balign           %[tp3],         %[tp2],         3              \n\t"
156         "balign           %[tp2],         %[tp1],         3              \n\t"
157         "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
158         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
159         "extp             %[p3],          $ac1,           31              \n\t"
160 
161         /* even 4. pixel */
162         "mtlo             %[vector4a],    $ac2                            \n\t"
163         "mthi             $zero,          $ac2                            \n\t"
164         "mtlo             %[vector4a],    $ac3                            \n\t"
165         "mthi             $zero,          $ac3                            \n\t"
166         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
167         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
168         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
169         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
170 
171         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
172         "extp             %[Temp3],       $ac2,           31              \n\t"
173 
174         "lbux             %[Temp1],         %[p3](%[cm])                    "
175         "\n\t"
176 
177         /* odd 1. pixel */
178         "mtlo             %[vector4a],    $ac1                            \n\t"
179         "mthi             $zero,          $ac1                            \n\t"
180         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
181         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
182         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
183         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
184         "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
185         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
186 
187         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
188         "extp             %[Temp2],       $ac3,           31              \n\t"
189 
190         /* odd 2. pixel */
191         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
192         "mtlo             %[vector4a],    $ac3                            \n\t"
193         "mthi             $zero,          $ac3                            \n\t"
194         "mtlo             %[vector4a],    $ac2                            \n\t"
195         "mthi             $zero,          $ac2                            \n\t"
196         "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
197         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
198         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
199         "extp             %[Temp3],       $ac1,           31              \n\t"
200 
201         /* odd 3. pixel */
202         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
203         "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
204         "extp             %[Temp2],       $ac3,           31              \n\t"
205 
206         /* odd 4. pixel */
207         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
208         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
209         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
210         "extp             %[Temp1],       $ac2,           31              \n\t"
211 
212         /* clamp */
213         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
214         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
215         "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
216 
217         /* store bytes */
218         "sb               %[p4],          0(%[odd_dst])                   \n\t"
219         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
220 
221         "sb               %[p2],          0(%[odd_dst])                   \n\t"
222         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
223 
224         "sb               %[p1],          0(%[odd_dst])                   \n\t"
225 
226         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
227           [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
228           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
229           [odd_dst] "+r"(odd_dst)
230         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
231           [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
232 
233     /* Next row... */
234     src += src_stride;
235     dst += 1;
236   }
237 }
238 
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)239 static void convolve_bi_horiz_16_transposed_dspr2(
240     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
241     int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
242   int32_t c, y;
243   const uint8_t *src;
244   uint8_t *dst;
245   uint8_t *cm = vpx_ff_cropTbl;
246   uint32_t vector_64 = 64;
247   int32_t Temp1, Temp2, Temp3;
248   uint32_t qload1, qload2;
249   uint32_t p1, p2, p3, p4, p5;
250   uint32_t st1, st2, st3;
251   uint32_t dst_pitch_2 = (dst_stride << 1);
252   uint8_t *odd_dst;
253   const int16_t *filter = &filter_x0[3];
254   uint32_t filter45;
255 
256   filter45 = ((const int32_t *)filter)[0];
257 
258   for (y = h; y--;) {
259     /* prefetch data to cache memory */
260     prefetch_load(src_ptr + src_stride);
261     prefetch_load(src_ptr + src_stride + 32);
262 
263     src = src_ptr;
264     dst = dst_ptr;
265 
266     odd_dst = (dst + dst_stride);
267 
268     for (c = 0; c < count; c++) {
269       __asm__ __volatile__(
270           "ulw              %[qload1],        0(%[src])                       "
271           "\n\t"
272           "ulw              %[qload2],        4(%[src])                       "
273           "\n\t"
274 
275           /* even 1. pixel */
276           "mtlo             %[vector_64],     $ac1                            "
277           "\n\t" /* even 1 */
278           "mthi             $zero,            $ac1                            "
279           "\n\t"
280           "mtlo             %[vector_64],     $ac2                            "
281           "\n\t" /* even 2 */
282           "mthi             $zero,            $ac2                            "
283           "\n\t"
284           "preceu.ph.qbr    %[p1],            %[qload1]                       "
285           "\n\t"
286           "preceu.ph.qbl    %[p2],            %[qload1]                       "
287           "\n\t"
288           "preceu.ph.qbr    %[p3],            %[qload2]                       "
289           "\n\t"
290           "preceu.ph.qbl    %[p4],            %[qload2]                       "
291           "\n\t"
292           "ulw              %[qload1],        8(%[src])                       "
293           "\n\t"
294           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
295           "\n\t" /* even 1 */
296           "extp             %[Temp1],         $ac1,           31              "
297           "\n\t" /* even 1 */
298 
299           /* even 2. pixel */
300           "mtlo             %[vector_64],     $ac3                            "
301           "\n\t" /* even 3 */
302           "mthi             $zero,            $ac3                            "
303           "\n\t"
304           "preceu.ph.qbr    %[p1],            %[qload1]                       "
305           "\n\t"
306           "preceu.ph.qbl    %[p5],            %[qload1]                       "
307           "\n\t"
308           "ulw              %[qload2],        12(%[src])                      "
309           "\n\t"
310           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
311           "\n\t" /* even 1 */
312           "lbux             %[st1],           %[Temp1](%[cm])                 "
313           "\n\t" /* even 1 */
314           "extp             %[Temp2],         $ac2,           31              "
315           "\n\t" /* even 1 */
316 
317           /* even 3. pixel */
318           "mtlo             %[vector_64],     $ac1                            "
319           "\n\t" /* even 4 */
320           "mthi             $zero,            $ac1                            "
321           "\n\t"
322           "preceu.ph.qbr    %[p2],            %[qload2]                       "
323           "\n\t"
324           "sb               %[st1],           0(%[dst])                       "
325           "\n\t" /* even 1 */
326           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
327           "          \n\t"
328           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
329           "\n\t" /* even 3 */
330           "extp             %[Temp3],         $ac3,           31              "
331           "\n\t" /* even 3 */
332           "lbux             %[st2],           %[Temp2](%[cm])                 "
333           "\n\t" /* even 1 */
334 
335           /* even 4. pixel */
336           "mtlo             %[vector_64],     $ac2                            "
337           "\n\t" /* even 5 */
338           "mthi             $zero,            $ac2                            "
339           "\n\t"
340           "preceu.ph.qbl    %[p3],            %[qload2]                       "
341           "\n\t"
342           "sb               %[st2],           0(%[dst])                       "
343           "\n\t" /* even 2 */
344           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
345           "\n\t"
346           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
347           "\n\t" /* even 4 */
348           "extp             %[Temp1],         $ac1,           31              "
349           "\n\t" /* even 4 */
350           "lbux             %[st3],           %[Temp3](%[cm])                 "
351           "\n\t" /* even 3 */
352 
353           /* even 5. pixel */
354           "mtlo             %[vector_64],     $ac3                            "
355           "\n\t" /* even 6 */
356           "mthi             $zero,            $ac3                            "
357           "\n\t"
358           "sb               %[st3],           0(%[dst])                       "
359           "\n\t" /* even 3 */
360           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
361           "\n\t"
362           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
363           "\n\t" /* even 5 */
364           "extp             %[Temp2],         $ac2,           31              "
365           "\n\t" /* even 5 */
366           "lbux             %[st1],           %[Temp1](%[cm])                 "
367           "\n\t" /* even 4 */
368 
369           /* even 6. pixel */
370           "mtlo             %[vector_64],     $ac1                            "
371           "\n\t" /* even 7 */
372           "mthi             $zero,            $ac1                            "
373           "\n\t"
374           "sb               %[st1],           0(%[dst])                       "
375           "\n\t" /* even 4 */
376           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
377           "\n\t"
378           "ulw              %[qload1],        20(%[src])                      "
379           "\n\t"
380           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
381           "\n\t" /* even 6 */
382           "extp             %[Temp3],         $ac3,           31              "
383           "\n\t" /* even 6 */
384           "lbux             %[st2],           %[Temp2](%[cm])                 "
385           "\n\t" /* even 5 */
386 
387           /* even 7. pixel */
388           "mtlo             %[vector_64],     $ac2                            "
389           "\n\t" /* even 8 */
390           "mthi             $zero,            $ac2                            "
391           "\n\t"
392           "preceu.ph.qbr    %[p5],            %[qload1]                       "
393           "\n\t"
394           "sb               %[st2],           0(%[dst])                       "
395           "\n\t" /* even 5 */
396           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
397           "\n\t"
398           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
399           "\n\t" /* even 7 */
400           "extp             %[Temp1],         $ac1,           31              "
401           "\n\t" /* even 7 */
402           "lbux             %[st3],           %[Temp3](%[cm])                 "
403           "\n\t" /* even 6 */
404 
405           /* even 8. pixel */
406           "mtlo             %[vector_64],     $ac3                            "
407           "\n\t" /* odd 1 */
408           "mthi             $zero,            $ac3                            "
409           "\n\t"
410           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
411           "\n\t" /* even 8 */
412           "sb               %[st3],           0(%[dst])                       "
413           "\n\t" /* even 6 */
414           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
415           "\n\t"
416           "extp             %[Temp2],         $ac2,           31              "
417           "\n\t" /* even 8 */
418           "lbux             %[st1],           %[Temp1](%[cm])                 "
419           "\n\t" /* even 7 */
420 
421           /* ODD pixels */
422           "ulw              %[qload1],        1(%[src])                       "
423           "\n\t"
424           "ulw              %[qload2],        5(%[src])                       "
425           "\n\t"
426 
427           /* odd 1. pixel */
428           "mtlo             %[vector_64],     $ac1                            "
429           "\n\t" /* odd 2 */
430           "mthi             $zero,            $ac1                            "
431           "\n\t"
432           "preceu.ph.qbr    %[p1],            %[qload1]                       "
433           "\n\t"
434           "preceu.ph.qbl    %[p2],            %[qload1]                       "
435           "\n\t"
436           "preceu.ph.qbr    %[p3],            %[qload2]                       "
437           "\n\t"
438           "preceu.ph.qbl    %[p4],            %[qload2]                       "
439           "\n\t"
440           "sb               %[st1],           0(%[dst])                       "
441           "\n\t" /* even 7 */
442           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
443           "\n\t"
444           "ulw              %[qload2],        9(%[src])                       "
445           "\n\t"
446           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
447           "\n\t" /* odd 1 */
448           "extp             %[Temp3],         $ac3,           31              "
449           "\n\t" /* odd 1 */
450           "lbux             %[st2],           %[Temp2](%[cm])                 "
451           "\n\t" /* even 8 */
452 
453           /* odd 2. pixel */
454           "mtlo             %[vector_64],     $ac2                            "
455           "\n\t" /* odd 3 */
456           "mthi             $zero,            $ac2                            "
457           "\n\t"
458           "preceu.ph.qbr    %[p1],            %[qload2]                       "
459           "\n\t"
460           "preceu.ph.qbl    %[p5],            %[qload2]                       "
461           "\n\t"
462           "sb               %[st2],           0(%[dst])                       "
463           "\n\t" /* even 8 */
464           "ulw              %[qload1],        13(%[src])                      "
465           "\n\t"
466           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
467           "\n\t" /* odd 2 */
468           "extp             %[Temp1],         $ac1,           31              "
469           "\n\t" /* odd 2 */
470           "lbux             %[st3],           %[Temp3](%[cm])                 "
471           "\n\t" /* odd 1 */
472 
473           /* odd 3. pixel */
474           "mtlo             %[vector_64],     $ac3                            "
475           "\n\t" /* odd 4 */
476           "mthi             $zero,            $ac3                            "
477           "\n\t"
478           "preceu.ph.qbr    %[p2],            %[qload1]                       "
479           "\n\t"
480           "sb               %[st3],           0(%[odd_dst])                   "
481           "\n\t" /* odd 1 */
482           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
483           "\n\t"
484           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
485           "\n\t" /* odd 3 */
486           "extp             %[Temp2],         $ac2,           31              "
487           "\n\t" /* odd 3 */
488           "lbux             %[st1],           %[Temp1](%[cm])                 "
489           "\n\t" /* odd 2 */
490 
491           /* odd 4. pixel */
492           "mtlo             %[vector_64],     $ac1                            "
493           "\n\t" /* odd 5 */
494           "mthi             $zero,            $ac1                            "
495           "\n\t"
496           "preceu.ph.qbl    %[p3],            %[qload1]                       "
497           "\n\t"
498           "sb               %[st1],           0(%[odd_dst])                   "
499           "\n\t" /* odd 2 */
500           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
501           "\n\t"
502           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
503           "\n\t" /* odd 4 */
504           "extp             %[Temp3],         $ac3,           31              "
505           "\n\t" /* odd 4 */
506           "lbux             %[st2],           %[Temp2](%[cm])                 "
507           "\n\t" /* odd 3 */
508 
509           /* odd 5. pixel */
510           "mtlo             %[vector_64],     $ac2                            "
511           "\n\t" /* odd 6 */
512           "mthi             $zero,            $ac2                            "
513           "\n\t"
514           "sb               %[st2],           0(%[odd_dst])                   "
515           "\n\t" /* odd 3 */
516           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
517           "\n\t"
518           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
519           "\n\t" /* odd 5 */
520           "extp             %[Temp1],         $ac1,           31              "
521           "\n\t" /* odd 5 */
522           "lbux             %[st3],           %[Temp3](%[cm])                 "
523           "\n\t" /* odd 4 */
524 
525           /* odd 6. pixel */
526           "mtlo             %[vector_64],     $ac3                            "
527           "\n\t" /* odd 7 */
528           "mthi             $zero,            $ac3                            "
529           "\n\t"
530           "sb               %[st3],           0(%[odd_dst])                   "
531           "\n\t" /* odd 4 */
532           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
533           "\n\t"
534           "ulw              %[qload1],        21(%[src])                      "
535           "\n\t"
536           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
537           "\n\t" /* odd 6 */
538           "extp             %[Temp2],         $ac2,           31              "
539           "\n\t" /* odd 6 */
540           "lbux             %[st1],           %[Temp1](%[cm])                 "
541           "\n\t" /* odd 5 */
542 
543           /* odd 7. pixel */
544           "mtlo             %[vector_64],     $ac1                            "
545           "\n\t" /* odd 8 */
546           "mthi             $zero,            $ac1                            "
547           "\n\t"
548           "preceu.ph.qbr    %[p5],            %[qload1]                       "
549           "\n\t"
550           "sb               %[st1],           0(%[odd_dst])                   "
551           "\n\t" /* odd 5 */
552           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
553           "\n\t"
554           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
555           "\n\t" /* odd 7 */
556           "extp             %[Temp3],         $ac3,           31              "
557           "\n\t" /* odd 7 */
558 
559           /* odd 8. pixel */
560           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
561           "\n\t" /* odd 8 */
562           "extp             %[Temp1],         $ac1,           31              "
563           "\n\t" /* odd 8 */
564 
565           "lbux             %[st2],           %[Temp2](%[cm])                 "
566           "\n\t" /* odd 6 */
567           "lbux             %[st3],           %[Temp3](%[cm])                 "
568           "\n\t" /* odd 7 */
569           "lbux             %[st1],           %[Temp1](%[cm])                 "
570           "\n\t" /* odd 8 */
571 
572           "sb               %[st2],           0(%[odd_dst])                   "
573           "\n\t" /* odd 6 */
574           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
575           "\n\t"
576 
577           "sb               %[st3],           0(%[odd_dst])                   "
578           "\n\t" /* odd 7 */
579           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
580           "\n\t"
581 
582           "sb               %[st1],           0(%[odd_dst])                   "
583           "\n\t" /* odd 8 */
584 
585           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
586             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
587             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
588             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
589             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
590           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
591             [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
592 
593       src += 16;
594       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
595       odd_dst = (dst + dst_stride);
596     }
597 
598     /* Next row... */
599     src_ptr += src_stride;
600     dst_ptr += 1;
601   }
602 }
603 
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)604 static void convolve_bi_horiz_64_transposed_dspr2(
605     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
606     int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
607   int32_t c, y;
608   const uint8_t *src;
609   uint8_t *dst;
610   uint8_t *cm = vpx_ff_cropTbl;
611   uint32_t vector_64 = 64;
612   int32_t Temp1, Temp2, Temp3;
613   uint32_t qload1, qload2;
614   uint32_t p1, p2, p3, p4, p5;
615   uint32_t st1, st2, st3;
616   uint32_t dst_pitch_2 = (dst_stride << 1);
617   uint8_t *odd_dst;
618   const int16_t *filter = &filter_x0[3];
619   uint32_t filter45;
620 
621   filter45 = ((const int32_t *)filter)[0];
622 
623   for (y = h; y--;) {
624     /* prefetch data to cache memory */
625     prefetch_load(src_ptr + src_stride);
626     prefetch_load(src_ptr + src_stride + 32);
627     prefetch_load(src_ptr + src_stride + 64);
628 
629     src = src_ptr;
630     dst = dst_ptr;
631 
632     odd_dst = (dst + dst_stride);
633 
634     for (c = 0; c < 4; c++) {
635       __asm__ __volatile__(
636           "ulw              %[qload1],        0(%[src])                       "
637           "\n\t"
638           "ulw              %[qload2],        4(%[src])                       "
639           "\n\t"
640 
641           /* even 1. pixel */
642           "mtlo             %[vector_64],     $ac1                            "
643           "\n\t" /* even 1 */
644           "mthi             $zero,            $ac1                            "
645           "\n\t"
646           "mtlo             %[vector_64],     $ac2                            "
647           "\n\t" /* even 2 */
648           "mthi             $zero,            $ac2                            "
649           "\n\t"
650           "preceu.ph.qbr    %[p1],            %[qload1]                       "
651           "\n\t"
652           "preceu.ph.qbl    %[p2],            %[qload1]                       "
653           "\n\t"
654           "preceu.ph.qbr    %[p3],            %[qload2]                       "
655           "\n\t"
656           "preceu.ph.qbl    %[p4],            %[qload2]                       "
657           "\n\t"
658           "ulw              %[qload1],        8(%[src])                       "
659           "\n\t"
660           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
661           "\n\t" /* even 1 */
662           "extp             %[Temp1],         $ac1,           31              "
663           "\n\t" /* even 1 */
664 
665           /* even 2. pixel */
666           "mtlo             %[vector_64],     $ac3                            "
667           "\n\t" /* even 3 */
668           "mthi             $zero,            $ac3                            "
669           "\n\t"
670           "preceu.ph.qbr    %[p1],            %[qload1]                       "
671           "\n\t"
672           "preceu.ph.qbl    %[p5],            %[qload1]                       "
673           "\n\t"
674           "ulw              %[qload2],        12(%[src])                      "
675           "\n\t"
676           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
677           "\n\t" /* even 1 */
678           "lbux             %[st1],           %[Temp1](%[cm])                 "
679           "\n\t" /* even 1 */
680           "extp             %[Temp2],         $ac2,           31              "
681           "\n\t" /* even 1 */
682 
683           /* even 3. pixel */
684           "mtlo             %[vector_64],     $ac1                            "
685           "\n\t" /* even 4 */
686           "mthi             $zero,            $ac1                            "
687           "\n\t"
688           "preceu.ph.qbr    %[p2],            %[qload2]                       "
689           "\n\t"
690           "sb               %[st1],           0(%[dst])                       "
691           "\n\t" /* even 1 */
692           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
693           "          \n\t"
694           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
695           "\n\t" /* even 3 */
696           "extp             %[Temp3],         $ac3,           31              "
697           "\n\t" /* even 3 */
698           "lbux             %[st2],           %[Temp2](%[cm])                 "
699           "\n\t" /* even 1 */
700 
701           /* even 4. pixel */
702           "mtlo             %[vector_64],     $ac2                            "
703           "\n\t" /* even 5 */
704           "mthi             $zero,            $ac2                            "
705           "\n\t"
706           "preceu.ph.qbl    %[p3],            %[qload2]                       "
707           "\n\t"
708           "sb               %[st2],           0(%[dst])                       "
709           "\n\t" /* even 2 */
710           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
711           "\n\t"
712           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
713           "\n\t" /* even 4 */
714           "extp             %[Temp1],         $ac1,           31              "
715           "\n\t" /* even 4 */
716           "lbux             %[st3],           %[Temp3](%[cm])                 "
717           "\n\t" /* even 3 */
718 
719           /* even 5. pixel */
720           "mtlo             %[vector_64],     $ac3                            "
721           "\n\t" /* even 6 */
722           "mthi             $zero,            $ac3                            "
723           "\n\t"
724           "sb               %[st3],           0(%[dst])                       "
725           "\n\t" /* even 3 */
726           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
727           "\n\t"
728           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
729           "\n\t" /* even 5 */
730           "extp             %[Temp2],         $ac2,           31              "
731           "\n\t" /* even 5 */
732           "lbux             %[st1],           %[Temp1](%[cm])                 "
733           "\n\t" /* even 4 */
734 
735           /* even 6. pixel */
736           "mtlo             %[vector_64],     $ac1                            "
737           "\n\t" /* even 7 */
738           "mthi             $zero,            $ac1                            "
739           "\n\t"
740           "sb               %[st1],           0(%[dst])                       "
741           "\n\t" /* even 4 */
742           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
743           "\n\t"
744           "ulw              %[qload1],        20(%[src])                      "
745           "\n\t"
746           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
747           "\n\t" /* even 6 */
748           "extp             %[Temp3],         $ac3,           31              "
749           "\n\t" /* even 6 */
750           "lbux             %[st2],           %[Temp2](%[cm])                 "
751           "\n\t" /* even 5 */
752 
753           /* even 7. pixel */
754           "mtlo             %[vector_64],     $ac2                            "
755           "\n\t" /* even 8 */
756           "mthi             $zero,            $ac2                            "
757           "\n\t"
758           "preceu.ph.qbr    %[p5],            %[qload1]                       "
759           "\n\t"
760           "sb               %[st2],           0(%[dst])                       "
761           "\n\t" /* even 5 */
762           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
763           "\n\t"
764           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
765           "\n\t" /* even 7 */
766           "extp             %[Temp1],         $ac1,           31              "
767           "\n\t" /* even 7 */
768           "lbux             %[st3],           %[Temp3](%[cm])                 "
769           "\n\t" /* even 6 */
770 
771           /* even 8. pixel */
772           "mtlo             %[vector_64],     $ac3                            "
773           "\n\t" /* odd 1 */
774           "mthi             $zero,            $ac3                            "
775           "\n\t"
776           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
777           "\n\t" /* even 8 */
778           "sb               %[st3],           0(%[dst])                       "
779           "\n\t" /* even 6 */
780           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
781           "\n\t"
782           "extp             %[Temp2],         $ac2,           31              "
783           "\n\t" /* even 8 */
784           "lbux             %[st1],           %[Temp1](%[cm])                 "
785           "\n\t" /* even 7 */
786 
787           /* ODD pixels */
788           "ulw              %[qload1],        1(%[src])                       "
789           "\n\t"
790           "ulw              %[qload2],        5(%[src])                       "
791           "\n\t"
792 
793           /* odd 1. pixel */
794           "mtlo             %[vector_64],     $ac1                            "
795           "\n\t" /* odd 2 */
796           "mthi             $zero,            $ac1                            "
797           "\n\t"
798           "preceu.ph.qbr    %[p1],            %[qload1]                       "
799           "\n\t"
800           "preceu.ph.qbl    %[p2],            %[qload1]                       "
801           "\n\t"
802           "preceu.ph.qbr    %[p3],            %[qload2]                       "
803           "\n\t"
804           "preceu.ph.qbl    %[p4],            %[qload2]                       "
805           "\n\t"
806           "sb               %[st1],           0(%[dst])                       "
807           "\n\t" /* even 7 */
808           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
809           "\n\t"
810           "ulw              %[qload2],        9(%[src])                       "
811           "\n\t"
812           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
813           "\n\t" /* odd 1 */
814           "extp             %[Temp3],         $ac3,           31              "
815           "\n\t" /* odd 1 */
816           "lbux             %[st2],           %[Temp2](%[cm])                 "
817           "\n\t" /* even 8 */
818 
819           /* odd 2. pixel */
820           "mtlo             %[vector_64],     $ac2                            "
821           "\n\t" /* odd 3 */
822           "mthi             $zero,            $ac2                            "
823           "\n\t"
824           "preceu.ph.qbr    %[p1],            %[qload2]                       "
825           "\n\t"
826           "preceu.ph.qbl    %[p5],            %[qload2]                       "
827           "\n\t"
828           "sb               %[st2],           0(%[dst])                       "
829           "\n\t" /* even 8 */
830           "ulw              %[qload1],        13(%[src])                      "
831           "\n\t"
832           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
833           "\n\t" /* odd 2 */
834           "extp             %[Temp1],         $ac1,           31              "
835           "\n\t" /* odd 2 */
836           "lbux             %[st3],           %[Temp3](%[cm])                 "
837           "\n\t" /* odd 1 */
838 
839           /* odd 3. pixel */
840           "mtlo             %[vector_64],     $ac3                            "
841           "\n\t" /* odd 4 */
842           "mthi             $zero,            $ac3                            "
843           "\n\t"
844           "preceu.ph.qbr    %[p2],            %[qload1]                       "
845           "\n\t"
846           "sb               %[st3],           0(%[odd_dst])                   "
847           "\n\t" /* odd 1 */
848           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
849           "\n\t"
850           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
851           "\n\t" /* odd 3 */
852           "extp             %[Temp2],         $ac2,           31              "
853           "\n\t" /* odd 3 */
854           "lbux             %[st1],           %[Temp1](%[cm])                 "
855           "\n\t" /* odd 2 */
856 
857           /* odd 4. pixel */
858           "mtlo             %[vector_64],     $ac1                            "
859           "\n\t" /* odd 5 */
860           "mthi             $zero,            $ac1                            "
861           "\n\t"
862           "preceu.ph.qbl    %[p3],            %[qload1]                       "
863           "\n\t"
864           "sb               %[st1],           0(%[odd_dst])                   "
865           "\n\t" /* odd 2 */
866           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
867           "\n\t"
868           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
869           "\n\t" /* odd 4 */
870           "extp             %[Temp3],         $ac3,           31              "
871           "\n\t" /* odd 4 */
872           "lbux             %[st2],           %[Temp2](%[cm])                 "
873           "\n\t" /* odd 3 */
874 
875           /* odd 5. pixel */
876           "mtlo             %[vector_64],     $ac2                            "
877           "\n\t" /* odd 6 */
878           "mthi             $zero,            $ac2                            "
879           "\n\t"
880           "sb               %[st2],           0(%[odd_dst])                   "
881           "\n\t" /* odd 3 */
882           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
883           "\n\t"
884           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
885           "\n\t" /* odd 5 */
886           "extp             %[Temp1],         $ac1,           31              "
887           "\n\t" /* odd 5 */
888           "lbux             %[st3],           %[Temp3](%[cm])                 "
889           "\n\t" /* odd 4 */
890 
891           /* odd 6. pixel */
892           "mtlo             %[vector_64],     $ac3                            "
893           "\n\t" /* odd 7 */
894           "mthi             $zero,            $ac3                            "
895           "\n\t"
896           "sb               %[st3],           0(%[odd_dst])                   "
897           "\n\t" /* odd 4 */
898           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
899           "\n\t"
900           "ulw              %[qload1],        21(%[src])                      "
901           "\n\t"
902           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
903           "\n\t" /* odd 6 */
904           "extp             %[Temp2],         $ac2,           31              "
905           "\n\t" /* odd 6 */
906           "lbux             %[st1],           %[Temp1](%[cm])                 "
907           "\n\t" /* odd 5 */
908 
909           /* odd 7. pixel */
910           "mtlo             %[vector_64],     $ac1                            "
911           "\n\t" /* odd 8 */
912           "mthi             $zero,            $ac1                            "
913           "\n\t"
914           "preceu.ph.qbr    %[p5],            %[qload1]                       "
915           "\n\t"
916           "sb               %[st1],           0(%[odd_dst])                   "
917           "\n\t" /* odd 5 */
918           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
919           "\n\t"
920           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
921           "\n\t" /* odd 7 */
922           "extp             %[Temp3],         $ac3,           31              "
923           "\n\t" /* odd 7 */
924 
925           /* odd 8. pixel */
926           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
927           "\n\t" /* odd 8 */
928           "extp             %[Temp1],         $ac1,           31              "
929           "\n\t" /* odd 8 */
930 
931           "lbux             %[st2],           %[Temp2](%[cm])                 "
932           "\n\t" /* odd 6 */
933           "lbux             %[st3],           %[Temp3](%[cm])                 "
934           "\n\t" /* odd 7 */
935           "lbux             %[st1],           %[Temp1](%[cm])                 "
936           "\n\t" /* odd 8 */
937 
938           "sb               %[st2],           0(%[odd_dst])                   "
939           "\n\t" /* odd 6 */
940           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
941           "\n\t"
942 
943           "sb               %[st3],           0(%[odd_dst])                   "
944           "\n\t" /* odd 7 */
945           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
946           "\n\t"
947 
948           "sb               %[st1],           0(%[odd_dst])                   "
949           "\n\t" /* odd 8 */
950 
951           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
952             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
953             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
954             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
955             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
956           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
957             [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
958 
959       src += 16;
960       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
961       odd_dst = (dst + dst_stride);
962     }
963 
964     /* Next row... */
965     src_ptr += src_stride;
966     dst_ptr += 1;
967   }
968 }
969 
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)970 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
971                                   uint8_t *dst, ptrdiff_t dst_stride,
972                                   const int16_t *filter, int w, int h) {
973   int x, y;
974 
975   for (y = 0; y < h; ++y) {
976     for (x = 0; x < w; ++x) {
977       int sum = 0;
978 
979       sum += src[x] * filter[3];
980       sum += src[x + 1] * filter[4];
981 
982       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
983     }
984 
985     src += src_stride;
986     dst += 1;
987   }
988 }
989 
vpx_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)990 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
991                          ptrdiff_t dst_stride, const int16_t *filter, int w,
992                          int h) {
993   uint32_t pos = 38;
994 
995   /* bit positon for extract from acc */
996   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
997                        :
998                        : [pos] "r"(pos));
999 
1000   /* prefetch data to cache memory */
1001   prefetch_load(src);
1002   prefetch_load(src + 32);
1003 
1004   switch (w) {
1005     case 4:
1006       convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
1007                                            filter, h);
1008       break;
1009     case 8:
1010       convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
1011                                            filter, h);
1012       break;
1013     case 16:
1014     case 32:
1015       convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
1016                                             filter, h, (w / 16));
1017       break;
1018     case 64:
1019       prefetch_load(src + 32);
1020       convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
1021                                             filter, h);
1022       break;
1023     default:
1024       convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
1025                                    h);
1026       break;
1027   }
1028 }
1029 #endif
1030