1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
22                                               int32_t src_stride, uint8_t *dst,
23                                               int32_t dst_stride,
24                                               const int16_t *filter_x0,
25                                               int32_t h) {
26   int32_t y;
27   uint8_t *cm = vpx_ff_cropTbl;
28   uint8_t *dst_ptr;
29   int32_t vector1b, vector2b, vector3b, vector4b;
30   int32_t Temp1, Temp2, Temp3, Temp4;
31   uint32_t vector4a = 64;
32   uint32_t tp1, tp2;
33   uint32_t p1, p2, p3, p4;
34   uint32_t tn1, tn2;
35 
36   vector1b = ((const int32_t *)filter_x0)[0];
37   vector2b = ((const int32_t *)filter_x0)[1];
38   vector3b = ((const int32_t *)filter_x0)[2];
39   vector4b = ((const int32_t *)filter_x0)[3];
40 
41   for (y = h; y--;) {
42     dst_ptr = dst;
43     /* prefetch data to cache memory */
44     prefetch_load(src + src_stride);
45     prefetch_load(src + src_stride + 32);
46 
47     __asm__ __volatile__(
48         "ulw              %[tp1],         0(%[src])                      \n\t"
49         "ulw              %[tp2],         4(%[src])                      \n\t"
50 
51         /* even 1. pixel */
52         "mtlo             %[vector4a],    $ac3                           \n\t"
53         "mthi             $zero,          $ac3                           \n\t"
54         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
57         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
58         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
59         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
60         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
61         "ulw              %[tn2],         8(%[src])                      \n\t"
62         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
63         "extp             %[Temp1],       $ac3,           31             \n\t"
64 
65         /* even 2. pixel */
66         "mtlo             %[vector4a],    $ac2                           \n\t"
67         "mthi             $zero,          $ac2                           \n\t"
68         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
69         "balign           %[tn1],         %[tn2],         3              \n\t"
70         "balign           %[tn2],         %[tp2],         3              \n\t"
71         "balign           %[tp2],         %[tp1],         3              \n\t"
72         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
73         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
74         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
75         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
76         "extp             %[Temp3],       $ac2,           31             \n\t"
77 
78         /* odd 1. pixel */
79         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
80         "mtlo             %[vector4a],    $ac3                           \n\t"
81         "mthi             $zero,          $ac3                           \n\t"
82         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
83         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
84         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
85         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
86         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
87         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
88         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
89         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
90         "extp             %[Temp2],       $ac3,           31             \n\t"
91 
92         /* odd 2. pixel */
93         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
94         "mtlo             %[vector4a],    $ac2                           \n\t"
95         "mthi             $zero,          $ac2                           \n\t"
96         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
97         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
98         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
99         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
100         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
101         "extp             %[Temp4],       $ac2,           31             \n\t"
102 
103         /* clamp */
104         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
105         "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
106 
107         /* store bytes */
108         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
109         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
110 
111         "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
112         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
113 
114         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
115         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
116 
117         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
118         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
119 
120         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
121           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
122           [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
123           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
124         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
125           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
126           [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
127           [dst_stride] "r"(dst_stride));
128 
129     /* Next row... */
130     src += src_stride;
131     dst += 1;
132   }
133 }
134 
convolve_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)135 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
136                                               int32_t src_stride, uint8_t *dst,
137                                               int32_t dst_stride,
138                                               const int16_t *filter_x0,
139                                               int32_t h) {
140   int32_t y;
141   uint8_t *cm = vpx_ff_cropTbl;
142   uint8_t *dst_ptr;
143   uint32_t vector4a = 64;
144   int32_t vector1b, vector2b, vector3b, vector4b;
145   int32_t Temp1, Temp2, Temp3;
146   uint32_t tp1, tp2, tp3;
147   uint32_t p1, p2, p3, p4, n1;
148   uint8_t *odd_dst;
149   uint32_t dst_pitch_2 = (dst_stride << 1);
150 
151   vector1b = ((const int32_t *)filter_x0)[0];
152   vector2b = ((const int32_t *)filter_x0)[1];
153   vector3b = ((const int32_t *)filter_x0)[2];
154   vector4b = ((const int32_t *)filter_x0)[3];
155 
156   for (y = h; y--;) {
157     /* prefetch data to cache memory */
158     prefetch_load(src + src_stride);
159     prefetch_load(src + src_stride + 32);
160 
161     dst_ptr = dst;
162     odd_dst = (dst_ptr + dst_stride);
163 
164     __asm__ __volatile__(
165         "ulw              %[tp2],         0(%[src])                       \n\t"
166         "ulw              %[tp1],         4(%[src])                       \n\t"
167 
168         /* even 1. pixel */
169         "mtlo             %[vector4a],    $ac3                            \n\t"
170         "mthi             $zero,          $ac3                            \n\t"
171         "mtlo             %[vector4a],    $ac2                            \n\t"
172         "mthi             $zero,          $ac2                            \n\t"
173         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
174         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
175         "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
176         "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
177         "ulw              %[tp3],         8(%[src])                       \n\t"
178         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
179         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
180         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
181         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
182         "extp             %[Temp1],       $ac3,           31              \n\t"
183 
184         /* even 2. pixel */
185         "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
186         "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
187         "ulw              %[tp2],         12(%[src])                      \n\t"
188         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
189         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
190         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
191         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
192         "extp             %[Temp3],       $ac2,           31              \n\t"
193 
194         /* even 3. pixel */
195         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
196         "mtlo             %[vector4a],    $ac1                            \n\t"
197         "mthi             $zero,          $ac1                            \n\t"
198         "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
199         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
200         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
201         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
202         "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
203         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
204         "extp             %[p3],          $ac1,           31              \n\t"
205 
206         /* even 4. pixel */
207         "mtlo             %[vector4a],    $ac2                            \n\t"
208         "mthi             $zero,          $ac2                            \n\t"
209         "mtlo             %[vector4a],    $ac3                            \n\t"
210         "mthi             $zero,          $ac3                            \n\t"
211         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
212         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
213         "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
214         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
215 
216         "ulw              %[tp1],         1(%[src])                       \n\t"
217         "ulw              %[tp3],         5(%[src])                       \n\t"
218 
219         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
220         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
221         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
222         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
223         "extp             %[Temp3],       $ac2,           31              \n\t"
224 
225         "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
226 
227         /* odd 1. pixel */
228         "mtlo             %[vector4a],    $ac1                            \n\t"
229         "mthi             $zero,          $ac1                            \n\t"
230         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
231         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
232         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
233         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
234         "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
235         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
236         "ulw              %[tp2],         9(%[src])                       \n\t"
237 
238         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
239         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
240         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
241         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
242         "extp             %[Temp2],       $ac3,           31              \n\t"
243 
244         /* odd 2. pixel */
245         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
246         "mtlo             %[vector4a],    $ac3                            \n\t"
247         "mthi             $zero,          $ac3                            \n\t"
248         "mtlo             %[vector4a],    $ac2                            \n\t"
249         "mthi             $zero,          $ac2                            \n\t"
250         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
251         "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
252         "ulw              %[Temp1],       13(%[src])                      \n\t"
253         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
254         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
255         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
256         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
257         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
258         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
259         "extp             %[Temp3],       $ac1,           31              \n\t"
260 
261         /* odd 3. pixel */
262         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
263         "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
264         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
265         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
266         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
267         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
268         "extp             %[Temp2],       $ac3,           31              \n\t"
269 
270         /* odd 4. pixel */
271         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
272         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
273         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
274         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
275         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
276         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
277         "extp             %[Temp1],       $ac2,           31              \n\t"
278 
279         /* clamp */
280         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
281         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
282         "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
283 
284         /* store bytes */
285         "sb               %[p4],          0(%[odd_dst])                   \n\t"
286         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
287 
288         "sb               %[p2],          0(%[odd_dst])                   \n\t"
289         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
290 
291         "sb               %[n1],          0(%[odd_dst])                   \n\t"
292 
293         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
294           [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
295           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
296           [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
297         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
298           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
299           [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
300           [dst_pitch_2] "r"(dst_pitch_2));
301 
302     /* Next row... */
303     src += src_stride;
304     dst += 1;
305   }
306 }
307 
convolve_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)308 static void convolve_horiz_16_transposed_dspr2(
309     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
310     int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
311   int32_t c, y;
312   const uint8_t *src;
313   uint8_t *dst;
314   uint8_t *cm = vpx_ff_cropTbl;
315   uint32_t vector_64 = 64;
316   int32_t filter12, filter34, filter56, filter78;
317   int32_t Temp1, Temp2, Temp3;
318   uint32_t qload1, qload2;
319   uint32_t p1, p2, p3, p4, p5;
320   uint32_t st1, st2, st3;
321   uint32_t dst_pitch_2 = (dst_stride << 1);
322   uint8_t *odd_dst;
323 
324   filter12 = ((const int32_t *)filter_x0)[0];
325   filter34 = ((const int32_t *)filter_x0)[1];
326   filter56 = ((const int32_t *)filter_x0)[2];
327   filter78 = ((const int32_t *)filter_x0)[3];
328 
329   for (y = h; y--;) {
330     /* prefetch data to cache memory */
331     prefetch_load(src_ptr + src_stride);
332     prefetch_load(src_ptr + src_stride + 32);
333 
334     src = src_ptr;
335     dst = dst_ptr;
336 
337     odd_dst = (dst + dst_stride);
338 
339     for (c = 0; c < count; c++) {
340       __asm__ __volatile__(
341           "ulw              %[qload1],        0(%[src])                       "
342           "\n\t"
343           "ulw              %[qload2],        4(%[src])                       "
344           "\n\t"
345 
346           /* even 1. pixel */
347           "mtlo             %[vector_64],     $ac1                            "
348           "\n\t" /* even 1 */
349           "mthi             $zero,            $ac1                            "
350           "\n\t"
351           "mtlo             %[vector_64],     $ac2                            "
352           "\n\t" /* even 2 */
353           "mthi             $zero,            $ac2                            "
354           "\n\t"
355           "preceu.ph.qbr    %[p3],            %[qload2]                       "
356           "\n\t"
357           "preceu.ph.qbl    %[p4],            %[qload2]                       "
358           "\n\t"
359           "preceu.ph.qbr    %[p1],            %[qload1]                       "
360           "\n\t"
361           "preceu.ph.qbl    %[p2],            %[qload1]                       "
362           "\n\t"
363           "ulw              %[qload2],        8(%[src])                       "
364           "\n\t"
365           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
366           "\n\t" /* even 1 */
367           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
368           "\n\t" /* even 1 */
369           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
370           "\n\t" /* even 1 */
371           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
372           "\n\t" /* even 1 */
373           "extp             %[Temp1],         $ac1,           31              "
374           "\n\t" /* even 1 */
375 
376           /* even 2. pixel */
377           "mtlo             %[vector_64],     $ac3                            "
378           "\n\t" /* even 3 */
379           "mthi             $zero,            $ac3                            "
380           "\n\t"
381           "preceu.ph.qbr    %[p1],            %[qload2]                       "
382           "\n\t"
383           "preceu.ph.qbl    %[p5],            %[qload2]                       "
384           "\n\t"
385           "ulw              %[qload1],        12(%[src])                      "
386           "\n\t"
387           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
388           "\n\t" /* even 1 */
389           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
390           "\n\t" /* even 1 */
391           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
392           "\n\t" /* even 1 */
393           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
394           "\n\t" /* even 1 */
395           "lbux             %[st1],           %[Temp1](%[cm])                 "
396           "\n\t" /* even 1 */
397           "extp             %[Temp2],         $ac2,           31              "
398           "\n\t" /* even 1 */
399 
400           /* even 3. pixel */
401           "mtlo             %[vector_64],     $ac1                            "
402           "\n\t" /* even 4 */
403           "mthi             $zero,            $ac1                            "
404           "\n\t"
405           "preceu.ph.qbr    %[p2],            %[qload1]                       "
406           "\n\t"
407           "sb               %[st1],           0(%[dst])                       "
408           "\n\t" /* even 1 */
409           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
410           "          \n\t"
411           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
412           "\n\t" /* even 3 */
413           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
414           "\n\t" /* even 3 */
415           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
416           "\n\t" /* even 3 */
417           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
418           "\n\t" /* even 3 */
419           "extp             %[Temp3],         $ac3,           31              "
420           "\n\t" /* even 3 */
421           "lbux             %[st2],           %[Temp2](%[cm])                 "
422           "\n\t" /* even 1 */
423 
424           /* even 4. pixel */
425           "mtlo             %[vector_64],     $ac2                            "
426           "\n\t" /* even 5 */
427           "mthi             $zero,            $ac2                            "
428           "\n\t"
429           "preceu.ph.qbl    %[p3],            %[qload1]                       "
430           "\n\t"
431           "sb               %[st2],           0(%[dst])                       "
432           "\n\t" /* even 2 */
433           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
434           "\n\t"
435           "ulw              %[qload2],        16(%[src])                      "
436           "\n\t"
437           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
438           "\n\t" /* even 4 */
439           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
440           "\n\t" /* even 4 */
441           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
442           "\n\t" /* even 4 */
443           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
444           "\n\t" /* even 4 */
445           "extp             %[Temp1],         $ac1,           31              "
446           "\n\t" /* even 4 */
447           "lbux             %[st3],           %[Temp3](%[cm])                 "
448           "\n\t" /* even 3 */
449 
450           /* even 5. pixel */
451           "mtlo             %[vector_64],     $ac3                            "
452           "\n\t" /* even 6 */
453           "mthi             $zero,            $ac3                            "
454           "\n\t"
455           "preceu.ph.qbr    %[p4],            %[qload2]                       "
456           "\n\t"
457           "sb               %[st3],           0(%[dst])                       "
458           "\n\t" /* even 3 */
459           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
460           "\n\t"
461           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
462           "\n\t" /* even 5 */
463           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
464           "\n\t" /* even 5 */
465           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
466           "\n\t" /* even 5 */
467           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
468           "\n\t" /* even 5 */
469           "extp             %[Temp2],         $ac2,           31              "
470           "\n\t" /* even 5 */
471           "lbux             %[st1],           %[Temp1](%[cm])                 "
472           "\n\t" /* even 4 */
473 
474           /* even 6. pixel */
475           "mtlo             %[vector_64],     $ac1                            "
476           "\n\t" /* even 7 */
477           "mthi             $zero,            $ac1                            "
478           "\n\t"
479           "preceu.ph.qbl    %[p1],            %[qload2]                       "
480           "\n\t"
481           "sb               %[st1],           0(%[dst])                       "
482           "\n\t" /* even 4 */
483           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
484           "\n\t"
485           "ulw              %[qload1],        20(%[src])                      "
486           "\n\t"
487           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
488           "\n\t" /* even 6 */
489           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
490           "\n\t" /* even 6 */
491           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
492           "\n\t" /* even 6 */
493           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
494           "\n\t" /* even 6 */
495           "extp             %[Temp3],         $ac3,           31              "
496           "\n\t" /* even 6 */
497           "lbux             %[st2],           %[Temp2](%[cm])                 "
498           "\n\t" /* even 5 */
499 
500           /* even 7. pixel */
501           "mtlo             %[vector_64],     $ac2                            "
502           "\n\t" /* even 8 */
503           "mthi             $zero,            $ac2                            "
504           "\n\t"
505           "preceu.ph.qbr    %[p5],            %[qload1]                       "
506           "\n\t"
507           "sb               %[st2],           0(%[dst])                       "
508           "\n\t" /* even 5 */
509           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
510           "\n\t"
511           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
512           "\n\t" /* even 7 */
513           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
514           "\n\t" /* even 7 */
515           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
516           "\n\t" /* even 7 */
517           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
518           "\n\t" /* even 7 */
519           "extp             %[Temp1],         $ac1,           31              "
520           "\n\t" /* even 7 */
521           "lbux             %[st3],           %[Temp3](%[cm])                 "
522           "\n\t" /* even 6 */
523 
524           /* even 8. pixel */
525           "mtlo             %[vector_64],     $ac3                            "
526           "\n\t" /* odd 1 */
527           "mthi             $zero,            $ac3                            "
528           "\n\t"
529           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
530           "\n\t" /* even 8 */
531           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
532           "\n\t" /* even 8 */
533           "sb               %[st3],           0(%[dst])                       "
534           "\n\t" /* even 6 */
535           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
536           "\n\t"
537           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
538           "\n\t" /* even 8 */
539           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
540           "\n\t" /* even 8 */
541           "extp             %[Temp2],         $ac2,           31              "
542           "\n\t" /* even 8 */
543           "lbux             %[st1],           %[Temp1](%[cm])                 "
544           "\n\t" /* even 7 */
545 
546           /* ODD pixels */
547           "ulw              %[qload1],        1(%[src])                       "
548           "\n\t"
549           "ulw              %[qload2],        5(%[src])                       "
550           "\n\t"
551 
552           /* odd 1. pixel */
553           "mtlo             %[vector_64],     $ac1                            "
554           "\n\t" /* odd 2 */
555           "mthi             $zero,            $ac1                            "
556           "\n\t"
557           "preceu.ph.qbr    %[p1],            %[qload1]                       "
558           "\n\t"
559           "preceu.ph.qbl    %[p2],            %[qload1]                       "
560           "\n\t"
561           "preceu.ph.qbr    %[p3],            %[qload2]                       "
562           "\n\t"
563           "preceu.ph.qbl    %[p4],            %[qload2]                       "
564           "\n\t"
565           "sb               %[st1],           0(%[dst])                       "
566           "\n\t" /* even 7 */
567           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
568           "\n\t"
569           "ulw              %[qload2],        9(%[src])                       "
570           "\n\t"
571           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
572           "\n\t" /* odd 1 */
573           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
574           "\n\t" /* odd 1 */
575           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
576           "\n\t" /* odd 1 */
577           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
578           "\n\t" /* odd 1 */
579           "extp             %[Temp3],         $ac3,           31              "
580           "\n\t" /* odd 1 */
581           "lbux             %[st2],           %[Temp2](%[cm])                 "
582           "\n\t" /* even 8 */
583 
584           /* odd 2. pixel */
585           "mtlo             %[vector_64],     $ac2                            "
586           "\n\t" /* odd 3 */
587           "mthi             $zero,            $ac2                            "
588           "\n\t"
589           "preceu.ph.qbr    %[p1],            %[qload2]                       "
590           "\n\t"
591           "preceu.ph.qbl    %[p5],            %[qload2]                       "
592           "\n\t"
593           "sb               %[st2],           0(%[dst])                       "
594           "\n\t" /* even 8 */
595           "ulw              %[qload1],        13(%[src])                      "
596           "\n\t"
597           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
598           "\n\t" /* odd 2 */
599           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
600           "\n\t" /* odd 2 */
601           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
602           "\n\t" /* odd 2 */
603           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
604           "\n\t" /* odd 2 */
605           "extp             %[Temp1],         $ac1,           31              "
606           "\n\t" /* odd 2 */
607           "lbux             %[st3],           %[Temp3](%[cm])                 "
608           "\n\t" /* odd 1 */
609 
610           /* odd 3. pixel */
611           "mtlo             %[vector_64],     $ac3                            "
612           "\n\t" /* odd 4 */
613           "mthi             $zero,            $ac3                            "
614           "\n\t"
615           "preceu.ph.qbr    %[p2],            %[qload1]                       "
616           "\n\t"
617           "sb               %[st3],           0(%[odd_dst])                   "
618           "\n\t" /* odd 1 */
619           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
620           "\n\t"
621           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
622           "\n\t" /* odd 3 */
623           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
624           "\n\t" /* odd 3 */
625           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
626           "\n\t" /* odd 3 */
627           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
628           "\n\t" /* odd 3 */
629           "extp             %[Temp2],         $ac2,           31              "
630           "\n\t" /* odd 3 */
631           "lbux             %[st1],           %[Temp1](%[cm])                 "
632           "\n\t" /* odd 2 */
633 
634           /* odd 4. pixel */
635           "mtlo             %[vector_64],     $ac1                            "
636           "\n\t" /* odd 5 */
637           "mthi             $zero,            $ac1                            "
638           "\n\t"
639           "preceu.ph.qbl    %[p3],            %[qload1]                       "
640           "\n\t"
641           "sb               %[st1],           0(%[odd_dst])                   "
642           "\n\t" /* odd 2 */
643           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
644           "\n\t"
645           "ulw              %[qload2],        17(%[src])                      "
646           "\n\t"
647           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
648           "\n\t" /* odd 4 */
649           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
650           "\n\t" /* odd 4 */
651           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
652           "\n\t" /* odd 4 */
653           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
654           "\n\t" /* odd 4 */
655           "extp             %[Temp3],         $ac3,           31              "
656           "\n\t" /* odd 4 */
657           "lbux             %[st2],           %[Temp2](%[cm])                 "
658           "\n\t" /* odd 3 */
659 
660           /* odd 5. pixel */
661           "mtlo             %[vector_64],     $ac2                            "
662           "\n\t" /* odd 6 */
663           "mthi             $zero,            $ac2                            "
664           "\n\t"
665           "preceu.ph.qbr    %[p4],            %[qload2]                       "
666           "\n\t"
667           "sb               %[st2],           0(%[odd_dst])                   "
668           "\n\t" /* odd 3 */
669           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
670           "\n\t"
671           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
672           "\n\t" /* odd 5 */
673           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
674           "\n\t" /* odd 5 */
675           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
676           "\n\t" /* odd 5 */
677           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
678           "\n\t" /* odd 5 */
679           "extp             %[Temp1],         $ac1,           31              "
680           "\n\t" /* odd 5 */
681           "lbux             %[st3],           %[Temp3](%[cm])                 "
682           "\n\t" /* odd 4 */
683 
684           /* odd 6. pixel */
685           "mtlo             %[vector_64],     $ac3                            "
686           "\n\t" /* odd 7 */
687           "mthi             $zero,            $ac3                            "
688           "\n\t"
689           "preceu.ph.qbl    %[p1],            %[qload2]                       "
690           "\n\t"
691           "sb               %[st3],           0(%[odd_dst])                   "
692           "\n\t" /* odd 4 */
693           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
694           "\n\t"
695           "ulw              %[qload1],        21(%[src])                      "
696           "\n\t"
697           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
698           "\n\t" /* odd 6 */
699           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
700           "\n\t" /* odd 6 */
701           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
702           "\n\t" /* odd 6 */
703           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
704           "\n\t" /* odd 6 */
705           "extp             %[Temp2],         $ac2,           31              "
706           "\n\t" /* odd 6 */
707           "lbux             %[st1],           %[Temp1](%[cm])                 "
708           "\n\t" /* odd 5 */
709 
710           /* odd 7. pixel */
711           "mtlo             %[vector_64],     $ac1                            "
712           "\n\t" /* odd 8 */
713           "mthi             $zero,            $ac1                            "
714           "\n\t"
715           "preceu.ph.qbr    %[p5],            %[qload1]                       "
716           "\n\t"
717           "sb               %[st1],           0(%[odd_dst])                   "
718           "\n\t" /* odd 5 */
719           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
720           "\n\t"
721           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
722           "\n\t" /* odd 7 */
723           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
724           "\n\t" /* odd 7 */
725           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
726           "\n\t" /* odd 7 */
727           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
728           "\n\t" /* odd 7 */
729           "extp             %[Temp3],         $ac3,           31              "
730           "\n\t" /* odd 7 */
731 
732           /* odd 8. pixel */
733           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
734           "\n\t" /* odd 8 */
735           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
736           "\n\t" /* odd 8 */
737           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
738           "\n\t" /* odd 8 */
739           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
740           "\n\t" /* odd 8 */
741           "extp             %[Temp1],         $ac1,           31              "
742           "\n\t" /* odd 8 */
743 
744           "lbux             %[st2],           %[Temp2](%[cm])                 "
745           "\n\t" /* odd 6 */
746           "lbux             %[st3],           %[Temp3](%[cm])                 "
747           "\n\t" /* odd 7 */
748           "lbux             %[st1],           %[Temp1](%[cm])                 "
749           "\n\t" /* odd 8 */
750 
751           "sb               %[st2],           0(%[odd_dst])                   "
752           "\n\t" /* odd 6 */
753           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
754           "\n\t"
755 
756           "sb               %[st3],           0(%[odd_dst])                   "
757           "\n\t" /* odd 7 */
758           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
759           "\n\t"
760 
761           "sb               %[st1],           0(%[odd_dst])                   "
762           "\n\t" /* odd 8 */
763 
764           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
765             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
766             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
767             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
768             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
769           : [filter12] "r"(filter12), [filter34] "r"(filter34),
770             [filter56] "r"(filter56), [filter78] "r"(filter78),
771             [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
772             [dst_pitch_2] "r"(dst_pitch_2));
773 
774       src += 16;
775       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
776       odd_dst = (dst + dst_stride);
777     }
778 
779     /* Next row... */
780     src_ptr += src_stride;
781 
782     dst_ptr += 1;
783   }
784 }
785 
convolve_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)786 static void convolve_horiz_64_transposed_dspr2(
787     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
788     int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
789   int32_t c, y;
790   const uint8_t *src;
791   uint8_t *dst;
792   uint8_t *cm = vpx_ff_cropTbl;
793   uint32_t vector_64 = 64;
794   int32_t filter12, filter34, filter56, filter78;
795   int32_t Temp1, Temp2, Temp3;
796   uint32_t qload1, qload2;
797   uint32_t p1, p2, p3, p4, p5;
798   uint32_t st1, st2, st3;
799   uint32_t dst_pitch_2 = (dst_stride << 1);
800   uint8_t *odd_dst;
801 
802   filter12 = ((const int32_t *)filter_x0)[0];
803   filter34 = ((const int32_t *)filter_x0)[1];
804   filter56 = ((const int32_t *)filter_x0)[2];
805   filter78 = ((const int32_t *)filter_x0)[3];
806 
807   for (y = h; y--;) {
808     /* prefetch data to cache memory */
809     prefetch_load(src_ptr + src_stride);
810     prefetch_load(src_ptr + src_stride + 32);
811     prefetch_load(src_ptr + src_stride + 64);
812 
813     src = src_ptr;
814     dst = dst_ptr;
815 
816     odd_dst = (dst + dst_stride);
817 
818     for (c = 0; c < 4; c++) {
819       __asm__ __volatile__(
820           "ulw              %[qload1],        0(%[src])                       "
821           "\n\t"
822           "ulw              %[qload2],        4(%[src])                       "
823           "\n\t"
824 
825           /* even 1. pixel */
826           "mtlo             %[vector_64],     $ac1                            "
827           "\n\t" /* even 1 */
828           "mthi             $zero,            $ac1                            "
829           "\n\t"
830           "mtlo             %[vector_64],     $ac2                            "
831           "\n\t" /* even 2 */
832           "mthi             $zero,            $ac2                            "
833           "\n\t"
834           "preceu.ph.qbr    %[p3],            %[qload2]                       "
835           "\n\t"
836           "preceu.ph.qbl    %[p4],            %[qload2]                       "
837           "\n\t"
838           "preceu.ph.qbr    %[p1],            %[qload1]                       "
839           "\n\t"
840           "preceu.ph.qbl    %[p2],            %[qload1]                       "
841           "\n\t"
842           "ulw              %[qload2],        8(%[src])                       "
843           "\n\t"
844           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
845           "\n\t" /* even 1 */
846           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
847           "\n\t" /* even 1 */
848           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
849           "\n\t" /* even 1 */
850           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
851           "\n\t" /* even 1 */
852           "extp             %[Temp1],         $ac1,           31              "
853           "\n\t" /* even 1 */
854 
855           /* even 2. pixel */
856           "mtlo             %[vector_64],     $ac3                            "
857           "\n\t" /* even 3 */
858           "mthi             $zero,            $ac3                            "
859           "\n\t"
860           "preceu.ph.qbr    %[p1],            %[qload2]                       "
861           "\n\t"
862           "preceu.ph.qbl    %[p5],            %[qload2]                       "
863           "\n\t"
864           "ulw              %[qload1],        12(%[src])                      "
865           "\n\t"
866           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
867           "\n\t" /* even 1 */
868           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
869           "\n\t" /* even 1 */
870           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
871           "\n\t" /* even 1 */
872           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
873           "\n\t" /* even 1 */
874           "lbux             %[st1],           %[Temp1](%[cm])                 "
875           "\n\t" /* even 1 */
876           "extp             %[Temp2],         $ac2,           31              "
877           "\n\t" /* even 1 */
878 
879           /* even 3. pixel */
880           "mtlo             %[vector_64],     $ac1                            "
881           "\n\t" /* even 4 */
882           "mthi             $zero,            $ac1                            "
883           "\n\t"
884           "preceu.ph.qbr    %[p2],            %[qload1]                       "
885           "\n\t"
886           "sb               %[st1],           0(%[dst])                       "
887           "\n\t" /* even 1 */
888           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
889           "          \n\t"
890           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
891           "\n\t" /* even 3 */
892           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
893           "\n\t" /* even 3 */
894           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
895           "\n\t" /* even 3 */
896           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
897           "\n\t" /* even 3 */
898           "extp             %[Temp3],         $ac3,           31              "
899           "\n\t" /* even 3 */
900           "lbux             %[st2],           %[Temp2](%[cm])                 "
901           "\n\t" /* even 1 */
902 
903           /* even 4. pixel */
904           "mtlo             %[vector_64],     $ac2                            "
905           "\n\t" /* even 5 */
906           "mthi             $zero,            $ac2                            "
907           "\n\t"
908           "preceu.ph.qbl    %[p3],            %[qload1]                       "
909           "\n\t"
910           "sb               %[st2],           0(%[dst])                       "
911           "\n\t" /* even 2 */
912           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
913           "\n\t"
914           "ulw              %[qload2],        16(%[src])                      "
915           "\n\t"
916           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
917           "\n\t" /* even 4 */
918           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
919           "\n\t" /* even 4 */
920           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
921           "\n\t" /* even 4 */
922           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
923           "\n\t" /* even 4 */
924           "extp             %[Temp1],         $ac1,           31              "
925           "\n\t" /* even 4 */
926           "lbux             %[st3],           %[Temp3](%[cm])                 "
927           "\n\t" /* even 3 */
928 
929           /* even 5. pixel */
930           "mtlo             %[vector_64],     $ac3                            "
931           "\n\t" /* even 6 */
932           "mthi             $zero,            $ac3                            "
933           "\n\t"
934           "preceu.ph.qbr    %[p4],            %[qload2]                       "
935           "\n\t"
936           "sb               %[st3],           0(%[dst])                       "
937           "\n\t" /* even 3 */
938           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
939           "\n\t"
940           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
941           "\n\t" /* even 5 */
942           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
943           "\n\t" /* even 5 */
944           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
945           "\n\t" /* even 5 */
946           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
947           "\n\t" /* even 5 */
948           "extp             %[Temp2],         $ac2,           31              "
949           "\n\t" /* even 5 */
950           "lbux             %[st1],           %[Temp1](%[cm])                 "
951           "\n\t" /* even 4 */
952 
953           /* even 6. pixel */
954           "mtlo             %[vector_64],     $ac1                            "
955           "\n\t" /* even 7 */
956           "mthi             $zero,            $ac1                            "
957           "\n\t"
958           "preceu.ph.qbl    %[p1],            %[qload2]                       "
959           "\n\t"
960           "sb               %[st1],           0(%[dst])                       "
961           "\n\t" /* even 4 */
962           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
963           "\n\t"
964           "ulw              %[qload1],        20(%[src])                      "
965           "\n\t"
966           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
967           "\n\t" /* even 6 */
968           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
969           "\n\t" /* even 6 */
970           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
971           "\n\t" /* even 6 */
972           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
973           "\n\t" /* even 6 */
974           "extp             %[Temp3],         $ac3,           31              "
975           "\n\t" /* even 6 */
976           "lbux             %[st2],           %[Temp2](%[cm])                 "
977           "\n\t" /* even 5 */
978 
979           /* even 7. pixel */
980           "mtlo             %[vector_64],     $ac2                            "
981           "\n\t" /* even 8 */
982           "mthi             $zero,            $ac2                            "
983           "\n\t"
984           "preceu.ph.qbr    %[p5],            %[qload1]                       "
985           "\n\t"
986           "sb               %[st2],           0(%[dst])                       "
987           "\n\t" /* even 5 */
988           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
989           "\n\t"
990           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
991           "\n\t" /* even 7 */
992           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
993           "\n\t" /* even 7 */
994           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
995           "\n\t" /* even 7 */
996           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
997           "\n\t" /* even 7 */
998           "extp             %[Temp1],         $ac1,           31              "
999           "\n\t" /* even 7 */
1000           "lbux             %[st3],           %[Temp3](%[cm])                 "
1001           "\n\t" /* even 6 */
1002 
1003           /* even 8. pixel */
1004           "mtlo             %[vector_64],     $ac3                            "
1005           "\n\t" /* odd 1 */
1006           "mthi             $zero,            $ac3                            "
1007           "\n\t"
1008           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1009           "\n\t" /* even 8 */
1010           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1011           "\n\t" /* even 8 */
1012           "sb               %[st3],           0(%[dst])                       "
1013           "\n\t" /* even 6 */
1014           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1015           "\n\t"
1016           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1017           "\n\t" /* even 8 */
1018           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1019           "\n\t" /* even 8 */
1020           "extp             %[Temp2],         $ac2,           31              "
1021           "\n\t" /* even 8 */
1022           "lbux             %[st1],           %[Temp1](%[cm])                 "
1023           "\n\t" /* even 7 */
1024 
1025           /* ODD pixels */
1026           "ulw              %[qload1],        1(%[src])                       "
1027           "\n\t"
1028           "ulw              %[qload2],        5(%[src])                       "
1029           "\n\t"
1030 
1031           /* odd 1. pixel */
1032           "mtlo             %[vector_64],     $ac1                            "
1033           "\n\t" /* odd 2 */
1034           "mthi             $zero,            $ac1                            "
1035           "\n\t"
1036           "preceu.ph.qbr    %[p1],            %[qload1]                       "
1037           "\n\t"
1038           "preceu.ph.qbl    %[p2],            %[qload1]                       "
1039           "\n\t"
1040           "preceu.ph.qbr    %[p3],            %[qload2]                       "
1041           "\n\t"
1042           "preceu.ph.qbl    %[p4],            %[qload2]                       "
1043           "\n\t"
1044           "sb               %[st1],           0(%[dst])                       "
1045           "\n\t" /* even 7 */
1046           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1047           "\n\t"
1048           "ulw              %[qload2],        9(%[src])                       "
1049           "\n\t"
1050           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
1051           "\n\t" /* odd 1 */
1052           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
1053           "\n\t" /* odd 1 */
1054           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
1055           "\n\t" /* odd 1 */
1056           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
1057           "\n\t" /* odd 1 */
1058           "extp             %[Temp3],         $ac3,           31              "
1059           "\n\t" /* odd 1 */
1060           "lbux             %[st2],           %[Temp2](%[cm])                 "
1061           "\n\t" /* even 8 */
1062 
1063           /* odd 2. pixel */
1064           "mtlo             %[vector_64],     $ac2                            "
1065           "\n\t" /* odd 3 */
1066           "mthi             $zero,            $ac2                            "
1067           "\n\t"
1068           "preceu.ph.qbr    %[p1],            %[qload2]                       "
1069           "\n\t"
1070           "preceu.ph.qbl    %[p5],            %[qload2]                       "
1071           "\n\t"
1072           "sb               %[st2],           0(%[dst])                       "
1073           "\n\t" /* even 8 */
1074           "ulw              %[qload1],        13(%[src])                      "
1075           "\n\t"
1076           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
1077           "\n\t" /* odd 2 */
1078           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
1079           "\n\t" /* odd 2 */
1080           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
1081           "\n\t" /* odd 2 */
1082           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
1083           "\n\t" /* odd 2 */
1084           "extp             %[Temp1],         $ac1,           31              "
1085           "\n\t" /* odd 2 */
1086           "lbux             %[st3],           %[Temp3](%[cm])                 "
1087           "\n\t" /* odd 1 */
1088 
1089           /* odd 3. pixel */
1090           "mtlo             %[vector_64],     $ac3                            "
1091           "\n\t" /* odd 4 */
1092           "mthi             $zero,            $ac3                            "
1093           "\n\t"
1094           "preceu.ph.qbr    %[p2],            %[qload1]                       "
1095           "\n\t"
1096           "sb               %[st3],           0(%[odd_dst])                   "
1097           "\n\t" /* odd 1 */
1098           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1099           "\n\t"
1100           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1101           "\n\t" /* odd 3 */
1102           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1103           "\n\t" /* odd 3 */
1104           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1105           "\n\t" /* odd 3 */
1106           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1107           "\n\t" /* odd 3 */
1108           "extp             %[Temp2],         $ac2,           31              "
1109           "\n\t" /* odd 3 */
1110           "lbux             %[st1],           %[Temp1](%[cm])                 "
1111           "\n\t" /* odd 2 */
1112 
1113           /* odd 4. pixel */
1114           "mtlo             %[vector_64],     $ac1                            "
1115           "\n\t" /* odd 5 */
1116           "mthi             $zero,            $ac1                            "
1117           "\n\t"
1118           "preceu.ph.qbl    %[p3],            %[qload1]                       "
1119           "\n\t"
1120           "sb               %[st1],           0(%[odd_dst])                   "
1121           "\n\t" /* odd 2 */
1122           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1123           "\n\t"
1124           "ulw              %[qload2],        17(%[src])                      "
1125           "\n\t"
1126           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
1127           "\n\t" /* odd 4 */
1128           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
1129           "\n\t" /* odd 4 */
1130           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
1131           "\n\t" /* odd 4 */
1132           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
1133           "\n\t" /* odd 4 */
1134           "extp             %[Temp3],         $ac3,           31              "
1135           "\n\t" /* odd 4 */
1136           "lbux             %[st2],           %[Temp2](%[cm])                 "
1137           "\n\t" /* odd 3 */
1138 
1139           /* odd 5. pixel */
1140           "mtlo             %[vector_64],     $ac2                            "
1141           "\n\t" /* odd 6 */
1142           "mthi             $zero,            $ac2                            "
1143           "\n\t"
1144           "preceu.ph.qbr    %[p4],            %[qload2]                       "
1145           "\n\t"
1146           "sb               %[st2],           0(%[odd_dst])                   "
1147           "\n\t" /* odd 3 */
1148           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1149           "\n\t"
1150           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
1151           "\n\t" /* odd 5 */
1152           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
1153           "\n\t" /* odd 5 */
1154           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
1155           "\n\t" /* odd 5 */
1156           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
1157           "\n\t" /* odd 5 */
1158           "extp             %[Temp1],         $ac1,           31              "
1159           "\n\t" /* odd 5 */
1160           "lbux             %[st3],           %[Temp3](%[cm])                 "
1161           "\n\t" /* odd 4 */
1162 
1163           /* odd 6. pixel */
1164           "mtlo             %[vector_64],     $ac3                            "
1165           "\n\t" /* odd 7 */
1166           "mthi             $zero,            $ac3                            "
1167           "\n\t"
1168           "preceu.ph.qbl    %[p1],            %[qload2]                       "
1169           "\n\t"
1170           "sb               %[st3],           0(%[odd_dst])                   "
1171           "\n\t" /* odd 4 */
1172           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1173           "\n\t"
1174           "ulw              %[qload1],        21(%[src])                      "
1175           "\n\t"
1176           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
1177           "\n\t" /* odd 6 */
1178           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
1179           "\n\t" /* odd 6 */
1180           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
1181           "\n\t" /* odd 6 */
1182           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
1183           "\n\t" /* odd 6 */
1184           "extp             %[Temp2],         $ac2,           31              "
1185           "\n\t" /* odd 6 */
1186           "lbux             %[st1],           %[Temp1](%[cm])                 "
1187           "\n\t" /* odd 5 */
1188 
1189           /* odd 7. pixel */
1190           "mtlo             %[vector_64],     $ac1                            "
1191           "\n\t" /* odd 8 */
1192           "mthi             $zero,            $ac1                            "
1193           "\n\t"
1194           "preceu.ph.qbr    %[p5],            %[qload1]                       "
1195           "\n\t"
1196           "sb               %[st1],           0(%[odd_dst])                   "
1197           "\n\t" /* odd 5 */
1198           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1199           "\n\t"
1200           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
1201           "\n\t" /* odd 7 */
1202           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
1203           "\n\t" /* odd 7 */
1204           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
1205           "\n\t" /* odd 7 */
1206           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
1207           "\n\t" /* odd 7 */
1208           "extp             %[Temp3],         $ac3,           31              "
1209           "\n\t" /* odd 7 */
1210 
1211           /* odd 8. pixel */
1212           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
1213           "\n\t" /* odd 8 */
1214           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
1215           "\n\t" /* odd 8 */
1216           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
1217           "\n\t" /* odd 8 */
1218           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
1219           "\n\t" /* odd 8 */
1220           "extp             %[Temp1],         $ac1,           31              "
1221           "\n\t" /* odd 8 */
1222 
1223           "lbux             %[st2],           %[Temp2](%[cm])                 "
1224           "\n\t" /* odd 6 */
1225           "lbux             %[st3],           %[Temp3](%[cm])                 "
1226           "\n\t" /* odd 7 */
1227           "lbux             %[st1],           %[Temp1](%[cm])                 "
1228           "\n\t" /* odd 8 */
1229 
1230           "sb               %[st2],           0(%[odd_dst])                   "
1231           "\n\t" /* odd 6 */
1232           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1233           "\n\t"
1234 
1235           "sb               %[st3],           0(%[odd_dst])                   "
1236           "\n\t" /* odd 7 */
1237           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1238           "\n\t"
1239 
1240           "sb               %[st1],           0(%[odd_dst])                   "
1241           "\n\t" /* odd 8 */
1242 
1243           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
1244             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
1245             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
1246             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1247             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
1248           : [filter12] "r"(filter12), [filter34] "r"(filter34),
1249             [filter56] "r"(filter56), [filter78] "r"(filter78),
1250             [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
1251             [dst_pitch_2] "r"(dst_pitch_2));
1252 
1253       src += 16;
1254       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
1255       odd_dst = (dst + dst_stride);
1256     }
1257 
1258     /* Next row... */
1259     src_ptr += src_stride;
1260 
1261     dst_ptr += 1;
1262   }
1263 }
1264 
convolve_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)1265 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1266                                uint8_t *dst, ptrdiff_t dst_stride,
1267                                const int16_t *filter, int w, int h) {
1268   int x, y, k;
1269 
1270   for (y = 0; y < h; ++y) {
1271     for (x = 0; x < w; ++x) {
1272       int sum = 0;
1273 
1274       for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
1275 
1276       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
1277     }
1278 
1279     src += src_stride;
1280     dst += 1;
1281   }
1282 }
1283 
copy_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)1284 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1285                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
1286   int x, y;
1287 
1288   for (y = 0; y < h; ++y) {
1289     for (x = 0; x < w; ++x) {
1290       dst[x * dst_stride] = src[x];
1291     }
1292 
1293     src += src_stride;
1294     dst += 1;
1295   }
1296 }
1297 
vpx_convolve8_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)1298 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1299                          ptrdiff_t dst_stride, const int16_t *filter_x,
1300                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
1301                          int w, int h) {
1302   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
1303   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
1304   uint32_t pos = 38;
1305 
1306   assert(x_step_q4 == 16);
1307   assert(y_step_q4 == 16);
1308   assert(((const int32_t *)filter_x)[1] != 0x800000);
1309   assert(((const int32_t *)filter_y)[1] != 0x800000);
1310   (void)x_step_q4;
1311 
1312   /* bit positon for extract from acc */
1313   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
1314                        :
1315                        : [pos] "r"(pos));
1316 
1317   if (intermediate_height < h) intermediate_height = h;
1318 
1319   /* copy the src to dst */
1320   if (filter_x[3] == 0x80) {
1321     copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
1322                           intermediate_height, w, intermediate_height);
1323   } else if (((const int32_t *)filter_x)[0] == 0) {
1324     vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
1325                         intermediate_height, filter_x, w, intermediate_height);
1326   } else {
1327     src -= (src_stride * 3 + 3);
1328 
1329     /* prefetch data to cache memory */
1330     prefetch_load(src);
1331     prefetch_load(src + 32);
1332 
1333     switch (w) {
1334       case 4:
1335         convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
1336                                           intermediate_height, filter_x,
1337                                           intermediate_height);
1338         break;
1339       case 8:
1340         convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
1341                                           intermediate_height, filter_x,
1342                                           intermediate_height);
1343         break;
1344       case 16:
1345       case 32:
1346         convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
1347                                            intermediate_height, filter_x,
1348                                            intermediate_height, (w / 16));
1349         break;
1350       case 64:
1351         prefetch_load(src + 32);
1352         convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
1353                                            intermediate_height, filter_x,
1354                                            intermediate_height);
1355         break;
1356       default:
1357         convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
1358                                   filter_x, w, intermediate_height);
1359         break;
1360     }
1361   }
1362 
1363   /* copy the src to dst */
1364   if (filter_y[3] == 0x80) {
1365     copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
1366   } else if (((const int32_t *)filter_y)[0] == 0) {
1367     vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
1368                         filter_y, h, w);
1369   } else {
1370     switch (h) {
1371       case 4:
1372         convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
1373                                           dst_stride, filter_y, w);
1374         break;
1375       case 8:
1376         convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
1377                                           dst_stride, filter_y, w);
1378         break;
1379       case 16:
1380       case 32:
1381         convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
1382                                            dst_stride, filter_y, w, (h / 16));
1383         break;
1384       case 64:
1385         convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
1386                                            dst_stride, filter_y, w);
1387         break;
1388       default:
1389         convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
1390                                   filter_y, h, w);
1391         break;
1392     }
1393   }
1394 }
1395 
vpx_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1396 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1397                              uint8_t *dst, ptrdiff_t dst_stride,
1398                              const int16_t *filter_x, int filter_x_stride,
1399                              const int16_t *filter_y, int filter_y_stride,
1400                              int w, int h) {
1401   int x, y;
1402   (void)filter_x;
1403   (void)filter_x_stride;
1404   (void)filter_y;
1405   (void)filter_y_stride;
1406 
1407   /* prefetch data to cache memory */
1408   prefetch_load(src);
1409   prefetch_load(src + 32);
1410   prefetch_store(dst);
1411 
1412   switch (w) {
1413     case 4: {
1414       uint32_t tp1;
1415 
1416       /* 1 word storage */
1417       for (y = h; y--;) {
1418         prefetch_load(src + src_stride);
1419         prefetch_load(src + src_stride + 32);
1420         prefetch_store(dst + dst_stride);
1421 
1422         __asm__ __volatile__(
1423             "ulw              %[tp1],         (%[src])      \n\t"
1424             "sw               %[tp1],         (%[dst])      \n\t" /* store */
1425 
1426             : [tp1] "=&r"(tp1)
1427             : [src] "r"(src), [dst] "r"(dst));
1428 
1429         src += src_stride;
1430         dst += dst_stride;
1431       }
1432       break;
1433     }
1434     case 8: {
1435       uint32_t tp1, tp2;
1436 
1437       /* 2 word storage */
1438       for (y = h; y--;) {
1439         prefetch_load(src + src_stride);
1440         prefetch_load(src + src_stride + 32);
1441         prefetch_store(dst + dst_stride);
1442 
1443         __asm__ __volatile__(
1444             "ulw              %[tp1],         0(%[src])      \n\t"
1445             "ulw              %[tp2],         4(%[src])      \n\t"
1446             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1447             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1448 
1449             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
1450             : [src] "r"(src), [dst] "r"(dst));
1451 
1452         src += src_stride;
1453         dst += dst_stride;
1454       }
1455       break;
1456     }
1457     case 16: {
1458       uint32_t tp1, tp2, tp3, tp4;
1459 
1460       /* 4 word storage */
1461       for (y = h; y--;) {
1462         prefetch_load(src + src_stride);
1463         prefetch_load(src + src_stride + 32);
1464         prefetch_store(dst + dst_stride);
1465 
1466         __asm__ __volatile__(
1467             "ulw              %[tp1],         0(%[src])      \n\t"
1468             "ulw              %[tp2],         4(%[src])      \n\t"
1469             "ulw              %[tp3],         8(%[src])      \n\t"
1470             "ulw              %[tp4],         12(%[src])     \n\t"
1471 
1472             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1473             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1474             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1475             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1476 
1477             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1478               [tp4] "=&r"(tp4)
1479             : [src] "r"(src), [dst] "r"(dst));
1480 
1481         src += src_stride;
1482         dst += dst_stride;
1483       }
1484       break;
1485     }
1486     case 32: {
1487       uint32_t tp1, tp2, tp3, tp4;
1488       uint32_t tp5, tp6, tp7, tp8;
1489 
1490       /* 8 word storage */
1491       for (y = h; y--;) {
1492         prefetch_load(src + src_stride);
1493         prefetch_load(src + src_stride + 32);
1494         prefetch_store(dst + dst_stride);
1495 
1496         __asm__ __volatile__(
1497             "ulw              %[tp1],         0(%[src])      \n\t"
1498             "ulw              %[tp2],         4(%[src])      \n\t"
1499             "ulw              %[tp3],         8(%[src])      \n\t"
1500             "ulw              %[tp4],         12(%[src])     \n\t"
1501             "ulw              %[tp5],         16(%[src])     \n\t"
1502             "ulw              %[tp6],         20(%[src])     \n\t"
1503             "ulw              %[tp7],         24(%[src])     \n\t"
1504             "ulw              %[tp8],         28(%[src])     \n\t"
1505 
1506             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1507             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1508             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1509             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1510             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1511             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1512             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1513             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1514 
1515             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1516               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1517               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1518             : [src] "r"(src), [dst] "r"(dst));
1519 
1520         src += src_stride;
1521         dst += dst_stride;
1522       }
1523       break;
1524     }
1525     case 64: {
1526       uint32_t tp1, tp2, tp3, tp4;
1527       uint32_t tp5, tp6, tp7, tp8;
1528 
1529       prefetch_load(src + 64);
1530       prefetch_store(dst + 32);
1531 
1532       /* 16 word storage */
1533       for (y = h; y--;) {
1534         prefetch_load(src + src_stride);
1535         prefetch_load(src + src_stride + 32);
1536         prefetch_load(src + src_stride + 64);
1537         prefetch_store(dst + dst_stride);
1538         prefetch_store(dst + dst_stride + 32);
1539 
1540         __asm__ __volatile__(
1541             "ulw              %[tp1],         0(%[src])      \n\t"
1542             "ulw              %[tp2],         4(%[src])      \n\t"
1543             "ulw              %[tp3],         8(%[src])      \n\t"
1544             "ulw              %[tp4],         12(%[src])     \n\t"
1545             "ulw              %[tp5],         16(%[src])     \n\t"
1546             "ulw              %[tp6],         20(%[src])     \n\t"
1547             "ulw              %[tp7],         24(%[src])     \n\t"
1548             "ulw              %[tp8],         28(%[src])     \n\t"
1549 
1550             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1551             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1552             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1553             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1554             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1555             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1556             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1557             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1558 
1559             "ulw              %[tp1],         32(%[src])     \n\t"
1560             "ulw              %[tp2],         36(%[src])     \n\t"
1561             "ulw              %[tp3],         40(%[src])     \n\t"
1562             "ulw              %[tp4],         44(%[src])     \n\t"
1563             "ulw              %[tp5],         48(%[src])     \n\t"
1564             "ulw              %[tp6],         52(%[src])     \n\t"
1565             "ulw              %[tp7],         56(%[src])     \n\t"
1566             "ulw              %[tp8],         60(%[src])     \n\t"
1567 
1568             "sw               %[tp1],         32(%[dst])     \n\t" /* store */
1569             "sw               %[tp2],         36(%[dst])     \n\t" /* store */
1570             "sw               %[tp3],         40(%[dst])     \n\t" /* store */
1571             "sw               %[tp4],         44(%[dst])     \n\t" /* store */
1572             "sw               %[tp5],         48(%[dst])     \n\t" /* store */
1573             "sw               %[tp6],         52(%[dst])     \n\t" /* store */
1574             "sw               %[tp7],         56(%[dst])     \n\t" /* store */
1575             "sw               %[tp8],         60(%[dst])     \n\t" /* store */
1576 
1577             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1578               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1579               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1580             : [src] "r"(src), [dst] "r"(dst));
1581 
1582         src += src_stride;
1583         dst += dst_stride;
1584       }
1585       break;
1586     }
1587     default:
1588       for (y = h; y--;) {
1589         for (x = 0; x < w; ++x) {
1590           dst[x] = src[x];
1591         }
1592 
1593         src += src_stride;
1594         dst += dst_stride;
1595       }
1596       break;
1597   }
1598 }
1599 #endif
1600