1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <stdio.h>
14 
15 #include "./aom_dsp_rtcd.h"
16 #include "aom_dsp/mips/convolve_common_dspr2.h"
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/aom_filter.h"
19 #include "aom_ports/mem.h"
20 
21 #if HAVE_DSPR2
convolve_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)22 static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
23                                   uint8_t *dst, int32_t dst_stride,
24                                   const int16_t *filter_y, int32_t w,
25                                   int32_t h) {
26   int32_t x, y;
27   const uint8_t *src_ptr;
28   uint8_t *dst_ptr;
29   uint8_t *cm = aom_ff_cropTbl;
30   uint32_t vector4a = 64;
31   uint32_t load1, load2, load3, load4;
32   uint32_t p1, p2;
33   uint32_t n1, n2;
34   uint32_t scratch1, scratch2;
35   uint32_t store1, store2;
36   int32_t vector1b, vector2b, vector3b, vector4b;
37   int32_t Temp1, Temp2;
38 
39   vector1b = ((const int32_t *)filter_y)[0];
40   vector2b = ((const int32_t *)filter_y)[1];
41   vector3b = ((const int32_t *)filter_y)[2];
42   vector4b = ((const int32_t *)filter_y)[3];
43 
44   src -= 3 * src_stride;
45 
46   for (y = h; y--;) {
47     /* prefetch data to cache memory */
48     prefetch_store(dst + dst_stride);
49 
50     for (x = 0; x < w; x += 4) {
51       src_ptr = src + x;
52       dst_ptr = dst + x;
53 
54       __asm__ __volatile__(
55           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
56           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
57           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
58           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
59           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
62 
63           "mtlo             %[vector4a],  $ac0                            \n\t"
64           "mtlo             %[vector4a],  $ac1                            \n\t"
65           "mtlo             %[vector4a],  $ac2                            \n\t"
66           "mtlo             %[vector4a],  $ac3                            \n\t"
67           "mthi             $zero,        $ac0                            \n\t"
68           "mthi             $zero,        $ac1                            \n\t"
69           "mthi             $zero,        $ac2                            \n\t"
70           "mthi             $zero,        $ac3                            \n\t"
71 
72           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
73           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
74           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
75           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
76           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
77           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
78           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
79           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
80 
81           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
82           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
83           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
84           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
85 
86           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
87           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
88           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
89           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
90           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
91           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
92           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
93           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
94 
95           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
96           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
97           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
98           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
99 
100           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
101           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
102           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
103           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
108 
109           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
110           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
111           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
112           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
113           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
114           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
115           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
116           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
117 
118           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
119           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
120           "extp             %[Temp1],     $ac0,           31              \n\t"
121           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
122           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
123           "extp             %[Temp2],     $ac1,           31              \n\t"
124 
125           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
126           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
127           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
128           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
129           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
130           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
131           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
132           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
133 
134           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
135           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
136           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
137           "extp             %[Temp1],     $ac2,           31              \n\t"
138 
139           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
140           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
141           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
142           "extp             %[Temp2],     $ac3,           31              \n\t"
143 
144           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
145           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
146 
147           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
148           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
149 
150           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
151           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
152 
153           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
154             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
155             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
156             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
157             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
158             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
159           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
160             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
161             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
162             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
163     }
164 
165     /* Next row... */
166     src += src_stride;
167     dst += dst_stride;
168   }
169 }
170 
convolve_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)171 static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
172                                    uint8_t *dst, int32_t dst_stride,
173                                    const int16_t *filter_y, int32_t h) {
174   int32_t x, y;
175   const uint8_t *src_ptr;
176   uint8_t *dst_ptr;
177   uint8_t *cm = aom_ff_cropTbl;
178   uint32_t vector4a = 64;
179   uint32_t load1, load2, load3, load4;
180   uint32_t p1, p2;
181   uint32_t n1, n2;
182   uint32_t scratch1, scratch2;
183   uint32_t store1, store2;
184   int32_t vector1b, vector2b, vector3b, vector4b;
185   int32_t Temp1, Temp2;
186 
187   vector1b = ((const int32_t *)filter_y)[0];
188   vector2b = ((const int32_t *)filter_y)[1];
189   vector3b = ((const int32_t *)filter_y)[2];
190   vector4b = ((const int32_t *)filter_y)[3];
191 
192   src -= 3 * src_stride;
193 
194   for (y = h; y--;) {
195     /* prefetch data to cache memory */
196     prefetch_store(dst + dst_stride);
197     prefetch_store(dst + dst_stride + 32);
198 
199     for (x = 0; x < 64; x += 4) {
200       src_ptr = src + x;
201       dst_ptr = dst + x;
202 
203       __asm__ __volatile__(
204           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
205           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
206           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
207           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
208           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
209           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
210           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
211 
212           "mtlo             %[vector4a],  $ac0                            \n\t"
213           "mtlo             %[vector4a],  $ac1                            \n\t"
214           "mtlo             %[vector4a],  $ac2                            \n\t"
215           "mtlo             %[vector4a],  $ac3                            \n\t"
216           "mthi             $zero,        $ac0                            \n\t"
217           "mthi             $zero,        $ac1                            \n\t"
218           "mthi             $zero,        $ac2                            \n\t"
219           "mthi             $zero,        $ac3                            \n\t"
220 
221           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
222           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
223           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
224           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
225           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
226           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
227           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
228           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
229 
230           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
231           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
232           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
233           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
234 
235           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
236           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
237           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
238           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
239           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
240           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
241           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
242           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
243 
244           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
245           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
246           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
247           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
248 
249           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
250           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
251           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
252           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
253           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
254           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
255           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
256           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
257 
258           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
259           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
260           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
261           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
262           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
263           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
264           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
265           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
266 
267           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
268           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
269           "extp             %[Temp1],     $ac0,           31              \n\t"
270           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
271           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
272           "extp             %[Temp2],     $ac1,           31              \n\t"
273 
274           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
275           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
276           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
277           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
278           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
279           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
280           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
281           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
282 
283           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
284           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
285           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
286           "extp             %[Temp1],     $ac2,           31              \n\t"
287 
288           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
289           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
290           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
291           "extp             %[Temp2],     $ac3,           31              \n\t"
292 
293           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
294           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
295 
296           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
297           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
298 
299           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
300           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
301 
302           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
303             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
304             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
305             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
306             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
307             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
308           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
309             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
310             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
311             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
312     }
313 
314     /* Next row... */
315     src += src_stride;
316     dst += dst_stride;
317   }
318 }
319 
aom_convolve8_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)320 void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
321                               uint8_t *dst, ptrdiff_t dst_stride,
322                               const int16_t *filter_x, int x_step_q4,
323                               const int16_t *filter_y, int y_step_q4, int w,
324                               int h) {
325   assert(y_step_q4 == 16);
326   assert(((const int32_t *)filter_y)[1] != 0x800000);
327 
328   if (((const int32_t *)filter_y)[0] == 0) {
329     aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
330                              x_step_q4, filter_y, y_step_q4, w, h);
331   } else {
332     uint32_t pos = 38;
333 
334     /* bit positon for extract from acc */
335     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
336                          :
337                          : [pos] "r"(pos));
338 
339     prefetch_store(dst);
340 
341     switch (w) {
342       case 4:
343       case 8:
344       case 16:
345       case 32:
346         convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
347         break;
348       case 64:
349         prefetch_store(dst + 32);
350         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
351         break;
352       default:
353         aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
354                              x_step_q4, filter_y, y_step_q4, w, h);
355         break;
356     }
357   }
358 }
359 
360 #endif
361