1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_bi_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)21 static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                      uint8_t *dst, int32_t dst_stride,
23                                      const int16_t *filter_y, int32_t w,
24                                      int32_t h) {
25   int32_t x, y;
26   const uint8_t *src_ptr;
27   uint8_t *dst_ptr;
28   uint8_t *cm = vpx_ff_cropTbl;
29   uint32_t vector4a = 64;
30   uint32_t load1, load2;
31   uint32_t p1, p2;
32   uint32_t scratch1;
33   uint32_t store1, store2;
34   int32_t Temp1, Temp2;
35   const int16_t *filter = &filter_y[3];
36   uint32_t filter45;
37 
38   filter45 = ((const int32_t *)filter)[0];
39 
40   for (y = h; y--;) {
41     /* prefetch data to cache memory */
42     prefetch_store(dst + dst_stride);
43 
44     for (x = 0; x < w; x += 4) {
45       src_ptr = src + x;
46       dst_ptr = dst + x;
47 
48       __asm__ __volatile__(
49           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
50           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
51           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
52 
53           "mtlo             %[vector4a],  $ac0                            \n\t"
54           "mtlo             %[vector4a],  $ac1                            \n\t"
55           "mtlo             %[vector4a],  $ac2                            \n\t"
56           "mtlo             %[vector4a],  $ac3                            \n\t"
57           "mthi             $zero,        $ac0                            \n\t"
58           "mthi             $zero,        $ac1                            \n\t"
59           "mthi             $zero,        $ac2                            \n\t"
60           "mthi             $zero,        $ac3                            \n\t"
61 
62           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
63           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
64 
65           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
66           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
67 
68           "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
69           "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
70 
71           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
72           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
73 
74           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
75           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
76 
77           "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
78           "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
79 
80           "extp             %[Temp1],     $ac0,           31              \n\t"
81           "extp             %[Temp2],     $ac1,           31              \n\t"
82 
83           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
84           "extp             %[Temp1],     $ac2,           31              \n\t"
85 
86           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
87           "extp             %[Temp2],     $ac3,           31              \n\t"
88 
89           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
90           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
91 
92           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
93           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
94 
95           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
96           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
97 
98           : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
99             [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
100             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
101             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
102           : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
103             [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
104     }
105 
106     /* Next row... */
107     src += src_stride;
108     dst += dst_stride;
109   }
110 }
111 
convolve_bi_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)112 static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
113                                       uint8_t *dst, int32_t dst_stride,
114                                       const int16_t *filter_y, int32_t h) {
115   int32_t x, y;
116   const uint8_t *src_ptr;
117   uint8_t *dst_ptr;
118   uint8_t *cm = vpx_ff_cropTbl;
119   uint32_t vector4a = 64;
120   uint32_t load1, load2;
121   uint32_t p1, p2;
122   uint32_t scratch1;
123   uint32_t store1, store2;
124   int32_t Temp1, Temp2;
125   const int16_t *filter = &filter_y[3];
126   uint32_t filter45;
127 
128   filter45 = ((const int32_t *)filter)[0];
129 
130   for (y = h; y--;) {
131     /* prefetch data to cache memory */
132     prefetch_store(dst + dst_stride);
133 
134     for (x = 0; x < 64; x += 4) {
135       src_ptr = src + x;
136       dst_ptr = dst + x;
137 
138       __asm__ __volatile__(
139           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
140           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
141           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
142 
143           "mtlo             %[vector4a],  $ac0                            \n\t"
144           "mtlo             %[vector4a],  $ac1                            \n\t"
145           "mtlo             %[vector4a],  $ac2                            \n\t"
146           "mtlo             %[vector4a],  $ac3                            \n\t"
147           "mthi             $zero,        $ac0                            \n\t"
148           "mthi             $zero,        $ac1                            \n\t"
149           "mthi             $zero,        $ac2                            \n\t"
150           "mthi             $zero,        $ac3                            \n\t"
151 
152           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
153           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
154 
155           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
156           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
157 
158           "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
159           "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
160 
161           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
162           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
163 
164           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
165           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
166 
167           "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
168           "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
169 
170           "extp             %[Temp1],     $ac0,           31              \n\t"
171           "extp             %[Temp2],     $ac1,           31              \n\t"
172 
173           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
174           "extp             %[Temp1],     $ac2,           31              \n\t"
175 
176           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
177           "extp             %[Temp2],     $ac3,           31              \n\t"
178 
179           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
180           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
181 
182           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
183           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
184 
185           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
186           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
187 
188           : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
189             [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
190             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
191             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
192           : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
193             [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
194     }
195 
196     /* Next row... */
197     src += src_stride;
198     dst += dst_stride;
199   }
200 }
201 
vpx_convolve2_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)202 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
203                               uint8_t *dst, ptrdiff_t dst_stride,
204                               const int16_t *filter_x, int x_step_q4,
205                               const int16_t *filter_y, int y_step_q4, int w,
206                               int h) {
207   uint32_t pos = 38;
208 
209   assert(y_step_q4 == 16);
210 
211   /* bit positon for extract from acc */
212   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
213                        :
214                        : [pos] "r"(pos));
215 
216   prefetch_store(dst);
217 
218   switch (w) {
219     case 4:
220     case 8:
221     case 16:
222     case 32:
223       convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
224                                h);
225       break;
226     case 64:
227       prefetch_store(dst + 32);
228       convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
229       break;
230     default:
231       vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
232                            x_step_q4, filter_y, y_step_q4, w, h);
233       break;
234   }
235 }
236 #endif
237