1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
14 #include "vpx_dsp/txfm_common.h"
15 
16 #if HAVE_DSPR2
idct16_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)17 void idct16_rows_dspr2(const int16_t *input, int16_t *output,
18                        uint32_t no_rows) {
19   int i;
20   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
21   int step1_10, step1_11, step1_12, step1_13;
22   int step2_0, step2_1, step2_2, step2_3;
23   int step2_8, step2_9, step2_10, step2_11;
24   int step2_12, step2_13, step2_14, step2_15;
25   int load1, load2, load3, load4, load5, load6, load7, load8;
26   int result1, result2, result3, result4;
27   const int const_2_power_13 = 8192;
28 
29   for (i = no_rows; i--;) {
30     /* prefetch row */
31     prefetch_load((const uint8_t *)(input + 16));
32 
33     __asm__ __volatile__(
34         "lh       %[load1],              0(%[input])                    \n\t"
35         "lh       %[load2],             16(%[input])                    \n\t"
36         "lh       %[load3],              8(%[input])                    \n\t"
37         "lh       %[load4],             24(%[input])                    \n\t"
38 
39         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
40         "mthi     $zero,                $ac1                            \n\t"
41         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
42         "mthi     $zero,                $ac2                            \n\t"
43         "add      %[result1],           %[load1],       %[load2]        \n\t"
44         "sub      %[result2],           %[load1],       %[load2]        \n\t"
45         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
46         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
47         "extp     %[step2_0],           $ac1,           31              \n\t"
48         "extp     %[step2_1],           $ac2,           31              \n\t"
49 
50         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
51         "mthi     $zero,                $ac3                            \n\t"
52         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
53         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
54         "extp     %[step2_2],           $ac3,           31              \n\t"
55 
56         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
57         "mthi     $zero,                $ac1                            \n\t"
58         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
59         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
60         "extp     %[step2_3],           $ac1,           31              \n\t"
61 
62         "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
63         "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
64         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
65         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
66 
67         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
68           [load4] "=&r"(load4), [result1] "=&r"(result1),
69           [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
70           [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
71           [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
72           [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
73           [step1_3] "=r"(step1_3)
74         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
75           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
76           [cospi_16_64] "r"(cospi_16_64));
77 
78     __asm__ __volatile__(
79         "lh       %[load5],             2(%[input])                     \n\t"
80         "lh       %[load6],             30(%[input])                    \n\t"
81         "lh       %[load7],             18(%[input])                    \n\t"
82         "lh       %[load8],             14(%[input])                    \n\t"
83 
84         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
85         "mthi     $zero,                $ac1                            \n\t"
86         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
87         "mthi     $zero,                $ac3                            \n\t"
88 
89         "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
90         "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
91         "extp     %[result1],           $ac1,           31              \n\t"
92 
93         "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
94         "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
95         "extp     %[result2],           $ac3,           31              \n\t"
96 
97         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
98         "mthi     $zero,                $ac1                            \n\t"
99         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
100         "mthi     $zero,                $ac2                            \n\t"
101 
102         "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
103         "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
104         "extp     %[result3],           $ac1,           31              \n\t"
105 
106         "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
107         "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
108         "extp     %[result4],           $ac2,           31              \n\t"
109 
110         "sub      %[load5],             %[result1],     %[result2]      \n\t"
111         "sub      %[load6],             %[result4],     %[result3]      \n\t"
112 
113         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
114         "mthi     $zero,                $ac1                            \n\t"
115         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
116         "mthi     $zero,                $ac3                            \n\t"
117 
118         "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
119         "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
120         "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
121         "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
122 
123         "extp     %[step2_9],           $ac1,           31              \n\t"
124         "extp     %[step2_14],          $ac3,           31              \n\t"
125         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
126         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
127 
128         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
129           [load8] "=&r"(load8), [result1] "=&r"(result1),
130           [result2] "=&r"(result2), [result3] "=&r"(result3),
131           [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
132           [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
133           [step2_14] "=r"(step2_14)
134         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
135           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
136           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
137           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
138 
139     __asm__ __volatile__(
140         "lh       %[load1],             10(%[input])                    \n\t"
141         "lh       %[load2],             22(%[input])                    \n\t"
142         "lh       %[load3],             26(%[input])                    \n\t"
143         "lh       %[load4],             6(%[input])                     \n\t"
144 
145         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
146         "mthi     $zero,                $ac1                            \n\t"
147         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
148         "mthi     $zero,                $ac3                            \n\t"
149 
150         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
151         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
152         "extp     %[result1],           $ac1,           31              \n\t"
153 
154         "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
155         "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
156         "extp     %[result2],           $ac3,           31              \n\t"
157 
158         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
159         "mthi     $zero,                $ac1                            \n\t"
160         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
161         "mthi     $zero,                $ac2                            \n\t"
162 
163         "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
164         "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
165         "extp     %[result3],           $ac1,           31              \n\t"
166 
167         "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
168         "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
169         "extp     %[result4],           $ac2,           31              \n\t"
170 
171         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
172         "mthi     $zero,                $ac1                            \n\t"
173         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
174         "mthi     $zero,                $ac3                            \n\t"
175 
176         "sub      %[load1],             %[result2],     %[result1]      \n\t"
177         "sub      %[load2],             %[result4],     %[result3]      \n\t"
178 
179         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
180         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
181         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
182         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
183 
184         "extp     %[step2_10],          $ac1,           31              \n\t"
185         "extp     %[step2_13],          $ac3,           31              \n\t"
186         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
187         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
188 
189         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
190           [load4] "=&r"(load4), [result1] "=&r"(result1),
191           [result2] "=&r"(result2), [result3] "=&r"(result3),
192           [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
193           [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
194           [step2_13] "=r"(step2_13)
195         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
196           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
197           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
198           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
199 
200     __asm__ __volatile__(
201         "lh       %[load5],             4(%[input])                     \n\t"
202         "lh       %[load6],             28(%[input])                    \n\t"
203         "lh       %[load7],             20(%[input])                    \n\t"
204         "lh       %[load8],             12(%[input])                    \n\t"
205 
206         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
207         "mthi     $zero,                $ac1                            \n\t"
208         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
209         "mthi     $zero,                $ac3                            \n\t"
210 
211         "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
212         "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
213         "extp     %[result1],           $ac1,           31              \n\t"
214 
215         "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
216         "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
217         "extp     %[result2],           $ac3,           31              \n\t"
218 
219         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
220         "mthi     $zero,                $ac1                            \n\t"
221         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
222         "mthi     $zero,                $ac2                            \n\t"
223 
224         "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
225         "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
226         "extp     %[result3],           $ac1,           31              \n\t"
227 
228         "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
229         "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
230         "extp     %[result4],           $ac2,           31              \n\t"
231 
232         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
233         "mthi     $zero,                $ac1                            \n\t"
234         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
235         "mthi     $zero,                $ac3                            \n\t"
236 
237         "sub      %[load5],             %[result4],     %[result3]      \n\t"
238         "sub      %[load5],             %[load5],       %[result1]      \n\t"
239         "add      %[load5],             %[load5],       %[result2]      \n\t"
240 
241         "sub      %[load6],             %[result1],     %[result2]      \n\t"
242         "sub      %[load6],             %[load6],       %[result3]      \n\t"
243         "add      %[load6],             %[load6],       %[result4]      \n\t"
244 
245         "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
246         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
247 
248         "extp     %[step1_5],           $ac1,           31              \n\t"
249         "extp     %[step1_6],           $ac3,           31              \n\t"
250         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
251         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
252 
253         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
254           [load8] "=&r"(load8), [result1] "=&r"(result1),
255           [result2] "=&r"(result2), [result3] "=&r"(result3),
256           [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
257           [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
258           [step1_7] "=r"(step1_7)
259         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
260           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
261           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
262           [cospi_16_64] "r"(cospi_16_64));
263 
264     __asm__ __volatile__(
265         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
266         "mthi     $zero,                $ac0                            \n\t"
267         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
268         "mthi     $zero,                $ac1                            \n\t"
269 
270         "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
271         "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
272         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
273 
274         "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
275 
276         "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
277         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
278         "add      %[load6],             %[load6],       %[step2_9]      \n\t"
279 
280         "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
281 
282         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
283         "mthi     $zero,                $ac2                            \n\t"
284         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
285         "mthi     $zero,                $ac3                            \n\t"
286 
287         "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
288         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
289         "add      %[load5],             %[load5],       %[step2_11]     \n\t"
290 
291         "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
292 
293         "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
294         "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
295         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
296 
297         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
298 
299         "extp     %[step1_10],          $ac0,           31              \n\t"
300         "extp     %[step1_13],          $ac1,           31              \n\t"
301         "extp     %[step1_11],          $ac2,           31              \n\t"
302         "extp     %[step1_12],          $ac3,           31              \n\t"
303 
304         : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
305           [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
306           [step1_13] "=r"(step1_13)
307         : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
308           [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
309           [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
310           [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
311           [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
312 
313     __asm__ __volatile__(
314         "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
315         "add      %[load5],             %[load5],       %[step2_12]     \n\t"
316         "add      %[load5],             %[load5],       %[step2_15]     \n\t"
317         "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
318         "add      %[load6],             %[load6],       %[step2_13]     \n\t"
319         "add      %[load6],             %[load6],       %[step2_14]     \n\t"
320         "sh       %[load5],             0(%[output])                    \n\t"
321         "sh       %[load6],             32(%[output])                   \n\t"
322         "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
323         "add      %[load5],             %[load5],       %[step2_9]      \n\t"
324         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
325         "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
326         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
327         "add      %[load6],             %[load6],       %[step2_11]     \n\t"
328         "sh       %[load5],             192(%[output])                  \n\t"
329         "sh       %[load6],             224(%[output])                  \n\t"
330         "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
331         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
332         "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
333         "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
334         "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
335         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
336         "sh       %[load5],             256(%[output])                  \n\t"
337         "sh       %[load6],             288(%[output])                  \n\t"
338         "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
339         "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
340         "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
341         "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
342         "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
343         "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
344         "sh       %[load5],             448(%[output])                  \n\t"
345         "sh       %[load6],             480(%[output])                  \n\t"
346 
347         : [load5] "=&r"(load5), [load6] "=&r"(load6)
348         : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
349           [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
350           [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
351           [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
352           [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
353           [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
354 
355     __asm__ __volatile__(
356         "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
357         "add      %[load5],             %[load5],       %[step1_13]     \n\t"
358         "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
359         "add      %[load6],             %[load6],       %[step1_12]     \n\t"
360         "sh       %[load5],             64(%[output])                   \n\t"
361         "sh       %[load6],             96(%[output])                   \n\t"
362         "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
363         "add      %[load5],             %[load5],       %[step1_11]     \n\t"
364         "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
365         "add      %[load6],             %[load6],       %[step1_10]     \n\t"
366         "sh       %[load5],             128(%[output])                  \n\t"
367         "sh       %[load6],             160(%[output])                  \n\t"
368         "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
369         "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
370         "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
371         "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
372         "sh       %[load5],             320(%[output])                  \n\t"
373         "sh       %[load6],             352(%[output])                  \n\t"
374         "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
375         "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
376         "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
377         "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
378         "sh       %[load5],             384(%[output])                  \n\t"
379         "sh       %[load6],             416(%[output])                  \n\t"
380 
381         : [load5] "=&r"(load5), [load6] "=&r"(load6)
382         : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
383           [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
384           [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
385           [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
386 
387     input += 16;
388     output += 1;
389   }
390 }
391 
idct16_cols_add_blk_dspr2(int16_t * input,uint8_t * dest,int stride)392 void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
393   int i;
394   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
395   int step1_8, step1_9, step1_10, step1_11;
396   int step1_12, step1_13, step1_14, step1_15;
397   int step2_0, step2_1, step2_2, step2_3;
398   int step2_8, step2_9, step2_10, step2_11;
399   int step2_12, step2_13, step2_14, step2_15;
400   int load1, load2, load3, load4, load5, load6, load7, load8;
401   int result1, result2, result3, result4;
402   const int const_2_power_13 = 8192;
403   uint8_t *dest_pix;
404   uint8_t *cm = vpx_ff_cropTbl;
405 
406   /* prefetch vpx_ff_cropTbl */
407   prefetch_load(vpx_ff_cropTbl);
408   prefetch_load(vpx_ff_cropTbl + 32);
409   prefetch_load(vpx_ff_cropTbl + 64);
410   prefetch_load(vpx_ff_cropTbl + 96);
411   prefetch_load(vpx_ff_cropTbl + 128);
412   prefetch_load(vpx_ff_cropTbl + 160);
413   prefetch_load(vpx_ff_cropTbl + 192);
414   prefetch_load(vpx_ff_cropTbl + 224);
415 
416   for (i = 0; i < 16; ++i) {
417     dest_pix = (dest + i);
418     __asm__ __volatile__(
419         "lh       %[load1],              0(%[input])                    \n\t"
420         "lh       %[load2],             16(%[input])                    \n\t"
421         "lh       %[load3],              8(%[input])                    \n\t"
422         "lh       %[load4],             24(%[input])                    \n\t"
423 
424         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
425         "mthi     $zero,                $ac1                            \n\t"
426         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
427         "mthi     $zero,                $ac2                            \n\t"
428         "add      %[result1],           %[load1],       %[load2]        \n\t"
429         "sub      %[result2],           %[load1],       %[load2]        \n\t"
430         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
431         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
432         "extp     %[step2_0],           $ac1,           31              \n\t"
433         "extp     %[step2_1],           $ac2,           31              \n\t"
434 
435         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
436         "mthi     $zero,                $ac3                            \n\t"
437         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
438         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
439         "extp     %[step2_2],           $ac3,           31              \n\t"
440 
441         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
442         "mthi     $zero,                $ac1                            \n\t"
443         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
444         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
445         "extp     %[step2_3],           $ac1,           31              \n\t"
446 
447         "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
448         "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
449         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
450         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
451 
452         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
453           [load4] "=&r"(load4), [result1] "=&r"(result1),
454           [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
455           [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
456           [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
457           [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
458           [step1_3] "=r"(step1_3)
459         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
460           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
461           [cospi_16_64] "r"(cospi_16_64));
462 
463     __asm__ __volatile__(
464         "lh       %[load5],             2(%[input])                     \n\t"
465         "lh       %[load6],             30(%[input])                    \n\t"
466         "lh       %[load7],             18(%[input])                    \n\t"
467         "lh       %[load8],             14(%[input])                    \n\t"
468 
469         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
470         "mthi     $zero,                $ac1                            \n\t"
471         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
472         "mthi     $zero,                $ac3                            \n\t"
473 
474         "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
475         "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
476         "extp     %[result1],           $ac1,           31              \n\t"
477 
478         "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
479         "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
480         "extp     %[result2],           $ac3,           31              \n\t"
481 
482         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
483         "mthi     $zero,                $ac1                            \n\t"
484         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
485         "mthi     $zero,                $ac2                            \n\t"
486 
487         "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
488         "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
489         "extp     %[result3],           $ac1,           31              \n\t"
490 
491         "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
492         "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
493         "extp     %[result4],           $ac2,            31             \n\t"
494 
495         "sub      %[load5],             %[result1],     %[result2]      \n\t"
496         "sub      %[load6],             %[result4],     %[result3]      \n\t"
497 
498         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
499         "mthi     $zero,                $ac1                            \n\t"
500         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
501         "mthi     $zero,                $ac3                            \n\t"
502 
503         "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
504         "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
505         "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
506         "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
507 
508         "extp     %[step2_9],           $ac1,           31              \n\t"
509         "extp     %[step2_14],          $ac3,           31              \n\t"
510         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
511         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
512 
513         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
514           [load8] "=&r"(load8), [result1] "=&r"(result1),
515           [result2] "=&r"(result2), [result3] "=&r"(result3),
516           [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
517           [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
518           [step2_14] "=r"(step2_14)
519         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
520           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
521           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
522           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
523 
524     __asm__ __volatile__(
525         "lh       %[load1],             10(%[input])                    \n\t"
526         "lh       %[load2],             22(%[input])                    \n\t"
527         "lh       %[load3],             26(%[input])                    \n\t"
528         "lh       %[load4],             6(%[input])                     \n\t"
529 
530         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
531         "mthi     $zero,                $ac1                            \n\t"
532         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
533         "mthi     $zero,                $ac3                            \n\t"
534 
535         "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
536         "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
537         "extp     %[result1],           $ac1,        31                 \n\t"
538 
539         "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
540         "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
541         "extp     %[result2],           $ac3,        31                 \n\t"
542 
543         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
544         "mthi     $zero,                $ac1                            \n\t"
545         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
546         "mthi     $zero,                $ac2                            \n\t"
547 
548         "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
549         "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
550         "extp     %[result3],           $ac1,        31                 \n\t"
551 
552         "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
553         "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
554         "extp     %[result4],           $ac2,        31                 \n\t"
555 
556         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
557         "mthi     $zero,                $ac1                            \n\t"
558         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
559         "mthi     $zero,                $ac3                            \n\t"
560 
561         "sub      %[load1],             %[result2],     %[result1]      \n\t"
562         "sub      %[load2],             %[result4],     %[result3]      \n\t"
563 
564         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
565         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
566         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
567         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
568 
569         "extp     %[step2_10],          $ac1,           31              \n\t"
570         "extp     %[step2_13],          $ac3,           31              \n\t"
571         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
572         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
573 
574         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
575           [load4] "=&r"(load4), [result1] "=&r"(result1),
576           [result2] "=&r"(result2), [result3] "=&r"(result3),
577           [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
578           [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
579           [step2_13] "=r"(step2_13)
580         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
581           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
582           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
583           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
584 
585     __asm__ __volatile__(
586         "lh       %[load5],             4(%[input])                   \n\t"
587         "lh       %[load6],             28(%[input])                  \n\t"
588         "lh       %[load7],             20(%[input])                  \n\t"
589         "lh       %[load8],             12(%[input])                  \n\t"
590 
591         "mtlo     %[const_2_power_13],  $ac1                          \n\t"
592         "mthi     $zero,                $ac1                          \n\t"
593         "mtlo     %[const_2_power_13],  $ac3                          \n\t"
594         "mthi     $zero,                $ac3                          \n\t"
595 
596         "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
597         "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
598         "extp     %[result1],           $ac1,        31               \n\t"
599 
600         "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
601         "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
602         "extp     %[result2],           $ac3,        31               \n\t"
603 
604         "mtlo     %[const_2_power_13],  $ac1                          \n\t"
605         "mthi     $zero,                $ac1                          \n\t"
606         "mtlo     %[const_2_power_13],  $ac2                          \n\t"
607         "mthi     $zero,                $ac2                          \n\t"
608 
609         "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
610         "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
611         "extp     %[result3],           $ac1,        31               \n\t"
612 
613         "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
614         "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
615         "extp     %[result4],           $ac2,        31               \n\t"
616 
617         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
618         "mthi     $zero,                $ac1                            \n\t"
619         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
620         "mthi     $zero,                $ac3                            \n\t"
621 
622         "sub      %[load5],             %[result4],     %[result3]      \n\t"
623         "sub      %[load5],             %[load5],       %[result1]      \n\t"
624         "add      %[load5],             %[load5],       %[result2]      \n\t"
625 
626         "sub      %[load6],             %[result1],     %[result2]      \n\t"
627         "sub      %[load6],             %[load6],       %[result3]      \n\t"
628         "add      %[load6],             %[load6],       %[result4]      \n\t"
629 
630         "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
631         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
632 
633         "extp     %[step1_5],           $ac1,           31              \n\t"
634         "extp     %[step1_6],           $ac3,           31              \n\t"
635 
636         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
637         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
638 
639         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
640           [load8] "=&r"(load8), [result1] "=&r"(result1),
641           [result2] "=&r"(result2), [result3] "=&r"(result3),
642           [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
643           [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
644           [step1_7] "=r"(step1_7)
645         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
646           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
647           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
648           [cospi_16_64] "r"(cospi_16_64));
649 
650     __asm__ __volatile__(
651         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
652         "mthi     $zero,                $ac0                            \n\t"
653         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
654         "mthi     $zero,                $ac1                            \n\t"
655 
656         "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
657         "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
658         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
659 
660         "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
661 
662         "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
663         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
664         "add      %[load6],             %[load6],       %[step2_9]      \n\t"
665 
666         "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
667 
668         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
669         "mthi     $zero,                $ac2                            \n\t"
670         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
671         "mthi     $zero,                $ac3                            \n\t"
672 
673         "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
674         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
675         "add      %[load5],             %[load5],       %[step2_11]     \n\t"
676 
677         "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
678 
679         "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
680         "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
681         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
682 
683         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
684 
685         "extp     %[step1_10],          $ac0,           31              \n\t"
686         "extp     %[step1_13],          $ac1,           31              \n\t"
687         "extp     %[step1_11],          $ac2,           31              \n\t"
688         "extp     %[step1_12],          $ac3,           31              \n\t"
689 
690         : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
691           [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
692           [step1_13] "=r"(step1_13)
693         : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
694           [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
695           [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
696           [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
697           [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
698 
699     step1_8 = step2_8 + step2_11;
700     step1_9 = step2_9 + step2_10;
701     step1_14 = step2_13 + step2_14;
702     step1_15 = step2_12 + step2_15;
703 
704     __asm__ __volatile__(
705         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
706         "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
707         "add      %[load5],         %[load5],           %[step1_15]     \n\t"
708         "addi     %[load5],         %[load5],           32              \n\t"
709         "sra      %[load5],         %[load5],           6               \n\t"
710         "add      %[load7],         %[load7],           %[load5]        \n\t"
711         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
712         "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
713         "add      %[load6],         %[load6],           %[step1_14]     \n\t"
714         "sb       %[load5],         0(%[dest_pix])                      \n\t"
715         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
716         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
717         "addi     %[load6],         %[load6],           32              \n\t"
718         "sra      %[load6],         %[load6],           6               \n\t"
719         "add      %[load8],         %[load8],           %[load6]        \n\t"
720         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
721         "sb       %[load6],         0(%[dest_pix])                      \n\t"
722         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
723 
724         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
725         "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
726         "add      %[load5],         %[load5],           %[step1_13]     \n\t"
727         "addi     %[load5],         %[load5],           32              \n\t"
728         "sra      %[load5],         %[load5],           6               \n\t"
729         "add      %[load7],         %[load7],           %[load5]        \n\t"
730         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
731         "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
732         "add      %[load6],         %[load6],           %[step1_12]     \n\t"
733         "sb       %[load5],         0(%[dest_pix])                      \n\t"
734         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
735         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
736         "addi     %[load6],         %[load6],           32              \n\t"
737         "sra      %[load6],         %[load6],           6               \n\t"
738         "add      %[load8],         %[load8],           %[load6]        \n\t"
739         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
740         "sb       %[load6],         0(%[dest_pix])                      \n\t"
741         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
742 
743         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
744         "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
745         "add      %[load5],         %[load5],           %[step1_11]     \n\t"
746         "addi     %[load5],         %[load5],           32              \n\t"
747         "sra      %[load5],         %[load5],           6               \n\t"
748         "add      %[load7],         %[load7],           %[load5]        \n\t"
749         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
750         "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
751         "add      %[load6],         %[load6],           %[step1_10]     \n\t"
752         "sb       %[load5],         0(%[dest_pix])                      \n\t"
753         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
754         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
755         "addi     %[load6],         %[load6],           32              \n\t"
756         "sra      %[load6],         %[load6],           6               \n\t"
757         "add      %[load8],         %[load8],           %[load6]        \n\t"
758         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
759         "sb       %[load6],         0(%[dest_pix])                      \n\t"
760         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
761 
762         "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
763         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
764         "add      %[load5],         %[load5],           %[step1_9]      \n\t"
765         "addi     %[load5],         %[load5],           32              \n\t"
766         "sra      %[load5],         %[load5],           6               \n\t"
767         "add      %[load7],         %[load7],           %[load5]        \n\t"
768         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
769         "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
770         "add      %[load6],         %[load6],           %[step1_8]      \n\t"
771         "sb       %[load5],         0(%[dest_pix])                      \n\t"
772         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
773         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
774         "addi     %[load6],         %[load6],           32              \n\t"
775         "sra      %[load6],         %[load6],           6               \n\t"
776         "add      %[load8],         %[load8],           %[load6]        \n\t"
777         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
778         "sb       %[load6],         0(%[dest_pix])                      \n\t"
779         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
780 
781         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
782         "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
783         "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
784         "addi     %[load5],         %[load5],           32              \n\t"
785         "sra      %[load5],         %[load5],           6               \n\t"
786         "add      %[load7],         %[load7],           %[load5]        \n\t"
787         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
788         "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
789         "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
790         "sb       %[load5],         0(%[dest_pix])                      \n\t"
791         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
792         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
793         "addi     %[load6],         %[load6],           32              \n\t"
794         "sra      %[load6],         %[load6],           6               \n\t"
795         "add      %[load8],         %[load8],           %[load6]        \n\t"
796         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
797         "sb       %[load6],         0(%[dest_pix])                      \n\t"
798         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
799 
800         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
801         "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
802         "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
803         "addi     %[load5],         %[load5],           32              \n\t"
804         "sra      %[load5],         %[load5],           6               \n\t"
805         "add      %[load7],         %[load7],           %[load5]        \n\t"
806         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
807         "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
808         "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
809         "sb       %[load5],         0(%[dest_pix])                      \n\t"
810         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
811         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
812         "addi     %[load6],         %[load6],           32              \n\t"
813         "sra      %[load6],         %[load6],           6               \n\t"
814         "add      %[load8],         %[load8],           %[load6]        \n\t"
815         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
816         "sb       %[load6],         0(%[dest_pix])                      \n\t"
817         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
818 
819         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
820         "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
821         "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
822         "addi     %[load5],         %[load5],           32              \n\t"
823         "sra      %[load5],         %[load5],           6               \n\t"
824         "add      %[load7],         %[load7],           %[load5]        \n\t"
825         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
826         "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
827         "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
828         "sb       %[load5],         0(%[dest_pix])                      \n\t"
829         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
830         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
831         "addi     %[load6],         %[load6],           32              \n\t"
832         "sra      %[load6],         %[load6],           6               \n\t"
833         "add      %[load8],         %[load8],           %[load6]        \n\t"
834         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
835         "sb       %[load6],         0(%[dest_pix])                      \n\t"
836         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
837 
838         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
839         "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
840         "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
841         "addi     %[load5],         %[load5],           32              \n\t"
842         "sra      %[load5],         %[load5],           6               \n\t"
843         "add      %[load7],         %[load7],           %[load5]        \n\t"
844         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
845         "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
846         "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
847         "sb       %[load5],         0(%[dest_pix])                      \n\t"
848         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
849         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
850         "addi     %[load6],         %[load6],           32              \n\t"
851         "sra      %[load6],         %[load6],           6               \n\t"
852         "add      %[load8],         %[load8],           %[load6]        \n\t"
853         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
854         "sb       %[load6],         0(%[dest_pix])                      \n\t"
855 
856         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
857           [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
858         :
859         [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
860         [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
861         [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
862         [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
863         [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
864         [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
865         [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
866 
867     input += 16;
868   }
869 }
870 
vpx_idct16x16_256_add_dspr2(const int16_t * input,uint8_t * dest,int stride)871 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
872                                  int stride) {
873   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
874   uint32_t pos = 45;
875 
876   /* bit positon for extract from acc */
877   __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
878 
879   // First transform rows
880   idct16_rows_dspr2(input, out, 16);
881 
882   // Then transform columns and add to dest
883   idct16_cols_add_blk_dspr2(out, dest, stride);
884 }
885 
vpx_idct16x16_10_add_dspr2(const int16_t * input,uint8_t * dest,int stride)886 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
887                                 int stride) {
888   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
889   int16_t *outptr = out;
890   uint32_t i;
891   uint32_t pos = 45;
892 
893   /* bit positon for extract from acc */
894   __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
895 
896   // First transform rows. Since all non-zero dct coefficients are in
897   // upper-left 4x4 area, we only need to calculate first 4 rows here.
898   idct16_rows_dspr2(input, outptr, 4);
899 
900   outptr += 4;
901   for (i = 0; i < 6; ++i) {
902     __asm__ __volatile__(
903         "sw     $zero,    0(%[outptr])     \n\t"
904         "sw     $zero,   32(%[outptr])     \n\t"
905         "sw     $zero,   64(%[outptr])     \n\t"
906         "sw     $zero,   96(%[outptr])     \n\t"
907         "sw     $zero,  128(%[outptr])     \n\t"
908         "sw     $zero,  160(%[outptr])     \n\t"
909         "sw     $zero,  192(%[outptr])     \n\t"
910         "sw     $zero,  224(%[outptr])     \n\t"
911         "sw     $zero,  256(%[outptr])     \n\t"
912         "sw     $zero,  288(%[outptr])     \n\t"
913         "sw     $zero,  320(%[outptr])     \n\t"
914         "sw     $zero,  352(%[outptr])     \n\t"
915         "sw     $zero,  384(%[outptr])     \n\t"
916         "sw     $zero,  416(%[outptr])     \n\t"
917         "sw     $zero,  448(%[outptr])     \n\t"
918         "sw     $zero,  480(%[outptr])     \n\t"
919 
920         :
921         : [outptr] "r"(outptr));
922 
923     outptr += 2;
924   }
925 
926   // Then transform columns
927   idct16_cols_add_blk_dspr2(out, dest, stride);
928 }
929 
vpx_idct16x16_1_add_dspr2(const int16_t * input,uint8_t * dest,int stride)930 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
931                                int stride) {
932   uint32_t pos = 45;
933   int32_t out;
934   int32_t r;
935   int32_t a1, absa1;
936   int32_t vector_a1;
937   int32_t t1, t2, t3, t4;
938   int32_t vector_1, vector_2, vector_3, vector_4;
939 
940   /* bit positon for extract from acc */
941   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
942 
943                        :
944                        : [pos] "r"(pos));
945 
946   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
947   __asm__ __volatile__(
948       "addi     %[out],     %[out],     32      \n\t"
949       "sra      %[a1],      %[out],     6       \n\t"
950 
951       : [out] "+r"(out), [a1] "=r"(a1)
952       :);
953 
954   if (a1 < 0) {
955     /* use quad-byte
956      * input and output memory are four byte aligned */
957     __asm__ __volatile__(
958         "abs        %[absa1],       %[a1]       \n\t"
959         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
960 
961         : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
962         : [a1] "r"(a1));
963 
964     for (r = 16; r--;) {
965       __asm__ __volatile__(
966           "lw             %[t1],          0(%[dest])                      \n\t"
967           "lw             %[t2],          4(%[dest])                      \n\t"
968           "lw             %[t3],          8(%[dest])                      \n\t"
969           "lw             %[t4],          12(%[dest])                     \n\t"
970           "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
971           "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
972           "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
973           "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
974           "sw             %[vector_1],    0(%[dest])                      \n\t"
975           "sw             %[vector_2],    4(%[dest])                      \n\t"
976           "sw             %[vector_3],    8(%[dest])                      \n\t"
977           "sw             %[vector_4],    12(%[dest])                     \n\t"
978           "add            %[dest],        %[dest],        %[stride]       \n\t"
979 
980           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
981             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
982             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
983             [dest] "+&r"(dest)
984           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
985     }
986   } else if (a1 > 255) {
987     int32_t a11, a12, vector_a11, vector_a12;
988 
989     /* use quad-byte
990      * input and output memory are four byte aligned */
991     a11 = a1 >> 1;
992     a12 = a1 - a11;
993     __asm__ __volatile__(
994         "replv.qb       %[vector_a11],  %[a11]     \n\t"
995         "replv.qb       %[vector_a12],  %[a12]     \n\t"
996 
997         : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
998         : [a11] "r"(a11), [a12] "r"(a12));
999 
1000     for (r = 16; r--;) {
1001       __asm__ __volatile__(
1002           "lw             %[t1],          0(%[dest])                      \n\t"
1003           "lw             %[t2],          4(%[dest])                      \n\t"
1004           "lw             %[t3],          8(%[dest])                      \n\t"
1005           "lw             %[t4],          12(%[dest])                     \n\t"
1006           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
1007           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
1008           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
1009           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
1010           "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
1011           "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
1012           "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
1013           "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
1014           "sw             %[vector_1],    0(%[dest])                      \n\t"
1015           "sw             %[vector_2],    4(%[dest])                      \n\t"
1016           "sw             %[vector_3],    8(%[dest])                      \n\t"
1017           "sw             %[vector_4],    12(%[dest])                     \n\t"
1018           "add            %[dest],        %[dest],        %[stride]       \n\t"
1019 
1020           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1021             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1022             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1023             [dest] "+&r"(dest)
1024           : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
1025             [vector_a12] "r"(vector_a12));
1026     }
1027   } else {
1028     /* use quad-byte
1029      * input and output memory are four byte aligned */
1030     __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
1031 
1032                          : [vector_a1] "=r"(vector_a1)
1033                          : [a1] "r"(a1));
1034 
1035     for (r = 16; r--;) {
1036       __asm__ __volatile__(
1037           "lw             %[t1],          0(%[dest])                      \n\t"
1038           "lw             %[t2],          4(%[dest])                      \n\t"
1039           "lw             %[t3],          8(%[dest])                      \n\t"
1040           "lw             %[t4],          12(%[dest])                     \n\t"
1041           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
1042           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
1043           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
1044           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
1045           "sw             %[vector_1],    0(%[dest])                      \n\t"
1046           "sw             %[vector_2],    4(%[dest])                      \n\t"
1047           "sw             %[vector_3],    8(%[dest])                      \n\t"
1048           "sw             %[vector_4],    12(%[dest])                     \n\t"
1049           "add            %[dest],        %[dest],        %[stride]       \n\t"
1050 
1051           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1052             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1053             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1054             [dest] "+&r"(dest)
1055           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
1056     }
1057   }
1058 }
1059 
iadst16_dspr2(const int16_t * input,int16_t * output)1060 void iadst16_dspr2(const int16_t *input, int16_t *output) {
1061   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
1062 
1063   int x0 = input[15];
1064   int x1 = input[0];
1065   int x2 = input[13];
1066   int x3 = input[2];
1067   int x4 = input[11];
1068   int x5 = input[4];
1069   int x6 = input[9];
1070   int x7 = input[6];
1071   int x8 = input[7];
1072   int x9 = input[8];
1073   int x10 = input[5];
1074   int x11 = input[10];
1075   int x12 = input[3];
1076   int x13 = input[12];
1077   int x14 = input[1];
1078   int x15 = input[14];
1079 
1080   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1081         x13 | x14 | x15)) {
1082     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
1083         output[6] = output[7] = output[8] = output[9] = output[10] =
1084             output[11] = output[12] = output[13] = output[14] = output[15] = 0;
1085     return;
1086   }
1087 
1088   // stage 1
1089   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1090   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1091   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1092   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1093   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1094   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1095   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1096   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1097   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1098   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1099   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1100   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1101   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1102   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1103   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1104   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1105 
1106   x0 = dct_const_round_shift(s0 + s8);
1107   x1 = dct_const_round_shift(s1 + s9);
1108   x2 = dct_const_round_shift(s2 + s10);
1109   x3 = dct_const_round_shift(s3 + s11);
1110   x4 = dct_const_round_shift(s4 + s12);
1111   x5 = dct_const_round_shift(s5 + s13);
1112   x6 = dct_const_round_shift(s6 + s14);
1113   x7 = dct_const_round_shift(s7 + s15);
1114   x8 = dct_const_round_shift(s0 - s8);
1115   x9 = dct_const_round_shift(s1 - s9);
1116   x10 = dct_const_round_shift(s2 - s10);
1117   x11 = dct_const_round_shift(s3 - s11);
1118   x12 = dct_const_round_shift(s4 - s12);
1119   x13 = dct_const_round_shift(s5 - s13);
1120   x14 = dct_const_round_shift(s6 - s14);
1121   x15 = dct_const_round_shift(s7 - s15);
1122 
1123   // stage 2
1124   s0 = x0;
1125   s1 = x1;
1126   s2 = x2;
1127   s3 = x3;
1128   s4 = x4;
1129   s5 = x5;
1130   s6 = x6;
1131   s7 = x7;
1132   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1133   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1134   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1135   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1136   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1137   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1138   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1139   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1140 
1141   x0 = s0 + s4;
1142   x1 = s1 + s5;
1143   x2 = s2 + s6;
1144   x3 = s3 + s7;
1145   x4 = s0 - s4;
1146   x5 = s1 - s5;
1147   x6 = s2 - s6;
1148   x7 = s3 - s7;
1149   x8 = dct_const_round_shift(s8 + s12);
1150   x9 = dct_const_round_shift(s9 + s13);
1151   x10 = dct_const_round_shift(s10 + s14);
1152   x11 = dct_const_round_shift(s11 + s15);
1153   x12 = dct_const_round_shift(s8 - s12);
1154   x13 = dct_const_round_shift(s9 - s13);
1155   x14 = dct_const_round_shift(s10 - s14);
1156   x15 = dct_const_round_shift(s11 - s15);
1157 
1158   // stage 3
1159   s0 = x0;
1160   s1 = x1;
1161   s2 = x2;
1162   s3 = x3;
1163   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1164   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1165   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1166   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1167   s8 = x8;
1168   s9 = x9;
1169   s10 = x10;
1170   s11 = x11;
1171   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1172   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1173   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1174   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1175 
1176   x0 = s0 + s2;
1177   x1 = s1 + s3;
1178   x2 = s0 - s2;
1179   x3 = s1 - s3;
1180   x4 = dct_const_round_shift(s4 + s6);
1181   x5 = dct_const_round_shift(s5 + s7);
1182   x6 = dct_const_round_shift(s4 - s6);
1183   x7 = dct_const_round_shift(s5 - s7);
1184   x8 = s8 + s10;
1185   x9 = s9 + s11;
1186   x10 = s8 - s10;
1187   x11 = s9 - s11;
1188   x12 = dct_const_round_shift(s12 + s14);
1189   x13 = dct_const_round_shift(s13 + s15);
1190   x14 = dct_const_round_shift(s12 - s14);
1191   x15 = dct_const_round_shift(s13 - s15);
1192 
1193   // stage 4
1194   s2 = (-cospi_16_64) * (x2 + x3);
1195   s3 = cospi_16_64 * (x2 - x3);
1196   s6 = cospi_16_64 * (x6 + x7);
1197   s7 = cospi_16_64 * (-x6 + x7);
1198   s10 = cospi_16_64 * (x10 + x11);
1199   s11 = cospi_16_64 * (-x10 + x11);
1200   s14 = (-cospi_16_64) * (x14 + x15);
1201   s15 = cospi_16_64 * (x14 - x15);
1202 
1203   x2 = dct_const_round_shift(s2);
1204   x3 = dct_const_round_shift(s3);
1205   x6 = dct_const_round_shift(s6);
1206   x7 = dct_const_round_shift(s7);
1207   x10 = dct_const_round_shift(s10);
1208   x11 = dct_const_round_shift(s11);
1209   x14 = dct_const_round_shift(s14);
1210   x15 = dct_const_round_shift(s15);
1211 
1212   output[0] = x0;
1213   output[1] = -x8;
1214   output[2] = x12;
1215   output[3] = -x4;
1216   output[4] = x6;
1217   output[5] = x14;
1218   output[6] = x10;
1219   output[7] = x2;
1220   output[8] = x3;
1221   output[9] = x11;
1222   output[10] = x15;
1223   output[11] = x7;
1224   output[12] = x5;
1225   output[13] = -x13;
1226   output[14] = x9;
1227   output[15] = -x1;
1228 }
1229 
1230 #endif  // HAVE_DSPR2
1231