1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    quant_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    20/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
WelsQuant4x4_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf)43 void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
44   __asm__ volatile (
45     ".set       arch=loongson3a                 \n\t"
46     "xor        $f10, $f10, $f10                \n\t"
47     "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
48     "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
49 
50     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
51     "xor        $f4, $f4, $f4                   \n\t"
52     "xor        $f6, $f6, $f6                   \n\t"
53     "pcmpgth    $f4, $f4, $f0                   \n\t"
54     "pcmpgth    $f6, $f6, $f2                   \n\t"
55     "xor        $f0, $f0, $f4                   \n\t"
56     "xor        $f2, $f2, $f6                   \n\t"
57     "psubh      $f0, $f0, $f4                   \n\t"
58     "psubh      $f2, $f2, $f6                   \n\t"
59     "paddush    $f0, $f0, $f8                   \n\t"
60     "paddush    $f2, $f2, $f10                  \n\t"
61     "pmulhuh    $f0, $f0, $f12                  \n\t"
62     "pmulhuh    $f2, $f2, $f14                  \n\t"
63     "xor        $f0, $f0, $f4                   \n\t"
64     "xor        $f2, $f2, $f6                   \n\t"
65     "psubh      $f0, $f0, $f4                   \n\t"
66     "psubh      $f2, $f2, $f6                   \n\t"
67     "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
68 
69     "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
70     "xor        $f4, $f4, $f4                   \n\t"
71     "xor        $f6, $f6, $f6                   \n\t"
72     "pcmpgth    $f4, $f4, $f0                   \n\t"
73     "pcmpgth    $f6, $f6, $f2                   \n\t"
74     "xor        $f0, $f0, $f4                   \n\t"
75     "xor        $f2, $f2, $f6                   \n\t"
76     "psubh      $f0, $f0, $f4                   \n\t"
77     "psubh      $f2, $f2, $f6                   \n\t"
78     "paddush    $f0, $f0, $f8                   \n\t"
79     "paddush    $f2, $f2, $f10                  \n\t"
80     "pmulhuh    $f0, $f0, $f12                  \n\t"
81     "pmulhuh    $f2, $f2, $f14                  \n\t"
82     "xor        $f0, $f0, $f4                   \n\t"
83     "xor        $f2, $f2, $f6                   \n\t"
84     "psubh      $f0, $f0, $f4                   \n\t"
85     "psubh      $f2, $f2, $f6                   \n\t"
86     "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
87    :
88    : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
89    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
90   );
91 }
92 
WelsQuant4x4Dc_mmi(int16_t * pDct,const int16_t ff,int16_t mf)93 void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
94   __asm__ volatile (
95     ".set       arch=loongson3a                 \n\t"
96     "xor        $f10, $f10, $f10                \n\t"
97     "dmtc1      %[mf], $f12                     \n\t"
98     "pshufh     $f12, $f12, $f10                \n\t"
99 
100     "dmtc1      %[ff], $f8                      \n\t"
101     "pshufh     $f8, $f8, $f10                  \n\t"
102 
103     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
104     "xor        $f4, $f4, $f4                   \n\t"
105     "xor        $f6, $f6, $f6                   \n\t"
106     "pcmpgth    $f4, $f4, $f0                   \n\t"
107     "pcmpgth    $f6, $f6, $f2                   \n\t"
108     "xor        $f0, $f0, $f4                   \n\t"
109     "xor        $f2, $f2, $f6                   \n\t"
110     "psubh      $f0, $f0, $f4                   \n\t"
111     "psubh      $f2, $f2, $f6                   \n\t"
112     "paddush    $f0, $f0, $f8                   \n\t"
113     "paddush    $f2, $f2, $f8                   \n\t"
114     "pmulhuh    $f0, $f0, $f12                  \n\t"
115     "pmulhuh    $f2, $f2, $f12                  \n\t"
116     "xor        $f0, $f0, $f4                   \n\t"
117     "xor        $f2, $f2, $f6                   \n\t"
118     "psubh      $f0, $f0, $f4                   \n\t"
119     "psubh      $f2, $f2, $f6                   \n\t"
120     "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
121 
122     "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
123     "xor        $f4, $f4, $f4                   \n\t"
124     "xor        $f6, $f6, $f6                   \n\t"
125     "pcmpgth    $f4, $f4, $f0                   \n\t"
126     "pcmpgth    $f6, $f6, $f2                   \n\t"
127     "xor        $f0, $f0, $f4                   \n\t"
128     "xor        $f2, $f2, $f6                   \n\t"
129     "psubh      $f0, $f0, $f4                   \n\t"
130     "psubh      $f2, $f2, $f6                   \n\t"
131     "paddush    $f0, $f0, $f8                   \n\t"
132     "paddush    $f2, $f2, $f8                   \n\t"
133     "pmulhuh    $f0, $f0, $f12                  \n\t"
134     "pmulhuh    $f2, $f2, $f12                  \n\t"
135     "xor        $f0, $f0, $f4                   \n\t"
136     "xor        $f2, $f2, $f6                   \n\t"
137     "psubh      $f0, $f0, $f4                   \n\t"
138     "psubh      $f2, $f2, $f6                   \n\t"
139     "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
140    :
141    : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
142    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
143   );
144 }
145 
WelsQuantFour4x4_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf)146 void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
147   __asm__ volatile (
148     ".set       arch=loongson3a                 \n\t"
149     "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
150     "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
151 
152     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
153     "xor        $f4, $f4, $f4                   \n\t"
154     "xor        $f6, $f6, $f6                   \n\t"
155     "pcmpgth    $f4, $f4, $f0                   \n\t"
156     "pcmpgth    $f6, $f6, $f2                   \n\t"
157     "xor        $f0, $f0, $f4                   \n\t"
158     "xor        $f2, $f2, $f6                   \n\t"
159     "psubh      $f0, $f0, $f4                   \n\t"
160     "psubh      $f2, $f2, $f6                   \n\t"
161     "paddush    $f0, $f0, $f8                   \n\t"
162     "paddush    $f2, $f2, $f10                  \n\t"
163     "pmulhuh    $f0, $f0, $f12                  \n\t"
164     "pmulhuh    $f2, $f2, $f14                  \n\t"
165     "xor        $f0, $f0, $f4                   \n\t"
166     "xor        $f2, $f2, $f6                   \n\t"
167     "psubh      $f0, $f0, $f4                   \n\t"
168     "psubh      $f2, $f2, $f6                   \n\t"
169     "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
170 
171     "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
172     "xor        $f4, $f4, $f4                   \n\t"
173     "xor        $f6, $f6, $f6                   \n\t"
174     "pcmpgth    $f4, $f4, $f0                   \n\t"
175     "pcmpgth    $f6, $f6, $f2                   \n\t"
176     "xor        $f0, $f0, $f4                   \n\t"
177     "xor        $f2, $f2, $f6                   \n\t"
178     "psubh      $f0, $f0, $f4                   \n\t"
179     "psubh      $f2, $f2, $f6                   \n\t"
180     "paddush    $f0, $f0, $f8                   \n\t"
181     "paddush    $f2, $f2, $f10                  \n\t"
182     "pmulhuh    $f0, $f0, $f12                  \n\t"
183     "pmulhuh    $f2, $f2, $f14                  \n\t"
184     "xor        $f0, $f0, $f4                   \n\t"
185     "xor        $f2, $f2, $f6                   \n\t"
186     "psubh      $f0, $f0, $f4                   \n\t"
187     "psubh      $f2, $f2, $f6                   \n\t"
188     "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
189 
190     "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
191     "xor        $f4, $f4, $f4                   \n\t"
192     "xor        $f6, $f6, $f6                   \n\t"
193     "pcmpgth    $f4, $f4, $f0                   \n\t"
194     "pcmpgth    $f6, $f6, $f2                   \n\t"
195     "xor        $f0, $f0, $f4                   \n\t"
196     "xor        $f2, $f2, $f6                   \n\t"
197     "psubh      $f0, $f0, $f4                   \n\t"
198     "psubh      $f2, $f2, $f6                   \n\t"
199     "paddush    $f0, $f0, $f8                   \n\t"
200     "paddush    $f2, $f2, $f10                  \n\t"
201     "pmulhuh    $f0, $f0, $f12                  \n\t"
202     "pmulhuh    $f2, $f2, $f14                  \n\t"
203     "xor        $f0, $f0, $f4                   \n\t"
204     "xor        $f2, $f2, $f6                   \n\t"
205     "psubh      $f0, $f0, $f4                   \n\t"
206     "psubh      $f2, $f2, $f6                   \n\t"
207     "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
208 
209     "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
210     "xor        $f4, $f4, $f4                   \n\t"
211     "xor        $f6, $f6, $f6                   \n\t"
212     "pcmpgth    $f4, $f4, $f0                   \n\t"
213     "pcmpgth    $f6, $f6, $f2                   \n\t"
214     "xor        $f0, $f0, $f4                   \n\t"
215     "xor        $f2, $f2, $f6                   \n\t"
216     "psubh      $f0, $f0, $f4                   \n\t"
217     "psubh      $f2, $f2, $f6                   \n\t"
218     "paddush    $f0, $f0, $f8                   \n\t"
219     "paddush    $f2, $f2, $f10                  \n\t"
220     "pmulhuh    $f0, $f0, $f12                  \n\t"
221     "pmulhuh    $f2, $f2, $f14                  \n\t"
222     "xor        $f0, $f0, $f4                   \n\t"
223     "xor        $f2, $f2, $f6                   \n\t"
224     "psubh      $f0, $f0, $f4                   \n\t"
225     "psubh      $f2, $f2, $f6                   \n\t"
226     "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
227 
228     "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
229     "xor        $f4, $f4, $f4                   \n\t"
230     "xor        $f6, $f6, $f6                   \n\t"
231     "pcmpgth    $f4, $f4, $f0                   \n\t"
232     "pcmpgth    $f6, $f6, $f2                   \n\t"
233     "xor        $f0, $f0, $f4                   \n\t"
234     "xor        $f2, $f2, $f6                   \n\t"
235     "psubh      $f0, $f0, $f4                   \n\t"
236     "psubh      $f2, $f2, $f6                   \n\t"
237     "paddush    $f0, $f0, $f8                   \n\t"
238     "paddush    $f2, $f2, $f10                  \n\t"
239     "pmulhuh    $f0, $f0, $f12                  \n\t"
240     "pmulhuh    $f2, $f2, $f14                  \n\t"
241     "xor        $f0, $f0, $f4                   \n\t"
242     "xor        $f2, $f2, $f6                   \n\t"
243     "psubh      $f0, $f0, $f4                   \n\t"
244     "psubh      $f2, $f2, $f6                   \n\t"
245     "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
246 
247     "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
248     "xor        $f4, $f4, $f4                   \n\t"
249     "xor        $f6, $f6, $f6                   \n\t"
250     "pcmpgth    $f4, $f4, $f0                   \n\t"
251     "pcmpgth    $f6, $f6, $f2                   \n\t"
252     "xor        $f0, $f0, $f4                   \n\t"
253     "xor        $f2, $f2, $f6                   \n\t"
254     "psubh      $f0, $f0, $f4                   \n\t"
255     "psubh      $f2, $f2, $f6                   \n\t"
256     "paddush    $f0, $f0, $f8                   \n\t"
257     "paddush    $f2, $f2, $f10                  \n\t"
258     "pmulhuh    $f0, $f0, $f12                  \n\t"
259     "pmulhuh    $f2, $f2, $f14                  \n\t"
260     "xor        $f0, $f0, $f4                   \n\t"
261     "xor        $f2, $f2, $f6                   \n\t"
262     "psubh      $f0, $f0, $f4                   \n\t"
263     "psubh      $f2, $f2, $f6                   \n\t"
264     "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
265 
266     "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
267     "xor        $f4, $f4, $f4                   \n\t"
268     "xor        $f6, $f6, $f6                   \n\t"
269     "pcmpgth    $f4, $f4, $f0                   \n\t"
270     "pcmpgth    $f6, $f6, $f2                   \n\t"
271     "xor        $f0, $f0, $f4                   \n\t"
272     "xor        $f2, $f2, $f6                   \n\t"
273     "psubh      $f0, $f0, $f4                   \n\t"
274     "psubh      $f2, $f2, $f6                   \n\t"
275     "paddush    $f0, $f0, $f8                   \n\t"
276     "paddush    $f2, $f2, $f10                  \n\t"
277     "pmulhuh    $f0, $f0, $f12                  \n\t"
278     "pmulhuh    $f2, $f2, $f14                  \n\t"
279     "xor        $f0, $f0, $f4                   \n\t"
280     "xor        $f2, $f2, $f6                   \n\t"
281     "psubh      $f0, $f0, $f4                   \n\t"
282     "psubh      $f2, $f2, $f6                   \n\t"
283     "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
284 
285     "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
286     "xor        $f4, $f4, $f4                   \n\t"
287     "xor        $f6, $f6, $f6                   \n\t"
288     "pcmpgth    $f4, $f4, $f0                   \n\t"
289     "pcmpgth    $f6, $f6, $f2                   \n\t"
290     "xor        $f0, $f0, $f4                   \n\t"
291     "xor        $f2, $f2, $f6                   \n\t"
292     "psubh      $f0, $f0, $f4                   \n\t"
293     "psubh      $f2, $f2, $f6                   \n\t"
294     "paddush    $f0, $f0, $f8                   \n\t"
295     "paddush    $f2, $f2, $f10                  \n\t"
296     "pmulhuh    $f0, $f0, $f12                  \n\t"
297     "pmulhuh    $f2, $f2, $f14                  \n\t"
298     "xor        $f0, $f0, $f4                   \n\t"
299     "xor        $f2, $f2, $f6                   \n\t"
300     "psubh      $f0, $f0, $f4                   \n\t"
301     "psubh      $f2, $f2, $f6                   \n\t"
302     "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
303    :
304    : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
305    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
306   );
307 }
308 
WelsQuantFour4x4Max_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf,int16_t * max)309 void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
310                              const int16_t *mf, int16_t *max) {
311   BACKUP_REG;
312   __asm__ volatile (
313     ".set       arch=loongson3a                 \n\t"
314     "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
315     "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
316 
317     "xor        $f16, $f16, $f16                \n\t"
318     "xor        $f18, $f18, $f18                \n\t"
319     "xor        $f20, $f20, $f20                \n\t"
320     "xor        $f22, $f22, $f22                \n\t"
321     "xor        $f24, $f24, $f24                \n\t"
322     "xor        $f26, $f26, $f26                \n\t"
323     "xor        $f28, $f28, $f28                \n\t"
324     "xor        $f30, $f30, $f30                \n\t"
325 
326     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
327     "xor        $f4, $f4, $f4                   \n\t"
328     "xor        $f6, $f6, $f6                   \n\t"
329     "pcmpgth    $f4, $f4, $f0                   \n\t"
330     "pcmpgth    $f6, $f6, $f2                   \n\t"
331     "xor        $f0, $f0, $f4                   \n\t"
332     "xor        $f2, $f2, $f6                   \n\t"
333     "psubh      $f0, $f0, $f4                   \n\t"
334     "psubh      $f2, $f2, $f6                   \n\t"
335     "paddush    $f0, $f0, $f8                   \n\t"
336     "paddush    $f2, $f2, $f10                  \n\t"
337     "pmulhuh    $f0, $f0, $f12                  \n\t"
338     "pmulhuh    $f2, $f2, $f14                  \n\t"
339     "pmaxsh     $f16, $f16, $f0                 \n\t"
340     "pmaxsh     $f18, $f18, $f2                 \n\t"
341     "xor        $f0, $f0, $f4                   \n\t"
342     "xor        $f2, $f2, $f6                   \n\t"
343     "psubh      $f0, $f0, $f4                   \n\t"
344     "psubh      $f2, $f2, $f6                   \n\t"
345     "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
346 
347     "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
348     "xor        $f4, $f4, $f4                   \n\t"
349     "xor        $f6, $f6, $f6                   \n\t"
350     "pcmpgth    $f4, $f4, $f0                   \n\t"
351     "pcmpgth    $f6, $f6, $f2                   \n\t"
352     "xor        $f0, $f0, $f4                   \n\t"
353     "xor        $f2, $f2, $f6                   \n\t"
354     "psubh      $f0, $f0, $f4                   \n\t"
355     "psubh      $f2, $f2, $f6                   \n\t"
356     "paddush    $f0, $f0, $f8                   \n\t"
357     "paddush    $f2, $f2, $f10                  \n\t"
358     "pmulhuh    $f0, $f0, $f12                  \n\t"
359     "pmulhuh    $f2, $f2, $f14                  \n\t"
360     "pmaxsh     $f16, $f16, $f0                 \n\t"
361     "pmaxsh     $f18, $f18, $f2                 \n\t"
362     "xor        $f0, $f0, $f4                   \n\t"
363     "xor        $f2, $f2, $f6                   \n\t"
364     "psubh      $f0, $f0, $f4                   \n\t"
365     "psubh      $f2, $f2, $f6                   \n\t"
366     "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
367 
368     "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
369     "xor        $f4, $f4, $f4                   \n\t"
370     "xor        $f6, $f6, $f6                   \n\t"
371     "pcmpgth    $f4, $f4, $f0                   \n\t"
372     "pcmpgth    $f6, $f6, $f2                   \n\t"
373     "xor        $f0, $f0, $f4                   \n\t"
374     "xor        $f2, $f2, $f6                   \n\t"
375     "psubh      $f0, $f0, $f4                   \n\t"
376     "psubh      $f2, $f2, $f6                   \n\t"
377     "paddush    $f0, $f0, $f8                   \n\t"
378     "paddush    $f2, $f2, $f10                  \n\t"
379     "pmulhuh    $f0, $f0, $f12                  \n\t"
380     "pmulhuh    $f2, $f2, $f14                  \n\t"
381     "pmaxsh     $f20, $f20, $f0                 \n\t"
382     "pmaxsh     $f22, $f22, $f2                 \n\t"
383     "xor        $f0, $f0, $f4                   \n\t"
384     "xor        $f2, $f2, $f6                   \n\t"
385     "psubh      $f0, $f0, $f4                   \n\t"
386     "psubh      $f2, $f2, $f6                   \n\t"
387     "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
388 
389     "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
390     "xor        $f4, $f4, $f4                   \n\t"
391     "xor        $f6, $f6, $f6                   \n\t"
392     "pcmpgth    $f4, $f4, $f0                   \n\t"
393     "pcmpgth    $f6, $f6, $f2                   \n\t"
394     "xor        $f0, $f0, $f4                   \n\t"
395     "xor        $f2, $f2, $f6                   \n\t"
396     "psubh      $f0, $f0, $f4                   \n\t"
397     "psubh      $f2, $f2, $f6                   \n\t"
398     "paddush    $f0, $f0, $f8                   \n\t"
399     "paddush    $f2, $f2, $f10                  \n\t"
400     "pmulhuh    $f0, $f0, $f12                  \n\t"
401     "pmulhuh    $f2, $f2, $f14                  \n\t"
402     "pmaxsh     $f20, $f20, $f0                 \n\t"
403     "pmaxsh     $f22, $f22, $f2                 \n\t"
404     "xor        $f0, $f0, $f4                   \n\t"
405     "xor        $f2, $f2, $f6                   \n\t"
406     "psubh      $f0, $f0, $f4                   \n\t"
407     "psubh      $f2, $f2, $f6                   \n\t"
408     "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
409 
410     "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
411     "xor        $f4, $f4, $f4                   \n\t"
412     "xor        $f6, $f6, $f6                   \n\t"
413     "pcmpgth    $f4, $f4, $f0                   \n\t"
414     "pcmpgth    $f6, $f6, $f2                   \n\t"
415     "xor        $f0, $f0, $f4                   \n\t"
416     "xor        $f2, $f2, $f6                   \n\t"
417     "psubh      $f0, $f0, $f4                   \n\t"
418     "psubh      $f2, $f2, $f6                   \n\t"
419     "paddush    $f0, $f0, $f8                   \n\t"
420     "paddush    $f2, $f2, $f10                  \n\t"
421     "pmulhuh    $f0, $f0, $f12                  \n\t"
422     "pmulhuh    $f2, $f2, $f14                  \n\t"
423     "pmaxsh     $f24, $f24, $f0                 \n\t"
424     "pmaxsh     $f26, $f26, $f2                 \n\t"
425     "xor        $f0, $f0, $f4                   \n\t"
426     "xor        $f2, $f2, $f6                   \n\t"
427     "psubh      $f0, $f0, $f4                   \n\t"
428     "psubh      $f2, $f2, $f6                   \n\t"
429     "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
430 
431     "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
432     "xor        $f4, $f4, $f4                   \n\t"
433     "xor        $f6, $f6, $f6                   \n\t"
434     "pcmpgth    $f4, $f4, $f0                   \n\t"
435     "pcmpgth    $f6, $f6, $f2                   \n\t"
436     "xor        $f0, $f0, $f4                   \n\t"
437     "xor        $f2, $f2, $f6                   \n\t"
438     "psubh      $f0, $f0, $f4                   \n\t"
439     "psubh      $f2, $f2, $f6                   \n\t"
440     "paddush    $f0, $f0, $f8                   \n\t"
441     "paddush    $f2, $f2, $f10                  \n\t"
442     "pmulhuh    $f0, $f0, $f12                  \n\t"
443     "pmulhuh    $f2, $f2, $f14                  \n\t"
444     "pmaxsh     $f24, $f24, $f0                 \n\t"
445     "pmaxsh     $f26, $f26, $f2                 \n\t"
446     "xor        $f0, $f0, $f4                   \n\t"
447     "xor        $f2, $f2, $f6                   \n\t"
448     "psubh      $f0, $f0, $f4                   \n\t"
449     "psubh      $f2, $f2, $f6                   \n\t"
450     "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
451 
452     "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
453     "xor        $f4, $f4, $f4                   \n\t"
454     "xor        $f6, $f6, $f6                   \n\t"
455     "pcmpgth    $f4, $f4, $f0                   \n\t"
456     "pcmpgth    $f6, $f6, $f2                   \n\t"
457     "xor        $f0, $f0, $f4                   \n\t"
458     "xor        $f2, $f2, $f6                   \n\t"
459     "psubh      $f0, $f0, $f4                   \n\t"
460     "psubh      $f2, $f2, $f6                   \n\t"
461     "paddush    $f0, $f0, $f8                   \n\t"
462     "paddush    $f2, $f2, $f10                  \n\t"
463     "pmulhuh    $f0, $f0, $f12                  \n\t"
464     "pmulhuh    $f2, $f2, $f14                  \n\t"
465     "pmaxsh     $f28, $f28, $f0                 \n\t"
466     "pmaxsh     $f30, $f30, $f2                 \n\t"
467     "xor        $f0, $f0, $f4                   \n\t"
468     "xor        $f2, $f2, $f6                   \n\t"
469     "psubh      $f0, $f0, $f4                   \n\t"
470     "psubh      $f2, $f2, $f6                   \n\t"
471     "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
472 
473     "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
474     "xor        $f4, $f4, $f4                   \n\t"
475     "xor        $f6, $f6, $f6                   \n\t"
476     "pcmpgth    $f4, $f4, $f0                   \n\t"
477     "pcmpgth    $f6, $f6, $f2                   \n\t"
478     "xor        $f0, $f0, $f4                   \n\t"
479     "xor        $f2, $f2, $f6                   \n\t"
480     "psubh      $f0, $f0, $f4                   \n\t"
481     "psubh      $f2, $f2, $f6                   \n\t"
482     "paddush    $f0, $f0, $f8                   \n\t"
483     "paddush    $f2, $f2, $f10                  \n\t"
484     "pmulhuh    $f0, $f0, $f12                  \n\t"
485     "pmulhuh    $f2, $f2, $f14                  \n\t"
486     "pmaxsh     $f28, $f28, $f0                 \n\t"
487     "pmaxsh     $f30, $f30, $f2                 \n\t"
488     "xor        $f0, $f0, $f4                   \n\t"
489     "xor        $f2, $f2, $f6                   \n\t"
490     "psubh      $f0, $f0, $f4                   \n\t"
491     "psubh      $f2, $f2, $f6                   \n\t"
492     "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
493 
494     "mov.d      $f0, $f18                       \n\t"
495     "punpckhhw  $f18, $f16, $f20                \n\t"
496     "punpcklhw  $f16, $f16, $f20                \n\t"
497     "punpckhhw  $f2, $f0, $f22                  \n\t"
498     "punpcklhw  $f0, $f0, $f22                  \n\t"
499 
500     "mov.d      $f20, $f26                      \n\t"
501     "punpckhhw  $f26, $f24, $f28                \n\t"
502     "punpcklhw  $f24, $f24, $f28                \n\t"
503     "punpckhhw  $f22, $f20, $f30                \n\t"
504     "punpcklhw  $f20, $f20, $f30                \n\t"
505 
506     "mov.d      $f28, $f18                      \n\t"
507     "punpckhwd  $f18, $f16, $f24                \n\t"
508     "punpcklwd  $f16, $f16, $f24                \n\t"
509     "punpckhwd  $f30, $f28, $f26                \n\t"
510     "punpcklwd  $f28, $f28, $f26                \n\t"
511 
512     "mov.d      $f24, $f2                       \n\t"
513     "punpckhwd  $f2, $f0, $f20                  \n\t"
514     "punpcklwd  $f0, $f0, $f20                  \n\t"
515     "punpckhwd  $f26, $f24, $f22                \n\t"
516     "punpcklwd  $f24, $f24, $f22                \n\t"
517 
518     "mov.d      $f20, $f18                      \n\t"
519     "mov.d      $f18, $f0                       \n\t"
520     "mov.d      $f22, $f2                       \n\t"
521 
522     "mov.d      $f0, $f30                       \n\t"
523     "mov.d      $f30, $f24                      \n\t"
524     "mov.d      $f2, $f26                       \n\t"
525 
526     "pmaxsh     $f0, $f0, $f16                  \n\t"
527     "pmaxsh     $f2, $f2, $f18                  \n\t"
528 
529     "pmaxsh     $f0, $f0, $f20                  \n\t"
530     "pmaxsh     $f2, $f2, $f22                  \n\t"
531 
532     "pmaxsh     $f0, $f0, $f28                  \n\t"
533     "pmaxsh     $f2, $f2, $f30                  \n\t"
534 
535     "mov.d      $f4, $f0                        \n\t"
536     "mov.d      $f6, $f2                        \n\t"
537 
538     "mov.d      $f0, $f2                        \n\t"
539     "mov.d      $f2, $f6                        \n\t"
540 
541     "pmaxsh     $f0, $f0, $f4                   \n\t"
542     "pmaxsh     $f2, $f2, $f6                   \n\t"
543 
544     "gssdlc1    $f0, 0x7(%[max])                \n\t"
545     "gssdrc1    $f0, 0x0(%[max])                \n\t"
546    :
547    : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
548      [max]"r"((short *)max)
549    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
550      "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
551   );
552   RECOVER_REG;
553 }
554