1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    intra_pred_com_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    23/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 #define MMI_PRED_H_16X16_ONE_LINE \
44   PTR_ADDIU  "%[pPred], %[pPred], 0x10                  \n\t" \
45   PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
46   "lbu        $8, 0x0(%[pRef])                          \n\t" \
47   MMI_Copy16Times($f0, $f2, $f4, $8)                          \
48   "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
49 
50 #define LOAD_2_LEFT_AND_ADD \
51   PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
52   "lbu        $9, -0x1(%[pRef])                         \n\t" \
53   PTR_ADDU   "$8, $8, $9                                \n\t" \
54   PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
55   "lbu        $9, -0x1(%[pRef])                         \n\t" \
56   PTR_ADDU   "$8, $8, $9                                \n\t"
57 
58 //f2 should be mmi_01bytes, f4 should be 0x38, f6 should be 0x0
59 #define MMI_PRED_H_8X8_ONE_LINE(f0, f2, f4, f6, r0, r1, r1_offset) \
60   PTR_ADDU   ""#r0", "#r0", %[kiStride]                 \n\t" \
61   "gsldxc1    "#f0", -0x8("#r0", $0)                    \n\t" \
62   "dsrl       "#f0", "#f0", "#f4"                       \n\t" \
63   "pmullh     "#f0", "#f0", "#f2"                       \n\t" \
64   "pshufh     "#f0", "#f0", "#f6"                       \n\t" \
65   "gssdxc1    "#f0", "#r1_offset"+0x0("#r1", $0)        \n\t"
66 
WelsI16x16LumaPredV_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)67 void WelsI16x16LumaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
68   __asm__ volatile (
69     ".set     arch=loongson3a                             \n\t"
70     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
71     "gslqc1     $f2, $f0, 0x0(%[pRef])                    \n\t"
72 
73     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
74     "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
75     "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
76     "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
77     "gssqc1     $f2, $f0, 0x40(%[pPred])                  \n\t"
78     "gssqc1     $f2, $f0, 0x50(%[pPred])                  \n\t"
79     "gssqc1     $f2, $f0, 0x60(%[pPred])                  \n\t"
80     "gssqc1     $f2, $f0, 0x70(%[pPred])                  \n\t"
81     "gssqc1     $f2, $f0, 0x80(%[pPred])                  \n\t"
82     "gssqc1     $f2, $f0, 0x90(%[pPred])                  \n\t"
83     "gssqc1     $f2, $f0, 0xa0(%[pPred])                  \n\t"
84     "gssqc1     $f2, $f0, 0xb0(%[pPred])                  \n\t"
85     "gssqc1     $f2, $f0, 0xc0(%[pPred])                  \n\t"
86     "gssqc1     $f2, $f0, 0xd0(%[pPred])                  \n\t"
87     "gssqc1     $f2, $f0, 0xe0(%[pPred])                  \n\t"
88     "gssqc1     $f2, $f0, 0xf0(%[pPred])                  \n\t"
89     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
90     : [kiStride]"r"((int)kiStride)
91     : "memory", "$f0", "$f2"
92   );
93 }
94 
WelsI16x16LumaPredH_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)95 void WelsI16x16LumaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
96   __asm__ volatile (
97     ".set     arch=loongson3a                             \n\t"
98     PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
99     "lbu        $8, 0x0(%[pRef])                          \n\t"
100     "xor        $f4, $f4, $f4                             \n\t"
101     MMI_Copy16Times($f0, $f2, $f4, $8)
102     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
103 
104     MMI_PRED_H_16X16_ONE_LINE
105     MMI_PRED_H_16X16_ONE_LINE
106     MMI_PRED_H_16X16_ONE_LINE
107     MMI_PRED_H_16X16_ONE_LINE
108     MMI_PRED_H_16X16_ONE_LINE
109     MMI_PRED_H_16X16_ONE_LINE
110     MMI_PRED_H_16X16_ONE_LINE
111     MMI_PRED_H_16X16_ONE_LINE
112     MMI_PRED_H_16X16_ONE_LINE
113     MMI_PRED_H_16X16_ONE_LINE
114     MMI_PRED_H_16X16_ONE_LINE
115     MMI_PRED_H_16X16_ONE_LINE
116     MMI_PRED_H_16X16_ONE_LINE
117     MMI_PRED_H_16X16_ONE_LINE
118     MMI_PRED_H_16X16_ONE_LINE
119     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
120     : [kiStride]"r"((int)kiStride)
121     : "memory", "$8", "$f0", "$f2", "$f4"
122   );
123 }
124 
WelsI16x16LumaPredDc_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)125 void WelsI16x16LumaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
126   unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
127                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
128   __asm__ volatile (
129     ".set     arch=loongson3a                             \n\t"
130     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
131     "gslqc1     $f2, $f0, 0x0(%[pRef])                    \n\t"
132     "xor        $f4, $f4, $f4                             \n\t"
133     "pasubub    $f0, $f0, $f4                             \n\t"
134     "pasubub    $f2, $f2, $f4                             \n\t"
135     "biadd      $f0, $f0                                  \n\t"
136     "biadd      $f2, $f2                                  \n\t"
137     "paddh      $f0, $f0, $f2                             \n\t"
138 
139     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
140     "lbu        $8, -0x1(%[pRef])                         \n\t"
141     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
142     "lbu        $9, -0x1(%[pRef])                         \n\t"
143     PTR_ADDU   "$8, $8, $9                                \n\t"
144 
145     LOAD_2_LEFT_AND_ADD
146     LOAD_2_LEFT_AND_ADD
147     LOAD_2_LEFT_AND_ADD
148     LOAD_2_LEFT_AND_ADD
149     LOAD_2_LEFT_AND_ADD
150     LOAD_2_LEFT_AND_ADD
151     LOAD_2_LEFT_AND_ADD
152 
153     "dli        $10, 0x5                                  \n\t"
154     "dmtc1      $10, $f6                                  \n\t"
155     PTR_ADDIU  "$8, 0x10                                  \n\t"
156     "dmtc1      $8, $f4                                   \n\t"
157     "paddh      $f0, $f0, $f4                             \n\t"
158     "psrlw      $f0, $f0, $f6                             \n\t"
159     "gsldxc1    $f6, 0x0(%[mmi_01bytes], $0)              \n\t"
160     "pmuluw     $f0, $f0, $f6                             \n\t"
161     "punpcklwd  $f0, $f0, $f0                             \n\t"
162     "mov.d      $f2, $f0                                  \n\t"
163 
164     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
165     "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
166     "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
167     "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
168     "gssqc1     $f2, $f0, 0x40(%[pPred])                  \n\t"
169     "gssqc1     $f2, $f0, 0x50(%[pPred])                  \n\t"
170     "gssqc1     $f2, $f0, 0x60(%[pPred])                  \n\t"
171     "gssqc1     $f2, $f0, 0x70(%[pPred])                  \n\t"
172     "gssqc1     $f2, $f0, 0x80(%[pPred])                  \n\t"
173     "gssqc1     $f2, $f0, 0x90(%[pPred])                  \n\t"
174     "gssqc1     $f2, $f0, 0xa0(%[pPred])                  \n\t"
175     "gssqc1     $f2, $f0, 0xb0(%[pPred])                  \n\t"
176     "gssqc1     $f2, $f0, 0xc0(%[pPred])                  \n\t"
177     "gssqc1     $f2, $f0, 0xd0(%[pPred])                  \n\t"
178     "gssqc1     $f2, $f0, 0xe0(%[pPred])                  \n\t"
179     "gssqc1     $f2, $f0, 0xf0(%[pPred])                  \n\t"
180     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
181     : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
182     : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
183   );
184 }
185 
WelsI16x16LumaPredPlane_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)186 void WelsI16x16LumaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
187   short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4,
188                                                               -3, -2, -1, 0};
189   short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
190   short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
191   BACKUP_REG;
192   __asm__ volatile (
193     ".set       arch=loongson3a                           \n\t"
194     PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
195     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
196 
197     "gsldlc1    $f0, 0x7(%[pRef])                         \n\t"
198     "xor        $f28, $f28, $f28                          \n\t"
199     "gsldrc1    $f0, 0x0(%[pRef])                         \n\t"
200     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_dec])         \n\t"
201     "punpckhbh  $f2, $f0, $f28                            \n\t"
202     "punpcklbh  $f0, $f0, $f28                            \n\t"
203     "gsldlc1    $f4, 0x10(%[pRef])                        \n\t"
204     "pmullh     $f0, $f0, $f20                            \n\t"
205     "pmullh     $f2, $f2, $f22                            \n\t"
206     "gsldrc1    $f4, 0x9(%[pRef])                         \n\t"
207     "gslqc1     $f26, $f24, 0x0(%[mmi_plane_inc])         \n\t"
208     "punpckhbh  $f6, $f4, $f28                            \n\t"
209     "punpcklbh  $f4, $f4, $f28                            \n\t"
210     "pmullh     $f4, $f4, $f24                            \n\t"
211     "pmullh     $f6, $f6, $f26                            \n\t"
212     "psubh      $f4, $f4, $f0                             \n\t"
213     "psubh      $f6, $f6, $f2                             \n\t"
214 
215     "xor        $f8, $f8, $f8                             \n\t"
216     SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
217     "dmfc1      $8, $f4                                   \n\t"
218     "seh        $8, $8                                    \n\t"
219     "mul        $8, $8, 0x5                               \n\t"
220     PTR_ADDIU  "$8, $8, 0x20                              \n\t"
221     "sra        $8, $8, 0x6                               \n\t"
222     MMI_Copy8Times($f4, $f6, $f28, $8)
223 
224     "lbu        $9, 0x10(%[pRef])                         \n\t"
225     PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
226     LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16,
227                 $f18, %[pRef], %[kiStride], $11)
228 
229     PTR_ADDIU  "%[pRef], %[pRef], 0x3                     \n\t"
230     "dsll       $10, %[kiStride], 0x3                     \n\t"
231     PTR_ADDU   "$10, $10, %[pRef]                         \n\t"
232     "lbu        $8, 0x0($10)                              \n\t"
233     PTR_ADDU   "$9, $9, $8                                \n\t"
234     "dsll       $9, $9, 0x4                               \n\t"
235 
236     PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
237     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
238     LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16,
239                 $f18, %[pRef], %[kiStride], $11)
240     "xor        $f16, $f16, $f16                          \n\t"
241     "xor        $f18, $f18, $f18                          \n\t"
242     "punpcklbh  $f0, $f2, $f18                            \n\t"
243     "punpckhbh  $f2, $f2, $f18                            \n\t"
244     "pmullh     $f0, $f0, $f20                            \n\t"
245     "pmullh     $f2, $f2, $f22                            \n\t"
246     "punpcklbh  $f28, $f30, $f18                          \n\t"
247     "punpckhbh  $f30, $f30, $f18                          \n\t"
248     "pmullh     $f28, $f28, $f24                          \n\t"
249     "pmullh     $f30, $f30, $f26                          \n\t"
250     "psubh      $f28, $f28, $f0                           \n\t"
251     "psubh      $f30, $f30, $f2                           \n\t"
252 
253     SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
254     "dmfc1      $8, $f28                                  \n\t"
255     "seh        $8, $8                                    \n\t"
256     "mul        $8, $8, 0x5                               \n\t"
257     PTR_ADDIU  "$8, $8, 0x20                              \n\t"
258     "sra        $8, $8, 0x6                               \n\t"
259     "xor        $f20, $f20, $f20                          \n\t"
260     MMI_Copy8Times($f16, $f18, $f20, $8)
261 
262     PTR_ADDIU  "$9, $9, 0x10                              \n\t"
263     "mul        $8, $8, -0x7                              \n\t"
264     PTR_ADDU   "$8, $8, $9                                \n\t"
265     "xor        $f20, $f20, $f20                          \n\t"
266     MMI_Copy8Times($f0, $f2, $f20, $8)
267 
268     "xor        $8, $8, $8                                \n\t"
269     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_inc_minus])   \n\t"
270 
271     "dli        $10, 0x5                                  \n\t"
272     "dmtc1      $10, $f30                                 \n\t"
273     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
274     "1:                                                   \n\t"
275     "pmullh     $f8, $f4, $f20                            \n\t"
276     "pmullh     $f10, $f6, $f22                           \n\t"
277     "paddh      $f8, $f8, $f0                             \n\t"
278     "paddh      $f10, $f10, $f2                           \n\t"
279     "psrah      $f8, $f8, $f30                            \n\t"
280     "psrah      $f10, $f10, $f30                          \n\t"
281     "pmullh     $f12, $f4, $f24                           \n\t"
282     "pmullh     $f14, $f6, $f26                           \n\t"
283     "paddh      $f12, $f12, $f0                           \n\t"
284     "paddh      $f14, $f14, $f2                           \n\t"
285     "psrah      $f12, $f12, $f30                          \n\t"
286     "psrah      $f14, $f14, $f30                          \n\t"
287     "packushb   $f8, $f8, $f10                            \n\t"
288     "packushb   $f10, $f12, $f14                          \n\t"
289     "gssqc1     $f10, $f8, 0x0(%[pPred])                  \n\t"
290     "paddh      $f0, $f0, $f16                            \n\t"
291     "paddh      $f2, $f2, $f18                            \n\t"
292     PTR_ADDIU  "%[pPred], %[pPred], 0x10                  \n\t"
293     PTR_ADDIU  "$8, $8, 0x1                               \n\t"
294     PTR_ADDIU  "$10, $8, -0x10                            \n\t"
295     "bnez       $10, 1b                                   \n\t"
296     "nop                                                  \n\t"
297     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
298     : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
299       [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
300     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
301       "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
302       "$f28", "$f30"
303   );
304   RECOVER_REG;
305 }
306 
WelsIChromaPredPlane_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)307 void WelsIChromaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
308   short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
309   short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
310   short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0,
311                                                             1, 2, 3, 4};
312   BACKUP_REG;
313   __asm__ volatile (
314     ".set       arch=loongson3a                           \n\t"
315     PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
316     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
317 
318     "gsldlc1    $f0, 0x7(%[pRef])                         \n\t"
319     "xor        $f28, $f28, $f28                          \n\t"
320     "gsldrc1    $f0, 0x0(%[pRef])                         \n\t"
321     "gsldxc1    $f20, 0x0(%[mmi_plane_dec_c], $0)         \n\t"
322     "punpcklbh  $f0, $f0, $f28                            \n\t"
323     "gsldlc1    $f4, 0xc(%[pRef])                         \n\t"
324     "pmullh     $f0, $f0, $f20                            \n\t"
325     "gsldrc1    $f4, 0x5(%[pRef])                         \n\t"
326     "gsldxc1    $f24, 0x0(%[mmi_plane_inc_c], $0)         \n\t"
327     "punpcklbh  $f4, $f4, $f28                            \n\t"
328     "pmullh     $f4, $f4, $f24                            \n\t"
329     "psubh      $f4, $f4, $f0                             \n\t"
330 
331     "xor        $f6, $f6, $f6                             \n\t"
332     "xor        $f8, $f8, $f8                             \n\t"
333     SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
334     "dmfc1      $8, $f4                                   \n\t"
335     "seh        $8, $8                                    \n\t"
336     "mul        $8, $8, 0x11                              \n\t"
337     PTR_ADDIU  "$8, $8, 0x10                              \n\t"
338     "sra        $8, $8, 0x5                               \n\t"
339     MMI_Copy8Times($f4, $f6, $f28, $8)
340 
341     "lbu        $8, 0x8(%[pRef])                          \n\t"
342     PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
343     LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
344 
345     PTR_ADDIU  "%[pRef], %[pRef], 0x3                     \n\t"
346     "dsll       $10, %[kiStride], 0x2                     \n\t"
347     PTR_ADDU   "$10, $10, %[pRef]                         \n\t"
348     "lbu        $9, 0x0($10)                              \n\t"
349     PTR_ADDU   "$9, $9, $8                                \n\t"
350     "dsll       $9, $9, 0x4                               \n\t"
351 
352     PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
353     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
354     LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
355     "xor        $f16, $f16, $f16                          \n\t"
356     "punpckhbh  $f0, $f0, $f16                            \n\t"
357     "pmullh     $f0, $f0, $f20                            \n\t"
358     "punpckhbh  $f28, $f28, $f16                          \n\t"
359     "pmullh     $f28, $f28, $f24                          \n\t"
360     "psubh      $f28, $f28, $f0                           \n\t"
361 
362     "xor        $f30, $f30, $f30                          \n\t"
363     "xor        $f8, $f8, $f8                             \n\t"
364     SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
365     "dmfc1      $8, $f28                                  \n\t"
366     "seh        $8, $8                                    \n\t"
367     "mul        $8, $8, 0x11                              \n\t"
368     PTR_ADDIU  "$8, $8, 0x10                              \n\t"
369     "sra        $8, $8, 0x5                               \n\t"
370     MMI_Copy8Times($f16, $f18, $f8, $8)
371 
372     PTR_ADDIU  "$9, $9, 0x10                              \n\t"
373     "mul        $8, $8, -0x3                              \n\t"
374     PTR_ADDU   "$8, $8, $9                                \n\t"
375     MMI_Copy8Times($f0, $f2, $f8, $8)
376 
377     "xor        $8, $8, $8                                \n\t"
378     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_mul_b_c])     \n\t"
379 
380     "dli        $10, 0x5                                  \n\t"
381     "dmtc1      $10, $f30                                 \n\t"
382     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
383 
384     "1:                                                   \n\t"
385     "pmullh     $f8, $f4, $f20                            \n\t"
386     "pmullh     $f10, $f6, $f22                           \n\t"
387     "paddh      $f8, $f8, $f0                             \n\t"
388     "paddh      $f10, $f10, $f2                           \n\t"
389     "psrah      $f8, $f8, $f30                            \n\t"
390     "psrah      $f10, $f10, $f30                          \n\t"
391     "packushb   $f8, $f8, $f10                            \n\t"
392     "gssdxc1    $f8, 0x0(%[pPred], $0)                    \n\t"
393     "paddh      $f0, $f0, $f16                            \n\t"
394     "paddh      $f2, $f2, $f18                            \n\t"
395     PTR_ADDIU  "%[pPred], %[pPred], 0x8                   \n\t"
396     PTR_ADDIU  "$8, $8, 0x1                               \n\t"
397     PTR_ADDIU  "$10, $8, -0x8                             \n\t"
398     "bnez       $10, 1b                                   \n\t"
399     "nop                                                  \n\t"
400     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
401     : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
402       [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
403     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
404       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
405   );
406   RECOVER_REG;
407 }
408 
WelsIChromaPredV_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)409 void WelsIChromaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
410   __asm__ volatile (
411     ".set       arch=loongson3a                           \n\t"
412     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
413     "gsldxc1    $f0, 0x0(%[pRef], $0)                     \n\t"
414     "mov.d      $f2, $f0                                  \n\t"
415 
416     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
417     "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
418     "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
419     "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
420     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
421     : [kiStride]"r"((int)kiStride)
422     : "memory", "$f0", "$f2"
423   );
424 }
425 
WelsIChromaPredDc_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)426 void WelsIChromaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
427   short mmi_0x02[4]__attribute__((aligned(16))) = {2, 0, 0, 0};
428   unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
429                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
430   __asm__ volatile (
431     ".set       arch=loongson3a                           \n\t"
432     PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
433     "gsldxc1    $f0, 0x0(%[pRef], $0)                     \n\t"
434 
435     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
436     "lbu        $8, -0x1(%[pRef])                         \n\t"
437     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
438     "lbu        $9, -0x1(%[pRef])                         \n\t"
439     PTR_ADDU   "$8, $8, $9                                \n\t"
440     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
441     "lbu        $9, -0x1(%[pRef])                         \n\t"
442     PTR_ADDU   "$8, $8, $9                                \n\t"
443     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
444     "lbu        $9, -0x1(%[pRef])                         \n\t"
445     PTR_ADDU   "$8, $8, $9                                \n\t"
446     "dmtc1      $8, $f2                                   \n\t"
447 
448     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
449     "lbu        $8, -0x1(%[pRef])                         \n\t"
450     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
451     "lbu        $9, -0x1(%[pRef])                         \n\t"
452     PTR_ADDU   "$8, $8, $9                                \n\t"
453     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
454     "lbu        $9, -0x1(%[pRef])                         \n\t"
455     PTR_ADDU   "$8, $8, $9                                \n\t"
456     PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
457     "lbu        $9, -0x1(%[pRef])                         \n\t"
458     PTR_ADDU   "$8, $8, $9                                \n\t"
459     "dmtc1      $8, $f4                                   \n\t"
460 
461     "xor        $f8, $f8, $f8                             \n\t"
462     "punpcklwd  $f6, $f0, $f8                             \n\t"
463     "punpckhwd  $f0, $f0, $f8                             \n\t"
464     "pasubub    $f0, $f0, $f8                             \n\t"
465     "pasubub    $f6, $f6, $f8                             \n\t"
466     "biadd      $f0, $f0                                  \n\t"
467     "biadd      $f6, $f6                                  \n\t"
468 
469     "dadd       $f6, $f6, $f2                             \n\t"
470     "dadd       $f2, $f4, $f0                             \n\t"
471 
472     "gsldxc1    $f8, 0x0(%[mmi_0x02], $0)                 \n\t"
473 
474     "dli        $10, 0x2                                  \n\t"
475     "dmtc1      $10, $f10                                 \n\t"
476     "dadd       $f0, $f0, $f8                             \n\t"
477     "dsrl       $f0, $f0, $f10                            \n\t"
478 
479     "dadd       $f4, $f4, $f8                             \n\t"
480     "dsrl       $f4, $f4, $f10                            \n\t"
481 
482     "dli        $10, 0x3                                  \n\t"
483     "dmtc1      $10, $f10                                 \n\t"
484     "dadd       $f6, $f6, $f8                             \n\t"
485     "dadd       $f6, $f6, $f8                             \n\t"
486     "dsrl       $f6, $f6, $f10                            \n\t"
487 
488     "dadd       $f2, $f2, $f8                             \n\t"
489     "dadd       $f2, $f2, $f8                             \n\t"
490     "dsrl       $f2, $f2, $f10                            \n\t"
491 
492     "dli        $10, 0x20                                 \n\t"
493     "dmtc1      $10, $f10                                 \n\t"
494     "gsldxc1    $f12, 0x0(%[mmi_01bytes], $0)             \n\t"
495     "pmuluw     $f0, $f0, $f12                            \n\t"
496     "pmuluw     $f6, $f6, $f12                            \n\t"
497     "dsll       $f0, $f0, $f10                            \n\t"
498     "xor        $f0, $f0, $f6                             \n\t"
499 
500     "pmuluw     $f4, $f4, $f12                            \n\t"
501     "pmuluw     $f2, $f2, $f12                            \n\t"
502     "dsll       $f2, $f2, $f10                            \n\t"
503     "xor        $f2, $f2, $f4                             \n\t"
504 
505     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
506     "gssdxc1    $f0, 0x8(%[pPred], $0)                    \n\t"
507     "gssdxc1    $f0, 0x10(%[pPred], $0)                   \n\t"
508     "gssdxc1    $f0, 0x18(%[pPred], $0)                   \n\t"
509 
510     "gssdxc1    $f2, 0x20(%[pPred], $0)                   \n\t"
511     "gssdxc1    $f2, 0x28(%[pPred], $0)                   \n\t"
512     "gssdxc1    $f2, 0x30(%[pPred], $0)                   \n\t"
513     "gssdxc1    $f2, 0x38(%[pPred], $0)                   \n\t"
514     : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
515     : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes),
516       [mmi_0x02]"r"((unsigned char *)mmi_0x02)
517     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
518   );
519 }
520 
WelsIChromaPredH_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)521 void WelsIChromaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
522   unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
523                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
524   __asm__ volatile (
525     ".set       arch=loongson3a                           \n\t"
526     "gsldxc1    $f2, 0x0(%[mmi_01bytes], $0)              \n\t"
527     "dli        $8, 0x38                                  \n\t"
528     "dmtc1      $8, $f4                                   \n\t"
529     "xor        $f6, $f6, $f6                             \n\t"
530     "gsldxc1    $f0, -0x8(%[pRef], $0)                    \n\t"
531     "dsrl       $f0, $f0, $f4                             \n\t"
532 
533     "pmullh     $f0, $f0, $f2                             \n\t"
534     "pshufh     $f0, $f0, $f6                             \n\t"
535     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
536 
537     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x8)
538     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x10)
539     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x18)
540     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x20)
541     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x28)
542     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x30)
543     MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x38)
544    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
545    : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
546    : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
547   );
548 }
549