1 // Copyright (c) 2014-2015 The Chromium Authors. All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions
5 // are met:
6 //  * Redistributions of source code must retain the above copyright
7 //    notice, this list of conditions and the following disclaimer.
8 //  * Redistributions in binary form must reproduce the above copyright
9 //    notice, this list of conditions and the following disclaimer in
10 //    the documentation and/or other materials provided with the
11 //    distribution.
12 //  * Neither the name of Google, Inc. nor the names of its contributors
13 //    may be used to endorse or promote products derived from this
14 //    software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 // OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 // SUCH DAMAGE.
28 
29 #include "convolver.h"
30 #include <algorithm>
31 #include "skia/include/core/SkTypes.h"
32 
33 #if defined(_MIPS_ARCH_LOONGSON3A)
34 
35 #include "MMIHelpers.h"
36 
37 namespace skia {
38 
39 // Convolves horizontally along a single row. The row data is given in
40 // |src_data| and continues for the num_values() of the filter.
ConvolveHorizontally_LS3(const unsigned char * src_data,const ConvolutionFilter1D & filter,unsigned char * out_row)41 void ConvolveHorizontally_LS3(const unsigned char* src_data,
42                                const ConvolutionFilter1D& filter,
43                                unsigned char* out_row) {
44   int num_values = filter.num_values();
45   int tmp, filter_offset, filter_length;
46   double zero, mask[4], shuf_50, shuf_fa;
47 
48   asm volatile (
49     ".set push \n\t"
50     ".set arch=loongson3a \n\t"
51     "xor %[zero], %[zero], %[zero] \n\t"
52     // |mask| will be used to decimate all extra filter coefficients that are
53     // loaded by SIMD when |filter_length| is not divisible by 4.
54     // mask[0] is not used in following algorithm.
55     "li %[tmp], 1 \n\t"
56     "dsll32 %[tmp], 0x10 \n\t"
57     "daddiu %[tmp], -1 \n\t"
58     "dmtc1 %[tmp], %[mask3] \n\t"
59     "dsrl %[tmp], 0x10 \n\t"
60     "mtc1 %[tmp], %[mask2] \n\t"
61     "dsrl %[tmp], 0x10 \n\t"
62     "mtc1 %[tmp], %[mask1] \n\t"
63     "ori %[tmp], $0, 0x50 \n\t"
64     "mtc1 %[tmp], %[shuf_50] \n\t"
65     "ori %[tmp], $0, 0xfa \n\t"
66     "mtc1 %[tmp], %[shuf_fa] \n\t"
67     ".set pop \n\t"
68     :[zero]"=f"(zero), [mask1]"=f"(mask[1]),
69      [mask2]"=f"(mask[2]), [mask3]"=f"(mask[3]),
70      [shuf_50]"=f"(shuf_50), [shuf_fa]"=f"(shuf_fa),
71      [tmp]"=&r"(tmp)
72   );
73 
74   // Output one pixel each iteration, calculating all channels (RGBA) together.
75   for (int out_x = 0; out_x < num_values; out_x++) {
76     const ConvolutionFilter1D::Fixed* filter_values =
77         filter.FilterForValue(out_x, &filter_offset, &filter_length);
78     double accumh, accuml;
79     // Compute the first pixel in this row that the filter affects. It will
80     // touch |filter_length| pixels (4 bytes each) after this.
81     const void *row_to_filter =
82         reinterpret_cast<const void*>(&src_data[filter_offset << 2]);
83 
84     asm volatile (
85       ".set push \n\t"
86       ".set arch=loongson3a \n\t"
87       _mm_xor(accum, accum, accum)
88       ".set pop \n\t"
89       :[accumh]"=f"(accumh), [accuml]"=f"(accuml)
90     );
91 
92     // We will load and accumulate with four coefficients per iteration.
93     for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
94       double src16h, src16l, mul_hih, mul_hil, mul_loh, mul_lol;
95       double coeffh, coeffl, src8h, src8l, th, tl, coeff16h, coeff16l;
96 
97       asm volatile (
98         ".set push \n\t"
99         ".set arch=loongson3a \n\t"
100         // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
101         // [16] xx xx xx xx c3 c2 c1 c0
102         "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
103         "gsldrc1 %[coeffl], (%[fval]) \n\t"
104         "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
105         // [16] xx xx xx xx c1 c1 c0 c0
106         _mm_pshuflh(coeff16, coeff, shuf_50)
107         // [16] c1 c1 c1 c1 c0 c0 c0 c0
108         _mm_punpcklhw(coeff16, coeff16, coeff16)
109         // Load four pixels => unpack the first two pixels to 16 bits =>
110         // multiply with coefficients => accumulate the convolution result.
111         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
112         "gsldlc1 %[src8h], 0xf(%[rtf]) \n\t"
113         "gsldrc1 %[src8h], 0x8(%[rtf]) \n\t"
114         "gsldlc1 %[src8l], 0x7(%[rtf]) \n\t"
115         "gsldrc1 %[src8l], 0x0(%[rtf]) \n\t"
116         // [16] a1 b1 g1 r1 a0 b0 g0 r0
117         _mm_punpcklbh(src16, src8, zero)
118         _mm_pmulhh(mul_hi, src16, coeff16)
119         _mm_pmullh(mul_lo, src16, coeff16)
120         // [32]  a0*c0 b0*c0 g0*c0 r0*c0
121         _mm_punpcklhw(t, mul_lo, mul_hi)
122         _mm_paddw(accum, accum, t)
123         // [32]  a1*c1 b1*c1 g1*c1 r1*c1
124         _mm_punpckhhw(t, mul_lo, mul_hi)
125         _mm_paddw(accum, accum, t)
126         // Duplicate 3rd and 4th coefficients for all channels =>
127         // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
128         // => accumulate the convolution results.
129         // [16] xx xx xx xx c3 c3 c2 c2
130         _mm_pshuflh(coeff16, coeff, shuf_fa)
131         // [16] c3 c3 c3 c3 c2 c2 c2 c2
132         _mm_punpcklhw(coeff16, coeff16, coeff16)
133         // [16] a3 g3 b3 r3 a2 g2 b2 r2
134         _mm_punpckhbh(src16, src8, zero)
135         _mm_pmulhh(mul_hi, src16, coeff16)
136         _mm_pmullh(mul_lo, src16, coeff16)
137         // [32]  a2*c2 b2*c2 g2*c2 r2*c2
138         _mm_punpcklhw(t, mul_lo, mul_hi)
139         _mm_paddw(accum, accum, t)
140         // [32]  a3*c3 b3*c3 g3*c3 r3*c3
141         _mm_punpckhhw(t, mul_lo, mul_hi)
142         _mm_paddw(accum, accum, t)
143         ".set pop \n\t"
144         :[th]"=&f"(th), [tl]"=&f"(tl),
145          [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
146          [accumh]"+f"(accumh), [accuml]"+f"(accuml),
147          [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
148          [coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
149          [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l),
150          [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
151          [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol)
152         :[zeroh]"f"(zero), [zerol]"f"(zero),
153          [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa),
154          [fval]"r"(filter_values), [rtf]"r"(row_to_filter)
155       );
156 
157       // Advance the pixel and coefficients pointers.
158       row_to_filter += 16;
159       filter_values += 4;
160     }
161 
162     // When |filter_length| is not divisible by 4, we need to decimate some of
163     // the filter coefficient that was loaded incorrectly to zero; Other than
164     // that the algorithm is same with above, except that the 4th pixel will be
165     // always absent.
166     int r = filter_length & 3;
167     if (r) {
168       double coeffh, coeffl, th, tl, coeff16h, coeff16l;
169       double src8h, src8l, src16h, src16l, mul_hih, mul_hil, mul_loh, mul_lol;
170 
171       asm volatile (
172         ".set push \n\t"
173         ".set arch=loongson3a \n\t"
174         "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
175         "gsldrc1 %[coeffl], (%[fval]) \n\t"
176         "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
177         // Mask out extra filter taps.
178         "and %[coeffl], %[coeffl], %[mask] \n\t"
179         _mm_pshuflh(coeff16, coeff, shuf_50)
180         _mm_punpcklhw(coeff16, coeff16, coeff16)
181         "gsldlc1 %[src8h], 0xf(%[rtf]) \n\t"
182         "gsldrc1 %[src8h], 0x8(%[rtf]) \n\t"
183         "gsldlc1 %[src8l], 0x7(%[rtf]) \n\t"
184         "gsldrc1 %[src8l], 0x0(%[rtf]) \n\t"
185         _mm_punpcklbh(src16, src8, zero)
186         _mm_pmulhh(mul_hi, src16, coeff16)
187         _mm_pmullh(mul_lo, src16, coeff16)
188         _mm_punpcklhw(t, mul_lo, mul_hi)
189         _mm_paddw(accum, accum, t)
190         _mm_punpckhhw(t, mul_lo, mul_hi)
191         _mm_paddw(accum, accum, t)
192         _mm_punpckhbh(src16, src8, zero)
193         _mm_pshuflh(coeff16, coeff, shuf_fa)
194         _mm_punpcklhw(coeff16, coeff16, coeff16)
195         _mm_pmulhh(mul_hi, src16, coeff16)
196         _mm_pmullh(mul_lo, src16, coeff16)
197         _mm_punpcklhw(t, mul_lo, mul_hi)
198         _mm_paddw(accum, accum, t)
199         ".set pop \n\t"
200         :[th]"=&f"(th), [tl]"=&f"(tl),
201          [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
202          [accumh]"+f"(accumh), [accuml]"+f"(accuml),
203          [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
204          [coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
205          [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l),
206          [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
207          [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol)
208         :[fval]"r"(filter_values), [rtf]"r"(row_to_filter),
209          [zeroh]"f"(zero), [zerol]"f"(zero), [mask]"f"(mask[r]),
210          [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
211       );
212     }
213 
214     double t, sra;
215     asm volatile (
216       ".set push \n\t"
217       ".set arch=loongson3a \n\t"
218       "ori %[tmp], $0, %[sk_sra] \n\t"
219       "mtc1 %[tmp], %[sra] \n\t"
220       // Shift right for fixed point implementation.
221       _mm_psraw(accum, accum, sra)
222       // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
223       _mm_packsswh(accum, accum, zero, t)
224       // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
225       _mm_packushb(accum, accum, zero, t)
226       // Store the pixel value of 32 bits.
227       "swc1 %[accuml], (%[out_row]) \n\t"
228       ".set pop \n\t"
229       :[sra]"=&f"(sra), [t]"=&f"(t), [tmp]"=&r"(tmp),
230        [accumh]"+f"(accumh), [accuml]"+f"(accuml)
231       :[sk_sra]"i"(ConvolutionFilter1D::kShiftBits),
232        [out_row]"r"(out_row), [zeroh]"f"(zero), [zerol]"f"(zero)
233       :"memory"
234     );
235 
236     out_row += 4;
237   }
238 }
239 
240 // Convolves horizontally along a single row. The row data is given in
241 // |src_data| and continues for the [begin, end) of the filter.
242 // Process one pixel at a time.
ConvolveHorizontally1_LS3(const unsigned char * src_data,const ConvolutionFilter1D & filter,unsigned char * out_row)243 void ConvolveHorizontally1_LS3(const unsigned char* src_data,
244                                const ConvolutionFilter1D& filter,
245                                unsigned char* out_row) {
246   int num_values = filter.num_values();
247   double zero;
248   double sra;
249 
250   asm volatile (
251     ".set push \n"
252     ".set arch=loongson3a \n"
253     "xor %[zero], %[zero], %[zero] \n"
254     "mtc1 %[sk_sra], %[sra] \n"
255     ".set pop \n"
256     :[zero]"=&f"(zero), [sra]"=&f"(sra)
257     :[sk_sra]"r"(ConvolutionFilter1D::kShiftBits)
258   );
259   // Loop over each pixel on this row in the output image.
260   for (int out_x = 0; out_x < num_values; out_x++) {
261     // Get the filter that determines the current output pixel.
262     int filter_offset;
263     int filter_length;
264     const ConvolutionFilter1D::Fixed* filter_values =
265         filter.FilterForValue(out_x, &filter_offset, &filter_length);
266 
267     // Compute the first pixel in this row that the filter affects. It will
268     // touch |filter_length| pixels (4 bytes each) after this.
269     const unsigned char* row_to_filter = &src_data[filter_offset * 4];
270 
271     // Apply the filter to the row to get the destination pixel in |accum|.
272     double accuml;
273     double accumh;
274     asm volatile (
275       ".set push \n"
276       ".set arch=loongson3a \n"
277       "xor %[accuml], %[accuml], %[accuml] \n"
278       "xor %[accumh], %[accumh], %[accumh] \n"
279       ".set pop \n"
280       :[accuml]"=&f"(accuml), [accumh]"=&f"(accumh)
281     );
282     for (int filter_x = 0; filter_x < filter_length; filter_x++) {
283       double src8;
284       double src16;
285       double coeff;
286       double coeff16;
287       asm volatile (
288         ".set push \n"
289         ".set arch=loongson3a \n"
290         "lwc1 %[src8], %[rtf] \n"
291         "mtc1 %[fv], %[coeff] \n"
292         "pshufh %[coeff16], %[coeff], %[zero] \n"
293         "punpcklbh %[src16], %[src8], %[zero] \n"
294         "pmullh %[src8], %[src16], %[coeff16] \n"
295         "pmulhh %[coeff], %[src16], %[coeff16] \n"
296         "punpcklhw %[src16], %[src8], %[coeff] \n"
297         "punpckhhw %[coeff16], %[src8], %[coeff] \n"
298         "paddw %[accuml], %[accuml], %[src16] \n"
299         "paddw %[accumh], %[accumh], %[coeff16] \n"
300         ".set pop \n"
301         :[accuml]"+f"(accuml), [accumh]"+f"(accumh),
302          [src8]"=&f"(src8), [src16]"=&f"(src16),
303          [coeff]"=&f"(coeff), [coeff16]"=&f"(coeff16)
304         :[rtf]"m"(row_to_filter[filter_x * 4]),
305          [fv]"r"(filter_values[filter_x]), [zero]"f"(zero)
306       );
307     }
308 
309     asm volatile (
310       ".set push \n"
311       ".set arch=loongson3a \n"
312       // Bring this value back in range. All of the filter scaling factors
313       // are in fixed point with kShiftBits bits of fractional part.
314       "psraw %[accuml], %[accuml], %[sra] \n"
315       "psraw %[accumh], %[accumh], %[sra] \n"
316       // Store the new pixel.
317       "packsswh %[accuml], %[accuml], %[accumh] \n"
318       "packushb %[accuml], %[accuml], %[zero] \n"
319       "swc1 %[accuml], %[out_row] \n"
320       ".set pop \n"
321       :[accuml]"+f"(accuml), [accumh]"+f"(accumh)
322       :[sra]"f"(sra), [zero]"f"(zero), [out_row]"m"(out_row[out_x * 4])
323       :"memory"
324     );
325   }
326 }
327 
328 // Convolves horizontally along four rows. The row data is given in
329 // |src_data| and continues for the num_values() of the filter.
330 // The algorithm is almost same as |ConvolveHorizontally_LS3|. Please
331 // refer to that function for detailed comments.
ConvolveHorizontally4_LS3(const unsigned char * src_data[4],const ConvolutionFilter1D & filter,unsigned char * out_row[4])332 void ConvolveHorizontally4_LS3(const unsigned char* src_data[4],
333                                 const ConvolutionFilter1D& filter,
334                                 unsigned char* out_row[4]) {
335   int num_values = filter.num_values();
336   int tmp, filter_offset, filter_length;
337   double zero, mask[4], shuf_50, shuf_fa;
338 
339   asm volatile (
340     ".set push \n\t"
341     ".set arch=loongson3a \n\t"
342     "xor %[zero], %[zero], %[zero] \n\t"
343     // |mask| will be used to decimate all extra filter coefficients that are
344     // loaded by SIMD when |filter_length| is not divisible by 4.
345     // mask[0] is not used in following algorithm.
346     "li %[tmp], 1 \n\t"
347     "dsll32 %[tmp], 0x10 \n\t"
348     "daddiu %[tmp], -1 \n\t"
349     "dmtc1 %[tmp], %[mask3] \n\t"
350     "dsrl %[tmp], 0x10 \n\t"
351     "mtc1 %[tmp], %[mask2] \n\t"
352     "dsrl %[tmp], 0x10 \n\t"
353     "mtc1 %[tmp], %[mask1] \n\t"
354     "ori %[tmp], $0, 0x50 \n\t"
355     "mtc1 %[tmp], %[shuf_50] \n\t"
356     "ori %[tmp], $0, 0xfa \n\t"
357     "mtc1 %[tmp], %[shuf_fa] \n\t"
358     ".set pop \n\t"
359     :[zero]"=f"(zero), [mask1]"=f"(mask[1]),
360      [mask2]"=f"(mask[2]), [mask3]"=f"(mask[3]),
361      [shuf_50]"=f"(shuf_50), [shuf_fa]"=f"(shuf_fa),
362      [tmp]"=&r"(tmp)
363   );
364 
365   // Output one pixel each iteration, calculating all channels (RGBA) together.
366   for (int out_x = 0; out_x < num_values; out_x++) {
367     const ConvolutionFilter1D::Fixed* filter_values =
368         filter.FilterForValue(out_x, &filter_offset, &filter_length);
369     double accum0h, accum0l, accum1h, accum1l;
370     double accum2h, accum2l, accum3h, accum3l;
371 
372     // four pixels in a column per iteration.
373     asm volatile (
374       ".set push \n\t"
375       ".set arch=loongson3a \n\t"
376       _mm_xor(accum0, accum0, accum0)
377       _mm_xor(accum1, accum1, accum1)
378       _mm_xor(accum2, accum2, accum2)
379       _mm_xor(accum3, accum3, accum3)
380       ".set pop \n\t"
381       :[accum0h]"=f"(accum0h), [accum0l]"=f"(accum0l),
382        [accum1h]"=f"(accum1h), [accum1l]"=f"(accum1l),
383        [accum2h]"=f"(accum2h), [accum2l]"=f"(accum2l),
384        [accum3h]"=f"(accum3h), [accum3l]"=f"(accum3l)
385     );
386 
387     int start = (filter_offset<<2);
388     // We will load and accumulate with four coefficients per iteration.
389     for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
390       double src8h, src8l, src16h, src16l;
391       double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
392       double coeffh, coeffl, coeff16loh, coeff16lol, coeff16hih, coeff16hil;
393 
394       asm volatile (
395         ".set push \n\t"
396         ".set arch=loongson3a \n\t"
397         // [16] xx xx xx xx c3 c2 c1 c0
398         "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
399         "gsldrc1 %[coeffl], (%[fval]) \n\t"
400         "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
401         // [16] xx xx xx xx c1 c1 c0 c0
402         _mm_pshuflh(coeff16lo, coeff, shuf_50)
403         // [16] c1 c1 c1 c1 c0 c0 c0 c0
404         _mm_punpcklhw(coeff16lo, coeff16lo, coeff16lo)
405         // [16] xx xx xx xx c3 c3 c2 c2
406         _mm_pshuflh(coeff16hi, coeff, shuf_fa)
407         // [16] c3 c3 c3 c3 c2 c2 c2 c2
408         _mm_punpcklhw(coeff16hi, coeff16hi, coeff16hi)
409         ".set pop \n\t"
410         :[coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
411          [coeff16loh]"=&f"(coeff16loh), [coeff16lol]"=&f"(coeff16lol),
412          [coeff16hih]"=&f"(coeff16hih), [coeff16hil]"=&f"(coeff16hil)
413         :[fval]"r"(filter_values), [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
414       );
415 
416 #define ITERATION(_src, _accumh, _accuml)                              \
417       asm volatile (                                                   \
418         ".set push \n\t"                                               \
419         ".set arch=loongson3a \n\t"                                    \
420         "gsldlc1 %[src8h], 0xf(%[src]) \n\t"                           \
421         "gsldrc1 %[src8h], 0x8(%[src]) \n\t"                           \
422         "gsldlc1 %[src8l], 0x7(%[src]) \n\t"                           \
423         "gsldrc1 %[src8l], 0x0(%[src]) \n\t"                           \
424         _mm_punpcklbh(src16, src8, zero)                               \
425         _mm_pmulhh(mul_hi, src16, coeff16lo)                           \
426         _mm_pmullh(mul_lo, src16, coeff16lo)                           \
427         _mm_punpcklhw(t, mul_lo, mul_hi)                               \
428         _mm_paddw(accum, accum, t)                                     \
429         _mm_punpckhhw(t, mul_lo, mul_hi)                               \
430         _mm_paddw(accum, accum, t)                                     \
431         _mm_punpckhbh(src16, src8, zero)                               \
432         _mm_pmulhh(mul_hi, src16, coeff16hi)                           \
433         _mm_pmullh(mul_lo, src16, coeff16hi)                           \
434         _mm_punpcklhw(t, mul_lo, mul_hi)                               \
435         _mm_paddw(accum, accum, t)                                     \
436         _mm_punpckhhw(t, mul_lo, mul_hi)                               \
437         _mm_paddw(accum, accum, t)                                     \
438         ".set pop \n\t"                                                \
439         :[th]"=&f"(th), [tl]"=&f"(tl),                                 \
440          [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),                     \
441          [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),                 \
442          [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),             \
443          [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol),             \
444          [accumh]"+f"(_accumh), [accuml]"+f"(_accuml)                  \
445         :[zeroh]"f"(zero), [zerol]"f"(zero), [src]"r"(_src),           \
446          [coeff16loh]"f"(coeff16loh), [coeff16lol]"f"(coeff16lol),     \
447          [coeff16hih]"f"(coeff16hih), [coeff16hil]"f"(coeff16hil)      \
448       );
449 
450       ITERATION(src_data[0] + start, accum0h, accum0l);
451       ITERATION(src_data[1] + start, accum1h, accum1l);
452       ITERATION(src_data[2] + start, accum2h, accum2l);
453       ITERATION(src_data[3] + start, accum3h, accum3l);
454 
455       start += 16;
456       filter_values += 4;
457     }
458 
459     int r = filter_length & 3;
460     if (r) {
461       double src8h, src8l, src16h, src16l;
462       double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
463       double coeffh, coeffl, coeff16loh, coeff16lol, coeff16hih, coeff16hil;
464 
465       asm volatile (
466         ".set push \n\t"
467         ".set arch=loongson3a \n\t"
468         "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
469         "gsldrc1 %[coeffl], (%[fval]) \n\t"
470         "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
471         // Mask out extra filter taps.
472         "and %[coeffl], %[coeffl], %[mask] \n\t"
473         _mm_pshuflh(coeff16lo, coeff, shuf_50)
474         /* c1 c1 c1 c1 c0 c0 c0 c0 */
475         _mm_punpcklhw(coeff16lo, coeff16lo, coeff16lo)
476         _mm_pshuflh(coeff16hi, coeff, shuf_fa)
477         _mm_punpcklhw(coeff16hi, coeff16hi, coeff16hi)
478         ".set pop \n\t"
479         :[coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
480          [coeff16loh]"=&f"(coeff16loh), [coeff16lol]"=&f"(coeff16lol),
481          [coeff16hih]"=&f"(coeff16hih), [coeff16hil]"=&f"(coeff16hil)
482         :[fval]"r"(filter_values), [mask]"f"(mask[r]),
483          [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
484       );
485 
486       ITERATION(src_data[0] + start, accum0h, accum0l);
487       ITERATION(src_data[1] + start, accum1h, accum1l);
488       ITERATION(src_data[2] + start, accum2h, accum2l);
489       ITERATION(src_data[3] + start, accum3h, accum3l);
490     }
491 
492     double t, sra;
493     asm volatile (
494       ".set push \n\t"
495       ".set arch=loongson3a \n\t"
496       "ori %[tmp], $0, %[sk_sra] \n\t"
497       "mtc1 %[tmp], %[sra] \n\t"
498       _mm_psraw(accum0, accum0, sra)
499       _mm_packsswh(accum0, accum0, zero, t)
500       _mm_packushb(accum0, accum0, zero, t)
501       _mm_psraw(accum1, accum1, sra)
502       _mm_packsswh(accum1, accum1, zero, t)
503       _mm_packushb(accum1, accum1, zero, t)
504       _mm_psraw(accum2, accum2, sra)
505       _mm_packsswh(accum2, accum2, zero, t)
506       _mm_packushb(accum2, accum2, zero, t)
507       _mm_psraw(accum3, accum3, sra)
508       _mm_packsswh(accum3, accum3, zero, t)
509       _mm_packushb(accum3, accum3, zero, t)
510       "swc1 %[accum0l], (%[out_row0]) \n\t"
511       "swc1 %[accum1l], (%[out_row1]) \n\t"
512       "swc1 %[accum2l], (%[out_row2]) \n\t"
513       "swc1 %[accum3l], (%[out_row3]) \n\t"
514       ".set pop \n\t"
515       :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
516        [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
517        [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
518        [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l),
519        [sra]"=&f"(sra), [t]"=&f"(t), [tmp]"=&r"(tmp)
520       :[zeroh]"f"(zero), [zerol]"f"(zero),
521        [out_row0]"r"(out_row[0]), [out_row1]"r"(out_row[1]),
522        [out_row2]"r"(out_row[2]), [out_row3]"r"(out_row[3]),
523        [sk_sra]"i"(ConvolutionFilter1D::kShiftBits)
524       :"memory"
525     );
526 
527     out_row[0] += 4;
528     out_row[1] += 4;
529     out_row[2] += 4;
530     out_row[3] += 4;
531   }
532 }
533 
534 // Does vertical convolution to produce one output row. The filter values and
535 // length are given in the first two parameters. These are applied to each
536 // of the rows pointed to in the |source_data_rows| array, with each row
537 // being |pixel_width| wide.
538 //
539 // The output must have room for |pixel_width * 4| bytes.
540 template<bool has_alpha>
ConvolveVertically_LS3_impl(const ConvolutionFilter1D::Fixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row)541 void ConvolveVertically_LS3_impl(const ConvolutionFilter1D::Fixed* filter_values,
542                                   int filter_length,
543                                   unsigned char* const* source_data_rows,
544                                   int pixel_width,
545                                   unsigned char* out_row) {
546   uint64_t tmp;
547   int width = pixel_width & ~3;
548   double zero, sra, coeff16h, coeff16l;
549   double accum0h, accum0l, accum1h, accum1l;
550   double accum2h, accum2l, accum3h, accum3l;
551   const void *src;
552   int out_x;
553 
554   asm volatile (
555     ".set push \n\t"
556     ".set arch=loongson3a \n\t"
557     "xor %[zero], %[zero], %[zero] \n\t"
558     "ori %[tmp], $0, %[sk_sra] \n\t"
559     "mtc1 %[tmp], %[sra] \n\t"
560     ".set pop \n\t"
561     :[zero]"=f"(zero), [sra]"=f"(sra), [tmp]"=&r"(tmp)
562     :[sk_sra]"i"(ConvolutionFilter1D::kShiftBits)
563   );
564 
565   // Output four pixels per iteration (16 bytes).
566   for (out_x = 0; out_x < width; out_x += 4) {
567     // Accumulated result for each pixel. 32 bits per RGBA channel.
568     asm volatile (
569       ".set push \n\t"
570       ".set arch=loongson3a \n\t"
571       _mm_xor(accum0, accum0, accum0)
572       _mm_xor(accum1, accum1, accum1)
573       _mm_xor(accum2, accum2, accum2)
574       _mm_xor(accum3, accum3, accum3)
575       ".set pop \n\t"
576       :[accum0h]"=f"(accum0h), [accum0l]"=f"(accum0l),
577        [accum1h]"=f"(accum1h), [accum1l]"=f"(accum1l),
578        [accum2h]"=f"(accum2h), [accum2l]"=f"(accum2l),
579        [accum3h]"=f"(accum3h), [accum3l]"=f"(accum3l)
580     );
581 
582     // Convolve with one filter coefficient per iteration.
583     for (int filter_y = 0; filter_y < filter_length; filter_y++) {
584       double src8h, src8l, src16h, src16l;
585       double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
586 
587       src = reinterpret_cast<const void*>(
588           &source_data_rows[filter_y][out_x << 2]);
589 
590       asm volatile (
591         ".set push \n\t"
592         ".set arch=loongson3a \n\t"
593         // Duplicate the filter coefficient 8 times.
594         // [16] cj cj cj cj cj cj cj cj
595         "gsldlc1 %[coeff16l], 7+%[fval] \n\t"
596         "gsldrc1 %[coeff16l], %[fval] \n\t"
597         "pshufh %[coeff16l], %[coeff16l], %[zerol] \n\t"
598         "mov.d %[coeff16h], %[coeff16l] \n\t"
599         // Load four pixels (16 bytes) together.
600         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
601         "gsldlc1 %[src8h], 0xf(%[src]) \n\t"
602         "gsldrc1 %[src8h], 0x8(%[src]) \n\t"
603         "gsldlc1 %[src8l], 0x7(%[src]) \n\t"
604         "gsldrc1 %[src8l], 0x0(%[src]) \n\t"
605         // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
606         // multiply with current coefficient => accumulate the result.
607         // [16] a1 b1 g1 r1 a0 b0 g0 r0
608         _mm_punpcklbh(src16, src8, zero)
609         _mm_pmulhh(mul_hi, src16, coeff16)
610         _mm_pmullh(mul_lo, src16, coeff16)
611         // [32] a0 b0 g0 r0
612         _mm_punpcklhw(t, mul_lo, mul_hi)
613         _mm_paddw(accum0, accum0, t)
614         // [32] a1 b1 g1 r1
615         _mm_punpckhhw(t, mul_lo, mul_hi)
616         _mm_paddw(accum1, accum1, t)
617         // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
618         // multiply with current coefficient => accumulate the result.
619         // [16] a3 b3 g3 r3 a2 b2 g2 r2
620         _mm_punpckhbh(src16, src8, zero)
621         _mm_pmulhh(mul_hi, src16, coeff16)
622         _mm_pmullh(mul_lo, src16, coeff16)
623         ".set pop \n\t"
624         :[th]"=&f"(th), [tl]"=&f"(tl),
625          [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
626          [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
627          [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
628          [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol),
629          [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
630          [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
631          [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l)
632         :[zeroh]"f"(zero), [zerol]"f"(zero),
633          [fval]"m"(filter_values[filter_y]),
634          [src]"r"(src)
635       );
636 
637       asm volatile (
638         ".set push \n\t"
639         ".set arch=loongson3a \n\t"
640         // [32] a2 b2 g2 r2
641         _mm_punpcklhw(t, mul_lo, mul_hi)
642         _mm_paddw(accum2, accum2, t)
643         // [32] a3 b3 g3 r3
644         _mm_punpckhhw(t, mul_lo, mul_hi)
645         _mm_paddw(accum3, accum3, t)
646         ".set pop \n\t"
647         :[th]"=&f"(th), [tl]"=&f"(tl),
648          [mul_hih]"+f"(mul_hih), [mul_hil]"+f"(mul_hil),
649          [mul_loh]"+f"(mul_loh), [mul_lol]"+f"(mul_lol),
650          [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
651          [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l)
652       );
653     }
654 
655     double t;
656     asm volatile (
657       ".set push \n\t"
658       ".set arch=loongson3a \n\t"
659       // Shift right for fixed point implementation.
660       _mm_psraw(accum0, accum0, sra)
661       _mm_psraw(accum1, accum1, sra)
662       _mm_psraw(accum2, accum2, sra)
663       _mm_psraw(accum3, accum3, sra)
664       // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
665       // [16] a1 b1 g1 r1 a0 b0 g0 r0
666       _mm_packsswh(accum0, accum0, accum1, t)
667       // [16] a3 b3 g3 r3 a2 b2 g2 r2
668       _mm_packsswh(accum2, accum2, accum3, t)
669       // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
670       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
671       _mm_packushb(accum0, accum0, accum2, t)
672       ".set pop \n\t"
673       :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
674        [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
675        [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
676        [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l),
677        [t]"=&f"(t)
678       :[sra]"f"(sra)
679     );
680 
681     if (has_alpha) {
682       double ah, al, bh, bl, srl8, srl16, sll24;
683 
684       asm volatile (
685         ".set push \n\t"
686         ".set arch=loongson3a \n\t"
687         "li %[tmp], 8 \n\t"
688         "mtc1 %[tmp], %[srl8] \n\t"
689         "li %[tmp], 16 \n\t"
690         "mtc1 %[tmp], %[srl16] \n\t"
691         "li %[tmp], 24 \n\t"
692         "mtc1 %[tmp], %[sll24] \n\t"
693         // Compute the max(ri, gi, bi) for each pixel.
694         // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
695         _mm_psraw(a, accum0, srl8)
696         // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
697         _mm_pmaxub(b, a, accum0) // Max of r and g.
698         // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
699         _mm_psrlw(a, accum0, srl16)
700         // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
701         _mm_pmaxub(b, a, b) // Max of r and g and b.
702         // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
703         _mm_psllw(b, b, sll24)
704         // Make sure the value of alpha channel is always larger than maximum
705         // value of color channels.
706         _mm_pmaxub(accum0, b, accum0)
707         ".set pop \n\t"
708         :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
709          [tmp]"=&r"(tmp), [ah]"=&f"(ah), [al]"=&f"(al),
710          [bh]"=&f"(bh), [bl]"=&f"(bl), [srl8]"=&f"(srl8),
711          [srl16]"=&f"(srl16), [sll24]"=&f"(sll24)
712       );
713     } else {
714       double maskh, maskl;
715 
716       asm volatile (
717         ".set push \n\t"
718         ".set arch=loongson3a \n\t"
719         // Set value of alpha channels to 0xFF.
720         "li %[tmp], 0xff000000 \n\t"
721         "mtc1 %[tmp], %[maskl] \n\t"
722         "punpcklwd %[maskl], %[maskl], %[maskl] \n\t"
723         "mov.d %[maskh], %[maskl] \n\t"
724         _mm_or(accum0, accum0, mask)
725       ".set pop \n\t"
726       :[maskh]"=&f"(maskh), [maskl]"=&f"(maskl),
727        [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
728        [tmp]"=&r"(tmp)
729       );
730     }
731 
732     // Store the convolution result (16 bytes) and advance the pixel pointers.
733     asm volatile (
734       ".set push \n\t"
735       ".set arch=loongson3a \n\t"
736       "gssdlc1 %[accum0h], 0xf(%[out_row]) \n\t"
737       "gssdrc1 %[accum0h], 0x8(%[out_row]) \n\t"
738       "gssdlc1 %[accum0l], 0x7(%[out_row]) \n\t"
739       "gssdrc1 %[accum0l], 0x0(%[out_row]) \n\t"
740       ".set pop \n\t"
741       ::[accum0h]"f"(accum0h), [accum0l]"f"(accum0l),
742         [out_row]"r"(out_row)
743       :"memory"
744     );
745     out_row += 16;
746   }
747 
748   // When the width of the output is not divisible by 4, We need to save one
749   // pixel (4 bytes) each time. And also the fourth pixel is always absent.
750   if (pixel_width & 3) {
751     asm volatile (
752       ".set push \n\t"
753       ".set arch=loongson3a \n\t"
754       _mm_xor(accum0, accum0, accum0)
755       _mm_xor(accum1, accum1, accum1)
756       _mm_xor(accum2, accum2, accum2)
757       ".set pop \n\t"
758       :[accum0h]"=&f"(accum0h), [accum0l]"=&f"(accum0l),
759        [accum1h]"=&f"(accum1h), [accum1l]"=&f"(accum1l),
760        [accum2h]"=&f"(accum2h), [accum2l]"=&f"(accum2l)
761     );
762     for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
763       double src8h, src8l, src16h, src16l;
764       double th, tl, mul_hih, mul_hil, mul_loh, mul_lol;
765       src = reinterpret_cast<const void*>(
766           &source_data_rows[filter_y][out_x<<2]);
767 
768       asm volatile (
769         ".set push \n\t"
770         ".set arch=loongson3a \n\t"
771         "gsldlc1 %[coeff16l], 7+%[fval] \n\t"
772         "gsldrc1 %[coeff16l], %[fval] \n\t"
773         "pshufh %[coeff16l], %[coeff16l], %[zerol] \n\t"
774         "mov.d %[coeff16h], %[coeff16l] \n\t"
775         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
776         "gsldlc1 %[src8h], 0xf(%[src]) \n\t"
777         "gsldrc1 %[src8h], 0x8(%[src]) \n\t"
778         "gsldlc1 %[src8l], 0x7(%[src]) \n\t"
779         "gsldrc1 %[src8l], 0x0(%[src]) \n\t"
780         // [16] a1 b1 g1 r1 a0 b0 g0 r0
781         _mm_punpcklbh(src16, src8, zero)
782         _mm_pmulhh(mul_hi, src16, coeff16)
783         _mm_pmullh(mul_lo, src16, coeff16)
784         // [32] a0 b0 g0 r0
785         _mm_punpcklhw(t, mul_lo, mul_hi)
786         _mm_paddw(accum0, accum0, t)
787         // [32] a1 b1 g1 r1
788         _mm_punpckhhw(t, mul_lo, mul_hi)
789         _mm_paddw(accum1, accum1, t)
790         // [16] a3 b3 g3 r3 a2 b2 g2 r2
791         _mm_punpckhbh(src16, src8, zero)
792         _mm_pmulhh(mul_hi, src16, coeff16)
793         _mm_pmullh(mul_lo, src16, coeff16)
794         // [32] a2 b2 g2 r2
795         _mm_punpcklhw(t, mul_lo, mul_hi)
796         _mm_paddw(accum2, accum2, t)
797         ".set pop \n\t"
798         :[th]"=&f"(th), [tl]"=&f"(tl),
799          [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
800          [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
801          [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
802          [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol),
803          [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
804          [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
805          [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
806          [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l)
807         :[zeroh]"f"(zero), [zerol]"f"(zero),
808          [fval]"m"(filter_values[filter_y]),
809          [src]"r"(src)
810       );
811     }
812 
813     double t;
814     asm volatile (
815       ".set push \n\t"
816       ".set arch=loongson3a \n\t"
817       _mm_psraw(accum0, accum0, sra)
818       _mm_psraw(accum1, accum1, sra)
819       _mm_psraw(accum2, accum2, sra)
820       // [16] a1 b1 g1 r1 a0 b0 g0 r0
821       _mm_packsswh(accum0, accum0, accum1, t)
822       // [16] a3 b3 g3 r3 a2 b2 g2 r2
823       _mm_packsswh(accum2, accum2, zero, t)
824       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
825       _mm_packushb(accum0, accum0, accum2, t)
826       ".set pop \n\t"
827       :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
828        [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
829        [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
830        [t]"=&f"(t)
831       :[zeroh]"f"(zero), [zerol]"f"(zero), [sra]"f"(sra)
832     );
833     if (has_alpha) {
834       double ah, al, bh, bl, srl8, srl16, sll24;
835 
836       asm volatile (
837         ".set push \n\t"
838         ".set arch=loongson3a \n\t"
839         "li %[tmp], 8 \n\t"
840         "mtc1 %[tmp], %[srl8] \n\t"
841         "li %[tmp], 16 \n\t"
842         "mtc1 %[tmp], %[srl16] \n\t"
843         "li %[tmp], 24 \n\t"
844         "mtc1 %[tmp], %[sll24] \n\t"
845         // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
846         _mm_psrlw(a, accum0, srl8)
847         // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
848         _mm_pmaxub(b, a, accum0) // Max of r and g.
849         // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
850         _mm_psrlw(a, accum0, srl16)
851         // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
852         _mm_pmaxub(b, a, b) // Max of r and g and b.
853         // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
854         _mm_psllw(b, b, sll24)
855         _mm_pmaxub(accum0, b, accum0)
856         ".set pop \n\t"
857         :[ah]"=&f"(ah), [al]"=&f"(al), [bh]"=&f"(bh), [bl]"=&f"(bl),
858          [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l), [tmp]"=&r"(tmp),
859          [srl8]"=&f"(srl8), [srl16]"=&f"(srl16), [sll24]"=&f"(sll24)
860       );
861     } else {
862       double maskh, maskl;
863 
864       asm volatile (
865         ".set push \n\t"
866         ".set arch=loongson3a \n\t"
867         // Set value of alpha channels to 0xFF.
868         "li %[tmp], 0xff000000 \n\t"
869         "mtc1 %[tmp], %[maskl] \n\t"
870         "punpcklwd %[maskl], %[maskl], %[maskl] \n\t"
871         "mov.d %[maskh], %[maskl] \n\t"
872         _mm_or(accum0, accum0, mask)
873         ".set pop \n\t"
874         :[maskh]"=&f"(maskh), [maskl]"=&f"(maskl),
875          [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
876          [tmp]"=&r"(tmp)
877       );
878     }
879 
880     double s4, s64;
881     asm volatile (
882       ".set push \n\t"
883       ".set arch=loongson3a \n\t"
884       "li %[tmp], 4 \n\t"
885       "mtc1 %[tmp], %[s4] \n\t"
886       "li %[tmp], 64 \n\t"
887       "mtc1 %[tmp], %[s64] \n\t"
888       ".set pop \n\t"
889       :[s4]"=f"(s4), [s64]"=f"(s64),
890        [tmp]"=&r"(tmp)
891     );
892     for (int out_x = width; out_x < pixel_width; out_x++) {
893       double t;
894 
895       asm volatile (
896         ".set push \n\t"
897         ".set arch=loongson3a \n\t"
898         "swc1 %[accum0l], (%[out_row]) \n\t"
899         _mm_psrlq(accum0, accum0, s4, s64, t)
900         ".set pop \n\t"
901         :[t]"=&f"(t),
902          [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l)
903         :[out_row]"r"(out_row), [s4]"f"(s4), [s64]"f"(s64)
904         :"memory"
905       );
906       out_row += 4;
907     }
908   }
909 }
910 
ConvolveVertically_LS3(const ConvolutionFilter1D::Fixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row,bool has_alpha)911 void ConvolveVertically_LS3(const ConvolutionFilter1D::Fixed* filter_values,
912                              int filter_length,
913                              unsigned char* const* source_data_rows,
914                              int pixel_width,
915                              unsigned char* out_row, bool has_alpha) {
916   if (has_alpha) {
917     ConvolveVertically_LS3_impl<true>(filter_values, filter_length,
918                                        source_data_rows, pixel_width, out_row);
919   } else {
920     ConvolveVertically_LS3_impl<false>(filter_values, filter_length,
921                                        source_data_rows, pixel_width, out_row);
922   }
923 }
924 
925 }  // namespace skia
926 
927 #endif /* _MIPS_ARCH_LOONGSON3A */
928