1 // Copyright (c) 2014-2015 The Chromium Authors. All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions
5 // are met:
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above copyright
9 // notice, this list of conditions and the following disclaimer in
10 // the documentation and/or other materials provided with the
11 // distribution.
12 // * Neither the name of Google, Inc. nor the names of its contributors
13 // may be used to endorse or promote products derived from this
14 // software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 // OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 // SUCH DAMAGE.
28
29 #include "convolver.h"
30 #include <algorithm>
31 #include "skia/include/core/SkTypes.h"
32
33 #if defined(_MIPS_ARCH_LOONGSON3A)
34
35 #include "MMIHelpers.h"
36
37 namespace skia {
38
39 // Convolves horizontally along a single row. The row data is given in
40 // |src_data| and continues for the num_values() of the filter.
ConvolveHorizontally_LS3(const unsigned char * src_data,const ConvolutionFilter1D & filter,unsigned char * out_row)41 void ConvolveHorizontally_LS3(const unsigned char* src_data,
42 const ConvolutionFilter1D& filter,
43 unsigned char* out_row) {
44 int num_values = filter.num_values();
45 int tmp, filter_offset, filter_length;
46 double zero, mask[4], shuf_50, shuf_fa;
47
48 asm volatile (
49 ".set push \n\t"
50 ".set arch=loongson3a \n\t"
51 "xor %[zero], %[zero], %[zero] \n\t"
52 // |mask| will be used to decimate all extra filter coefficients that are
53 // loaded by SIMD when |filter_length| is not divisible by 4.
54 // mask[0] is not used in following algorithm.
55 "li %[tmp], 1 \n\t"
56 "dsll32 %[tmp], 0x10 \n\t"
57 "daddiu %[tmp], -1 \n\t"
58 "dmtc1 %[tmp], %[mask3] \n\t"
59 "dsrl %[tmp], 0x10 \n\t"
60 "mtc1 %[tmp], %[mask2] \n\t"
61 "dsrl %[tmp], 0x10 \n\t"
62 "mtc1 %[tmp], %[mask1] \n\t"
63 "ori %[tmp], $0, 0x50 \n\t"
64 "mtc1 %[tmp], %[shuf_50] \n\t"
65 "ori %[tmp], $0, 0xfa \n\t"
66 "mtc1 %[tmp], %[shuf_fa] \n\t"
67 ".set pop \n\t"
68 :[zero]"=f"(zero), [mask1]"=f"(mask[1]),
69 [mask2]"=f"(mask[2]), [mask3]"=f"(mask[3]),
70 [shuf_50]"=f"(shuf_50), [shuf_fa]"=f"(shuf_fa),
71 [tmp]"=&r"(tmp)
72 );
73
74 // Output one pixel each iteration, calculating all channels (RGBA) together.
75 for (int out_x = 0; out_x < num_values; out_x++) {
76 const ConvolutionFilter1D::Fixed* filter_values =
77 filter.FilterForValue(out_x, &filter_offset, &filter_length);
78 double accumh, accuml;
79 // Compute the first pixel in this row that the filter affects. It will
80 // touch |filter_length| pixels (4 bytes each) after this.
81 const void *row_to_filter =
82 reinterpret_cast<const void*>(&src_data[filter_offset << 2]);
83
84 asm volatile (
85 ".set push \n\t"
86 ".set arch=loongson3a \n\t"
87 _mm_xor(accum, accum, accum)
88 ".set pop \n\t"
89 :[accumh]"=f"(accumh), [accuml]"=f"(accuml)
90 );
91
92 // We will load and accumulate with four coefficients per iteration.
93 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
94 double src16h, src16l, mul_hih, mul_hil, mul_loh, mul_lol;
95 double coeffh, coeffl, src8h, src8l, th, tl, coeff16h, coeff16l;
96
97 asm volatile (
98 ".set push \n\t"
99 ".set arch=loongson3a \n\t"
100 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
101 // [16] xx xx xx xx c3 c2 c1 c0
102 "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
103 "gsldrc1 %[coeffl], (%[fval]) \n\t"
104 "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
105 // [16] xx xx xx xx c1 c1 c0 c0
106 _mm_pshuflh(coeff16, coeff, shuf_50)
107 // [16] c1 c1 c1 c1 c0 c0 c0 c0
108 _mm_punpcklhw(coeff16, coeff16, coeff16)
109 // Load four pixels => unpack the first two pixels to 16 bits =>
110 // multiply with coefficients => accumulate the convolution result.
111 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
112 "gsldlc1 %[src8h], 0xf(%[rtf]) \n\t"
113 "gsldrc1 %[src8h], 0x8(%[rtf]) \n\t"
114 "gsldlc1 %[src8l], 0x7(%[rtf]) \n\t"
115 "gsldrc1 %[src8l], 0x0(%[rtf]) \n\t"
116 // [16] a1 b1 g1 r1 a0 b0 g0 r0
117 _mm_punpcklbh(src16, src8, zero)
118 _mm_pmulhh(mul_hi, src16, coeff16)
119 _mm_pmullh(mul_lo, src16, coeff16)
120 // [32] a0*c0 b0*c0 g0*c0 r0*c0
121 _mm_punpcklhw(t, mul_lo, mul_hi)
122 _mm_paddw(accum, accum, t)
123 // [32] a1*c1 b1*c1 g1*c1 r1*c1
124 _mm_punpckhhw(t, mul_lo, mul_hi)
125 _mm_paddw(accum, accum, t)
126 // Duplicate 3rd and 4th coefficients for all channels =>
127 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
128 // => accumulate the convolution results.
129 // [16] xx xx xx xx c3 c3 c2 c2
130 _mm_pshuflh(coeff16, coeff, shuf_fa)
131 // [16] c3 c3 c3 c3 c2 c2 c2 c2
132 _mm_punpcklhw(coeff16, coeff16, coeff16)
133 // [16] a3 g3 b3 r3 a2 g2 b2 r2
134 _mm_punpckhbh(src16, src8, zero)
135 _mm_pmulhh(mul_hi, src16, coeff16)
136 _mm_pmullh(mul_lo, src16, coeff16)
137 // [32] a2*c2 b2*c2 g2*c2 r2*c2
138 _mm_punpcklhw(t, mul_lo, mul_hi)
139 _mm_paddw(accum, accum, t)
140 // [32] a3*c3 b3*c3 g3*c3 r3*c3
141 _mm_punpckhhw(t, mul_lo, mul_hi)
142 _mm_paddw(accum, accum, t)
143 ".set pop \n\t"
144 :[th]"=&f"(th), [tl]"=&f"(tl),
145 [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
146 [accumh]"+f"(accumh), [accuml]"+f"(accuml),
147 [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
148 [coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
149 [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l),
150 [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
151 [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol)
152 :[zeroh]"f"(zero), [zerol]"f"(zero),
153 [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa),
154 [fval]"r"(filter_values), [rtf]"r"(row_to_filter)
155 );
156
157 // Advance the pixel and coefficients pointers.
158 row_to_filter += 16;
159 filter_values += 4;
160 }
161
162 // When |filter_length| is not divisible by 4, we need to decimate some of
163 // the filter coefficient that was loaded incorrectly to zero; Other than
164 // that the algorithm is same with above, except that the 4th pixel will be
165 // always absent.
166 int r = filter_length & 3;
167 if (r) {
168 double coeffh, coeffl, th, tl, coeff16h, coeff16l;
169 double src8h, src8l, src16h, src16l, mul_hih, mul_hil, mul_loh, mul_lol;
170
171 asm volatile (
172 ".set push \n\t"
173 ".set arch=loongson3a \n\t"
174 "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
175 "gsldrc1 %[coeffl], (%[fval]) \n\t"
176 "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
177 // Mask out extra filter taps.
178 "and %[coeffl], %[coeffl], %[mask] \n\t"
179 _mm_pshuflh(coeff16, coeff, shuf_50)
180 _mm_punpcklhw(coeff16, coeff16, coeff16)
181 "gsldlc1 %[src8h], 0xf(%[rtf]) \n\t"
182 "gsldrc1 %[src8h], 0x8(%[rtf]) \n\t"
183 "gsldlc1 %[src8l], 0x7(%[rtf]) \n\t"
184 "gsldrc1 %[src8l], 0x0(%[rtf]) \n\t"
185 _mm_punpcklbh(src16, src8, zero)
186 _mm_pmulhh(mul_hi, src16, coeff16)
187 _mm_pmullh(mul_lo, src16, coeff16)
188 _mm_punpcklhw(t, mul_lo, mul_hi)
189 _mm_paddw(accum, accum, t)
190 _mm_punpckhhw(t, mul_lo, mul_hi)
191 _mm_paddw(accum, accum, t)
192 _mm_punpckhbh(src16, src8, zero)
193 _mm_pshuflh(coeff16, coeff, shuf_fa)
194 _mm_punpcklhw(coeff16, coeff16, coeff16)
195 _mm_pmulhh(mul_hi, src16, coeff16)
196 _mm_pmullh(mul_lo, src16, coeff16)
197 _mm_punpcklhw(t, mul_lo, mul_hi)
198 _mm_paddw(accum, accum, t)
199 ".set pop \n\t"
200 :[th]"=&f"(th), [tl]"=&f"(tl),
201 [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
202 [accumh]"+f"(accumh), [accuml]"+f"(accuml),
203 [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
204 [coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
205 [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l),
206 [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
207 [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol)
208 :[fval]"r"(filter_values), [rtf]"r"(row_to_filter),
209 [zeroh]"f"(zero), [zerol]"f"(zero), [mask]"f"(mask[r]),
210 [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
211 );
212 }
213
214 double t, sra;
215 asm volatile (
216 ".set push \n\t"
217 ".set arch=loongson3a \n\t"
218 "ori %[tmp], $0, %[sk_sra] \n\t"
219 "mtc1 %[tmp], %[sra] \n\t"
220 // Shift right for fixed point implementation.
221 _mm_psraw(accum, accum, sra)
222 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
223 _mm_packsswh(accum, accum, zero, t)
224 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
225 _mm_packushb(accum, accum, zero, t)
226 // Store the pixel value of 32 bits.
227 "swc1 %[accuml], (%[out_row]) \n\t"
228 ".set pop \n\t"
229 :[sra]"=&f"(sra), [t]"=&f"(t), [tmp]"=&r"(tmp),
230 [accumh]"+f"(accumh), [accuml]"+f"(accuml)
231 :[sk_sra]"i"(ConvolutionFilter1D::kShiftBits),
232 [out_row]"r"(out_row), [zeroh]"f"(zero), [zerol]"f"(zero)
233 :"memory"
234 );
235
236 out_row += 4;
237 }
238 }
239
240 // Convolves horizontally along a single row. The row data is given in
241 // |src_data| and continues for the [begin, end) of the filter.
242 // Process one pixel at a time.
ConvolveHorizontally1_LS3(const unsigned char * src_data,const ConvolutionFilter1D & filter,unsigned char * out_row)243 void ConvolveHorizontally1_LS3(const unsigned char* src_data,
244 const ConvolutionFilter1D& filter,
245 unsigned char* out_row) {
246 int num_values = filter.num_values();
247 double zero;
248 double sra;
249
250 asm volatile (
251 ".set push \n"
252 ".set arch=loongson3a \n"
253 "xor %[zero], %[zero], %[zero] \n"
254 "mtc1 %[sk_sra], %[sra] \n"
255 ".set pop \n"
256 :[zero]"=&f"(zero), [sra]"=&f"(sra)
257 :[sk_sra]"r"(ConvolutionFilter1D::kShiftBits)
258 );
259 // Loop over each pixel on this row in the output image.
260 for (int out_x = 0; out_x < num_values; out_x++) {
261 // Get the filter that determines the current output pixel.
262 int filter_offset;
263 int filter_length;
264 const ConvolutionFilter1D::Fixed* filter_values =
265 filter.FilterForValue(out_x, &filter_offset, &filter_length);
266
267 // Compute the first pixel in this row that the filter affects. It will
268 // touch |filter_length| pixels (4 bytes each) after this.
269 const unsigned char* row_to_filter = &src_data[filter_offset * 4];
270
271 // Apply the filter to the row to get the destination pixel in |accum|.
272 double accuml;
273 double accumh;
274 asm volatile (
275 ".set push \n"
276 ".set arch=loongson3a \n"
277 "xor %[accuml], %[accuml], %[accuml] \n"
278 "xor %[accumh], %[accumh], %[accumh] \n"
279 ".set pop \n"
280 :[accuml]"=&f"(accuml), [accumh]"=&f"(accumh)
281 );
282 for (int filter_x = 0; filter_x < filter_length; filter_x++) {
283 double src8;
284 double src16;
285 double coeff;
286 double coeff16;
287 asm volatile (
288 ".set push \n"
289 ".set arch=loongson3a \n"
290 "lwc1 %[src8], %[rtf] \n"
291 "mtc1 %[fv], %[coeff] \n"
292 "pshufh %[coeff16], %[coeff], %[zero] \n"
293 "punpcklbh %[src16], %[src8], %[zero] \n"
294 "pmullh %[src8], %[src16], %[coeff16] \n"
295 "pmulhh %[coeff], %[src16], %[coeff16] \n"
296 "punpcklhw %[src16], %[src8], %[coeff] \n"
297 "punpckhhw %[coeff16], %[src8], %[coeff] \n"
298 "paddw %[accuml], %[accuml], %[src16] \n"
299 "paddw %[accumh], %[accumh], %[coeff16] \n"
300 ".set pop \n"
301 :[accuml]"+f"(accuml), [accumh]"+f"(accumh),
302 [src8]"=&f"(src8), [src16]"=&f"(src16),
303 [coeff]"=&f"(coeff), [coeff16]"=&f"(coeff16)
304 :[rtf]"m"(row_to_filter[filter_x * 4]),
305 [fv]"r"(filter_values[filter_x]), [zero]"f"(zero)
306 );
307 }
308
309 asm volatile (
310 ".set push \n"
311 ".set arch=loongson3a \n"
312 // Bring this value back in range. All of the filter scaling factors
313 // are in fixed point with kShiftBits bits of fractional part.
314 "psraw %[accuml], %[accuml], %[sra] \n"
315 "psraw %[accumh], %[accumh], %[sra] \n"
316 // Store the new pixel.
317 "packsswh %[accuml], %[accuml], %[accumh] \n"
318 "packushb %[accuml], %[accuml], %[zero] \n"
319 "swc1 %[accuml], %[out_row] \n"
320 ".set pop \n"
321 :[accuml]"+f"(accuml), [accumh]"+f"(accumh)
322 :[sra]"f"(sra), [zero]"f"(zero), [out_row]"m"(out_row[out_x * 4])
323 :"memory"
324 );
325 }
326 }
327
328 // Convolves horizontally along four rows. The row data is given in
329 // |src_data| and continues for the num_values() of the filter.
330 // The algorithm is almost same as |ConvolveHorizontally_LS3|. Please
331 // refer to that function for detailed comments.
ConvolveHorizontally4_LS3(const unsigned char * src_data[4],const ConvolutionFilter1D & filter,unsigned char * out_row[4])332 void ConvolveHorizontally4_LS3(const unsigned char* src_data[4],
333 const ConvolutionFilter1D& filter,
334 unsigned char* out_row[4]) {
335 int num_values = filter.num_values();
336 int tmp, filter_offset, filter_length;
337 double zero, mask[4], shuf_50, shuf_fa;
338
339 asm volatile (
340 ".set push \n\t"
341 ".set arch=loongson3a \n\t"
342 "xor %[zero], %[zero], %[zero] \n\t"
343 // |mask| will be used to decimate all extra filter coefficients that are
344 // loaded by SIMD when |filter_length| is not divisible by 4.
345 // mask[0] is not used in following algorithm.
346 "li %[tmp], 1 \n\t"
347 "dsll32 %[tmp], 0x10 \n\t"
348 "daddiu %[tmp], -1 \n\t"
349 "dmtc1 %[tmp], %[mask3] \n\t"
350 "dsrl %[tmp], 0x10 \n\t"
351 "mtc1 %[tmp], %[mask2] \n\t"
352 "dsrl %[tmp], 0x10 \n\t"
353 "mtc1 %[tmp], %[mask1] \n\t"
354 "ori %[tmp], $0, 0x50 \n\t"
355 "mtc1 %[tmp], %[shuf_50] \n\t"
356 "ori %[tmp], $0, 0xfa \n\t"
357 "mtc1 %[tmp], %[shuf_fa] \n\t"
358 ".set pop \n\t"
359 :[zero]"=f"(zero), [mask1]"=f"(mask[1]),
360 [mask2]"=f"(mask[2]), [mask3]"=f"(mask[3]),
361 [shuf_50]"=f"(shuf_50), [shuf_fa]"=f"(shuf_fa),
362 [tmp]"=&r"(tmp)
363 );
364
365 // Output one pixel each iteration, calculating all channels (RGBA) together.
366 for (int out_x = 0; out_x < num_values; out_x++) {
367 const ConvolutionFilter1D::Fixed* filter_values =
368 filter.FilterForValue(out_x, &filter_offset, &filter_length);
369 double accum0h, accum0l, accum1h, accum1l;
370 double accum2h, accum2l, accum3h, accum3l;
371
372 // four pixels in a column per iteration.
373 asm volatile (
374 ".set push \n\t"
375 ".set arch=loongson3a \n\t"
376 _mm_xor(accum0, accum0, accum0)
377 _mm_xor(accum1, accum1, accum1)
378 _mm_xor(accum2, accum2, accum2)
379 _mm_xor(accum3, accum3, accum3)
380 ".set pop \n\t"
381 :[accum0h]"=f"(accum0h), [accum0l]"=f"(accum0l),
382 [accum1h]"=f"(accum1h), [accum1l]"=f"(accum1l),
383 [accum2h]"=f"(accum2h), [accum2l]"=f"(accum2l),
384 [accum3h]"=f"(accum3h), [accum3l]"=f"(accum3l)
385 );
386
387 int start = (filter_offset<<2);
388 // We will load and accumulate with four coefficients per iteration.
389 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
390 double src8h, src8l, src16h, src16l;
391 double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
392 double coeffh, coeffl, coeff16loh, coeff16lol, coeff16hih, coeff16hil;
393
394 asm volatile (
395 ".set push \n\t"
396 ".set arch=loongson3a \n\t"
397 // [16] xx xx xx xx c3 c2 c1 c0
398 "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
399 "gsldrc1 %[coeffl], (%[fval]) \n\t"
400 "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
401 // [16] xx xx xx xx c1 c1 c0 c0
402 _mm_pshuflh(coeff16lo, coeff, shuf_50)
403 // [16] c1 c1 c1 c1 c0 c0 c0 c0
404 _mm_punpcklhw(coeff16lo, coeff16lo, coeff16lo)
405 // [16] xx xx xx xx c3 c3 c2 c2
406 _mm_pshuflh(coeff16hi, coeff, shuf_fa)
407 // [16] c3 c3 c3 c3 c2 c2 c2 c2
408 _mm_punpcklhw(coeff16hi, coeff16hi, coeff16hi)
409 ".set pop \n\t"
410 :[coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
411 [coeff16loh]"=&f"(coeff16loh), [coeff16lol]"=&f"(coeff16lol),
412 [coeff16hih]"=&f"(coeff16hih), [coeff16hil]"=&f"(coeff16hil)
413 :[fval]"r"(filter_values), [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
414 );
415
416 #define ITERATION(_src, _accumh, _accuml) \
417 asm volatile ( \
418 ".set push \n\t" \
419 ".set arch=loongson3a \n\t" \
420 "gsldlc1 %[src8h], 0xf(%[src]) \n\t" \
421 "gsldrc1 %[src8h], 0x8(%[src]) \n\t" \
422 "gsldlc1 %[src8l], 0x7(%[src]) \n\t" \
423 "gsldrc1 %[src8l], 0x0(%[src]) \n\t" \
424 _mm_punpcklbh(src16, src8, zero) \
425 _mm_pmulhh(mul_hi, src16, coeff16lo) \
426 _mm_pmullh(mul_lo, src16, coeff16lo) \
427 _mm_punpcklhw(t, mul_lo, mul_hi) \
428 _mm_paddw(accum, accum, t) \
429 _mm_punpckhhw(t, mul_lo, mul_hi) \
430 _mm_paddw(accum, accum, t) \
431 _mm_punpckhbh(src16, src8, zero) \
432 _mm_pmulhh(mul_hi, src16, coeff16hi) \
433 _mm_pmullh(mul_lo, src16, coeff16hi) \
434 _mm_punpcklhw(t, mul_lo, mul_hi) \
435 _mm_paddw(accum, accum, t) \
436 _mm_punpckhhw(t, mul_lo, mul_hi) \
437 _mm_paddw(accum, accum, t) \
438 ".set pop \n\t" \
439 :[th]"=&f"(th), [tl]"=&f"(tl), \
440 [src8h]"=&f"(src8h), [src8l]"=&f"(src8l), \
441 [src16h]"=&f"(src16h), [src16l]"=&f"(src16l), \
442 [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil), \
443 [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol), \
444 [accumh]"+f"(_accumh), [accuml]"+f"(_accuml) \
445 :[zeroh]"f"(zero), [zerol]"f"(zero), [src]"r"(_src), \
446 [coeff16loh]"f"(coeff16loh), [coeff16lol]"f"(coeff16lol), \
447 [coeff16hih]"f"(coeff16hih), [coeff16hil]"f"(coeff16hil) \
448 );
449
450 ITERATION(src_data[0] + start, accum0h, accum0l);
451 ITERATION(src_data[1] + start, accum1h, accum1l);
452 ITERATION(src_data[2] + start, accum2h, accum2l);
453 ITERATION(src_data[3] + start, accum3h, accum3l);
454
455 start += 16;
456 filter_values += 4;
457 }
458
459 int r = filter_length & 3;
460 if (r) {
461 double src8h, src8l, src16h, src16l;
462 double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
463 double coeffh, coeffl, coeff16loh, coeff16lol, coeff16hih, coeff16hil;
464
465 asm volatile (
466 ".set push \n\t"
467 ".set arch=loongson3a \n\t"
468 "gsldlc1 %[coeffl], 7(%[fval]) \n\t"
469 "gsldrc1 %[coeffl], (%[fval]) \n\t"
470 "xor %[coeffh], %[coeffh], %[coeffh] \n\t"
471 // Mask out extra filter taps.
472 "and %[coeffl], %[coeffl], %[mask] \n\t"
473 _mm_pshuflh(coeff16lo, coeff, shuf_50)
474 /* c1 c1 c1 c1 c0 c0 c0 c0 */
475 _mm_punpcklhw(coeff16lo, coeff16lo, coeff16lo)
476 _mm_pshuflh(coeff16hi, coeff, shuf_fa)
477 _mm_punpcklhw(coeff16hi, coeff16hi, coeff16hi)
478 ".set pop \n\t"
479 :[coeffh]"=&f"(coeffh), [coeffl]"=&f"(coeffl),
480 [coeff16loh]"=&f"(coeff16loh), [coeff16lol]"=&f"(coeff16lol),
481 [coeff16hih]"=&f"(coeff16hih), [coeff16hil]"=&f"(coeff16hil)
482 :[fval]"r"(filter_values), [mask]"f"(mask[r]),
483 [shuf_50]"f"(shuf_50), [shuf_fa]"f"(shuf_fa)
484 );
485
486 ITERATION(src_data[0] + start, accum0h, accum0l);
487 ITERATION(src_data[1] + start, accum1h, accum1l);
488 ITERATION(src_data[2] + start, accum2h, accum2l);
489 ITERATION(src_data[3] + start, accum3h, accum3l);
490 }
491
492 double t, sra;
493 asm volatile (
494 ".set push \n\t"
495 ".set arch=loongson3a \n\t"
496 "ori %[tmp], $0, %[sk_sra] \n\t"
497 "mtc1 %[tmp], %[sra] \n\t"
498 _mm_psraw(accum0, accum0, sra)
499 _mm_packsswh(accum0, accum0, zero, t)
500 _mm_packushb(accum0, accum0, zero, t)
501 _mm_psraw(accum1, accum1, sra)
502 _mm_packsswh(accum1, accum1, zero, t)
503 _mm_packushb(accum1, accum1, zero, t)
504 _mm_psraw(accum2, accum2, sra)
505 _mm_packsswh(accum2, accum2, zero, t)
506 _mm_packushb(accum2, accum2, zero, t)
507 _mm_psraw(accum3, accum3, sra)
508 _mm_packsswh(accum3, accum3, zero, t)
509 _mm_packushb(accum3, accum3, zero, t)
510 "swc1 %[accum0l], (%[out_row0]) \n\t"
511 "swc1 %[accum1l], (%[out_row1]) \n\t"
512 "swc1 %[accum2l], (%[out_row2]) \n\t"
513 "swc1 %[accum3l], (%[out_row3]) \n\t"
514 ".set pop \n\t"
515 :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
516 [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
517 [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
518 [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l),
519 [sra]"=&f"(sra), [t]"=&f"(t), [tmp]"=&r"(tmp)
520 :[zeroh]"f"(zero), [zerol]"f"(zero),
521 [out_row0]"r"(out_row[0]), [out_row1]"r"(out_row[1]),
522 [out_row2]"r"(out_row[2]), [out_row3]"r"(out_row[3]),
523 [sk_sra]"i"(ConvolutionFilter1D::kShiftBits)
524 :"memory"
525 );
526
527 out_row[0] += 4;
528 out_row[1] += 4;
529 out_row[2] += 4;
530 out_row[3] += 4;
531 }
532 }
533
534 // Does vertical convolution to produce one output row. The filter values and
535 // length are given in the first two parameters. These are applied to each
536 // of the rows pointed to in the |source_data_rows| array, with each row
537 // being |pixel_width| wide.
538 //
539 // The output must have room for |pixel_width * 4| bytes.
540 template<bool has_alpha>
ConvolveVertically_LS3_impl(const ConvolutionFilter1D::Fixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row)541 void ConvolveVertically_LS3_impl(const ConvolutionFilter1D::Fixed* filter_values,
542 int filter_length,
543 unsigned char* const* source_data_rows,
544 int pixel_width,
545 unsigned char* out_row) {
546 uint64_t tmp;
547 int width = pixel_width & ~3;
548 double zero, sra, coeff16h, coeff16l;
549 double accum0h, accum0l, accum1h, accum1l;
550 double accum2h, accum2l, accum3h, accum3l;
551 const void *src;
552 int out_x;
553
554 asm volatile (
555 ".set push \n\t"
556 ".set arch=loongson3a \n\t"
557 "xor %[zero], %[zero], %[zero] \n\t"
558 "ori %[tmp], $0, %[sk_sra] \n\t"
559 "mtc1 %[tmp], %[sra] \n\t"
560 ".set pop \n\t"
561 :[zero]"=f"(zero), [sra]"=f"(sra), [tmp]"=&r"(tmp)
562 :[sk_sra]"i"(ConvolutionFilter1D::kShiftBits)
563 );
564
565 // Output four pixels per iteration (16 bytes).
566 for (out_x = 0; out_x < width; out_x += 4) {
567 // Accumulated result for each pixel. 32 bits per RGBA channel.
568 asm volatile (
569 ".set push \n\t"
570 ".set arch=loongson3a \n\t"
571 _mm_xor(accum0, accum0, accum0)
572 _mm_xor(accum1, accum1, accum1)
573 _mm_xor(accum2, accum2, accum2)
574 _mm_xor(accum3, accum3, accum3)
575 ".set pop \n\t"
576 :[accum0h]"=f"(accum0h), [accum0l]"=f"(accum0l),
577 [accum1h]"=f"(accum1h), [accum1l]"=f"(accum1l),
578 [accum2h]"=f"(accum2h), [accum2l]"=f"(accum2l),
579 [accum3h]"=f"(accum3h), [accum3l]"=f"(accum3l)
580 );
581
582 // Convolve with one filter coefficient per iteration.
583 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
584 double src8h, src8l, src16h, src16l;
585 double mul_hih, mul_hil, mul_loh, mul_lol, th, tl;
586
587 src = reinterpret_cast<const void*>(
588 &source_data_rows[filter_y][out_x << 2]);
589
590 asm volatile (
591 ".set push \n\t"
592 ".set arch=loongson3a \n\t"
593 // Duplicate the filter coefficient 8 times.
594 // [16] cj cj cj cj cj cj cj cj
595 "gsldlc1 %[coeff16l], 7+%[fval] \n\t"
596 "gsldrc1 %[coeff16l], %[fval] \n\t"
597 "pshufh %[coeff16l], %[coeff16l], %[zerol] \n\t"
598 "mov.d %[coeff16h], %[coeff16l] \n\t"
599 // Load four pixels (16 bytes) together.
600 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
601 "gsldlc1 %[src8h], 0xf(%[src]) \n\t"
602 "gsldrc1 %[src8h], 0x8(%[src]) \n\t"
603 "gsldlc1 %[src8l], 0x7(%[src]) \n\t"
604 "gsldrc1 %[src8l], 0x0(%[src]) \n\t"
605 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
606 // multiply with current coefficient => accumulate the result.
607 // [16] a1 b1 g1 r1 a0 b0 g0 r0
608 _mm_punpcklbh(src16, src8, zero)
609 _mm_pmulhh(mul_hi, src16, coeff16)
610 _mm_pmullh(mul_lo, src16, coeff16)
611 // [32] a0 b0 g0 r0
612 _mm_punpcklhw(t, mul_lo, mul_hi)
613 _mm_paddw(accum0, accum0, t)
614 // [32] a1 b1 g1 r1
615 _mm_punpckhhw(t, mul_lo, mul_hi)
616 _mm_paddw(accum1, accum1, t)
617 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
618 // multiply with current coefficient => accumulate the result.
619 // [16] a3 b3 g3 r3 a2 b2 g2 r2
620 _mm_punpckhbh(src16, src8, zero)
621 _mm_pmulhh(mul_hi, src16, coeff16)
622 _mm_pmullh(mul_lo, src16, coeff16)
623 ".set pop \n\t"
624 :[th]"=&f"(th), [tl]"=&f"(tl),
625 [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
626 [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
627 [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
628 [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol),
629 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
630 [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
631 [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l)
632 :[zeroh]"f"(zero), [zerol]"f"(zero),
633 [fval]"m"(filter_values[filter_y]),
634 [src]"r"(src)
635 );
636
637 asm volatile (
638 ".set push \n\t"
639 ".set arch=loongson3a \n\t"
640 // [32] a2 b2 g2 r2
641 _mm_punpcklhw(t, mul_lo, mul_hi)
642 _mm_paddw(accum2, accum2, t)
643 // [32] a3 b3 g3 r3
644 _mm_punpckhhw(t, mul_lo, mul_hi)
645 _mm_paddw(accum3, accum3, t)
646 ".set pop \n\t"
647 :[th]"=&f"(th), [tl]"=&f"(tl),
648 [mul_hih]"+f"(mul_hih), [mul_hil]"+f"(mul_hil),
649 [mul_loh]"+f"(mul_loh), [mul_lol]"+f"(mul_lol),
650 [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
651 [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l)
652 );
653 }
654
655 double t;
656 asm volatile (
657 ".set push \n\t"
658 ".set arch=loongson3a \n\t"
659 // Shift right for fixed point implementation.
660 _mm_psraw(accum0, accum0, sra)
661 _mm_psraw(accum1, accum1, sra)
662 _mm_psraw(accum2, accum2, sra)
663 _mm_psraw(accum3, accum3, sra)
664 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
665 // [16] a1 b1 g1 r1 a0 b0 g0 r0
666 _mm_packsswh(accum0, accum0, accum1, t)
667 // [16] a3 b3 g3 r3 a2 b2 g2 r2
668 _mm_packsswh(accum2, accum2, accum3, t)
669 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
670 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
671 _mm_packushb(accum0, accum0, accum2, t)
672 ".set pop \n\t"
673 :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
674 [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
675 [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
676 [accum3h]"+f"(accum3h), [accum3l]"+f"(accum3l),
677 [t]"=&f"(t)
678 :[sra]"f"(sra)
679 );
680
681 if (has_alpha) {
682 double ah, al, bh, bl, srl8, srl16, sll24;
683
684 asm volatile (
685 ".set push \n\t"
686 ".set arch=loongson3a \n\t"
687 "li %[tmp], 8 \n\t"
688 "mtc1 %[tmp], %[srl8] \n\t"
689 "li %[tmp], 16 \n\t"
690 "mtc1 %[tmp], %[srl16] \n\t"
691 "li %[tmp], 24 \n\t"
692 "mtc1 %[tmp], %[sll24] \n\t"
693 // Compute the max(ri, gi, bi) for each pixel.
694 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
695 _mm_psraw(a, accum0, srl8)
696 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
697 _mm_pmaxub(b, a, accum0) // Max of r and g.
698 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
699 _mm_psrlw(a, accum0, srl16)
700 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
701 _mm_pmaxub(b, a, b) // Max of r and g and b.
702 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
703 _mm_psllw(b, b, sll24)
704 // Make sure the value of alpha channel is always larger than maximum
705 // value of color channels.
706 _mm_pmaxub(accum0, b, accum0)
707 ".set pop \n\t"
708 :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
709 [tmp]"=&r"(tmp), [ah]"=&f"(ah), [al]"=&f"(al),
710 [bh]"=&f"(bh), [bl]"=&f"(bl), [srl8]"=&f"(srl8),
711 [srl16]"=&f"(srl16), [sll24]"=&f"(sll24)
712 );
713 } else {
714 double maskh, maskl;
715
716 asm volatile (
717 ".set push \n\t"
718 ".set arch=loongson3a \n\t"
719 // Set value of alpha channels to 0xFF.
720 "li %[tmp], 0xff000000 \n\t"
721 "mtc1 %[tmp], %[maskl] \n\t"
722 "punpcklwd %[maskl], %[maskl], %[maskl] \n\t"
723 "mov.d %[maskh], %[maskl] \n\t"
724 _mm_or(accum0, accum0, mask)
725 ".set pop \n\t"
726 :[maskh]"=&f"(maskh), [maskl]"=&f"(maskl),
727 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
728 [tmp]"=&r"(tmp)
729 );
730 }
731
732 // Store the convolution result (16 bytes) and advance the pixel pointers.
733 asm volatile (
734 ".set push \n\t"
735 ".set arch=loongson3a \n\t"
736 "gssdlc1 %[accum0h], 0xf(%[out_row]) \n\t"
737 "gssdrc1 %[accum0h], 0x8(%[out_row]) \n\t"
738 "gssdlc1 %[accum0l], 0x7(%[out_row]) \n\t"
739 "gssdrc1 %[accum0l], 0x0(%[out_row]) \n\t"
740 ".set pop \n\t"
741 ::[accum0h]"f"(accum0h), [accum0l]"f"(accum0l),
742 [out_row]"r"(out_row)
743 :"memory"
744 );
745 out_row += 16;
746 }
747
748 // When the width of the output is not divisible by 4, We need to save one
749 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
750 if (pixel_width & 3) {
751 asm volatile (
752 ".set push \n\t"
753 ".set arch=loongson3a \n\t"
754 _mm_xor(accum0, accum0, accum0)
755 _mm_xor(accum1, accum1, accum1)
756 _mm_xor(accum2, accum2, accum2)
757 ".set pop \n\t"
758 :[accum0h]"=&f"(accum0h), [accum0l]"=&f"(accum0l),
759 [accum1h]"=&f"(accum1h), [accum1l]"=&f"(accum1l),
760 [accum2h]"=&f"(accum2h), [accum2l]"=&f"(accum2l)
761 );
762 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
763 double src8h, src8l, src16h, src16l;
764 double th, tl, mul_hih, mul_hil, mul_loh, mul_lol;
765 src = reinterpret_cast<const void*>(
766 &source_data_rows[filter_y][out_x<<2]);
767
768 asm volatile (
769 ".set push \n\t"
770 ".set arch=loongson3a \n\t"
771 "gsldlc1 %[coeff16l], 7+%[fval] \n\t"
772 "gsldrc1 %[coeff16l], %[fval] \n\t"
773 "pshufh %[coeff16l], %[coeff16l], %[zerol] \n\t"
774 "mov.d %[coeff16h], %[coeff16l] \n\t"
775 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
776 "gsldlc1 %[src8h], 0xf(%[src]) \n\t"
777 "gsldrc1 %[src8h], 0x8(%[src]) \n\t"
778 "gsldlc1 %[src8l], 0x7(%[src]) \n\t"
779 "gsldrc1 %[src8l], 0x0(%[src]) \n\t"
780 // [16] a1 b1 g1 r1 a0 b0 g0 r0
781 _mm_punpcklbh(src16, src8, zero)
782 _mm_pmulhh(mul_hi, src16, coeff16)
783 _mm_pmullh(mul_lo, src16, coeff16)
784 // [32] a0 b0 g0 r0
785 _mm_punpcklhw(t, mul_lo, mul_hi)
786 _mm_paddw(accum0, accum0, t)
787 // [32] a1 b1 g1 r1
788 _mm_punpckhhw(t, mul_lo, mul_hi)
789 _mm_paddw(accum1, accum1, t)
790 // [16] a3 b3 g3 r3 a2 b2 g2 r2
791 _mm_punpckhbh(src16, src8, zero)
792 _mm_pmulhh(mul_hi, src16, coeff16)
793 _mm_pmullh(mul_lo, src16, coeff16)
794 // [32] a2 b2 g2 r2
795 _mm_punpcklhw(t, mul_lo, mul_hi)
796 _mm_paddw(accum2, accum2, t)
797 ".set pop \n\t"
798 :[th]"=&f"(th), [tl]"=&f"(tl),
799 [src8h]"=&f"(src8h), [src8l]"=&f"(src8l),
800 [src16h]"=&f"(src16h), [src16l]"=&f"(src16l),
801 [mul_hih]"=&f"(mul_hih), [mul_hil]"=&f"(mul_hil),
802 [mul_loh]"=&f"(mul_loh), [mul_lol]"=&f"(mul_lol),
803 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
804 [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
805 [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
806 [coeff16h]"=&f"(coeff16h), [coeff16l]"=&f"(coeff16l)
807 :[zeroh]"f"(zero), [zerol]"f"(zero),
808 [fval]"m"(filter_values[filter_y]),
809 [src]"r"(src)
810 );
811 }
812
813 double t;
814 asm volatile (
815 ".set push \n\t"
816 ".set arch=loongson3a \n\t"
817 _mm_psraw(accum0, accum0, sra)
818 _mm_psraw(accum1, accum1, sra)
819 _mm_psraw(accum2, accum2, sra)
820 // [16] a1 b1 g1 r1 a0 b0 g0 r0
821 _mm_packsswh(accum0, accum0, accum1, t)
822 // [16] a3 b3 g3 r3 a2 b2 g2 r2
823 _mm_packsswh(accum2, accum2, zero, t)
824 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
825 _mm_packushb(accum0, accum0, accum2, t)
826 ".set pop \n\t"
827 :[accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
828 [accum1h]"+f"(accum1h), [accum1l]"+f"(accum1l),
829 [accum2h]"+f"(accum2h), [accum2l]"+f"(accum2l),
830 [t]"=&f"(t)
831 :[zeroh]"f"(zero), [zerol]"f"(zero), [sra]"f"(sra)
832 );
833 if (has_alpha) {
834 double ah, al, bh, bl, srl8, srl16, sll24;
835
836 asm volatile (
837 ".set push \n\t"
838 ".set arch=loongson3a \n\t"
839 "li %[tmp], 8 \n\t"
840 "mtc1 %[tmp], %[srl8] \n\t"
841 "li %[tmp], 16 \n\t"
842 "mtc1 %[tmp], %[srl16] \n\t"
843 "li %[tmp], 24 \n\t"
844 "mtc1 %[tmp], %[sll24] \n\t"
845 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
846 _mm_psrlw(a, accum0, srl8)
847 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
848 _mm_pmaxub(b, a, accum0) // Max of r and g.
849 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
850 _mm_psrlw(a, accum0, srl16)
851 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
852 _mm_pmaxub(b, a, b) // Max of r and g and b.
853 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
854 _mm_psllw(b, b, sll24)
855 _mm_pmaxub(accum0, b, accum0)
856 ".set pop \n\t"
857 :[ah]"=&f"(ah), [al]"=&f"(al), [bh]"=&f"(bh), [bl]"=&f"(bl),
858 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l), [tmp]"=&r"(tmp),
859 [srl8]"=&f"(srl8), [srl16]"=&f"(srl16), [sll24]"=&f"(sll24)
860 );
861 } else {
862 double maskh, maskl;
863
864 asm volatile (
865 ".set push \n\t"
866 ".set arch=loongson3a \n\t"
867 // Set value of alpha channels to 0xFF.
868 "li %[tmp], 0xff000000 \n\t"
869 "mtc1 %[tmp], %[maskl] \n\t"
870 "punpcklwd %[maskl], %[maskl], %[maskl] \n\t"
871 "mov.d %[maskh], %[maskl] \n\t"
872 _mm_or(accum0, accum0, mask)
873 ".set pop \n\t"
874 :[maskh]"=&f"(maskh), [maskl]"=&f"(maskl),
875 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l),
876 [tmp]"=&r"(tmp)
877 );
878 }
879
880 double s4, s64;
881 asm volatile (
882 ".set push \n\t"
883 ".set arch=loongson3a \n\t"
884 "li %[tmp], 4 \n\t"
885 "mtc1 %[tmp], %[s4] \n\t"
886 "li %[tmp], 64 \n\t"
887 "mtc1 %[tmp], %[s64] \n\t"
888 ".set pop \n\t"
889 :[s4]"=f"(s4), [s64]"=f"(s64),
890 [tmp]"=&r"(tmp)
891 );
892 for (int out_x = width; out_x < pixel_width; out_x++) {
893 double t;
894
895 asm volatile (
896 ".set push \n\t"
897 ".set arch=loongson3a \n\t"
898 "swc1 %[accum0l], (%[out_row]) \n\t"
899 _mm_psrlq(accum0, accum0, s4, s64, t)
900 ".set pop \n\t"
901 :[t]"=&f"(t),
902 [accum0h]"+f"(accum0h), [accum0l]"+f"(accum0l)
903 :[out_row]"r"(out_row), [s4]"f"(s4), [s64]"f"(s64)
904 :"memory"
905 );
906 out_row += 4;
907 }
908 }
909 }
910
ConvolveVertically_LS3(const ConvolutionFilter1D::Fixed * filter_values,int filter_length,unsigned char * const * source_data_rows,int pixel_width,unsigned char * out_row,bool has_alpha)911 void ConvolveVertically_LS3(const ConvolutionFilter1D::Fixed* filter_values,
912 int filter_length,
913 unsigned char* const* source_data_rows,
914 int pixel_width,
915 unsigned char* out_row, bool has_alpha) {
916 if (has_alpha) {
917 ConvolveVertically_LS3_impl<true>(filter_values, filter_length,
918 source_data_rows, pixel_width, out_row);
919 } else {
920 ConvolveVertically_LS3_impl<false>(filter_values, filter_length,
921 source_data_rows, pixel_width, out_row);
922 }
923 }
924
925 } // namespace skia
926
927 #endif /* _MIPS_ARCH_LOONGSON3A */
928