1 /*
2  * Copyright 2009 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkColorPriv.h"
11 #include "SkPaint.h"
12 #include "SkUtils.h"
13 
S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)14 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
15                                    const uint32_t* xy,
16                                    int count, uint32_t* colors) {
17     SkASSERT(count > 0 && colors != nullptr);
18     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
19     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
20     SkASSERT(s.fAlphaScale == 256);
21 
22     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
23     size_t rb = s.fPixmap.rowBytes();
24     uint32_t XY = *xy++;
25     unsigned y0 = XY >> 14;
26     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
27     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
28     unsigned subY = y0 & 0xF;
29 
30     // ( 0,  0,  0,  0,  0,  0,  0, 16)
31     __m128i sixteen = _mm_cvtsi32_si128(16);
32 
33     // ( 0,  0,  0,  0, 16, 16, 16, 16)
34     sixteen = _mm_shufflelo_epi16(sixteen, 0);
35 
36     // ( 0,  0,  0,  0,  0,  0,  0,  y)
37     __m128i allY = _mm_cvtsi32_si128(subY);
38 
39     // ( 0,  0,  0,  0,  y,  y,  y,  y)
40     allY = _mm_shufflelo_epi16(allY, 0);
41 
42     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
43     __m128i negY = _mm_sub_epi16(sixteen, allY);
44 
45     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
46     allY = _mm_unpacklo_epi64(allY, negY);
47 
48     // (16, 16, 16, 16, 16, 16, 16, 16 )
49     sixteen = _mm_shuffle_epi32(sixteen, 0);
50 
51     // ( 0,  0,  0,  0,  0,  0,  0,  0)
52     __m128i zero = _mm_setzero_si128();
53     do {
54         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
55         unsigned x0 = XX >> 18;
56         unsigned x1 = XX & 0x3FFF;
57 
58         // (0, 0, 0, 0, 0, 0, 0, x)
59         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
60 
61         // (0, 0, 0, 0, x, x, x, x)
62         allX = _mm_shufflelo_epi16(allX, 0);
63 
64         // (x, x, x, x, x, x, x, x)
65         allX = _mm_shuffle_epi32(allX, 0);
66 
67         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
68         __m128i negX = _mm_sub_epi16(sixteen, allX);
69 
70         // Load 4 samples (pixels).
71         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
72         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
73         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
74         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
75 
76         // (0, 0, a00, a10)
77         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
78 
79         // Expand to 16 bits per component.
80         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
81 
82         // ((a00 * (16-y)), (a10 * y)).
83         a00a10 = _mm_mullo_epi16(a00a10, allY);
84 
85         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
86         a00a10 = _mm_mullo_epi16(a00a10, negX);
87 
88         // (0, 0, a01, a10)
89         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
90 
91         // Expand to 16 bits per component.
92         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
93 
94         // (a01 * (16-y)), (a11 * y)
95         a01a11 = _mm_mullo_epi16(a01a11, allY);
96 
97         // (a01 * (16-y) * x), (a11 * y * x)
98         a01a11 = _mm_mullo_epi16(a01a11, allX);
99 
100         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
101         __m128i sum = _mm_add_epi16(a00a10, a01a11);
102 
103         // (DC, a00*w00 + a01*w01)
104         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
105 
106         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
107         sum = _mm_add_epi16(sum, shifted);
108 
109         // Divide each 16 bit component by 256.
110         sum = _mm_srli_epi16(sum, 8);
111 
112         // Pack lower 4 16 bit values of sum into lower 4 bytes.
113         sum = _mm_packus_epi16(sum, zero);
114 
115         // Extract low int and store.
116         *colors++ = _mm_cvtsi128_si32(sum);
117     } while (--count > 0);
118 }
119 
S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)120 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
121                                   const uint32_t* xy,
122                                   int count, uint32_t* colors) {
123     SkASSERT(count > 0 && colors != nullptr);
124     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
125     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
126     SkASSERT(s.fAlphaScale < 256);
127 
128     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
129     size_t rb = s.fPixmap.rowBytes();
130     uint32_t XY = *xy++;
131     unsigned y0 = XY >> 14;
132     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
133     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
134     unsigned subY = y0 & 0xF;
135 
136     // ( 0,  0,  0,  0,  0,  0,  0, 16)
137     __m128i sixteen = _mm_cvtsi32_si128(16);
138 
139     // ( 0,  0,  0,  0, 16, 16, 16, 16)
140     sixteen = _mm_shufflelo_epi16(sixteen, 0);
141 
142     // ( 0,  0,  0,  0,  0,  0,  0,  y)
143     __m128i allY = _mm_cvtsi32_si128(subY);
144 
145     // ( 0,  0,  0,  0,  y,  y,  y,  y)
146     allY = _mm_shufflelo_epi16(allY, 0);
147 
148     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
149     __m128i negY = _mm_sub_epi16(sixteen, allY);
150 
151     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
152     allY = _mm_unpacklo_epi64(allY, negY);
153 
154     // (16, 16, 16, 16, 16, 16, 16, 16 )
155     sixteen = _mm_shuffle_epi32(sixteen, 0);
156 
157     // ( 0,  0,  0,  0,  0,  0,  0,  0)
158     __m128i zero = _mm_setzero_si128();
159 
160     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
161     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
162 
163     do {
164         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
165         unsigned x0 = XX >> 18;
166         unsigned x1 = XX & 0x3FFF;
167 
168         // (0, 0, 0, 0, 0, 0, 0, x)
169         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
170 
171         // (0, 0, 0, 0, x, x, x, x)
172         allX = _mm_shufflelo_epi16(allX, 0);
173 
174         // (x, x, x, x, x, x, x, x)
175         allX = _mm_shuffle_epi32(allX, 0);
176 
177         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
178         __m128i negX = _mm_sub_epi16(sixteen, allX);
179 
180         // Load 4 samples (pixels).
181         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
182         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
183         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
184         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
185 
186         // (0, 0, a00, a10)
187         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
188 
189         // Expand to 16 bits per component.
190         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
191 
192         // ((a00 * (16-y)), (a10 * y)).
193         a00a10 = _mm_mullo_epi16(a00a10, allY);
194 
195         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
196         a00a10 = _mm_mullo_epi16(a00a10, negX);
197 
198         // (0, 0, a01, a10)
199         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
200 
201         // Expand to 16 bits per component.
202         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
203 
204         // (a01 * (16-y)), (a11 * y)
205         a01a11 = _mm_mullo_epi16(a01a11, allY);
206 
207         // (a01 * (16-y) * x), (a11 * y * x)
208         a01a11 = _mm_mullo_epi16(a01a11, allX);
209 
210         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
211         __m128i sum = _mm_add_epi16(a00a10, a01a11);
212 
213         // (DC, a00*w00 + a01*w01)
214         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
215 
216         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
217         sum = _mm_add_epi16(sum, shifted);
218 
219         // Divide each 16 bit component by 256.
220         sum = _mm_srli_epi16(sum, 8);
221 
222         // Multiply by alpha.
223         sum = _mm_mullo_epi16(sum, alpha);
224 
225         // Divide each 16 bit component by 256.
226         sum = _mm_srli_epi16(sum, 8);
227 
228         // Pack lower 4 16 bit values of sum into lower 4 bytes.
229         sum = _mm_packus_epi16(sum, zero);
230 
231         // Extract low int and store.
232         *colors++ = _mm_cvtsi128_si32(sum);
233     } while (--count > 0);
234 }
235 
ClampX_ClampY_pack_filter(SkFixed f,unsigned max,SkFixed one)236 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
237                                                  SkFixed one) {
238     unsigned i = SkClampMax(f >> 16, max);
239     i = (i << 4) | ((f >> 12) & 0xF);
240     return (i << 14) | SkClampMax((f + one) >> 16, max);
241 }
242 
243 /*  SSE version of ClampX_ClampY_filter_scale()
244  *  portable version is in core/SkBitmapProcState_matrix.h
245  */
ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)246 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
247                                      int count, int x, int y) {
248     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
249                              SkMatrix::kScale_Mask)) == 0);
250     SkASSERT(s.fInvKy == 0);
251 
252     const unsigned maxX = s.fPixmap.width() - 1;
253     const SkFixed one = s.fFilterOneX;
254     const SkFixed dx = s.fInvSx;
255 
256     const SkBitmapProcStateAutoMapper mapper(s, x, y);
257     const SkFixed fy = mapper.fixedY();
258     const unsigned maxY = s.fPixmap.height() - 1;
259     // compute our two Y values up front
260     *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
261     // now initialize fx
262     SkFixed fx = mapper.fixedX();
263 
264     // test if we don't need to apply the tile proc
265     if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
266         (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
267         if (count >= 4) {
268             // SSE version of decal_filter_scale
269             while ((size_t(xy) & 0x0F) != 0) {
270                 SkASSERT((fx >> (16 + 14)) == 0);
271                 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
272                 fx += dx;
273                 count--;
274             }
275 
276             __m128i wide_1    = _mm_set1_epi32(1);
277             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
278             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
279                                               fx + dx, fx);
280 
281             while (count >= 4) {
282                 __m128i wide_out;
283 
284                 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
285                 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
286                                         _mm_srai_epi32(wide_fx, 16), wide_1));
287 
288                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
289 
290                 xy += 4;
291                 fx += dx * 4;
292                 wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
293                 count -= 4;
294             } // while count >= 4
295         } // if count >= 4
296 
297         while (count-- > 0) {
298             SkASSERT((fx >> (16 + 14)) == 0);
299             *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
300             fx += dx;
301         }
302     } else {
303         // SSE2 only support 16bit interger max & min, so only process the case
304         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
305         // height, there should be rare bitmap whose height will be greater
306         // than max 16bit interger in the real world.
307         if ((count >= 4) && (maxX <= 0xFFFF)) {
308             while (((size_t)xy & 0x0F) != 0) {
309                 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
310                 fx += dx;
311                 count--;
312             }
313 
314             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
315                                               fx + dx, fx);
316             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
317             __m128i wide_one  = _mm_set1_epi32(one);
318             __m128i wide_maxX = _mm_set1_epi32(maxX);
319             __m128i wide_mask = _mm_set1_epi32(0xF);
320 
321              while (count >= 4) {
322                 __m128i wide_i;
323                 __m128i wide_lo;
324                 __m128i wide_fx1;
325 
326                 // i = SkClampMax(f>>16,maxX)
327                 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
328                                        _mm_setzero_si128());
329                 wide_i = _mm_min_epi16(wide_i, wide_maxX);
330 
331                 // i<<4 | TILEX_LOW_BITS(fx)
332                 wide_lo = _mm_srli_epi32(wide_fx, 12);
333                 wide_lo = _mm_and_si128(wide_lo, wide_mask);
334                 wide_i  = _mm_slli_epi32(wide_i, 4);
335                 wide_i  = _mm_or_si128(wide_i, wide_lo);
336 
337                 // i<<14
338                 wide_i = _mm_slli_epi32(wide_i, 14);
339 
340                 // SkClampMax(((f+one))>>16,max)
341                 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
342                 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
343                                                         _mm_setzero_si128());
344                 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
345 
346                 // final combination
347                 wide_i = _mm_or_si128(wide_i, wide_fx1);
348                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
349 
350                 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
351                 fx += dx * 4;
352                 xy += 4;
353                 count -= 4;
354             } // while count >= 4
355         } // if count >= 4
356 
357         while (count-- > 0) {
358             *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
359             fx += dx;
360         }
361     }
362 }
363 
364 /*  SSE version of ClampX_ClampY_nofilter_scale()
365  *  portable version is in core/SkBitmapProcState_matrix.h
366  */
ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)367 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
368                                     uint32_t xy[], int count, int x, int y) {
369     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
370                              SkMatrix::kScale_Mask)) == 0);
371 
372     // we store y, x, x, x, x, x
373     const unsigned maxX = s.fPixmap.width() - 1;
374     const SkBitmapProcStateAutoMapper mapper(s, x, y);
375     const unsigned maxY = s.fPixmap.height() - 1;
376     *xy++ = SkClampMax(mapper.intY(), maxY);
377     SkFixed fx = mapper.fixedX();
378 
379     if (0 == maxX) {
380         // all of the following X values must be 0
381         memset(xy, 0, count * sizeof(uint16_t));
382         return;
383     }
384 
385     const SkFixed dx = s.fInvSx;
386 
387     // test if we don't need to apply the tile proc
388     if ((unsigned)(fx >> 16) <= maxX &&
389         (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
390         // SSE version of decal_nofilter_scale
391         if (count >= 8) {
392             while (((size_t)xy & 0x0F) != 0) {
393                 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
394                 fx += 2 * dx;
395                 count -= 2;
396             }
397 
398             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
399             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
400 
401             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
402                                              fx + dx, fx);
403             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
404 
405             while (count >= 8) {
406                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
407                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
408 
409                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
410                                                       wide_out_high);
411                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
412 
413                 wide_low = _mm_add_epi32(wide_low, wide_dx8);
414                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
415 
416                 xy += 4;
417                 fx += dx * 8;
418                 count -= 8;
419             }
420         } // if count >= 8
421 
422         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
423         while (count-- > 0) {
424             *xx++ = SkToU16(fx >> 16);
425             fx += dx;
426         }
427     } else {
428         // SSE2 only support 16bit interger max & min, so only process the case
429         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
430         // height, there should be rare bitmap whose height will be greater
431         // than max 16bit interger in the real world.
432         if ((count >= 8) && (maxX <= 0xFFFF)) {
433             while (((size_t)xy & 0x0F) != 0) {
434                 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
435                                         SkClampMax(fx >> 16, maxX));
436                 fx += 2 * dx;
437                 count -= 2;
438             }
439 
440             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
441             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
442 
443             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
444                                              fx + dx, fx);
445             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
446             __m128i wide_maxX = _mm_set1_epi32(maxX);
447 
448             while (count >= 8) {
449                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
450                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
451 
452                 wide_out_low  = _mm_max_epi16(wide_out_low,
453                                               _mm_setzero_si128());
454                 wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
455                 wide_out_high = _mm_max_epi16(wide_out_high,
456                                               _mm_setzero_si128());
457                 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
458 
459                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
460                                                       wide_out_high);
461                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
462 
463                 wide_low  = _mm_add_epi32(wide_low, wide_dx8);
464                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
465 
466                 xy += 4;
467                 fx += dx * 8;
468                 count -= 8;
469             }
470         } // if count >= 8
471 
472         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
473         while (count-- > 0) {
474             *xx++ = SkClampMax(fx >> 16, maxX);
475             fx += dx;
476         }
477     }
478 }
479 
480 /*  SSE version of ClampX_ClampY_filter_affine()
481  *  portable version is in core/SkBitmapProcState_matrix.h
482  */
ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)483 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
484                                       uint32_t xy[], int count, int x, int y) {
485     const SkBitmapProcStateAutoMapper mapper(s, x, y);
486 
487     SkFixed oneX = s.fFilterOneX;
488     SkFixed oneY = s.fFilterOneY;
489     SkFixed fx = mapper.fixedX();
490     SkFixed fy = mapper.fixedY();
491     SkFixed dx = s.fInvSx;
492     SkFixed dy = s.fInvKy;
493     unsigned maxX = s.fPixmap.width() - 1;
494     unsigned maxY = s.fPixmap.height() - 1;
495 
496     if (count >= 2 && (maxX <= 0xFFFF)) {
497         SkFixed dx2 = dx + dx;
498         SkFixed dy2 = dy + dy;
499 
500         __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
501         __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
502         __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
503         __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
504         __m128i wide_mask = _mm_set1_epi32(0xF);
505 
506         while (count >= 2) {
507             // i = SkClampMax(f>>16,maxX)
508             __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
509                                            _mm_setzero_si128());
510             wide_i = _mm_min_epi16(wide_i, wide_max);
511 
512             // i<<4 | TILEX_LOW_BITS(f)
513             __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
514             wide_lo = _mm_and_si128(wide_lo, wide_mask);
515             wide_i  = _mm_slli_epi32(wide_i, 4);
516             wide_i  = _mm_or_si128(wide_i, wide_lo);
517 
518             // i<<14
519             wide_i = _mm_slli_epi32(wide_i, 14);
520 
521             // SkClampMax(((f+one))>>16,max)
522             __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
523             wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
524                                                    _mm_setzero_si128());
525             wide_f1 = _mm_min_epi16(wide_f1, wide_max);
526 
527             // final combination
528             wide_i = _mm_or_si128(wide_i, wide_f1);
529             _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
530 
531             wide_f = _mm_add_epi32(wide_f, wide_d2);
532 
533             fx += dx2;
534             fy += dy2;
535             xy += 4;
536             count -= 2;
537         } // while count >= 2
538     } // if count >= 2
539 
540     while (count-- > 0) {
541         *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
542         fy += dy;
543         *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
544         fx += dx;
545     }
546 }
547 
548 /*  SSE version of ClampX_ClampY_nofilter_affine()
549  *  portable version is in core/SkBitmapProcState_matrix.h
550  */
ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)551 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
552                                       uint32_t xy[], int count, int x, int y) {
553     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
554     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
555                              SkMatrix::kScale_Mask |
556                              SkMatrix::kAffine_Mask)) == 0);
557 
558     const SkBitmapProcStateAutoMapper mapper(s, x, y);
559 
560     SkFixed fx = mapper.fixedX();
561     SkFixed fy = mapper.fixedY();
562     SkFixed dx = s.fInvSx;
563     SkFixed dy = s.fInvKy;
564     int maxX = s.fPixmap.width() - 1;
565     int maxY = s.fPixmap.height() - 1;
566 
567     if (count >= 4 && (maxX <= 0xFFFF)) {
568         while (((size_t)xy & 0x0F) != 0) {
569             *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
570                                   SkClampMax(fx >> 16, maxX);
571             fx += dx;
572             fy += dy;
573             count--;
574         }
575 
576         SkFixed dx4 = dx * 4;
577         SkFixed dy4 = dy * 4;
578 
579         __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
580                                           fx + dx, fx);
581         __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
582                                           fy + dy, fy);
583         __m128i wide_dx4  = _mm_set1_epi32(dx4);
584         __m128i wide_dy4  = _mm_set1_epi32(dy4);
585 
586         __m128i wide_maxX = _mm_set1_epi32(maxX);
587         __m128i wide_maxY = _mm_set1_epi32(maxY);
588 
589         while (count >= 4) {
590             // SkClampMax(fx>>16,maxX)
591             __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
592                                             _mm_setzero_si128());
593             wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
594 
595             // SkClampMax(fy>>16,maxY)
596             __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
597                                             _mm_setzero_si128());
598             wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
599 
600             // final combination
601             __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
602                                           wide_lo);
603             _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
604 
605             wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
606             wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
607 
608             fx += dx4;
609             fy += dy4;
610             xy += 4;
611             count -= 4;
612         } // while count >= 4
613     } // if count >= 4
614 
615     while (count-- > 0) {
616         *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
617                               SkClampMax(fx >> 16, maxX);
618         fx += dx;
619         fy += dy;
620     }
621 }
622