1 /*
2  * Copyright © 2013 Soren Sandmann Pedersen
3  * Copyright © 2013 Red Hat, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  * DEALINGS IN THE SOFTWARE.
23  *
24  * Author: Soren Sandmann (soren.sandmann@gmail.com)
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29 
30 #include <stdlib.h>
31 #include <mmintrin.h>
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <tmmintrin.h>
35 #include "pixman-private.h"
36 #include "pixman-inlines.h"
37 
38 typedef struct
39 {
40     int		y;
41     uint64_t *	buffer;
42 } line_t;
43 
44 typedef struct
45 {
46     line_t		lines[2];
47     pixman_fixed_t	y;
48     pixman_fixed_t	x;
49     uint64_t		data[1];
50 } bilinear_info_t;
51 
52 static void
ssse3_fetch_horizontal(bits_image_t * image,line_t * line,int y,pixman_fixed_t x,pixman_fixed_t ux,int n)53 ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
54 			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
55 {
56     uint32_t *bits = image->bits + y * image->rowstride;
57     __m128i vx = _mm_set_epi16 (
58 	- (x + 1), x, - (x + 1), x,
59 	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
60     __m128i vux = _mm_set_epi16 (
61 	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
62 	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
63     __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
64     __m128i *b = (__m128i *)line->buffer;
65     __m128i vrl0, vrl1;
66 
67     while ((n -= 2) >= 0)
68     {
69 	__m128i vw, vr, s;
70 
71 	vrl1 = _mm_loadl_epi64 (
72 	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
73 	/* vrl1: R1, L1 */
74 
75     final_pixel:
76 	vrl0 = _mm_loadl_epi64 (
77 	    (__m128i *)(bits + pixman_fixed_to_int (x)));
78 	/* vrl0: R0, L0 */
79 
80 	/* The weights are based on vx which is a vector of
81 	 *
82 	 *    - (x + 1), x, - (x + 1), x,
83 	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
84 	 *
85 	 * so the 16 bit weights end up like this:
86 	 *
87 	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
88 	 *
89 	 * and after shifting and packing, we get these bytes:
90 	 *
91 	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
92 	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
93 	 *
94 	 * which means the first and the second input pixel
95 	 * have to be interleaved like this:
96 	 *
97 	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
98 	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
99 	 *
100 	 * before maddubsw can be used.
101 	 */
102 
103 	vw = _mm_add_epi16 (
104 	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
105 	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
106 	 */
107 
108 	vw = _mm_packus_epi16 (vw, vw);
109 	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
110 	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
111 	 */
112 	vx = _mm_add_epi16 (vx, vux);
113 
114 	x += 2 * ux;
115 
116 	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
117 	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
118 
119 	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
120 	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
121 
122 	vr = _mm_unpackhi_epi8 (vr, s);
123 	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
124 	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
125 	 */
126 
127 	vr = _mm_maddubs_epi16 (vr, vw);
128 
129 	/* When the weight is 0, the inverse weight is
130 	 * 128 which can't be represented in a signed byte.
131 	 * As a result maddubsw computes the following:
132 	 *
133 	 *     r = l * -128 + r * 0
134 	 *
135 	 * rather than the desired
136 	 *
137 	 *     r = l * 128 + r * 0
138 	 *
139 	 * We fix this by taking the absolute value of the
140 	 * result.
141 	 */
142 	vr = _mm_abs_epi16 (vr);
143 
144 	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
145 	_mm_store_si128 (b++, vr);
146     }
147 
148     if (n == -1)
149     {
150 	vrl1 = _mm_setzero_si128();
151 	goto final_pixel;
152     }
153 
154     line->y = y;
155 }
156 
157 static uint32_t *
ssse3_fetch_bilinear_cover(pixman_iter_t * iter,const uint32_t * mask)158 ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
159 {
160     pixman_fixed_t fx, ux;
161     bilinear_info_t *info = iter->data;
162     line_t *line0, *line1;
163     int y0, y1;
164     int32_t dist_y;
165     __m128i vw;
166     int i;
167 
168     fx = info->x;
169     ux = iter->image->common.transform->matrix[0][0];
170 
171     y0 = pixman_fixed_to_int (info->y);
172     y1 = y0 + 1;
173 
174     line0 = &info->lines[y0 & 0x01];
175     line1 = &info->lines[y1 & 0x01];
176 
177     if (line0->y != y0)
178     {
179 	ssse3_fetch_horizontal (
180 	    &iter->image->bits, line0, y0, fx, ux, iter->width);
181     }
182 
183     if (line1->y != y1)
184     {
185 	ssse3_fetch_horizontal (
186 	    &iter->image->bits, line1, y1, fx, ux, iter->width);
187     }
188 
189     dist_y = pixman_fixed_to_bilinear_weight (info->y);
190     dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
191 
192     vw = _mm_set_epi16 (
193 	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
194 
195     for (i = 0; i + 3 < iter->width; i += 4)
196     {
197 	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
198 	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
199 	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
200 	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
201 	__m128i r0, r1, tmp, p;
202 
203 	r0 = _mm_mulhi_epu16 (
204 	    _mm_sub_epi16 (bot0, top0), vw);
205 	tmp = _mm_cmplt_epi16 (bot0, top0);
206 	tmp = _mm_and_si128 (tmp, vw);
207 	r0 = _mm_sub_epi16 (r0, tmp);
208 	r0 = _mm_add_epi16 (r0, top0);
209 	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
210 	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
211 	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
212 	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
213 
214 	r1 = _mm_mulhi_epu16 (
215 	    _mm_sub_epi16 (bot1, top1), vw);
216 	tmp = _mm_cmplt_epi16 (bot1, top1);
217 	tmp = _mm_and_si128 (tmp, vw);
218 	r1 = _mm_sub_epi16 (r1, tmp);
219 	r1 = _mm_add_epi16 (r1, top1);
220 	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
221 	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
222 	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
223 
224 	p = _mm_packus_epi16 (r0, r1);
225 
226 	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
227     }
228 
229     while (i < iter->width)
230     {
231 	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
232 	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
233 	__m128i r0, tmp, p;
234 
235 	r0 = _mm_mulhi_epu16 (
236 	    _mm_sub_epi16 (bot0, top0), vw);
237 	tmp = _mm_cmplt_epi16 (bot0, top0);
238 	tmp = _mm_and_si128 (tmp, vw);
239 	r0 = _mm_sub_epi16 (r0, tmp);
240 	r0 = _mm_add_epi16 (r0, top0);
241 	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
242 	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
243 	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
244 	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
245 
246 	p = _mm_packus_epi16 (r0, r0);
247 
248 	if (iter->width - i == 1)
249 	{
250 	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
251 	    i++;
252 	}
253 	else
254 	{
255 	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
256 	    i += 2;
257 	}
258     }
259 
260     info->y += iter->image->common.transform->matrix[1][1];
261 
262     return iter->buffer;
263 }
264 
265 static void
ssse3_bilinear_cover_iter_fini(pixman_iter_t * iter)266 ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
267 {
268     free (iter->data);
269 }
270 
271 static void
ssse3_bilinear_cover_iter_init(pixman_iter_t * iter,const pixman_iter_info_t * iter_info)272 ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
273 {
274     int width = iter->width;
275     bilinear_info_t *info;
276     pixman_vector_t v;
277 
278     /* Reference point is the center of the pixel */
279     v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
280     v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
281     v.vector[2] = pixman_fixed_1;
282 
283     if (!pixman_transform_point_3d (iter->image->common.transform, &v))
284 	goto fail;
285 
286     info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
287     if (!info)
288 	goto fail;
289 
290     info->x = v.vector[0] - pixman_fixed_1 / 2;
291     info->y = v.vector[1] - pixman_fixed_1 / 2;
292 
293 #define ALIGN(addr)							\
294     ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
295 
296     /* It is safe to set the y coordinates to -1 initially
297      * because COVER_CLIP_BILINEAR ensures that we will only
298      * be asked to fetch lines in the [0, height) interval
299      */
300     info->lines[0].y = -1;
301     info->lines[0].buffer = ALIGN (&(info->data[0]));
302     info->lines[1].y = -1;
303     info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
304 
305     iter->get_scanline = ssse3_fetch_bilinear_cover;
306     iter->fini = ssse3_bilinear_cover_iter_fini;
307 
308     iter->data = info;
309     return;
310 
311 fail:
312     /* Something went wrong, either a bad matrix or OOM; in such cases,
313      * we don't guarantee any particular rendering.
314      */
315     _pixman_log_error (
316 	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
317 
318     iter->get_scanline = _pixman_iter_get_scanline_noop;
319     iter->fini = NULL;
320 }
321 
322 static const pixman_iter_info_t ssse3_iters[] =
323 {
324     { PIXMAN_a8r8g8b8,
325       (FAST_PATH_STANDARD_FLAGS			|
326        FAST_PATH_SCALE_TRANSFORM		|
327        FAST_PATH_BILINEAR_FILTER		|
328        FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
329       ITER_NARROW | ITER_SRC,
330       ssse3_bilinear_cover_iter_init,
331       NULL, NULL
332     },
333 
334     { PIXMAN_null },
335 };
336 
337 static const pixman_fast_path_t ssse3_fast_paths[] =
338 {
339     { PIXMAN_OP_NONE },
340 };
341 
342 pixman_implementation_t *
_pixman_implementation_create_ssse3(pixman_implementation_t * fallback)343 _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
344 {
345     pixman_implementation_t *imp =
346 	_pixman_implementation_create (fallback, ssse3_fast_paths);
347 
348     imp->iter_info = ssse3_iters;
349 
350     return imp;
351 }
352