1 /**************************************************************************
2  *
3  * Copyright 2007-2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /*
29  * Rasterization for binned triangles within a tile
30  */
31 
32 #include <limits.h>
33 #include "util/u_math.h"
34 #include "lp_debug.h"
35 #include "lp_perf.h"
36 #include "lp_rast_priv.h"
37 
38 /**
39  * Shade all pixels in a 4x4 block.
40  */
41 static void
block_full_4(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)42 block_full_4(struct lp_rasterizer_task *task,
43              const struct lp_rast_triangle *tri,
44              int x, int y)
45 {
46    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
47 }
48 
49 
50 /**
51  * Shade all pixels in a 16x16 block.
52  */
53 static void
block_full_16(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)54 block_full_16(struct lp_rasterizer_task *task,
55               const struct lp_rast_triangle *tri,
56               int x, int y)
57 {
58    unsigned ix, iy;
59    assert(x % 16 == 0);
60    assert(y % 16 == 0);
61    for (iy = 0; iy < 16; iy += 4)
62       for (ix = 0; ix < 16; ix += 4)
63 	 block_full_4(task, tri, x + ix, y + iy);
64 }
65 
66 static inline unsigned
build_mask_linear(int32_t c,int32_t dcdx,int32_t dcdy)67 build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
68 {
69    unsigned mask = 0;
70 
71    int32_t c0 = c;
72    int32_t c1 = c0 + dcdy;
73    int32_t c2 = c1 + dcdy;
74    int32_t c3 = c2 + dcdy;
75 
76    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
77    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
78    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
79    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
80    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
81    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
82    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
83    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
84    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
85    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
86    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
87    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
88    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
89    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
90    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
91    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
92 
93    return mask;
94 }
95 
96 
97 static inline void
build_masks(int32_t c,int32_t cdiff,int32_t dcdx,int32_t dcdy,unsigned * outmask,unsigned * partmask)98 build_masks(int32_t c,
99             int32_t cdiff,
100             int32_t dcdx,
101             int32_t dcdy,
102             unsigned *outmask,
103             unsigned *partmask)
104 {
105    *outmask |= build_mask_linear(c, dcdx, dcdy);
106    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
107 }
108 
109 void
lp_rast_triangle_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)110 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
111                       const union lp_rast_cmd_arg arg)
112 {
113    union lp_rast_cmd_arg arg2;
114    arg2.triangle.tri = arg.triangle.tri;
115    arg2.triangle.plane_mask = (1<<3)-1;
116    lp_rast_triangle_3(task, arg2);
117 }
118 
119 void
lp_rast_triangle_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)120 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
121                       const union lp_rast_cmd_arg arg)
122 {
123    lp_rast_triangle_3_16(task, arg);
124 }
125 
126 void
lp_rast_triangle_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
128                       const union lp_rast_cmd_arg arg)
129 {
130    union lp_rast_cmd_arg arg2;
131    arg2.triangle.tri = arg.triangle.tri;
132    arg2.triangle.plane_mask = (1<<4)-1;
133    lp_rast_triangle_4(task, arg2);
134 }
135 
136 void
lp_rast_triangle_ms_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)137 lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task,
138                       const union lp_rast_cmd_arg arg)
139 {
140    union lp_rast_cmd_arg arg2;
141    arg2.triangle.tri = arg.triangle.tri;
142    arg2.triangle.plane_mask = (1<<3)-1;
143    lp_rast_triangle_ms_3(task, arg2);
144 }
145 
146 void
lp_rast_triangle_ms_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)147 lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task,
148                       const union lp_rast_cmd_arg arg)
149 {
150    lp_rast_triangle_ms_3_16(task, arg);
151 }
152 
153 void
lp_rast_triangle_ms_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)154 lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task,
155                       const union lp_rast_cmd_arg arg)
156 {
157    union lp_rast_cmd_arg arg2;
158    arg2.triangle.tri = arg.triangle.tri;
159    arg2.triangle.plane_mask = (1<<4)-1;
160    lp_rast_triangle_ms_4(task, arg2);
161 }
162 
163 #if defined(PIPE_ARCH_SSE)
164 
165 #include <emmintrin.h>
166 #include "util/u_sse.h"
167 
168 
169 static inline void
build_masks_sse(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)170 build_masks_sse(int c,
171                 int cdiff,
172                 int dcdx,
173                 int dcdy,
174                 unsigned *outmask,
175                 unsigned *partmask)
176 {
177    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
178    __m128i xdcdy = _mm_set1_epi32(dcdy);
179 
180    /* Get values across the quad
181     */
182    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
183    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
184    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
185 
186    {
187       __m128i cstep01, cstep23, result;
188 
189       cstep01 = _mm_packs_epi32(cstep0, cstep1);
190       cstep23 = _mm_packs_epi32(cstep2, cstep3);
191       result = _mm_packs_epi16(cstep01, cstep23);
192 
193       *outmask |= _mm_movemask_epi8(result);
194    }
195 
196 
197    {
198       __m128i cio4 = _mm_set1_epi32(cdiff);
199       __m128i cstep01, cstep23, result;
200 
201       cstep0 = _mm_add_epi32(cstep0, cio4);
202       cstep1 = _mm_add_epi32(cstep1, cio4);
203       cstep2 = _mm_add_epi32(cstep2, cio4);
204       cstep3 = _mm_add_epi32(cstep3, cio4);
205 
206       cstep01 = _mm_packs_epi32(cstep0, cstep1);
207       cstep23 = _mm_packs_epi32(cstep2, cstep3);
208       result = _mm_packs_epi16(cstep01, cstep23);
209 
210       *partmask |= _mm_movemask_epi8(result);
211    }
212 }
213 
214 
215 static inline unsigned
build_mask_linear_sse(int c,int dcdx,int dcdy)216 build_mask_linear_sse(int c, int dcdx, int dcdy)
217 {
218    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
219    __m128i xdcdy = _mm_set1_epi32(dcdy);
220 
221    /* Get values across the quad
222     */
223    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
224    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
225    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
226 
227    /* pack pairs of results into epi16
228     */
229    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
230    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
231 
232    /* pack into epi8, preserving sign bits
233     */
234    __m128i result = _mm_packs_epi16(cstep01, cstep23);
235 
236    /* extract sign bits to create mask
237     */
238    return _mm_movemask_epi8(result);
239 }
240 
241 static inline unsigned
sign_bits4(const __m128i * cstep,int cdiff)242 sign_bits4(const __m128i *cstep, int cdiff)
243 {
244 
245    /* Adjust the step values
246     */
247    __m128i cio4 = _mm_set1_epi32(cdiff);
248    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
249    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
250    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
251    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
252 
253    /* Pack down to epi8
254     */
255    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
256    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
257    __m128i result = _mm_packs_epi16(cstep01, cstep23);
258 
259    /* Extract the sign bits
260     */
261    return _mm_movemask_epi8(result);
262 }
263 
264 #define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12))
265 #define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13))
266 #define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14))
267 #define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15))
268 
269 #define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3))
270 #define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7))
271 #define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11))
272 #define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15))
273 
274 #define STAMP_SIZE 4
275 static unsigned bottom_mask_tab[STAMP_SIZE] = {
276    ROW3,
277    ROW3 | ROW2,
278    ROW3 | ROW2 | ROW1,
279    ROW3 | ROW2 | ROW1 | ROW0,
280 };
281 
282 static unsigned right_mask_tab[STAMP_SIZE] = {
283    COLUMN3,
284    COLUMN3 | COLUMN2,
285    COLUMN3 | COLUMN2 | COLUMN1,
286    COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0,
287 };
288 
289 
290 #define NR_PLANES 3
291 
292 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)293 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
294                          const union lp_rast_cmd_arg arg)
295 {
296    const struct lp_rast_triangle *tri = arg.triangle.tri;
297    const struct lp_rast_plane *plane = GET_PLANES(tri);
298    int x = (arg.triangle.plane_mask & 0xff) + task->x;
299    int y = (arg.triangle.plane_mask >> 8) + task->y;
300    unsigned i, j;
301 
302    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
303    unsigned nr = 0;
304 
305    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
306    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
307    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
308    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
309    __m128i zero = _mm_setzero_si128();
310 
311    __m128i c, dcdx, dcdy, rej4;
312    __m128i dcdx_neg_mask, dcdy_neg_mask;
313    __m128i dcdx2, dcdx3;
314 
315    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
316    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
317    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
318    __m128i unused;
319 
320    transpose4_epi32(&p0, &p1, &p2, &zero,
321                     &c, &unused, &dcdx, &dcdy);
322 
323    /* recalc eo - easier than trying to load as scalars / shuffle... */
324    dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
325    dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
326    rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
327                         _mm_and_si128(dcdx_neg_mask, dcdx));
328 
329    /* Adjust dcdx;
330     */
331    dcdx = _mm_sub_epi32(zero, dcdx);
332 
333    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
334    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
335    rej4 = _mm_slli_epi32(rej4, 2);
336 
337    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
338    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
339    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
340 
341    dcdx2 = _mm_add_epi32(dcdx, dcdx);
342    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
343 
344    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
345                     &span_0, &span_1, &span_2, &unused);
346 
347    for (i = 0; i < 4; i++) {
348       __m128i cx = c;
349 
350       for (j = 0; j < 4; j++) {
351          __m128i c4rej = _mm_add_epi32(cx, rej4);
352          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
353 
354          /* if (is_zero(rej_masks)) */
355          if (_mm_movemask_epi8(rej_masks) == 0) {
356             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
357             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
358             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
359 
360             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
361 
362             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
363             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
364             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
365 
366             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
367             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
368 
369             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
370             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
371             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
372 
373             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
374 
375             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
376             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
377             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
378 
379             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
380             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
381             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
382 
383             unsigned mask = _mm_movemask_epi8(c_0123);
384 
385             out[nr].i = i;
386             out[nr].j = j;
387             out[nr].mask = mask;
388             if (mask != 0xffff)
389                nr++;
390          }
391          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
392       }
393 
394       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
395    }
396 
397    for (i = 0; i < nr; i++)
398       lp_rast_shade_quads_mask(task,
399                                &tri->inputs,
400                                x + 4 * out[i].j,
401                                y + 4 * out[i].i,
402                                0xffff & ~out[i].mask);
403 }
404 
405 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)406 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
407                         const union lp_rast_cmd_arg arg)
408 {
409    const struct lp_rast_triangle *tri = arg.triangle.tri;
410    const struct lp_rast_plane *plane = GET_PLANES(tri);
411    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
412    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
413 
414    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
415    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
416    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
417    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
418    __m128i zero = _mm_setzero_si128();
419 
420    __m128i c, dcdx, dcdy;
421    __m128i dcdx2, dcdx3;
422 
423    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
424    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
425    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
426    __m128i unused;
427 
428    transpose4_epi32(&p0, &p1, &p2, &zero,
429                     &c, &unused, &dcdx, &dcdy);
430 
431    /* Adjust dcdx;
432     */
433    dcdx = _mm_sub_epi32(zero, dcdx);
434 
435    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
436    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
437 
438    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
439    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
440 
441    dcdx2 = _mm_add_epi32(dcdx, dcdx);
442    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
443 
444    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
445                     &span_0, &span_1, &span_2, &unused);
446 
447 
448    {
449       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
450       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
451       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
452 
453       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
454 
455       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
456       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
457       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
458 
459       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
460       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
461 
462       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
463       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
464       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
465 
466       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
467 
468       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
469       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
470       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
471 
472       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
473       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
474       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
475 
476       unsigned mask = _mm_movemask_epi8(c_0123);
477 
478       if (mask != 0xffff)
479          lp_rast_shade_quads_mask(task,
480                                   &tri->inputs,
481                                   x,
482                                   y,
483                                   0xffff & ~mask);
484    }
485 }
486 
487 #undef NR_PLANES
488 
489 #else
490 
491 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
492 
493 #undef bool
494 #include <altivec.h>
495 #define bool _Bool
496 #include "util/u_pwr8.h"
497 
498 static inline void
build_masks_ppc(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)499 build_masks_ppc(int c,
500                 int cdiff,
501                 int dcdx,
502                 int dcdy,
503                 unsigned *outmask,
504                 unsigned *partmask)
505 {
506    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
507    __m128i xdcdy = (__m128i) vec_splats(dcdy);
508 
509    /* Get values across the quad
510     */
511    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
512    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
513    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
514 
515    {
516       __m128i cstep01, cstep23, result;
517 
518       cstep01 = vec_packs_epi32(cstep0, cstep1);
519       cstep23 = vec_packs_epi32(cstep2, cstep3);
520       result = vec_packs_epi16(cstep01, cstep23);
521 
522       *outmask |= vec_movemask_epi8(result);
523    }
524 
525 
526    {
527       __m128i cio4 = (__m128i) vec_splats(cdiff);
528       __m128i cstep01, cstep23, result;
529 
530       cstep0 = vec_add_epi32(cstep0, cio4);
531       cstep1 = vec_add_epi32(cstep1, cio4);
532       cstep2 = vec_add_epi32(cstep2, cio4);
533       cstep3 = vec_add_epi32(cstep3, cio4);
534 
535       cstep01 = vec_packs_epi32(cstep0, cstep1);
536       cstep23 = vec_packs_epi32(cstep2, cstep3);
537       result = vec_packs_epi16(cstep01, cstep23);
538 
539       *partmask |= vec_movemask_epi8(result);
540    }
541 }
542 
543 static inline unsigned
build_mask_linear_ppc(int c,int dcdx,int dcdy)544 build_mask_linear_ppc(int c, int dcdx, int dcdy)
545 {
546    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
547    __m128i xdcdy = (__m128i) vec_splats(dcdy);
548 
549    /* Get values across the quad
550     */
551    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
552    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
553    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
554 
555    /* pack pairs of results into epi16
556     */
557    __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
558    __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
559 
560    /* pack into epi8, preserving sign bits
561     */
562    __m128i result = vec_packs_epi16(cstep01, cstep23);
563 
564    /* extract sign bits to create mask
565     */
566    return vec_movemask_epi8(result);
567 }
568 
569 static inline __m128i
lp_plane_to_m128i(const struct lp_rast_plane * plane)570 lp_plane_to_m128i(const struct lp_rast_plane *plane)
571 {
572    return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
573                          (int32_t)plane->dcdy, (int32_t)plane->eo);
574 }
575 
576 #define NR_PLANES 3
577 
578 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)579 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
580                       const union lp_rast_cmd_arg arg)
581 {
582    const struct lp_rast_triangle *tri = arg.triangle.tri;
583    const struct lp_rast_plane *plane = GET_PLANES(tri);
584    int x = (arg.triangle.plane_mask & 0xff) + task->x;
585    int y = (arg.triangle.plane_mask >> 8) + task->y;
586    unsigned i, j;
587 
588    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
589    unsigned nr = 0;
590 
591    __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
592    __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
593    __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
594    __m128i zero = vec_splats((unsigned char) 0);
595 
596    __m128i c;
597    __m128i dcdx;
598    __m128i dcdy;
599    __m128i rej4;
600 
601    __m128i dcdx2;
602    __m128i dcdx3;
603 
604    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
605    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
606    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
607    __m128i unused;
608 
609    __m128i vshuf_mask0;
610    __m128i vshuf_mask1;
611    __m128i vshuf_mask2;
612 
613 #if UTIL_ARCH_LITTLE_ENDIAN
614    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
615    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
616    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
617 #else
618    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
619    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
620    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
621 #endif
622 
623    transpose4_epi32(&p0, &p1, &p2, &zero,
624                     &c, &dcdx, &dcdy, &rej4);
625 
626    /* Adjust dcdx;
627     */
628    dcdx = vec_sub_epi32(zero, dcdx);
629 
630    c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
631    c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
632    rej4 = vec_slli_epi32(rej4, 2);
633 
634    /*
635     * Adjust so we can just check the sign bit (< 0 comparison),
636     * instead of having to do a less efficient <= 0 comparison
637     */
638    c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
639    rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
640 
641    dcdx2 = vec_add_epi32(dcdx, dcdx);
642    dcdx3 = vec_add_epi32(dcdx2, dcdx);
643 
644    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
645                     &span_0, &span_1, &span_2, &unused);
646 
647    for (i = 0; i < 4; i++) {
648       __m128i cx = c;
649 
650       for (j = 0; j < 4; j++) {
651          __m128i c4rej = vec_add_epi32(cx, rej4);
652          __m128i rej_masks = vec_srai_epi32(c4rej, 31);
653 
654          /* if (is_zero(rej_masks)) */
655          if (vec_movemask_epi8(rej_masks) == 0) {
656             __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
657             __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
658             __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
659 
660             __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
661 
662             __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
663             __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
664             __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
665 
666             __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
667             __m128i c_01 = vec_packs_epi32(c_0, c_1);
668 
669             __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
670             __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
671             __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
672 
673             __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
674 
675             __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
676             __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
677             __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
678 
679             __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
680             __m128i c_23 = vec_packs_epi32(c_2, c_3);
681             __m128i c_0123 = vec_packs_epi16(c_01, c_23);
682 
683             unsigned mask = vec_movemask_epi8(c_0123);
684 
685             out[nr].i = i;
686             out[nr].j = j;
687             out[nr].mask = mask;
688             if (mask != 0xffff)
689                nr++;
690          }
691          cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
692       }
693 
694       c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
695    }
696 
697    for (i = 0; i < nr; i++)
698       lp_rast_shade_quads_mask(task,
699                                &tri->inputs,
700                                x + 4 * out[i].j,
701                                y + 4 * out[i].i,
702                                0xffff & ~out[i].mask);
703 }
704 
705 #undef NR_PLANES
706 
707 #else
708 
709 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)710 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
711                          const union lp_rast_cmd_arg arg)
712 {
713    union lp_rast_cmd_arg arg2;
714    arg2.triangle.tri = arg.triangle.tri;
715    arg2.triangle.plane_mask = (1<<3)-1;
716    lp_rast_triangle_32_3(task, arg2);
717 }
718 
719 #endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
720 
721 void
lp_rast_triangle_32_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)722 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
723                          const union lp_rast_cmd_arg arg)
724 {
725    union lp_rast_cmd_arg arg2;
726    arg2.triangle.tri = arg.triangle.tri;
727    arg2.triangle.plane_mask = (1<<4)-1;
728    lp_rast_triangle_32_4(task, arg2);
729 }
730 
731 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)732 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
733                       const union lp_rast_cmd_arg arg)
734 {
735    lp_rast_triangle_32_3_16(task, arg);
736 }
737 
738 #endif
739 
740 #if defined PIPE_ARCH_SSE
741 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
742 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
743 #elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)
744 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
745 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
746 #else
747 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
748 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
749 #endif
750 
751 #define RASTER_64 1
752 
753 #define TAG(x) x##_1
754 #define NR_PLANES 1
755 #include "lp_rast_tri_tmp.h"
756 
757 #define TAG(x) x##_2
758 #define NR_PLANES 2
759 #include "lp_rast_tri_tmp.h"
760 
761 #define TAG(x) x##_3
762 #define NR_PLANES 3
763 /*#define TRI_4 lp_rast_triangle_3_4*/
764 /*#define TRI_16 lp_rast_triangle_3_16*/
765 #include "lp_rast_tri_tmp.h"
766 
767 #define TAG(x) x##_4
768 #define NR_PLANES 4
769 /*#define TRI_16 lp_rast_triangle_4_16*/
770 #include "lp_rast_tri_tmp.h"
771 
772 #define TAG(x) x##_5
773 #define NR_PLANES 5
774 #include "lp_rast_tri_tmp.h"
775 
776 #define TAG(x) x##_6
777 #define NR_PLANES 6
778 #include "lp_rast_tri_tmp.h"
779 
780 #define TAG(x) x##_7
781 #define NR_PLANES 7
782 #include "lp_rast_tri_tmp.h"
783 
784 #define TAG(x) x##_8
785 #define NR_PLANES 8
786 #include "lp_rast_tri_tmp.h"
787 
788 #undef RASTER_64
789 
790 #define TAG(x) x##_32_1
791 #define NR_PLANES 1
792 #include "lp_rast_tri_tmp.h"
793 
794 #define TAG(x) x##_32_2
795 #define NR_PLANES 2
796 #include "lp_rast_tri_tmp.h"
797 
798 #define TAG(x) x##_32_3
799 #define NR_PLANES 3
800 /*#define TRI_4 lp_rast_triangle_3_4*/
801 /*#define TRI_16 lp_rast_triangle_3_16*/
802 #include "lp_rast_tri_tmp.h"
803 
804 #define TAG(x) x##_32_4
805 #define NR_PLANES 4
806 #ifdef PIPE_ARCH_SSE
807 #define TRI_16 lp_rast_triangle_32_4_16
808 #endif
809 #include "lp_rast_tri_tmp.h"
810 
811 #define TAG(x) x##_32_5
812 #define NR_PLANES 5
813 #include "lp_rast_tri_tmp.h"
814 
815 #define TAG(x) x##_32_6
816 #define NR_PLANES 6
817 #include "lp_rast_tri_tmp.h"
818 
819 #define TAG(x) x##_32_7
820 #define NR_PLANES 7
821 #include "lp_rast_tri_tmp.h"
822 
823 #define TAG(x) x##_32_8
824 #define NR_PLANES 8
825 #include "lp_rast_tri_tmp.h"
826 
827 #define MULTISAMPLE 1
828 #define RASTER_64 1
829 
830 #define TAG(x) x##_ms_1
831 #define NR_PLANES 1
832 #include "lp_rast_tri_tmp.h"
833 
834 #define TAG(x) x##_ms_2
835 #define NR_PLANES 2
836 #include "lp_rast_tri_tmp.h"
837 
838 #define TAG(x) x##_ms_3
839 #define NR_PLANES 3
840 /*#define TRI_4 lp_rast_triangle_3_4*/
841 /*#define TRI_16 lp_rast_triangle_3_16*/
842 #include "lp_rast_tri_tmp.h"
843 
844 #define TAG(x) x##_ms_4
845 #define NR_PLANES 4
846 /*#define TRI_16 lp_rast_triangle_4_16*/
847 #include "lp_rast_tri_tmp.h"
848 
849 #define TAG(x) x##_ms_5
850 #define NR_PLANES 5
851 #include "lp_rast_tri_tmp.h"
852 
853 #define TAG(x) x##_ms_6
854 #define NR_PLANES 6
855 #include "lp_rast_tri_tmp.h"
856 
857 #define TAG(x) x##_ms_7
858 #define NR_PLANES 7
859 #include "lp_rast_tri_tmp.h"
860 
861 #define TAG(x) x##_ms_8
862 #define NR_PLANES 8
863 #include "lp_rast_tri_tmp.h"
864 
865 #undef RASTER_64
866