1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * Helper functions for swizzling/shuffling.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  */
34 
35 #include <inttypes.h>  /* for PRIx64 macro */
36 #include "util/compiler.h"
37 #include "util/u_debug.h"
38 
39 #include "lp_bld_type.h"
40 #include "lp_bld_const.h"
41 #include "lp_bld_init.h"
42 #include "lp_bld_logic.h"
43 #include "lp_bld_swizzle.h"
44 #include "lp_bld_pack.h"
45 
46 
47 LLVMValueRef
lp_build_broadcast(struct gallivm_state * gallivm,LLVMTypeRef vec_type,LLVMValueRef scalar)48 lp_build_broadcast(struct gallivm_state *gallivm,
49                    LLVMTypeRef vec_type,
50                    LLVMValueRef scalar)
51 {
52    LLVMValueRef res;
53 
54    if (LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind) {
55       /* scalar */
56       assert(vec_type == LLVMTypeOf(scalar));
57       res = scalar;
58    } else {
59       LLVMBuilderRef builder = gallivm->builder;
60       const unsigned length = LLVMGetVectorSize(vec_type);
61       LLVMValueRef undef = LLVMGetUndef(vec_type);
62       /* The shuffle vector is always made of int32 elements */
63       LLVMTypeRef i32_type = LLVMInt32TypeInContext(gallivm->context);
64       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
65 
66       assert(LLVMGetElementType(vec_type) == LLVMTypeOf(scalar));
67 
68       res = LLVMBuildInsertElement(builder, undef, scalar, LLVMConstNull(i32_type), "");
69       res = LLVMBuildShuffleVector(builder, res, undef, LLVMConstNull(i32_vec_type), "");
70    }
71 
72    return res;
73 }
74 
75 
76 /**
77  * Broadcast
78  */
79 LLVMValueRef
lp_build_broadcast_scalar(struct lp_build_context * bld,LLVMValueRef scalar)80 lp_build_broadcast_scalar(struct lp_build_context *bld,
81                           LLVMValueRef scalar)
82 {
83    assert(lp_check_elem_type(bld->type, LLVMTypeOf(scalar)));
84 
85    return lp_build_broadcast(bld->gallivm, bld->vec_type, scalar);
86 }
87 
88 
89 /**
90  * Combined extract and broadcast (mere shuffle in most cases)
91  */
92 LLVMValueRef
lp_build_extract_broadcast(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef vector,LLVMValueRef index)93 lp_build_extract_broadcast(struct gallivm_state *gallivm,
94                            struct lp_type src_type,
95                            struct lp_type dst_type,
96                            LLVMValueRef vector,
97                            LLVMValueRef index)
98 {
99    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
100    LLVMValueRef res;
101 
102    assert(src_type.floating == dst_type.floating);
103    assert(src_type.width    == dst_type.width);
104 
105    assert(lp_check_value(src_type, vector));
106    assert(LLVMTypeOf(index) == i32t);
107 
108    if (src_type.length == 1) {
109       if (dst_type.length == 1) {
110          /*
111           * Trivial scalar -> scalar.
112           */
113 
114          res = vector;
115       }
116       else {
117          /*
118           * Broadcast scalar -> vector.
119           */
120 
121          res = lp_build_broadcast(gallivm,
122                                   lp_build_vec_type(gallivm, dst_type),
123                                   vector);
124       }
125    }
126    else {
127       if (dst_type.length > 1) {
128          /*
129           * shuffle - result can be of different length.
130           */
131 
132          LLVMValueRef shuffle;
133          shuffle = lp_build_broadcast(gallivm,
134                                       LLVMVectorType(i32t, dst_type.length),
135                                       index);
136          res = LLVMBuildShuffleVector(gallivm->builder, vector,
137                                       LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
138                                       shuffle, "");
139       }
140       else {
141          /*
142           * Trivial extract scalar from vector.
143           */
144           res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
145       }
146    }
147 
148    return res;
149 }
150 
151 
152 /**
153  * Swizzle one channel into other channels.
154  */
155 LLVMValueRef
lp_build_swizzle_scalar_aos(struct lp_build_context * bld,LLVMValueRef a,unsigned channel,unsigned num_channels)156 lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
157                             LLVMValueRef a,
158                             unsigned channel,
159                             unsigned num_channels)
160 {
161    LLVMBuilderRef builder = bld->gallivm->builder;
162    const struct lp_type type = bld->type;
163    const unsigned n = type.length;
164    unsigned i, j;
165 
166    if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1)
167       return a;
168 
169    assert(num_channels == 2 || num_channels == 4);
170 
171    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
172     * using shuffles here actually causes worst results. More investigation is
173     * needed. */
174    if (LLVMIsConstant(a) ||
175        type.width >= 16) {
176       /*
177        * Shuffle.
178        */
179       LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
180       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
181 
182       for(j = 0; j < n; j += num_channels)
183          for(i = 0; i < num_channels; ++i)
184             shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
185 
186       return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
187    }
188    else if (num_channels == 2) {
189       /*
190        * Bit mask and shifts
191        *
192        *   XY XY .... XY  <= input
193        *   0Y 0Y .... 0Y
194        *   YY YY .... YY
195        *   YY YY .... YY  <= output
196        */
197       struct lp_type type2;
198       LLVMValueRef tmp = NULL;
199       int shift;
200 
201       a = LLVMBuildAnd(builder, a,
202                        lp_build_const_mask_aos(bld->gallivm,
203                                                type, 1 << channel, num_channels), "");
204 
205       type2 = type;
206       type2.floating = FALSE;
207       type2.width *= 2;
208       type2.length /= 2;
209 
210       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");
211 
212       /*
213        * Vector element 0 is always channel X.
214        *
215        *                        76 54 32 10 (array numbering)
216        * Little endian reg in:  YX YX YX YX
217        * Little endian reg out: YY YY YY YY if shift right (shift == -1)
218        *                        XX XX XX XX if shift left (shift == 1)
219        *
220        *                        01 23 45 67 (array numbering)
221        * Big endian reg in:     XY XY XY XY
222        * Big endian reg out:    YY YY YY YY if shift left (shift == 1)
223        *                        XX XX XX XX if shift right (shift == -1)
224        *
225        */
226 #if UTIL_ARCH_LITTLE_ENDIAN
227       shift = channel == 0 ? 1 : -1;
228 #else
229       shift = channel == 0 ? -1 : 1;
230 #endif
231 
232       if (shift > 0) {
233          tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), "");
234       } else if (shift < 0) {
235          tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), "");
236       }
237 
238       assert(tmp);
239       if (tmp) {
240          a = LLVMBuildOr(builder, a, tmp, "");
241       }
242 
243       return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
244    }
245    else {
246       /*
247        * Bit mask and recursive shifts
248        *
249        * Little-endian registers:
250        *
251        *   7654 3210
252        *   WZYX WZYX .... WZYX  <= input
253        *   00Y0 00Y0 .... 00Y0  <= mask
254        *   00YY 00YY .... 00YY  <= shift right 1 (shift amount -1)
255        *   YYYY YYYY .... YYYY  <= shift left 2 (shift amount 2)
256        *
257        * Big-endian registers:
258        *
259        *   0123 4567
260        *   XYZW XYZW .... XYZW  <= input
261        *   0Y00 0Y00 .... 0Y00  <= mask
262        *   YY00 YY00 .... YY00  <= shift left 1 (shift amount 1)
263        *   YYYY YYYY .... YYYY  <= shift right 2 (shift amount -2)
264        *
265        * shifts[] gives little-endian shift amounts; we need to negate for big-endian.
266        */
267       struct lp_type type4;
268       const int shifts[4][2] = {
269          { 1,  2},
270          {-1,  2},
271          { 1, -2},
272          {-1, -2}
273       };
274       unsigned i;
275 
276       a = LLVMBuildAnd(builder, a,
277                        lp_build_const_mask_aos(bld->gallivm,
278                                                type, 1 << channel, 4), "");
279 
280       /*
281        * Build a type where each element is an integer that cover the four
282        * channels.
283        */
284 
285       type4 = type;
286       type4.floating = FALSE;
287       type4.width *= 4;
288       type4.length /= 4;
289 
290       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
291 
292       for(i = 0; i < 2; ++i) {
293          LLVMValueRef tmp = NULL;
294          int shift = shifts[channel][i];
295 
296          /* See endianness diagram above */
297 #if UTIL_ARCH_BIG_ENDIAN
298          shift = -shift;
299 #endif
300 
301          if(shift > 0)
302             tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
303          if(shift < 0)
304             tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
305 
306          assert(tmp);
307          if(tmp)
308             a = LLVMBuildOr(builder, a, tmp, "");
309       }
310 
311       return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
312    }
313 }
314 
315 
316 /**
317  * Swizzle a vector consisting of an array of XYZW structs.
318  *
319  * This fills a vector of dst_len length with the swizzled channels from src.
320  *
321  * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in
322  *      RGBA RGBA = BGR BGR BG
323  *
324  * @param swizzles        the swizzle array
325  * @param num_swizzles    the number of elements in swizzles
326  * @param dst_len         the length of the result
327  */
328 LLVMValueRef
lp_build_swizzle_aos_n(struct gallivm_state * gallivm,LLVMValueRef src,const unsigned char * swizzles,unsigned num_swizzles,unsigned dst_len)329 lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
330                        LLVMValueRef src,
331                        const unsigned char* swizzles,
332                        unsigned num_swizzles,
333                        unsigned dst_len)
334 {
335    LLVMBuilderRef builder = gallivm->builder;
336    LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH];
337    unsigned i;
338 
339    assert(dst_len < LP_MAX_VECTOR_WIDTH);
340 
341    for (i = 0; i < dst_len; ++i) {
342       int swizzle = swizzles[i % num_swizzles];
343 
344       if (swizzle == LP_BLD_SWIZZLE_DONTCARE) {
345          shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
346       } else {
347          shuffles[i] = lp_build_const_int32(gallivm, swizzle);
348       }
349    }
350 
351    return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), "");
352 }
353 
354 
355 LLVMValueRef
lp_build_swizzle_aos(struct lp_build_context * bld,LLVMValueRef a,const unsigned char swizzles[4])356 lp_build_swizzle_aos(struct lp_build_context *bld,
357                      LLVMValueRef a,
358                      const unsigned char swizzles[4])
359 {
360    LLVMBuilderRef builder = bld->gallivm->builder;
361    const struct lp_type type = bld->type;
362    const unsigned n = type.length;
363    unsigned i, j;
364 
365    if (swizzles[0] == PIPE_SWIZZLE_X &&
366        swizzles[1] == PIPE_SWIZZLE_Y &&
367        swizzles[2] == PIPE_SWIZZLE_Z &&
368        swizzles[3] == PIPE_SWIZZLE_W) {
369       return a;
370    }
371 
372    if (swizzles[0] == swizzles[1] &&
373        swizzles[1] == swizzles[2] &&
374        swizzles[2] == swizzles[3]) {
375       switch (swizzles[0]) {
376       case PIPE_SWIZZLE_X:
377       case PIPE_SWIZZLE_Y:
378       case PIPE_SWIZZLE_Z:
379       case PIPE_SWIZZLE_W:
380          return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4);
381       case PIPE_SWIZZLE_0:
382          return bld->zero;
383       case PIPE_SWIZZLE_1:
384          return bld->one;
385       case LP_BLD_SWIZZLE_DONTCARE:
386          return bld->undef;
387       default:
388          assert(0);
389          return bld->undef;
390       }
391    }
392 
393    if (LLVMIsConstant(a) ||
394        type.width >= 16) {
395       /*
396        * Shuffle.
397        */
398       LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(bld->gallivm, type));
399       LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
400       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
401       LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
402 
403       memset(aux, 0, sizeof aux);
404 
405       for(j = 0; j < n; j += 4) {
406          for(i = 0; i < 4; ++i) {
407             unsigned shuffle;
408             switch (swizzles[i]) {
409             default:
410                assert(0);
411 #if defined(NDEBUG) || defined(DEBUG)
412                FALLTHROUGH;
413 #endif
414             case PIPE_SWIZZLE_X:
415             case PIPE_SWIZZLE_Y:
416             case PIPE_SWIZZLE_Z:
417             case PIPE_SWIZZLE_W:
418                shuffle = j + swizzles[i];
419                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
420                break;
421             case PIPE_SWIZZLE_0:
422                shuffle = type.length + 0;
423                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
424                if (!aux[0]) {
425                   aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
426                }
427                break;
428             case PIPE_SWIZZLE_1:
429                shuffle = type.length + 1;
430                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
431                if (!aux[1]) {
432                   aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
433                }
434                break;
435             case LP_BLD_SWIZZLE_DONTCARE:
436                shuffles[j + i] = LLVMGetUndef(i32t);
437                break;
438             }
439          }
440       }
441 
442       for (i = 0; i < n; ++i) {
443          if (!aux[i]) {
444             aux[i] = undef;
445          }
446       }
447 
448       return LLVMBuildShuffleVector(builder, a,
449                                     LLVMConstVector(aux, n),
450                                     LLVMConstVector(shuffles, n), "");
451    } else {
452       /*
453        * Bit mask and shifts.
454        *
455        * For example, this will convert BGRA to RGBA by doing
456        *
457        * Little endian:
458        *   rgba = (bgra & 0x00ff0000) >> 16
459        *        | (bgra & 0xff00ff00)
460        *        | (bgra & 0x000000ff) << 16
461        *
462        * Big endian:A
463        *   rgba = (bgra & 0x0000ff00) << 16
464        *        | (bgra & 0x00ff00ff)
465        *        | (bgra & 0xff000000) >> 16
466        *
467        * This is necessary not only for faster cause, but because X86 backend
468        * will refuse shuffles of <4 x i8> vectors
469        */
470       LLVMValueRef res;
471       struct lp_type type4;
472       unsigned cond = 0;
473       int chan;
474       int shift;
475 
476       /*
477        * Start with a mixture of 1 and 0.
478        */
479       for (chan = 0; chan < 4; ++chan) {
480          if (swizzles[chan] == PIPE_SWIZZLE_1) {
481             cond |= 1 << chan;
482          }
483       }
484       res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4);
485 
486       /*
487        * Build a type where each element is an integer that cover the four
488        * channels.
489        */
490       type4 = type;
491       type4.floating = FALSE;
492       type4.width *= 4;
493       type4.length /= 4;
494 
495       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
496       res = LLVMBuildBitCast(builder, res, lp_build_vec_type(bld->gallivm, type4), "");
497 
498       /*
499        * Mask and shift the channels, trying to group as many channels in the
500        * same shift as possible.  The shift amount is positive for shifts left
501        * and negative for shifts right.
502        */
503       for (shift = -3; shift <= 3; ++shift) {
504          uint64_t mask = 0;
505 
506          assert(type4.width <= sizeof(mask)*8);
507 
508          /*
509           * Vector element numbers follow the XYZW order, so 0 is always X, etc.
510           * After widening 4 times we have:
511           *
512           *                                3210
513           * Little-endian register layout: WZYX
514           *
515           *                                0123
516           * Big-endian register layout:    XYZW
517           *
518           * For little-endian, higher-numbered channels are obtained by a shift right
519           * (negative shift amount) and lower-numbered channels by a shift left
520           * (positive shift amount).  The opposite is true for big-endian.
521           */
522          for (chan = 0; chan < 4; ++chan) {
523             if (swizzles[chan] < 4) {
524                /* We need to move channel swizzles[chan] into channel chan */
525 #if UTIL_ARCH_LITTLE_ENDIAN
526                if (swizzles[chan] - chan == -shift) {
527                   mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
528                }
529 #else
530                if (swizzles[chan] - chan == shift) {
531                   mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width);
532                }
533 #endif
534             }
535          }
536 
537          if (mask) {
538             LLVMValueRef masked;
539             LLVMValueRef shifted;
540             if (0)
541                debug_printf("shift = %i, mask = %" PRIx64 "\n", shift, mask);
542 
543             masked = LLVMBuildAnd(builder, a,
544                                   lp_build_const_int_vec(bld->gallivm, type4, mask), "");
545             if (shift > 0) {
546                shifted = LLVMBuildShl(builder, masked,
547                                       lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
548             } else if (shift < 0) {
549                shifted = LLVMBuildLShr(builder, masked,
550                                        lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
551             } else {
552                shifted = masked;
553             }
554 
555             res = LLVMBuildOr(builder, res, shifted, "");
556          }
557       }
558 
559       return LLVMBuildBitCast(builder, res,
560                               lp_build_vec_type(bld->gallivm, type), "");
561    }
562 }
563 
564 
565 /**
566  * Extended swizzle of a single channel of a SoA vector.
567  *
568  * @param bld         building context
569  * @param unswizzled  array with the 4 unswizzled values
570  * @param swizzle     one of the PIPE_SWIZZLE_*
571  *
572  * @return  the swizzled value.
573  */
574 LLVMValueRef
lp_build_swizzle_soa_channel(struct lp_build_context * bld,const LLVMValueRef * unswizzled,unsigned swizzle)575 lp_build_swizzle_soa_channel(struct lp_build_context *bld,
576                              const LLVMValueRef *unswizzled,
577                              unsigned swizzle)
578 {
579    switch (swizzle) {
580    case PIPE_SWIZZLE_X:
581    case PIPE_SWIZZLE_Y:
582    case PIPE_SWIZZLE_Z:
583    case PIPE_SWIZZLE_W:
584       return unswizzled[swizzle];
585    case PIPE_SWIZZLE_0:
586       return bld->zero;
587    case PIPE_SWIZZLE_1:
588       return bld->one;
589    default:
590       assert(0);
591       return bld->undef;
592    }
593 }
594 
595 
596 /**
597  * Extended swizzle of a SoA vector.
598  *
599  * @param bld         building context
600  * @param unswizzled  array with the 4 unswizzled values
601  * @param swizzles    array of PIPE_SWIZZLE_*
602  * @param swizzled    output swizzled values
603  */
604 void
lp_build_swizzle_soa(struct lp_build_context * bld,const LLVMValueRef * unswizzled,const unsigned char swizzles[4],LLVMValueRef * swizzled)605 lp_build_swizzle_soa(struct lp_build_context *bld,
606                      const LLVMValueRef *unswizzled,
607                      const unsigned char swizzles[4],
608                      LLVMValueRef *swizzled)
609 {
610    unsigned chan;
611 
612    for (chan = 0; chan < 4; ++chan) {
613       swizzled[chan] = lp_build_swizzle_soa_channel(bld, unswizzled,
614                                                     swizzles[chan]);
615    }
616 }
617 
618 
619 /**
620  * Do an extended swizzle of a SoA vector inplace.
621  *
622  * @param bld         building context
623  * @param values      intput/output array with the 4 values
624  * @param swizzles    array of PIPE_SWIZZLE_*
625  */
626 void
lp_build_swizzle_soa_inplace(struct lp_build_context * bld,LLVMValueRef * values,const unsigned char swizzles[4])627 lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
628                              LLVMValueRef *values,
629                              const unsigned char swizzles[4])
630 {
631    LLVMValueRef unswizzled[4];
632    unsigned chan;
633 
634    for (chan = 0; chan < 4; ++chan) {
635       unswizzled[chan] = values[chan];
636    }
637 
638    lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
639 }
640 
641 
642 /**
643  * Transpose from AOS <-> SOA
644  *
645  * @param single_type_lp   type of pixels
646  * @param src              the 4 * n pixel input
647  * @param dst              the 4 * n pixel output
648  */
649 void
lp_build_transpose_aos(struct gallivm_state * gallivm,struct lp_type single_type_lp,const LLVMValueRef src[4],LLVMValueRef dst[4])650 lp_build_transpose_aos(struct gallivm_state *gallivm,
651                        struct lp_type single_type_lp,
652                        const LLVMValueRef src[4],
653                        LLVMValueRef dst[4])
654 {
655    struct lp_type double_type_lp = single_type_lp;
656    LLVMTypeRef single_type;
657    LLVMTypeRef double_type;
658    LLVMValueRef t0 = NULL, t1 = NULL, t2 = NULL, t3 = NULL;
659 
660    double_type_lp.length >>= 1;
661    double_type_lp.width  <<= 1;
662 
663    double_type = lp_build_vec_type(gallivm, double_type_lp);
664    single_type = lp_build_vec_type(gallivm, single_type_lp);
665 
666    LLVMValueRef double_type_zero = LLVMConstNull(double_type);
667    /* Interleave x, y, z, w -> xy and zw */
668    if (src[0] || src[1]) {
669       LLVMValueRef src0 = src[0];
670       LLVMValueRef src1 = src[1];
671       if (!src0)
672          src0 = LLVMConstNull(single_type);
673       if (!src1)
674          src1 = LLVMConstNull(single_type);
675       t0 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 0);
676       t2 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 1);
677 
678       /* Cast to double width type for second interleave */
679       t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
680       t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
681    }
682    if (src[2] || src[3]) {
683       LLVMValueRef src2 = src[2];
684       LLVMValueRef src3 = src[3];
685       if (!src2)
686          src2 = LLVMConstNull(single_type);
687       if (!src3)
688          src3 = LLVMConstNull(single_type);
689       t1 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 0);
690       t3 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 1);
691 
692       /* Cast to double width type for second interleave */
693       t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
694       t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
695    }
696 
697    if (!t0)
698       t0 = double_type_zero;
699    if (!t1)
700       t1 = double_type_zero;
701    if (!t2)
702       t2 = double_type_zero;
703    if (!t3)
704       t3 = double_type_zero;
705 
706    /* Interleave xy, zw -> xyzw */
707    dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
708    dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
709    dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
710    dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
711 
712    /* Cast back to original single width type */
713    dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
714    dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
715    dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
716    dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
717 }
718 
719 
720 /**
721  * Transpose from AOS <-> SOA for num_srcs
722  */
723 void
lp_build_transpose_aos_n(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst)724 lp_build_transpose_aos_n(struct gallivm_state *gallivm,
725                          struct lp_type type,
726                          const LLVMValueRef* src,
727                          unsigned num_srcs,
728                          LLVMValueRef* dst)
729 {
730    switch (num_srcs) {
731       case 1:
732          dst[0] = src[0];
733          break;
734 
735       case 2:
736       {
737          /* Note: we must use a temporary incase src == dst */
738          LLVMValueRef lo, hi;
739 
740          lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0);
741          hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1);
742 
743          dst[0] = lo;
744          dst[1] = hi;
745          break;
746       }
747 
748       case 4:
749          lp_build_transpose_aos(gallivm, type, src, dst);
750          break;
751 
752       default:
753          assert(0);
754    }
755 }
756 
757 
758 /**
759  * Pack n-th element of aos values,
760  * pad out to destination size.
761  * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
762  */
763 LLVMValueRef
lp_build_pack_aos_scalars(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef src,unsigned channel)764 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
765                           struct lp_type src_type,
766                           struct lp_type dst_type,
767                           const LLVMValueRef src,
768                           unsigned channel)
769 {
770    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
771    LLVMValueRef undef = LLVMGetUndef(i32t);
772    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
773    unsigned num_src = src_type.length / 4;
774    unsigned num_dst = dst_type.length;
775    unsigned i;
776 
777    assert(num_src <= num_dst);
778 
779    for (i = 0; i < num_src; i++) {
780       shuffles[i] = LLVMConstInt(i32t, i * 4 + channel, 0);
781    }
782    for (i = num_src; i < num_dst; i++) {
783       shuffles[i] = undef;
784    }
785 
786    if (num_dst == 1) {
787       return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
788    }
789    else {
790       return LLVMBuildShuffleVector(gallivm->builder, src, src,
791                                     LLVMConstVector(shuffles, num_dst), "");
792    }
793 }
794 
795 
796 /**
797  * Unpack and broadcast packed aos values consisting of only the
798  * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
799  */
800 LLVMValueRef
lp_build_unpack_broadcast_aos_scalars(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef src)801 lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
802                                       struct lp_type src_type,
803                                       struct lp_type dst_type,
804                                       const LLVMValueRef src)
805 {
806    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
807    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
808    unsigned num_dst = dst_type.length;
809    unsigned num_src = dst_type.length / 4;
810    unsigned i;
811 
812    assert(num_dst / 4 <= src_type.length);
813 
814    for (i = 0; i < num_src; i++) {
815       shuffles[i*4] = LLVMConstInt(i32t, i, 0);
816       shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
817       shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
818       shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
819    }
820 
821    if (num_src == 1) {
822       return lp_build_extract_broadcast(gallivm, src_type, dst_type,
823                                         src, shuffles[0]);
824    }
825    else {
826       return LLVMBuildShuffleVector(gallivm->builder, src, src,
827                                     LLVMConstVector(shuffles, num_dst), "");
828    }
829 }
830 
831