1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 #include "pipe/p_defines.h"
30 
31 #include "util/format/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
35 
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45 #include "lp_bld_flow.h"
46 #include "lp_bld_printf.h"
47 #include "lp_bld_intr.h"
48 
49 static void
convert_to_soa(struct gallivm_state * gallivm,LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH/32],LLVMValueRef dst_soa[4],const struct lp_type soa_type)50 convert_to_soa(struct gallivm_state *gallivm,
51                LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
52                LLVMValueRef dst_soa[4],
53                const struct lp_type soa_type)
54 {
55    unsigned j, k;
56    struct lp_type aos_channel_type = soa_type;
57 
58    LLVMValueRef aos_channels[4];
59    unsigned pixels_per_channel = soa_type.length / 4;
60 
61    debug_assert((soa_type.length % 4) == 0);
62 
63    aos_channel_type.length >>= 1;
64 
65    for (j = 0; j < 4; ++j) {
66       LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
67 
68       assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
69 
70       for (k = 0; k < pixels_per_channel; ++k) {
71          channel[k] = src_aos[j + 4 * k];
72       }
73 
74       aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
75    }
76 
77    lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
78 }
79 
80 
81 void
lp_build_format_swizzle_soa(const struct util_format_description * format_desc,struct lp_build_context * bld,const LLVMValueRef unswizzled[4],LLVMValueRef swizzled_out[4])82 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
83                             struct lp_build_context *bld,
84                             const LLVMValueRef unswizzled[4],
85                             LLVMValueRef swizzled_out[4])
86 {
87    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
88       enum pipe_swizzle swizzle;
89       LLVMValueRef depth_or_stencil;
90 
91       if (util_format_has_stencil(format_desc) &&
92           !util_format_has_depth(format_desc)) {
93          assert(!bld->type.floating);
94          swizzle = format_desc->swizzle[1];
95       }
96       else {
97          assert(bld->type.floating);
98          swizzle = format_desc->swizzle[0];
99       }
100       /*
101        * Return zzz1 or sss1 for depth-stencil formats here.
102        * Correct swizzling will be handled by apply_sampler_swizzle() later.
103        */
104       depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
105 
106       swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
107       swizzled_out[3] = bld->one;
108    }
109    else {
110       unsigned chan;
111       for (chan = 0; chan < 4; ++chan) {
112          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
113          swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
114       }
115    }
116 }
117 
118 
119 
120 static LLVMValueRef
lp_build_extract_soa_chan(struct lp_build_context * bld,unsigned blockbits,boolean srgb_chan,struct util_format_channel_description chan_desc,LLVMValueRef packed)121 lp_build_extract_soa_chan(struct lp_build_context *bld,
122                           unsigned blockbits,
123                           boolean srgb_chan,
124                           struct util_format_channel_description chan_desc,
125                           LLVMValueRef packed)
126 {
127    struct gallivm_state *gallivm = bld->gallivm;
128    LLVMBuilderRef builder = gallivm->builder;
129    struct lp_type type = bld->type;
130    LLVMValueRef input = packed;
131    const unsigned width = chan_desc.size;
132    const unsigned start = chan_desc.shift;
133    const unsigned stop = start + width;
134 
135    /* Decode the input vector component */
136 
137    switch(chan_desc.type) {
138    case UTIL_FORMAT_TYPE_VOID:
139       input = bld->undef;
140       break;
141 
142    case UTIL_FORMAT_TYPE_UNSIGNED:
143       /*
144        * Align the LSB
145        */
146       if (start) {
147          input = LLVMBuildLShr(builder, input,
148                                lp_build_const_int_vec(gallivm, type, start), "");
149       }
150 
151       /*
152        * Zero the MSBs
153        */
154       if (stop < blockbits) {
155          unsigned mask = ((unsigned long long)1 << width) - 1;
156          input = LLVMBuildAnd(builder, input,
157                               lp_build_const_int_vec(gallivm, type, mask), "");
158       }
159 
160       /*
161        * Type conversion
162        */
163       if (type.floating) {
164          if (srgb_chan) {
165             struct lp_type conv_type = lp_uint_type(type);
166             input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
167          }
168          else {
169             if(chan_desc.normalized)
170                input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
171             else
172                input = LLVMBuildUIToFP(builder, input, bld->vec_type, "");
173          }
174       }
175       else if (chan_desc.pure_integer) {
176          /* Nothing to do */
177       } else {
178           /* FIXME */
179           assert(0);
180       }
181       break;
182 
183    case UTIL_FORMAT_TYPE_SIGNED:
184       /*
185        * Align the sign bit first.
186        */
187       if (stop < type.width) {
188          unsigned bits = type.width - stop;
189          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
190          input = LLVMBuildShl(builder, input, bits_val, "");
191       }
192 
193       /*
194        * Align the LSB (with an arithmetic shift to preserve the sign)
195        */
196       if (chan_desc.size < type.width) {
197          unsigned bits = type.width - chan_desc.size;
198          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
199          input = LLVMBuildAShr(builder, input, bits_val, "");
200       }
201 
202       /*
203        * Type conversion
204        */
205       if (type.floating) {
206          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
207          if (chan_desc.normalized) {
208             double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
209             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
210             input = LLVMBuildFMul(builder, input, scale_val, "");
211             /*
212              * The formula above will produce value below -1.0 for most negative values.
213              * compliance requires clamping it.
214              * GTF-GL45.gtf33.GL3Tests.vertex_type_2_10_10_10_rev.vertex_type_2_10_10_10_rev_conversion.
215              */
216             input = lp_build_max(bld, input,
217                                  lp_build_const_vec(gallivm, type, -1.0f));
218          }
219       }
220       else if (chan_desc.pure_integer) {
221          /* Nothing to do */
222       } else {
223           /* FIXME */
224           assert(0);
225       }
226       break;
227 
228    case UTIL_FORMAT_TYPE_FLOAT:
229       if (type.floating) {
230          if (chan_desc.size == 16) {
231             struct lp_type f16i_type = type;
232             f16i_type.width /= 2;
233             f16i_type.floating = 0;
234             if (start) {
235                input = LLVMBuildLShr(builder, input,
236                                      lp_build_const_int_vec(gallivm, type, start), "");
237             }
238             input = LLVMBuildTrunc(builder, input,
239                                    lp_build_vec_type(gallivm, f16i_type), "");
240             input = lp_build_half_to_float(gallivm, input);
241          } else {
242             assert(start == 0);
243             assert(stop == 32);
244             assert(type.width == 32);
245          }
246          input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
247       }
248       else {
249          /* FIXME */
250          assert(0);
251          input = bld->undef;
252       }
253       break;
254 
255    case UTIL_FORMAT_TYPE_FIXED:
256       if (type.floating) {
257          double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
258          LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
259          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
260          input = LLVMBuildFMul(builder, input, scale_val, "");
261       }
262       else {
263          /* FIXME */
264          assert(0);
265          input = bld->undef;
266       }
267       break;
268 
269    default:
270       assert(0);
271       input = bld->undef;
272       break;
273    }
274 
275    return input;
276 }
277 
278 
279 /**
280  * Unpack several pixels in SoA.
281  *
282  * It takes a vector of packed pixels:
283  *
284  *   packed = {P0, P1, P2, P3, ..., Pn}
285  *
286  * And will produce four vectors:
287  *
288  *   red    = {R0, R1, R2, R3, ..., Rn}
289  *   green  = {G0, G1, G2, G3, ..., Gn}
290  *   blue   = {B0, B1, B2, B3, ..., Bn}
291  *   alpha  = {A0, A1, A2, A3, ..., An}
292  *
293  * It requires that a packed pixel fits into an element of the output
294  * channels. The common case is when converting pixel with a depth of 32 bit or
295  * less into floats.
296  *
297  * \param format_desc  the format of the 'packed' incoming pixel vector
298  * \param type  the desired type for rgba_out (type.length = n, above)
299  * \param packed  the incoming vector of packed pixels
300  * \param rgba_out  returns the SoA R,G,B,A vectors
301  */
302 void
lp_build_unpack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef packed,LLVMValueRef rgba_out[4])303 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
304                          const struct util_format_description *format_desc,
305                          struct lp_type type,
306                          LLVMValueRef packed,
307                          LLVMValueRef rgba_out[4])
308 {
309    struct lp_build_context bld;
310    LLVMValueRef inputs[4];
311    unsigned chan;
312 
313    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
314    assert(format_desc->block.width == 1);
315    assert(format_desc->block.height == 1);
316    assert(format_desc->block.bits <= type.width);
317    /* FIXME: Support more output types */
318    assert(type.width == 32);
319 
320    lp_build_context_init(&bld, gallivm, type);
321 
322    /* Decode the input vector components */
323    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
324       struct util_format_channel_description chan_desc = format_desc->channel[chan];
325       boolean srgb_chan = FALSE;
326 
327       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
328           format_desc->swizzle[3] != chan) {
329          srgb_chan = TRUE;
330       }
331 
332       inputs[chan] = lp_build_extract_soa_chan(&bld,
333                                                format_desc->block.bits,
334                                                srgb_chan,
335                                                chan_desc,
336                                                packed);
337    }
338 
339    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
340 }
341 
342 
343 /**
344  * Convert a vector of rgba8 values into 32bit wide SoA vectors.
345  *
346  * \param dst_type  The desired return type. For pure integer formats
347  *                  this should be a 32bit wide int or uint vector type,
348  *                  otherwise a float vector type.
349  *
350  * \param packed    The rgba8 values to pack.
351  *
352  * \param rgba      The 4 SoA return vectors.
353  */
354 void
lp_build_rgba8_to_fi32_soa(struct gallivm_state * gallivm,struct lp_type dst_type,LLVMValueRef packed,LLVMValueRef * rgba)355 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
356                            struct lp_type dst_type,
357                            LLVMValueRef packed,
358                            LLVMValueRef *rgba)
359 {
360    LLVMBuilderRef builder = gallivm->builder;
361    LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
362    unsigned chan;
363 
364    /* XXX technically shouldn't use that for uint dst_type */
365    packed = LLVMBuildBitCast(builder, packed,
366                              lp_build_int_vec_type(gallivm, dst_type), "");
367 
368    /* Decode the input vector components */
369    for (chan = 0; chan < 4; ++chan) {
370 #if UTIL_ARCH_LITTLE_ENDIAN
371       unsigned start = chan*8;
372 #else
373       unsigned start = (3-chan)*8;
374 #endif
375       unsigned stop = start + 8;
376       LLVMValueRef input;
377 
378       input = packed;
379 
380       if (start)
381          input = LLVMBuildLShr(builder, input,
382                                lp_build_const_int_vec(gallivm, dst_type, start), "");
383 
384       if (stop < 32)
385          input = LLVMBuildAnd(builder, input, mask, "");
386 
387       if (dst_type.floating)
388          input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
389 
390       rgba[chan] = input;
391    }
392 }
393 
394 
395 
396 /**
397  * Fetch a texels from a texture, returning them in SoA layout.
398  *
399  * \param type  the desired return type for 'rgba'.  The vector length
400  *              is the number of texels to fetch
401  * \param aligned if the offset is guaranteed to be aligned to element width
402  *
403  * \param base_ptr  points to the base of the texture mip tree.
404  * \param offset    offset to start of the texture image block.  For non-
405  *                  compressed formats, this simply is an offset to the texel.
406  *                  For compressed formats, it is an offset to the start of the
407  *                  compressed data block.
408  *
409  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
410  *              these will always be (0,0).  For compressed formats, i will
411  *              be in [0, block_width-1] and j will be in [0, block_height-1].
412  * \param cache  optional value pointing to a lp_build_format_cache structure
413  */
414 void
lp_build_fetch_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache,LLVMValueRef rgba_out[4])415 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
416                         const struct util_format_description *format_desc,
417                         struct lp_type type,
418                         boolean aligned,
419                         LLVMValueRef base_ptr,
420                         LLVMValueRef offset,
421                         LLVMValueRef i,
422                         LLVMValueRef j,
423                         LLVMValueRef cache,
424                         LLVMValueRef rgba_out[4])
425 {
426    LLVMBuilderRef builder = gallivm->builder;
427    enum pipe_format format = format_desc->format;
428    struct lp_type fetch_type;
429 
430    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
431        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
432         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
433         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
434        format_desc->block.width == 1 &&
435        format_desc->block.height == 1 &&
436        format_desc->block.bits <= type.width &&
437        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
438         format_desc->channel[0].size == 32 ||
439         format_desc->channel[0].size == 16))
440    {
441       /*
442        * The packed pixel fits into an element of the destination format. Put
443        * the packed pixels into a vector and extract each component for all
444        * vector elements in parallel.
445        */
446 
447       LLVMValueRef packed;
448 
449       /*
450        * gather the texels from the texture
451        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
452        */
453       assert(format_desc->block.bits <= type.width);
454       fetch_type = lp_type_uint(type.width);
455       packed = lp_build_gather(gallivm,
456                                type.length,
457                                format_desc->block.bits,
458                                fetch_type,
459                                aligned,
460                                base_ptr, offset, FALSE);
461 
462       /*
463        * convert texels to float rgba
464        */
465       lp_build_unpack_rgba_soa(gallivm,
466                                format_desc,
467                                type,
468                                packed, rgba_out);
469       return;
470    }
471 
472 
473    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
474        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
475        format_desc->block.width == 1 &&
476        format_desc->block.height == 1 &&
477        format_desc->block.bits > type.width &&
478        ((format_desc->block.bits <= type.width * type.length &&
479          format_desc->channel[0].size <= type.width) ||
480         (format_desc->channel[0].size == 64 &&
481          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
482          type.floating)))
483    {
484       /*
485        * Similar to above, but the packed pixel is larger than what fits
486        * into an element of the destination format. The packed pixels will be
487        * shuffled into SoA vectors appropriately, and then the extraction will
488        * be done in parallel as much as possible.
489        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490        * the gathered vectors can be shuffled easily (even with avx).
491        * 64xn float -> 32xn float is handled too but it's a bit special as
492        * it does the conversion pre-shuffle.
493        */
494 
495       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
496       struct lp_type fetch_type, gather_type = type;
497       unsigned num_gather, fetch_width, i, j;
498       struct lp_build_context bld;
499       boolean fp64 = format_desc->channel[0].size == 64;
500 
501       lp_build_context_init(&bld, gallivm, type);
502 
503       assert(type.width == 32);
504       assert(format_desc->block.bits > type.width);
505 
506       /*
507        * First, figure out fetch order.
508        */
509       fetch_width = util_next_power_of_two(format_desc->block.bits);
510       /*
511        * fp64 are treated like fp32 except we fetch twice wide values
512        * (as we shuffle after trunc). The shuffles for that work out
513        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514        * albeit we miss the potential opportunity for hw gather (as it
515        * only handles native size).
516        */
517       num_gather = fetch_width / type.width;
518       gather_type.width *= num_gather;
519       if (fp64) {
520          num_gather /= 2;
521       }
522       gather_type.length /= num_gather;
523 
524       for (i = 0; i < num_gather; i++) {
525          LLVMValueRef offsetr, shuf_vec;
526          if(num_gather == 4) {
527             for (j = 0; j < gather_type.length; j++) {
528                unsigned idx = i + 4*j;
529                shuffles[j] = lp_build_const_int32(gallivm, idx);
530             }
531             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
532             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
533 
534          }
535          else if (num_gather == 2) {
536             assert(num_gather == 2);
537             for (j = 0; j < gather_type.length; j++) {
538                unsigned idx = i*2 + (j%2) + (j/2)*4;
539                shuffles[j] = lp_build_const_int32(gallivm, idx);
540             }
541             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
542             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
543          }
544          else {
545             assert(num_gather == 1);
546             offsetr = offset;
547          }
548          if (gather_type.length == 1) {
549             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
550             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
551          }
552 
553          /*
554           * Determine whether to use float or int loads. This is mostly
555           * to outsmart the (stupid) llvm int/float shuffle logic, we
556           * don't really care much if the data is floats or ints...
557           * But llvm will refuse to use single float shuffle with int data
558           * and instead use 3 int shuffles instead, the code looks atrocious.
559           * (Note bitcasts often won't help, as llvm is too smart to be
560           * fooled by that.)
561           * Nobody cares about simd float<->int domain transition penalties,
562           * which usually don't even exist for shuffles anyway.
563           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564           * going into transpose, which is unpacks, so doesn't really matter
565           * much).
566           * With 2x32bit or 4x16bit fetch, we use float vec, since those
567           * go into the weird channel separation shuffle. With floats,
568           * this is (with 128bit vectors):
569           * - 2 movq, 2 movhpd, 2 shufps
570           * With ints it would be:
571           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572           * I've seen texture functions increase in code size by 15% just due
573           * to that (there's lots of such fetches in them...)
574           * (We could chose a different gather order to improve this somewhat
575           * for the int path, but it would basically just drop the blends,
576           * so the float path with this order really is optimal.)
577           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
578           * casts so must avoid them until we're done with the float shuffle...
579           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
580           * there's nothing we can do about them (we could overallocate by
581           * those couple bytes and use unaligned but pot sized load).
582           * Note that this is very much x86 specific. I don't know if this
583           * affect other archs at all.
584           */
585          if (num_gather > 1) {
586             /*
587              * We always want some float type here (with x86)
588              * due to shuffles being float ones afterwards (albeit for
589              * the num_gather == 4 case int should work fine too
590              * (unless there's some problems with avx but not avx2).
591              */
592             if (format_desc->channel[0].size == 64) {
593                fetch_type = lp_type_float_vec(64, gather_type.width);
594             } else {
595                fetch_type = lp_type_int_vec(32, gather_type.width);
596             }
597          }
598          else {
599             /* type doesn't matter much */
600             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
601                 (format_desc->channel[0].size == 32 ||
602                  format_desc->channel[0].size == 64)) {
603             fetch_type = lp_type_float(gather_type.width);
604             } else {
605                fetch_type = lp_type_uint(gather_type.width);
606             }
607          }
608 
609          /* Now finally gather the values */
610          packed[i] = lp_build_gather(gallivm, gather_type.length,
611                                      format_desc->block.bits,
612                                      fetch_type, aligned,
613                                      base_ptr, offsetr, FALSE);
614          if (fp64) {
615             struct lp_type conv_type = type;
616             conv_type.width *= 2;
617             packed[i] = LLVMBuildBitCast(builder, packed[i],
618                                          lp_build_vec_type(gallivm, conv_type), "");
619             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
620          }
621       }
622 
623       /* shuffle the gathered values to SoA */
624       if (num_gather == 2) {
625          for (i = 0; i < num_gather; i++) {
626             for (j = 0; j < type.length; j++) {
627                unsigned idx = (j%2)*2 + (j/4)*4 + i;
628                if ((j/2)%2)
629                   idx += type.length;
630                shuffles[j] = lp_build_const_int32(gallivm, idx);
631             }
632             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
633                                             LLVMConstVector(shuffles, type.length), "");
634          }
635       }
636       else if (num_gather == 4) {
637          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
638       }
639       else {
640          assert(num_gather == 1);
641          dst[0] = packed[0];
642       }
643 
644       /*
645        * And finally unpack exactly as above, except that
646        * chan shift is adjusted and the right vector selected.
647        */
648       if (!fp64) {
649          for (i = 0; i < num_gather; i++) {
650             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
651          }
652          for (i = 0; i < format_desc->nr_channels; i++) {
653             struct util_format_channel_description chan_desc = format_desc->channel[i];
654             unsigned blockbits = type.width;
655             unsigned vec_nr;
656 
657 #if UTIL_ARCH_BIG_ENDIAN
658             vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
659 #else
660             vec_nr = chan_desc.shift / type.width;
661 #endif
662             chan_desc.shift %= type.width;
663 
664             output[i] = lp_build_extract_soa_chan(&bld,
665                                                   blockbits,
666                                                   FALSE,
667                                                   chan_desc,
668                                                   dst[vec_nr]);
669          }
670       }
671       else {
672          for (i = 0; i < format_desc->nr_channels; i++)  {
673             output[i] = dst[i];
674          }
675       }
676 
677       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
678       return;
679    }
680 
681    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
682        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
683       /*
684        * similar conceptually to above but requiring special
685        * AoS packed -> SoA float conversion code.
686        */
687       LLVMValueRef packed;
688       struct lp_type fetch_type = lp_type_uint(type.width);
689 
690       assert(type.floating);
691       assert(type.width == 32);
692 
693       packed = lp_build_gather(gallivm, type.length,
694                                format_desc->block.bits,
695                                fetch_type, aligned,
696                                base_ptr, offset, FALSE);
697       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
698          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
699       }
700       else {
701          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
702       }
703       return;
704    }
705 
706    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
707        format_desc->block.bits == 64) {
708       /*
709        * special case the format is 64 bits but we only require
710        * 32bit (or 8bit) from each block.
711        */
712       LLVMValueRef packed;
713       struct lp_type fetch_type = lp_type_uint(type.width);
714 
715       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
716          /*
717           * for stencil simply fix up offsets - could in fact change
718           * base_ptr instead even outside the shader.
719           */
720          unsigned mask = (1 << 8) - 1;
721          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
722          offset = LLVMBuildAdd(builder, offset, s_offset, "");
723          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724                                   aligned, base_ptr, offset, FALSE);
725          packed = LLVMBuildAnd(builder, packed,
726                                lp_build_const_int_vec(gallivm, type, mask), "");
727       }
728       else {
729          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
730          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
731                                   aligned, base_ptr, offset, TRUE);
732          packed = LLVMBuildBitCast(builder, packed,
733                                    lp_build_vec_type(gallivm, type), "");
734       }
735       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
737       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
738       return;
739    }
740 
741    /*
742     * Try calling lp_build_fetch_rgba_aos for all pixels.
743     * Should only really hit subsampled, compressed
744     * (for s3tc srgb and rgtc too).
745     * (This is invalid for plain 8unorm formats because we're lazy with
746     * the swizzle since some results would arrive swizzled, some not.)
747     */
748 
749    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
750        (util_format_fits_8unorm(format_desc) ||
751         format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
752         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
753        type.floating && type.width == 32 &&
754        (type.length == 1 || (type.length % 4 == 0))) {
755       struct lp_type tmp_type;
756       struct lp_build_context bld;
757       LLVMValueRef packed, rgba[4];
758       const struct util_format_description *flinear_desc;
759       const struct util_format_description *frgba8_desc;
760       unsigned chan;
761       bool is_signed = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
762                         format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
763                         format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
764                         format_desc->format == PIPE_FORMAT_LATC2_SNORM);
765 
766       lp_build_context_init(&bld, gallivm, type);
767 
768       /*
769        * Make sure the conversion in aos really only does convert to rgba8
770        * and not anything more (so use linear format, adjust type).
771        */
772       flinear_desc = util_format_description(util_format_linear(format));
773       memset(&tmp_type, 0, sizeof tmp_type);
774       tmp_type.width = 8;
775       tmp_type.length = type.length * 4;
776       tmp_type.norm = TRUE;
777       tmp_type.sign = is_signed;
778 
779       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
780                                        aligned, base_ptr, offset, i, j, cache);
781       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
782 
783       /*
784        * The values are now packed so they match ordinary (srgb) RGBA8 format,
785        * hence need to use matching format for unpack.
786        */
787       frgba8_desc = util_format_description(is_signed ? PIPE_FORMAT_R8G8B8A8_SNORM : PIPE_FORMAT_R8G8B8A8_UNORM);
788       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
789          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
790          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
791       }
792       lp_build_unpack_rgba_soa(gallivm,
793                                frgba8_desc,
794                                type,
795                                packed, rgba);
796 
797       /*
798        * We converted 4 channels. Make sure llvm can drop unneeded ones
799        * (luckily the rgba order is fixed, only LA needs special case).
800        */
801       for (chan = 0; chan < 4; chan++) {
802          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
803          if (chan == 3 && util_format_is_luminance_alpha(format)) {
804             swizzle = PIPE_SWIZZLE_W;
805          }
806          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
807       }
808       return;
809    }
810 
811 
812    /*
813     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
814     *
815     * This is not the most efficient way of fetching pixels, as we
816     * miss some opportunities to do vectorization, but this is
817     * convenient for formats or scenarios for which there was no
818     * opportunity or incentive to optimize.
819     *
820     * We do NOT want to end up here, this typically is quite terrible,
821     * in particular if the formats have less than 4 channels.
822     *
823     * Right now, this should only be hit for:
824     * - ETC formats
825     *   (those miss fast fetch functions hence they are terrible anyway)
826     */
827 
828    {
829       unsigned k;
830       struct lp_type tmp_type;
831       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
832 
833       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
834          debug_printf("%s: AoS fetch fallback for %s\n",
835                       __FUNCTION__, format_desc->short_name);
836       }
837 
838       tmp_type = type;
839       tmp_type.length = 4;
840 
841       if (type.length == 1) {
842          LLVMValueRef fetch = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
843                                                       aligned, base_ptr, offset,
844                                                       i, j, cache);
845 
846          for (k = 0; k < 4; k++)
847             rgba_out[k] = LLVMBuildExtractElement(gallivm->builder, fetch, lp_build_const_int32(gallivm, k), "");
848          return;
849       }
850 
851       /*
852        * Note that vector transpose can be worse compared to insert/extract
853        * for aos->soa conversion (for formats with 1 or 2 channels). However,
854        * we should try to avoid getting here for just about all formats, so
855        * don't bother.
856        */
857 
858       /* loop over number of pixels */
859       for(k = 0; k < type.length; ++k) {
860          LLVMValueRef index = lp_build_const_int32(gallivm, k);
861          LLVMValueRef offset_elem;
862          LLVMValueRef i_elem, j_elem;
863 
864          offset_elem = LLVMBuildExtractElement(builder, offset,
865                                                index, "");
866 
867          i_elem = LLVMBuildExtractElement(builder, i, index, "");
868          j_elem = LLVMBuildExtractElement(builder, j, index, "");
869 
870          /* Get a single float[4]={R,G,B,A} pixel */
871          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
872                                                 aligned, base_ptr, offset_elem,
873                                                 i_elem, j_elem, cache);
874 
875       }
876       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
877    }
878 }
879 
880 static void
lp_build_insert_soa_chan(struct lp_build_context * bld,unsigned blockbits,struct util_format_channel_description chan_desc,LLVMValueRef * output,LLVMValueRef rgba)881 lp_build_insert_soa_chan(struct lp_build_context *bld,
882                          unsigned blockbits,
883                          struct util_format_channel_description chan_desc,
884                          LLVMValueRef *output,
885                          LLVMValueRef rgba)
886 {
887     struct gallivm_state *gallivm = bld->gallivm;
888     LLVMBuilderRef builder = gallivm->builder;
889     struct lp_type type = bld->type;
890     const unsigned width = chan_desc.size;
891     const unsigned start = chan_desc.shift;
892     const uint32_t chan_mask = (1ULL << width) - 1;
893     ASSERTED const unsigned stop = start + width;
894     LLVMValueRef chan = NULL;
895     switch(chan_desc.type) {
896     case UTIL_FORMAT_TYPE_UNSIGNED:
897 
898        if (chan_desc.pure_integer) {
899           chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
900           LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, type, chan_mask);
901           LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chan, mask_val, "");
902           chan = LLVMBuildSelect(builder, mask, mask_val, chan, "");
903        }
904        else if (type.floating) {
905           if (chan_desc.normalized) {
906              rgba = lp_build_clamp(bld, rgba, bld->zero, bld->one);
907              chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
908           } else
909              chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
910        }
911        if (start)
912           chan = LLVMBuildShl(builder, chan,
913                               lp_build_const_int_vec(gallivm, type, start), "");
914        if (!*output)
915           *output = chan;
916        else
917           *output = LLVMBuildOr(builder, *output, chan, "");
918        break;
919     case UTIL_FORMAT_TYPE_SIGNED:
920        if (chan_desc.pure_integer) {
921           chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
922           /* clamp to SINT range for < 32-bit values */
923           if (width < 32) {
924              struct lp_build_context int_bld;
925              lp_build_context_init(&int_bld, gallivm, lp_int_type(bld->type));
926              chan = lp_build_clamp(&int_bld, chan,
927                                    lp_build_const_int_vec(gallivm, type, -(1ULL << (width - 1))),
928                                    lp_build_const_int_vec(gallivm, type, (1ULL << (width - 1)) - 1));
929              chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
930           }
931        } else if (type.floating) {
932           if (chan_desc.normalized) {
933              char intrin[32];
934              double scale = ((1 << (chan_desc.size - 1)) - 1);
935              LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
936              rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
937              rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
938              lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
939              rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
940           }
941           chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
942           chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
943        }
944        if (start)
945           chan = LLVMBuildShl(builder, chan,
946                               lp_build_const_int_vec(gallivm, type, start), "");
947        if (!*output)
948           *output = chan;
949        else
950           *output = LLVMBuildOr(builder, *output, chan, "");
951        break;
952     case UTIL_FORMAT_TYPE_FLOAT:
953        if (type.floating) {
954           if (chan_desc.size == 16) {
955              chan = lp_build_float_to_half(gallivm, rgba);
956              chan = LLVMBuildBitCast(builder, chan,
957 				     lp_build_vec_type(gallivm, lp_type_int_vec(16, 16 * type.length)), "");
958              chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
959              if (start)
960                 chan = LLVMBuildShl(builder, chan,
961                                     lp_build_const_int_vec(gallivm, type, start), "");
962              if (!*output)
963                 *output = chan;
964              else
965                 *output = LLVMBuildOr(builder, *output, chan, "");
966           } else {
967              assert(start == 0);
968              assert(stop == 32);
969              assert(type.width == 32);
970              *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
971           }
972        } else
973           assert(0);
974        break;
975     default:
976        assert(0);
977        *output = bld->undef;
978     }
979 }
980 
981 static void
lp_build_pack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,const LLVMValueRef rgba_in[4],LLVMValueRef * packed)982 lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
983                        const struct util_format_description *format_desc,
984                        struct lp_type type,
985                        const LLVMValueRef rgba_in[4],
986                        LLVMValueRef *packed)
987 {
988    unsigned chan;
989    struct lp_build_context bld;
990    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
991    assert(format_desc->block.width == 1);
992    assert(format_desc->block.height == 1);
993    assert(format_desc->block.bits <= type.width);
994    /* FIXME: Support more output types */
995    assert(type.width == 32);
996 
997    lp_build_context_init(&bld, gallivm, type);
998    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
999       struct util_format_channel_description chan_desc = format_desc->channel[chan];
1000 
1001       lp_build_insert_soa_chan(&bld, format_desc->block.bits,
1002                                chan_desc,
1003                                packed,
1004                                rgba_in[chan]);
1005    }
1006 }
1007 
1008 void
lp_build_store_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef exec_mask,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef out_of_bounds,const LLVMValueRef rgba_in[4])1009 lp_build_store_rgba_soa(struct gallivm_state *gallivm,
1010                         const struct util_format_description *format_desc,
1011                         struct lp_type type,
1012                         LLVMValueRef exec_mask,
1013                         LLVMValueRef base_ptr,
1014                         LLVMValueRef offset,
1015                         LLVMValueRef out_of_bounds,
1016                         const LLVMValueRef rgba_in[4])
1017 {
1018    enum pipe_format format = format_desc->format;
1019    LLVMValueRef packed[4];
1020    unsigned num_stores = 0;
1021 
1022    memset(packed, 0, sizeof(LLVMValueRef) * 4);
1023    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1024        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
1025        format_desc->block.width == 1 &&
1026        format_desc->block.height == 1 &&
1027        format_desc->block.bits <= type.width &&
1028        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
1029         format_desc->channel[0].size == 32 ||
1030         format_desc->channel[0].size == 16))
1031    {
1032       lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1033 
1034       num_stores = 1;
1035    } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1036        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1037        format_desc->block.width == 1 &&
1038        format_desc->block.height == 1 &&
1039        format_desc->block.bits > type.width &&
1040        ((format_desc->block.bits <= type.width * type.length &&
1041          format_desc->channel[0].size <= type.width) ||
1042         (format_desc->channel[0].size == 64 &&
1043          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1044          type.floating)))
1045    {
1046       /*
1047        * Similar to above, but the packed pixel is larger than what fits
1048        * into an element of the destination format. The packed pixels will be
1049        * shuffled into SoA vectors appropriately, and then the extraction will
1050        * be done in parallel as much as possible.
1051        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1052        * the gathered vectors can be shuffled easily (even with avx).
1053        * 64xn float -> 32xn float is handled too but it's a bit special as
1054        * it does the conversion pre-shuffle.
1055        */
1056       struct lp_build_context bld;
1057 
1058       lp_build_context_init(&bld, gallivm, type);
1059       assert(type.width == 32);
1060       assert(format_desc->block.bits > type.width);
1061 
1062       unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1063       num_stores = store_width / type.width;
1064       for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1065             struct util_format_channel_description chan_desc = format_desc->channel[i];
1066             unsigned blockbits = type.width;
1067             unsigned vec_nr;
1068 
1069             vec_nr = chan_desc.shift / type.width;
1070             chan_desc.shift %= type.width;
1071 
1072             lp_build_insert_soa_chan(&bld, blockbits,
1073                                      chan_desc,
1074                                      &packed[vec_nr],
1075                                      rgba_in[i]);
1076       }
1077 
1078       assert(num_stores == 4 || num_stores == 2);
1079       /* we can transpose and store at the same time */
1080    } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1081       packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1082       num_stores = 1;
1083    } else
1084       assert(0);
1085 
1086    assert(exec_mask);
1087 
1088    LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1089    LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1090    LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1091 
1092    LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1093    should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1094    for (unsigned i = 0; i < num_stores; i++) {
1095       struct lp_build_loop_state loop_state;
1096 
1097       LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1098       store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1099 
1100       lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1101 
1102       struct lp_build_if_state ifthen;
1103       LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1104       lp_build_if(&ifthen, gallivm, cond);
1105 
1106       LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1107       LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1108 
1109       if (format_desc->block.bits == 8) {
1110          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1111          data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1112       } else if (format_desc->block.bits == 16) {
1113          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1114          data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1115       } else
1116          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1117       LLVMBuildStore(gallivm->builder, data, this_offset);
1118       lp_build_endif(&ifthen);
1119       lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1120                              NULL, LLVMIntUGE);
1121    }
1122 }
1123