1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * AoS pixel format manipulation.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  */
34 
35 
36 #include "util/format/u_format.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include "util/u_pointer.h"
40 #include "util/u_string.h"
41 #include "util/u_cpu_detect.h"
42 
43 #include "lp_bld_arit.h"
44 #include "lp_bld_init.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_flow.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_swizzle.h"
50 #include "lp_bld_gather.h"
51 #include "lp_bld_debug.h"
52 #include "lp_bld_format.h"
53 #include "lp_bld_pack.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_bitarit.h"
57 #include "lp_bld_misc.h"
58 
59 /**
60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
62  * too.
63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
64  */
65 LLVMValueRef
lp_build_format_swizzle_aos(const struct util_format_description * desc,struct lp_build_context * bld,LLVMValueRef unswizzled)66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
67                             struct lp_build_context *bld,
68                             LLVMValueRef unswizzled)
69 {
70    unsigned char swizzles[4];
71    unsigned chan;
72 
73    assert(bld->type.length % 4 == 0);
74 
75    for (chan = 0; chan < 4; ++chan) {
76       enum pipe_swizzle swizzle;
77 
78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
79          /*
80           * For ZS formats do RGBA = ZZZ1
81           */
82          if (chan == 3) {
83             swizzle = PIPE_SWIZZLE_1;
84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
85             swizzle = PIPE_SWIZZLE_0;
86          } else {
87             swizzle = desc->swizzle[0];
88          }
89       } else {
90          swizzle = desc->swizzle[chan];
91       }
92       swizzles[chan] = swizzle;
93    }
94 
95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
96 }
97 
98 
99 /**
100  * Whether the format matches the vector type, apart of swizzles.
101  */
102 static inline boolean
format_matches_type(const struct util_format_description * desc,struct lp_type type)103 format_matches_type(const struct util_format_description *desc,
104                     struct lp_type type)
105 {
106    enum util_format_type chan_type;
107    unsigned chan;
108 
109    assert(type.length % 4 == 0);
110 
111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
113        desc->block.width != 1 ||
114        desc->block.height != 1) {
115       return FALSE;
116    }
117 
118    if (type.floating) {
119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
120    } else if (type.fixed) {
121       chan_type = UTIL_FORMAT_TYPE_FIXED;
122    } else if (type.sign) {
123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
124    } else {
125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
126    }
127 
128    for (chan = 0; chan < desc->nr_channels; ++chan) {
129       if (desc->channel[chan].size != type.width) {
130          return FALSE;
131       }
132 
133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
134          if (desc->channel[chan].type != chan_type ||
135              desc->channel[chan].normalized != type.norm) {
136             return FALSE;
137          }
138       }
139    }
140 
141    return TRUE;
142 }
143 
144 /*
145  * Do rounding when converting small unorm values to larger ones.
146  * Not quite 100% accurate, as it's done by appending MSBs, but
147  * should be good enough.
148  */
149 
150 static inline LLVMValueRef
scale_bits_up(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)151 scale_bits_up(struct gallivm_state *gallivm,
152               int src_bits,
153               int dst_bits,
154               LLVMValueRef src,
155               struct lp_type src_type)
156 {
157    LLVMBuilderRef builder = gallivm->builder;
158    LLVMValueRef result = src;
159 
160    if (src_bits == 1 && dst_bits > 1) {
161       /*
162        * Useful for a1 - we'd need quite some repeated copies otherwise.
163        */
164       struct lp_build_context bld;
165       LLVMValueRef dst_mask;
166       lp_build_context_init(&bld, gallivm, src_type);
167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
168                                         (1 << dst_bits) - 1),
169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
170                             lp_build_const_int_vec(gallivm, src_type, 0));
171       result = lp_build_andnot(&bld, dst_mask, result);
172    }
173    else if (dst_bits > src_bits) {
174       /* Scale up bits */
175       int db = dst_bits - src_bits;
176 
177       /* Shift left by difference in bits */
178       result = LLVMBuildShl(builder,
179                             src,
180                             lp_build_const_int_vec(gallivm, src_type, db),
181                             "");
182 
183       if (db <= src_bits) {
184          /* Enough bits in src to fill the remainder */
185          LLVMValueRef lower = LLVMBuildLShr(builder,
186                                             src,
187                                             lp_build_const_int_vec(gallivm, src_type,
188                                                                    src_bits - db),
189                                             "");
190 
191          result = LLVMBuildOr(builder, result, lower, "");
192       } else if (db > src_bits) {
193          /* Need to repeatedly copy src bits to fill remainder in dst */
194          unsigned n;
195 
196          for (n = src_bits; n < dst_bits; n *= 2) {
197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
198 
199             result = LLVMBuildOr(builder,
200                                  result,
201                                  LLVMBuildLShr(builder, result, shuv, ""),
202                                  "");
203          }
204       }
205    } else {
206       assert (dst_bits == src_bits);
207    }
208 
209    return result;
210 }
211 
212 /**
213  * Unpack a single pixel into its XYZW components.
214  *
215  * @param desc  the pixel format for the packed pixel value
216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
217  *
218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
219  */
220 static inline LLVMValueRef
lp_build_unpack_arith_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef packed)221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
222                                const struct util_format_description *desc,
223                                LLVMValueRef packed)
224 {
225    LLVMBuilderRef builder = gallivm->builder;
226    LLVMValueRef shifted, casted, scaled, masked;
227    LLVMValueRef shifts[4];
228    LLVMValueRef masks[4];
229    LLVMValueRef scales[4];
230    LLVMTypeRef vec32_type;
231 
232    boolean normalized;
233    boolean needs_uitofp;
234    unsigned i;
235 
236    /* TODO: Support more formats */
237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
238    assert(desc->block.width == 1);
239    assert(desc->block.height == 1);
240    assert(desc->block.bits <= 32);
241 
242    /* Do the intermediate integer computations with 32bit integers since it
243     * matches floating point size */
244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
245 
246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
247 
248    /* Broadcast the packed value to all four channels
249     * before: packed = BGRA
250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
251     */
252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
254                                    "");
255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
256                                    LLVMConstNull(vec32_type),
257                                    "");
258 
259    /* Initialize vector constants */
260    normalized = FALSE;
261    needs_uitofp = FALSE;
262 
263    /* Loop over 4 color components */
264    for (i = 0; i < 4; ++i) {
265       unsigned bits = desc->channel[i].size;
266       unsigned shift = desc->channel[i].shift;
267 
268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
272       }
273       else {
274          unsigned long long mask = (1ULL << bits) - 1;
275 
276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
277 
278          if (bits == 32) {
279             needs_uitofp = TRUE;
280          }
281 
282          shifts[i] = lp_build_const_int32(gallivm, shift);
283          masks[i] = lp_build_const_int32(gallivm, mask);
284 
285          if (desc->channel[i].normalized) {
286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
287             normalized = TRUE;
288          }
289          else
290             scales[i] =  lp_build_const_float(gallivm, 1.0);
291       }
292    }
293 
294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
295     * into masked = {X, Y, Z, W}
296     */
297    if (desc->block.bits < 32 && normalized) {
298       /*
299        * Note: we cannot do the shift below on x86 natively until AVX2.
300        *
301        * Old llvm versions will resort to scalar extract/shift insert,
302        * which is definitely terrible, new versions will just do
303        * several vector shifts and shuffle/blend results together.
304        * We could turn this into a variable left shift plus a constant
305        * right shift, and llvm would then turn the variable left shift
306        * into a mul for us (albeit without sse41 the mul needs emulation
307        * too...). However, since we're going to do a float mul
308        * anyway, we just adjust that mul instead (plus the mask), skipping
309        * the shift completely.
310        * We could also use a extra mul when the format isn't normalized and
311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
314        * would be fixable with easy 16bit shuffle (unless there's channels
315        * crossing 16bit boundaries).
316        */
317       for (i = 0; i < 4; ++i) {
318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
319             unsigned bits = desc->channel[i].size;
320             unsigned shift = desc->channel[i].shift;
321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
323             masks[i] = lp_build_const_int32(gallivm, mask);
324          }
325       }
326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
327    } else {
328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
330    }
331 
332    if (!needs_uitofp) {
333       /* UIToFP can't be expressed in SSE2 */
334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
335    } else {
336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
337    }
338 
339    /*
340     * At this point 'casted' may be a vector of floats such as
341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
342     * by powers of two). Next, if the pixel values are normalized
343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
344     */
345 
346    if (normalized)
347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
348    else
349       scaled = casted;
350 
351    return scaled;
352 }
353 
354 
355 /**
356  * Pack a single pixel.
357  *
358  * @param rgba 4 float vector with the unpacked components.
359  *
360  * XXX: This is mostly for reference and testing -- operating a single pixel at
361  * a time is rarely if ever needed.
362  */
363 LLVMValueRef
lp_build_pack_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef rgba)364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
365                        const struct util_format_description *desc,
366                        LLVMValueRef rgba)
367 {
368    LLVMBuilderRef builder = gallivm->builder;
369    LLVMTypeRef type;
370    LLVMValueRef packed = NULL;
371    LLVMValueRef swizzles[4];
372    LLVMValueRef shifted, casted, scaled, unswizzled;
373    LLVMValueRef shifts[4];
374    LLVMValueRef scales[4];
375    boolean normalized;
376    unsigned i, j;
377 
378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
379    assert(desc->block.width == 1);
380    assert(desc->block.height == 1);
381 
382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
383 
384    /* Unswizzle the color components into the source vector. */
385    for (i = 0; i < 4; ++i) {
386       for (j = 0; j < 4; ++j) {
387          if (desc->swizzle[j] == i)
388             break;
389       }
390       if (j < 4)
391          swizzles[i] = lp_build_const_int32(gallivm, j);
392       else
393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
394    }
395 
396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
398                                        LLVMConstVector(swizzles, 4), "");
399 
400    normalized = FALSE;
401    for (i = 0; i < 4; ++i) {
402       unsigned bits = desc->channel[i].size;
403       unsigned shift = desc->channel[i].shift;
404 
405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
408       }
409       else {
410          unsigned mask = (1 << bits) - 1;
411 
412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
413          assert(bits < 32);
414 
415          shifts[i] = lp_build_const_int32(gallivm, shift);
416 
417          if (desc->channel[i].normalized) {
418             scales[i] = lp_build_const_float(gallivm, mask);
419             normalized = TRUE;
420          }
421          else
422             scales[i] = lp_build_const_float(gallivm, 1.0);
423       }
424    }
425 
426    if (normalized)
427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
428    else
429       scaled = unswizzled;
430 
431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
432 
433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
434 
435    /* Bitwise or all components */
436    for (i = 0; i < 4; ++i) {
437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
439                                                lp_build_const_int32(gallivm, i), "");
440          if (packed)
441             packed = LLVMBuildOr(builder, packed, component, "");
442          else
443             packed = component;
444       }
445    }
446 
447    if (!packed)
448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
449 
450    if (desc->block.bits < 32)
451       packed = LLVMBuildTrunc(builder, packed, type, "");
452 
453    return packed;
454 }
455 
456 
457 
458 
459 /**
460  * Fetch a pixel into a 4 float AoS.
461  *
462  * \param format_desc  describes format of the image we're fetching from
463  * \param aligned  whether the data is guaranteed to be aligned
464  * \param ptr  address of the pixel block (or the texel if uncompressed)
465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
466  *              these will always be (0, 0).
467  * \param cache  optional value pointing to a lp_build_format_cache structure
468  * \return  a 4 element vector with the pixel's RGBA values.
469  */
470 LLVMValueRef
lp_build_fetch_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)471 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
472                         const struct util_format_description *format_desc,
473                         struct lp_type type,
474                         boolean aligned,
475                         LLVMValueRef base_ptr,
476                         LLVMValueRef offset,
477                         LLVMValueRef i,
478                         LLVMValueRef j,
479                         LLVMValueRef cache)
480 {
481    LLVMBuilderRef builder = gallivm->builder;
482    unsigned num_pixels = type.length / 4;
483    struct lp_build_context bld;
484 
485    assert(type.length <= LP_MAX_VECTOR_LENGTH);
486    assert(type.length % 4 == 0);
487 
488    lp_build_context_init(&bld, gallivm, type);
489 
490    /*
491     * Trivial case
492     *
493     * The format matches the type (apart of a swizzle) so no need for
494     * scaling or converting.
495     */
496 
497    if (format_matches_type(format_desc, type) &&
498        format_desc->block.bits <= type.width * 4 &&
499        /* XXX this shouldn't be needed */
500        util_is_power_of_two_or_zero(format_desc->block.bits)) {
501       LLVMValueRef packed;
502       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
503       struct lp_type fetch_type;
504       unsigned vec_len = type.width * type.length;
505 
506       /*
507        * The format matches the type (apart of a swizzle) so no need for
508        * scaling or converting.
509        */
510 
511       fetch_type = lp_type_uint(type.width*4);
512       packed = lp_build_gather(gallivm, type.length/4,
513                                format_desc->block.bits, fetch_type,
514                                aligned, base_ptr, offset, TRUE);
515 
516       assert(format_desc->block.bits <= vec_len);
517       (void) vec_len; /* silence unused var warning for non-debug build */
518 
519       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
520       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
521    }
522 
523    /*
524     * Bit arithmetic for converting small_unorm to unorm8.
525     *
526     * This misses some opportunities for optimizations (like skipping mask
527     * for the highest channel for instance, or doing bit scaling in parallel
528     * for channels with the same bit width) but it should be passable for
529     * all arithmetic formats.
530     */
531    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
532        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
533        util_format_fits_8unorm(format_desc) &&
534        type.width == 8 && type.norm == 1 && type.sign == 0 &&
535        type.fixed == 0 && type.floating == 0) {
536       LLVMValueRef packed, res = NULL, chans[4], rgba[4];
537       LLVMTypeRef dst_vec_type, conv_vec_type;
538       struct lp_type fetch_type, conv_type;
539       struct lp_build_context bld_conv;
540       unsigned j;
541 
542       fetch_type = lp_type_uint(type.width*4);
543       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
544       dst_vec_type = lp_build_vec_type(gallivm, type);
545       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
546       lp_build_context_init(&bld_conv, gallivm, conv_type);
547 
548       packed = lp_build_gather(gallivm, type.length/4,
549                                format_desc->block.bits, fetch_type,
550                                aligned, base_ptr, offset, TRUE);
551 
552       assert(format_desc->block.bits * type.length / 4 <=
553              type.width * type.length);
554 
555       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
556 
557       for (j = 0; j < format_desc->nr_channels; ++j) {
558          unsigned mask = 0;
559          unsigned sa = format_desc->channel[j].shift;
560 
561          mask = (1 << format_desc->channel[j].size) - 1;
562 
563          /* Extract bits from source */
564          chans[j] = LLVMBuildLShr(builder, packed,
565                                   lp_build_const_int_vec(gallivm, conv_type, sa),
566                                   "");
567 
568          chans[j] = LLVMBuildAnd(builder, chans[j],
569                                  lp_build_const_int_vec(gallivm, conv_type, mask),
570                                  "");
571 
572          /* Scale bits */
573          if (type.norm) {
574             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
575                                      type.width, chans[j], conv_type);
576          }
577       }
578       /*
579        * This is a hacked lp_build_format_swizzle_soa() since we need a
580        * normalized 1 but only 8 bits in a 32bit vector...
581        */
582       for (j = 0; j < 4; ++j) {
583          enum pipe_swizzle swizzle = format_desc->swizzle[j];
584          if (swizzle == PIPE_SWIZZLE_1) {
585             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
586          } else {
587             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
588          }
589          if (j == 0) {
590             res = rgba[j];
591          } else {
592             rgba[j] = LLVMBuildShl(builder, rgba[j],
593                                    lp_build_const_int_vec(gallivm, conv_type,
594                                                           j * type.width), "");
595             res = LLVMBuildOr(builder, res, rgba[j], "");
596          }
597       }
598       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
599 
600       return res;
601    }
602 
603    /*
604     * Bit arithmetic
605     */
606 
607    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
608        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
609         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
610        format_desc->block.width == 1 &&
611        format_desc->block.height == 1 &&
612        /* XXX this shouldn't be needed */
613        util_is_power_of_two_or_zero(format_desc->block.bits) &&
614        format_desc->block.bits <= 32 &&
615        format_desc->is_bitmask &&
616        !format_desc->is_mixed &&
617        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
618         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
619        !format_desc->channel[0].pure_integer) {
620 
621       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
622       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
623       struct lp_type conv_type;
624       unsigned k, num_conv_src, num_conv_dst;
625 
626       /*
627        * Note this path is generally terrible for fetching multiple pixels.
628        * We should make sure we cannot hit this code path for anything but
629        * single pixels.
630        */
631 
632       /*
633        * Unpack a pixel at a time into a <4 x float> RGBA vector
634        */
635 
636       for (k = 0; k < num_pixels; ++k) {
637          LLVMValueRef packed;
638 
639          packed = lp_build_gather_elem(gallivm, num_pixels,
640                                        format_desc->block.bits, 32, aligned,
641                                        base_ptr, offset, k, FALSE);
642 
643          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
644                                                   format_desc,
645                                                   packed);
646       }
647 
648       /*
649        * Type conversion.
650        *
651        * TODO: We could avoid floating conversion for integer to
652        * integer conversions.
653        */
654 
655       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
656          debug_printf("%s: unpacking %s with floating point\n",
657                       __FUNCTION__, format_desc->short_name);
658       }
659 
660       conv_type = lp_float32_vec4_type();
661       num_conv_src = num_pixels;
662       num_conv_dst = 1;
663 
664       if (num_pixels % 8 == 0) {
665          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
666                            tmps, num_pixels, tmps, num_pixels / 2);
667          conv_type.length *= num_pixels / 4;
668          num_conv_src = 4 * num_pixels / 8;
669          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
670             /*
671              * FIXME: The fast float->unorm path (which is basically
672              * skipping the MIN/MAX which are extremely pointless in any
673              * case) requires that there's 2 destinations...
674              * In any case, we really should make sure we don't hit this
675              * code with multiple pixels for unorm8 dst types, it's
676              * completely hopeless even if we do hit the right conversion.
677              */
678             type.length /= num_pixels / 4;
679             num_conv_dst = num_pixels / 4;
680          }
681       }
682 
683       lp_build_conv(gallivm, conv_type, type,
684                     tmps, num_conv_src, res, num_conv_dst);
685 
686       if (num_pixels % 8 == 0 &&
687           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
688          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
689       }
690 
691       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
692    }
693 
694    /* If all channels are of same type and we are not using half-floats */
695    if (format_desc->is_array &&
696        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
697       assert(!format_desc->is_mixed);
698       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
699    }
700 
701    /*
702     * YUV / subsampled formats
703     */
704 
705    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
706       struct lp_type tmp_type;
707       LLVMValueRef tmp;
708 
709       memset(&tmp_type, 0, sizeof tmp_type);
710       tmp_type.width = 8;
711       tmp_type.length = num_pixels * 4;
712       tmp_type.norm = TRUE;
713 
714       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
715                                                format_desc,
716                                                num_pixels,
717                                                base_ptr,
718                                                offset,
719                                                i, j);
720 
721       lp_build_conv(gallivm,
722                     tmp_type, type,
723                     &tmp, 1, &tmp, 1);
724 
725       return tmp;
726    }
727 
728    /*
729     * s3tc rgb formats
730     */
731 
732    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
733       struct lp_type tmp_type;
734       LLVMValueRef tmp;
735 
736       memset(&tmp_type, 0, sizeof tmp_type);
737       tmp_type.width = 8;
738       tmp_type.length = num_pixels * 4;
739       tmp_type.norm = TRUE;
740 
741       tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
742                                          format_desc,
743                                          num_pixels,
744                                          base_ptr,
745                                          offset,
746                                          i, j,
747                                          cache);
748 
749       lp_build_conv(gallivm,
750                     tmp_type, type,
751                     &tmp, 1, &tmp, 1);
752 
753        return tmp;
754    }
755 
756    /*
757     * rgtc rgb formats
758     */
759 
760    if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
761       struct lp_type tmp_type;
762       LLVMValueRef tmp;
763 
764       memset(&tmp_type, 0, sizeof tmp_type);
765       tmp_type.width = 8;
766       tmp_type.length = num_pixels * 4;
767       tmp_type.norm = TRUE;
768       tmp_type.sign = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
769                        format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
770                        format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
771                        format_desc->format == PIPE_FORMAT_LATC2_SNORM);
772 
773       tmp = lp_build_fetch_rgtc_rgba_aos(gallivm,
774                                          format_desc,
775                                          num_pixels,
776                                          base_ptr,
777                                          offset,
778                                          i, j,
779                                          cache);
780 
781       lp_build_conv(gallivm,
782                     tmp_type, type,
783                     &tmp, 1, &tmp, 1);
784 
785        return tmp;
786    }
787 
788    /*
789     * Fallback to util_format_description::fetch_rgba_8unorm().
790     */
791 
792    if (format_desc->fetch_rgba_8unorm &&
793        !type.floating && type.width == 8 && !type.sign && type.norm) {
794       /*
795        * Fallback to calling util_format_description::fetch_rgba_8unorm.
796        *
797        * This is definitely not the most efficient way of fetching pixels, as
798        * we miss the opportunity to do vectorization, but this it is a
799        * convenient for formats or scenarios for which there was no opportunity
800        * or incentive to optimize.
801        */
802 
803       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
804       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
805       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
806       LLVMValueRef function;
807       LLVMValueRef tmp_ptr;
808       LLVMValueRef tmp;
809       LLVMValueRef res;
810       unsigned k;
811 
812       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
813          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
814                       __FUNCTION__, format_desc->short_name);
815       }
816 
817       /*
818        * Declare and bind format_desc->fetch_rgba_8unorm().
819        */
820 
821       {
822          /*
823           * Function to call looks like:
824           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
825           */
826          LLVMTypeRef ret_type;
827          LLVMTypeRef arg_types[4];
828          LLVMTypeRef function_type;
829 
830          ret_type = LLVMVoidTypeInContext(gallivm->context);
831          arg_types[0] = pi8t;
832          arg_types[1] = pi8t;
833          arg_types[2] = i32t;
834          arg_types[3] = i32t;
835          function_type = LLVMFunctionType(ret_type, arg_types,
836                                           ARRAY_SIZE(arg_types), 0);
837 
838          if (gallivm->cache)
839             gallivm->cache->dont_cache = true;
840          /* make const pointer for the C fetch_rgba_8unorm function */
841          function = lp_build_const_int_pointer(gallivm,
842             func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
843 
844          /* cast the callee pointer to the function's type */
845          function = LLVMBuildBitCast(builder, function,
846                                      LLVMPointerType(function_type, 0),
847                                      "cast callee");
848       }
849 
850       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
851 
852       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
853 
854       /*
855        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
856        * in the SoA vectors.
857        */
858 
859       for (k = 0; k < num_pixels; ++k) {
860          LLVMValueRef index = lp_build_const_int32(gallivm, k);
861          LLVMValueRef args[4];
862 
863          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
864          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
865                                             base_ptr, offset, k);
866 
867          if (num_pixels == 1) {
868             args[2] = i;
869             args[3] = j;
870          }
871          else {
872             args[2] = LLVMBuildExtractElement(builder, i, index, "");
873             args[3] = LLVMBuildExtractElement(builder, j, index, "");
874          }
875 
876          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
877 
878          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
879 
880          if (num_pixels == 1) {
881             res = tmp;
882          }
883          else {
884             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
885          }
886       }
887 
888       /* Bitcast from <n x i32> to <4n x i8> */
889       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
890 
891       return res;
892    }
893 
894    /*
895     * Fallback to util_format_description::fetch_rgba_float().
896     */
897 
898    if (format_desc->fetch_rgba_float) {
899       /*
900        * Fallback to calling util_format_description::fetch_rgba_float.
901        *
902        * This is definitely not the most efficient way of fetching pixels, as
903        * we miss the opportunity to do vectorization, but this it is a
904        * convenient for formats or scenarios for which there was no opportunity
905        * or incentive to optimize.
906        */
907 
908       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
909       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
910       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
911       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
912       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
913       LLVMValueRef function;
914       LLVMValueRef tmp_ptr;
915       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
916       LLVMValueRef res;
917       unsigned k;
918 
919       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
920          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
921                       __FUNCTION__, format_desc->short_name);
922       }
923 
924       /*
925        * Declare and bind format_desc->fetch_rgba_float().
926        */
927 
928       {
929          /*
930           * Function to call looks like:
931           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
932           */
933          LLVMTypeRef ret_type;
934          LLVMTypeRef arg_types[4];
935 
936          ret_type = LLVMVoidTypeInContext(gallivm->context);
937          arg_types[0] = pf32t;
938          arg_types[1] = pi8t;
939          arg_types[2] = i32t;
940          arg_types[3] = i32t;
941 
942          if (gallivm->cache)
943             gallivm->cache->dont_cache = true;
944          function = lp_build_const_func_pointer(gallivm,
945                                                 func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
946                                                 ret_type,
947                                                 arg_types, ARRAY_SIZE(arg_types),
948                                                 format_desc->short_name);
949       }
950 
951       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
952 
953       /*
954        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
955        * in the SoA vectors.
956        */
957 
958       for (k = 0; k < num_pixels; ++k) {
959          LLVMValueRef args[4];
960 
961          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
962          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
963                                             base_ptr, offset, k);
964 
965          if (num_pixels == 1) {
966             args[2] = i;
967             args[3] = j;
968          }
969          else {
970             LLVMValueRef index = lp_build_const_int32(gallivm, k);
971             args[2] = LLVMBuildExtractElement(builder, i, index, "");
972             args[3] = LLVMBuildExtractElement(builder, j, index, "");
973          }
974 
975          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
976 
977          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
978       }
979 
980       lp_build_conv(gallivm,
981                     lp_float32_vec4_type(),
982                     type,
983                     tmps, num_pixels, &res, 1);
984 
985       return res;
986    }
987 
988    assert(!util_format_is_pure_integer(format_desc->format));
989 
990    assert(0);
991    return lp_build_undef(gallivm, type);
992 }
993