1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * AoS pixel format manipulation.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  */
34 
35 
36 #include "util/format/u_format.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include "util/u_pointer.h"
40 #include "util/u_string.h"
41 #include "util/u_cpu_detect.h"
42 
43 #include "lp_bld_arit.h"
44 #include "lp_bld_init.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_flow.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_swizzle.h"
50 #include "lp_bld_gather.h"
51 #include "lp_bld_debug.h"
52 #include "lp_bld_format.h"
53 #include "lp_bld_pack.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_bitarit.h"
57 #include "lp_bld_misc.h"
58 
59 /**
60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
62  * too.
63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
64  */
65 LLVMValueRef
lp_build_format_swizzle_aos(const struct util_format_description * desc,struct lp_build_context * bld,LLVMValueRef unswizzled)66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
67                             struct lp_build_context *bld,
68                             LLVMValueRef unswizzled)
69 {
70    unsigned char swizzles[4];
71    unsigned chan;
72 
73    assert(bld->type.length % 4 == 0);
74 
75    for (chan = 0; chan < 4; ++chan) {
76       enum pipe_swizzle swizzle;
77 
78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
79          /*
80           * For ZS formats do RGBA = ZZZ1
81           */
82          if (chan == 3) {
83             swizzle = PIPE_SWIZZLE_1;
84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
85             swizzle = PIPE_SWIZZLE_0;
86          } else {
87             swizzle = desc->swizzle[0];
88          }
89       } else {
90          swizzle = desc->swizzle[chan];
91       }
92       swizzles[chan] = swizzle;
93    }
94 
95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
96 }
97 
98 
99 /**
100  * Whether the format matches the vector type, apart of swizzles.
101  */
102 static inline boolean
format_matches_type(const struct util_format_description * desc,struct lp_type type)103 format_matches_type(const struct util_format_description *desc,
104                     struct lp_type type)
105 {
106    enum util_format_type chan_type;
107    unsigned chan;
108 
109    assert(type.length % 4 == 0);
110 
111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
113        desc->block.width != 1 ||
114        desc->block.height != 1) {
115       return FALSE;
116    }
117 
118    if (type.floating) {
119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
120    } else if (type.fixed) {
121       chan_type = UTIL_FORMAT_TYPE_FIXED;
122    } else if (type.sign) {
123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
124    } else {
125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
126    }
127 
128    for (chan = 0; chan < desc->nr_channels; ++chan) {
129       if (desc->channel[chan].size != type.width) {
130          return FALSE;
131       }
132 
133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
134          if (desc->channel[chan].type != chan_type ||
135              desc->channel[chan].normalized != type.norm) {
136             return FALSE;
137          }
138       }
139    }
140 
141    return TRUE;
142 }
143 
144 /*
145  * Do rounding when converting small unorm values to larger ones.
146  * Not quite 100% accurate, as it's done by appending MSBs, but
147  * should be good enough.
148  */
149 
150 static inline LLVMValueRef
scale_bits_up(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)151 scale_bits_up(struct gallivm_state *gallivm,
152               int src_bits,
153               int dst_bits,
154               LLVMValueRef src,
155               struct lp_type src_type)
156 {
157    LLVMBuilderRef builder = gallivm->builder;
158    LLVMValueRef result = src;
159 
160    if (src_bits == 1 && dst_bits > 1) {
161       /*
162        * Useful for a1 - we'd need quite some repeated copies otherwise.
163        */
164       struct lp_build_context bld;
165       LLVMValueRef dst_mask;
166       lp_build_context_init(&bld, gallivm, src_type);
167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
168                                         (1 << dst_bits) - 1),
169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
170                             lp_build_const_int_vec(gallivm, src_type, 0));
171       result = lp_build_andnot(&bld, dst_mask, result);
172    }
173    else if (dst_bits > src_bits) {
174       /* Scale up bits */
175       int db = dst_bits - src_bits;
176 
177       /* Shift left by difference in bits */
178       result = LLVMBuildShl(builder,
179                             src,
180                             lp_build_const_int_vec(gallivm, src_type, db),
181                             "");
182 
183       if (db <= src_bits) {
184          /* Enough bits in src to fill the remainder */
185          LLVMValueRef lower = LLVMBuildLShr(builder,
186                                             src,
187                                             lp_build_const_int_vec(gallivm, src_type,
188                                                                    src_bits - db),
189                                             "");
190 
191          result = LLVMBuildOr(builder, result, lower, "");
192       } else if (db > src_bits) {
193          /* Need to repeatedly copy src bits to fill remainder in dst */
194          unsigned n;
195 
196          for (n = src_bits; n < dst_bits; n *= 2) {
197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
198 
199             result = LLVMBuildOr(builder,
200                                  result,
201                                  LLVMBuildLShr(builder, result, shuv, ""),
202                                  "");
203          }
204       }
205    } else {
206       assert (dst_bits == src_bits);
207    }
208 
209    return result;
210 }
211 
212 /**
213  * Unpack a single pixel into its XYZW components.
214  *
215  * @param desc  the pixel format for the packed pixel value
216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
217  *
218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
219  */
220 static inline LLVMValueRef
lp_build_unpack_arith_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef packed)221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
222                                const struct util_format_description *desc,
223                                LLVMValueRef packed)
224 {
225    LLVMBuilderRef builder = gallivm->builder;
226    LLVMValueRef shifted, casted, scaled, masked;
227    LLVMValueRef shifts[4];
228    LLVMValueRef masks[4];
229    LLVMValueRef scales[4];
230    LLVMTypeRef vec32_type;
231 
232    boolean normalized;
233    boolean needs_uitofp;
234    unsigned i;
235 
236    /* TODO: Support more formats */
237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
238    assert(desc->block.width == 1);
239    assert(desc->block.height == 1);
240    assert(desc->block.bits <= 32);
241 
242    /* Do the intermediate integer computations with 32bit integers since it
243     * matches floating point size */
244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
245 
246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
247 
248    /* Broadcast the packed value to all four channels
249     * before: packed = BGRA
250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
251     */
252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
254                                    "");
255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
256                                    LLVMConstNull(vec32_type),
257                                    "");
258 
259    /* Initialize vector constants */
260    normalized = FALSE;
261    needs_uitofp = FALSE;
262 
263    /* Loop over 4 color components */
264    for (i = 0; i < 4; ++i) {
265       unsigned bits = desc->channel[i].size;
266       unsigned shift = desc->channel[i].shift;
267 
268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
272       }
273       else {
274          unsigned long long mask = (1ULL << bits) - 1;
275 
276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
277 
278          if (bits == 32) {
279             needs_uitofp = TRUE;
280          }
281 
282          shifts[i] = lp_build_const_int32(gallivm, shift);
283          masks[i] = lp_build_const_int32(gallivm, mask);
284 
285          if (desc->channel[i].normalized) {
286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
287             normalized = TRUE;
288          }
289          else
290             scales[i] =  lp_build_const_float(gallivm, 1.0);
291       }
292    }
293 
294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
295     * into masked = {X, Y, Z, W}
296     */
297    if (desc->block.bits < 32 && normalized) {
298       /*
299        * Note: we cannot do the shift below on x86 natively until AVX2.
300        *
301        * Old llvm versions will resort to scalar extract/shift insert,
302        * which is definitely terrible, new versions will just do
303        * several vector shifts and shuffle/blend results together.
304        * We could turn this into a variable left shift plus a constant
305        * right shift, and llvm would then turn the variable left shift
306        * into a mul for us (albeit without sse41 the mul needs emulation
307        * too...). However, since we're going to do a float mul
308        * anyway, we just adjust that mul instead (plus the mask), skipping
309        * the shift completely.
310        * We could also use a extra mul when the format isn't normalized and
311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
314        * would be fixable with easy 16bit shuffle (unless there's channels
315        * crossing 16bit boundaries).
316        */
317       for (i = 0; i < 4; ++i) {
318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
319             unsigned bits = desc->channel[i].size;
320             unsigned shift = desc->channel[i].shift;
321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
323             masks[i] = lp_build_const_int32(gallivm, mask);
324          }
325       }
326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
327    } else {
328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
330    }
331 
332    if (!needs_uitofp) {
333       /* UIToFP can't be expressed in SSE2 */
334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
335    } else {
336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
337    }
338 
339    /*
340     * At this point 'casted' may be a vector of floats such as
341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
342     * by powers of two). Next, if the pixel values are normalized
343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
344     */
345 
346    if (normalized)
347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
348    else
349       scaled = casted;
350 
351    return scaled;
352 }
353 
354 
355 /**
356  * Pack a single pixel.
357  *
358  * @param rgba 4 float vector with the unpacked components.
359  *
360  * XXX: This is mostly for reference and testing -- operating a single pixel at
361  * a time is rarely if ever needed.
362  */
363 LLVMValueRef
lp_build_pack_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef rgba)364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
365                        const struct util_format_description *desc,
366                        LLVMValueRef rgba)
367 {
368    LLVMBuilderRef builder = gallivm->builder;
369    LLVMTypeRef type;
370    LLVMValueRef packed = NULL;
371    LLVMValueRef swizzles[4];
372    LLVMValueRef shifted, casted, scaled, unswizzled;
373    LLVMValueRef shifts[4];
374    LLVMValueRef scales[4];
375    boolean normalized;
376    unsigned i, j;
377 
378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
379    assert(desc->block.width == 1);
380    assert(desc->block.height == 1);
381 
382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
383 
384    /* Unswizzle the color components into the source vector. */
385    for (i = 0; i < 4; ++i) {
386       for (j = 0; j < 4; ++j) {
387          if (desc->swizzle[j] == i)
388             break;
389       }
390       if (j < 4)
391          swizzles[i] = lp_build_const_int32(gallivm, j);
392       else
393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
394    }
395 
396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
398                                        LLVMConstVector(swizzles, 4), "");
399 
400    normalized = FALSE;
401    for (i = 0; i < 4; ++i) {
402       unsigned bits = desc->channel[i].size;
403       unsigned shift = desc->channel[i].shift;
404 
405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
408       }
409       else {
410          unsigned mask = (1 << bits) - 1;
411 
412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
413          assert(bits < 32);
414 
415          shifts[i] = lp_build_const_int32(gallivm, shift);
416 
417          if (desc->channel[i].normalized) {
418             scales[i] = lp_build_const_float(gallivm, mask);
419             normalized = TRUE;
420          }
421          else
422             scales[i] = lp_build_const_float(gallivm, 1.0);
423       }
424    }
425 
426    if (normalized)
427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
428    else
429       scaled = unswizzled;
430 
431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
432 
433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
434 
435    /* Bitwise or all components */
436    for (i = 0; i < 4; ++i) {
437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
439                                                lp_build_const_int32(gallivm, i), "");
440          if (packed)
441             packed = LLVMBuildOr(builder, packed, component, "");
442          else
443             packed = component;
444       }
445    }
446 
447    if (!packed)
448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
449 
450    if (desc->block.bits < 32)
451       packed = LLVMBuildTrunc(builder, packed, type, "");
452 
453    return packed;
454 }
455 
456 
457 
458 
459 /**
460  * Fetch a pixel into a 4 float AoS.
461  *
462  * \param format_desc  describes format of the image we're fetching from
463  * \param aligned  whether the data is guaranteed to be aligned
464  * \param ptr  address of the pixel block (or the texel if uncompressed)
465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
466  *              these will always be (0, 0).
467  * \param cache  optional value pointing to a lp_build_format_cache structure
468  * \return  a 4 element vector with the pixel's RGBA values.
469  */
470 LLVMValueRef
lp_build_fetch_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)471 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
472                         const struct util_format_description *format_desc,
473                         struct lp_type type,
474                         boolean aligned,
475                         LLVMValueRef base_ptr,
476                         LLVMValueRef offset,
477                         LLVMValueRef i,
478                         LLVMValueRef j,
479                         LLVMValueRef cache)
480 {
481    const struct util_format_unpack_description *unpack =
482       util_format_unpack_description(format_desc->format);
483    LLVMBuilderRef builder = gallivm->builder;
484    unsigned num_pixels = type.length / 4;
485    struct lp_build_context bld;
486 
487    assert(type.length <= LP_MAX_VECTOR_LENGTH);
488    assert(type.length % 4 == 0);
489 
490    lp_build_context_init(&bld, gallivm, type);
491 
492    /*
493     * Trivial case
494     *
495     * The format matches the type (apart of a swizzle) so no need for
496     * scaling or converting.
497     */
498 
499    if (format_matches_type(format_desc, type) &&
500        format_desc->block.bits <= type.width * 4 &&
501        /* XXX this shouldn't be needed */
502        util_is_power_of_two_or_zero(format_desc->block.bits)) {
503       LLVMValueRef packed;
504       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
505       struct lp_type fetch_type;
506       unsigned vec_len = type.width * type.length;
507 
508       /*
509        * The format matches the type (apart of a swizzle) so no need for
510        * scaling or converting.
511        */
512 
513       fetch_type = lp_type_uint(type.width*4);
514       packed = lp_build_gather(gallivm, type.length/4,
515                                format_desc->block.bits, fetch_type,
516                                aligned, base_ptr, offset, TRUE);
517 
518       assert(format_desc->block.bits <= vec_len);
519       (void) vec_len; /* silence unused var warning for non-debug build */
520 
521       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
522       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
523    }
524 
525    /*
526     * Bit arithmetic for converting small_unorm to unorm8.
527     *
528     * This misses some opportunities for optimizations (like skipping mask
529     * for the highest channel for instance, or doing bit scaling in parallel
530     * for channels with the same bit width) but it should be passable for
531     * all arithmetic formats.
532     */
533    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
534        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
535        util_format_fits_8unorm(format_desc) &&
536        type.width == 8 && type.norm == 1 && type.sign == 0 &&
537        type.fixed == 0 && type.floating == 0) {
538       LLVMValueRef packed, res = NULL, chans[4], rgba[4];
539       LLVMTypeRef dst_vec_type, conv_vec_type;
540       struct lp_type fetch_type, conv_type;
541       struct lp_build_context bld_conv;
542       unsigned j;
543 
544       fetch_type = lp_type_uint(type.width*4);
545       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
546       dst_vec_type = lp_build_vec_type(gallivm, type);
547       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
548       lp_build_context_init(&bld_conv, gallivm, conv_type);
549 
550       packed = lp_build_gather(gallivm, type.length/4,
551                                format_desc->block.bits, fetch_type,
552                                aligned, base_ptr, offset, TRUE);
553 
554       assert(format_desc->block.bits * type.length / 4 <=
555              type.width * type.length);
556 
557       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
558 
559       for (j = 0; j < format_desc->nr_channels; ++j) {
560          unsigned mask = 0;
561          unsigned sa = format_desc->channel[j].shift;
562 
563          mask = (1 << format_desc->channel[j].size) - 1;
564 
565          /* Extract bits from source */
566          chans[j] = LLVMBuildLShr(builder, packed,
567                                   lp_build_const_int_vec(gallivm, conv_type, sa),
568                                   "");
569 
570          chans[j] = LLVMBuildAnd(builder, chans[j],
571                                  lp_build_const_int_vec(gallivm, conv_type, mask),
572                                  "");
573 
574          /* Scale bits */
575          if (type.norm) {
576             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
577                                      type.width, chans[j], conv_type);
578          }
579       }
580       /*
581        * This is a hacked lp_build_format_swizzle_soa() since we need a
582        * normalized 1 but only 8 bits in a 32bit vector...
583        */
584       for (j = 0; j < 4; ++j) {
585          enum pipe_swizzle swizzle = format_desc->swizzle[j];
586          if (swizzle == PIPE_SWIZZLE_1) {
587             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
588          } else {
589             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
590          }
591          if (j == 0) {
592             res = rgba[j];
593          } else {
594             rgba[j] = LLVMBuildShl(builder, rgba[j],
595                                    lp_build_const_int_vec(gallivm, conv_type,
596                                                           j * type.width), "");
597             res = LLVMBuildOr(builder, res, rgba[j], "");
598          }
599       }
600       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
601 
602       return res;
603    }
604 
605    /*
606     * Bit arithmetic
607     */
608 
609    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
610        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
611         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
612        format_desc->block.width == 1 &&
613        format_desc->block.height == 1 &&
614        /* XXX this shouldn't be needed */
615        util_is_power_of_two_or_zero(format_desc->block.bits) &&
616        format_desc->block.bits <= 32 &&
617        format_desc->is_bitmask &&
618        !format_desc->is_mixed &&
619        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
620         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
621        !format_desc->channel[0].pure_integer) {
622 
623       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
624       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
625       struct lp_type conv_type;
626       unsigned k, num_conv_src, num_conv_dst;
627 
628       /*
629        * Note this path is generally terrible for fetching multiple pixels.
630        * We should make sure we cannot hit this code path for anything but
631        * single pixels.
632        */
633 
634       /*
635        * Unpack a pixel at a time into a <4 x float> RGBA vector
636        */
637 
638       for (k = 0; k < num_pixels; ++k) {
639          LLVMValueRef packed;
640 
641          packed = lp_build_gather_elem(gallivm, num_pixels,
642                                        format_desc->block.bits, 32, aligned,
643                                        base_ptr, offset, k, FALSE);
644 
645          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
646                                                   format_desc,
647                                                   packed);
648       }
649 
650       /*
651        * Type conversion.
652        *
653        * TODO: We could avoid floating conversion for integer to
654        * integer conversions.
655        */
656 
657       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
658          debug_printf("%s: unpacking %s with floating point\n",
659                       __FUNCTION__, format_desc->short_name);
660       }
661 
662       conv_type = lp_float32_vec4_type();
663       num_conv_src = num_pixels;
664       num_conv_dst = 1;
665 
666       if (num_pixels % 8 == 0) {
667          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
668                            tmps, num_pixels, tmps, num_pixels / 2);
669          conv_type.length *= num_pixels / 4;
670          num_conv_src = 4 * num_pixels / 8;
671          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
672             /*
673              * FIXME: The fast float->unorm path (which is basically
674              * skipping the MIN/MAX which are extremely pointless in any
675              * case) requires that there's 2 destinations...
676              * In any case, we really should make sure we don't hit this
677              * code with multiple pixels for unorm8 dst types, it's
678              * completely hopeless even if we do hit the right conversion.
679              */
680             type.length /= num_pixels / 4;
681             num_conv_dst = num_pixels / 4;
682          }
683       }
684 
685       lp_build_conv(gallivm, conv_type, type,
686                     tmps, num_conv_src, res, num_conv_dst);
687 
688       if (num_pixels % 8 == 0 &&
689           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
690          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
691       }
692 
693       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
694    }
695 
696    /* If all channels are of same type and we are not using half-floats */
697    if (format_desc->is_array &&
698        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
699       assert(!format_desc->is_mixed);
700       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
701    }
702 
703    /*
704     * YUV / subsampled formats
705     */
706 
707    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
708       struct lp_type tmp_type;
709       LLVMValueRef tmp;
710 
711       memset(&tmp_type, 0, sizeof tmp_type);
712       tmp_type.width = 8;
713       tmp_type.length = num_pixels * 4;
714       tmp_type.norm = TRUE;
715 
716       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
717                                                format_desc,
718                                                num_pixels,
719                                                base_ptr,
720                                                offset,
721                                                i, j);
722 
723       lp_build_conv(gallivm,
724                     tmp_type, type,
725                     &tmp, 1, &tmp, 1);
726 
727       return tmp;
728    }
729 
730    /*
731     * s3tc rgb formats
732     */
733 
734    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
735       struct lp_type tmp_type;
736       LLVMValueRef tmp;
737 
738       memset(&tmp_type, 0, sizeof tmp_type);
739       tmp_type.width = 8;
740       tmp_type.length = num_pixels * 4;
741       tmp_type.norm = TRUE;
742 
743       tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
744                                          format_desc,
745                                          num_pixels,
746                                          base_ptr,
747                                          offset,
748                                          i, j,
749                                          cache);
750 
751       lp_build_conv(gallivm,
752                     tmp_type, type,
753                     &tmp, 1, &tmp, 1);
754 
755        return tmp;
756    }
757 
758    /*
759     * rgtc rgb formats
760     */
761 
762    if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
763       struct lp_type tmp_type;
764       LLVMValueRef tmp;
765 
766       memset(&tmp_type, 0, sizeof tmp_type);
767       tmp_type.width = 8;
768       tmp_type.length = num_pixels * 4;
769       tmp_type.norm = TRUE;
770       tmp_type.sign = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
771                        format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
772                        format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
773                        format_desc->format == PIPE_FORMAT_LATC2_SNORM);
774 
775       tmp = lp_build_fetch_rgtc_rgba_aos(gallivm,
776                                          format_desc,
777                                          num_pixels,
778                                          base_ptr,
779                                          offset,
780                                          i, j,
781                                          cache);
782 
783       lp_build_conv(gallivm,
784                     tmp_type, type,
785                     &tmp, 1, &tmp, 1);
786 
787        return tmp;
788    }
789 
790    /*
791     * Fallback to util_format_description::fetch_rgba_8unorm().
792     */
793 
794    if (unpack->fetch_rgba_8unorm &&
795        !type.floating && type.width == 8 && !type.sign && type.norm) {
796       /*
797        * Fallback to calling util_format_description::fetch_rgba_8unorm.
798        *
799        * This is definitely not the most efficient way of fetching pixels, as
800        * we miss the opportunity to do vectorization, but this it is a
801        * convenient for formats or scenarios for which there was no opportunity
802        * or incentive to optimize.
803        */
804 
805       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
806       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
807       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
808       LLVMValueRef function;
809       LLVMValueRef tmp_ptr;
810       LLVMValueRef tmp;
811       LLVMValueRef res;
812       unsigned k;
813 
814       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
815          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
816                       __FUNCTION__, format_desc->short_name);
817       }
818 
819       /*
820        * Declare and bind format_desc->fetch_rgba_8unorm().
821        */
822 
823       {
824          /*
825           * Function to call looks like:
826           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
827           */
828          LLVMTypeRef ret_type;
829          LLVMTypeRef arg_types[4];
830          LLVMTypeRef function_type;
831 
832          ret_type = LLVMVoidTypeInContext(gallivm->context);
833          arg_types[0] = pi8t;
834          arg_types[1] = pi8t;
835          arg_types[2] = i32t;
836          arg_types[3] = i32t;
837          function_type = LLVMFunctionType(ret_type, arg_types,
838                                           ARRAY_SIZE(arg_types), 0);
839 
840          if (gallivm->cache)
841             gallivm->cache->dont_cache = true;
842          /* make const pointer for the C fetch_rgba_8unorm function */
843          function = lp_build_const_int_pointer(gallivm,
844             func_to_pointer((func_pointer) unpack->fetch_rgba_8unorm));
845 
846          /* cast the callee pointer to the function's type */
847          function = LLVMBuildBitCast(builder, function,
848                                      LLVMPointerType(function_type, 0),
849                                      "cast callee");
850       }
851 
852       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
853 
854       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
855 
856       /*
857        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
858        * in the SoA vectors.
859        */
860 
861       for (k = 0; k < num_pixels; ++k) {
862          LLVMValueRef index = lp_build_const_int32(gallivm, k);
863          LLVMValueRef args[4];
864 
865          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
866          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
867                                             base_ptr, offset, k);
868 
869          if (num_pixels == 1) {
870             args[2] = i;
871             args[3] = j;
872          }
873          else {
874             args[2] = LLVMBuildExtractElement(builder, i, index, "");
875             args[3] = LLVMBuildExtractElement(builder, j, index, "");
876          }
877 
878          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
879 
880          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
881 
882          if (num_pixels == 1) {
883             res = tmp;
884          }
885          else {
886             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
887          }
888       }
889 
890       /* Bitcast from <n x i32> to <4n x i8> */
891       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
892 
893       return res;
894    }
895 
896    /*
897     * Fallback to fetch_rgba().
898     */
899 
900    util_format_fetch_rgba_func_ptr fetch_rgba =
901       util_format_fetch_rgba_func(format_desc->format);
902    if (fetch_rgba) {
903       /*
904        * Fallback to calling util_format_description::fetch_rgba_float.
905        *
906        * This is definitely not the most efficient way of fetching pixels, as
907        * we miss the opportunity to do vectorization, but this it is a
908        * convenient for formats or scenarios for which there was no opportunity
909        * or incentive to optimize.
910        */
911 
912       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
913       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
914       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
915       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
916       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
917       LLVMValueRef function;
918       LLVMValueRef tmp_ptr;
919       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
920       LLVMValueRef res;
921       unsigned k;
922 
923       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
924          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
925                       __FUNCTION__, format_desc->short_name);
926       }
927 
928       /*
929        * Declare and bind unpack->fetch_rgba_float().
930        */
931 
932       {
933          /*
934           * Function to call looks like:
935           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
936           */
937          LLVMTypeRef ret_type;
938          LLVMTypeRef arg_types[4];
939 
940          ret_type = LLVMVoidTypeInContext(gallivm->context);
941          arg_types[0] = pf32t;
942          arg_types[1] = pi8t;
943          arg_types[2] = i32t;
944          arg_types[3] = i32t;
945 
946          if (gallivm->cache)
947             gallivm->cache->dont_cache = true;
948          function = lp_build_const_func_pointer(gallivm,
949                                                 func_to_pointer((func_pointer) fetch_rgba),
950                                                 ret_type,
951                                                 arg_types, ARRAY_SIZE(arg_types),
952                                                 format_desc->short_name);
953       }
954 
955       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
956 
957       /*
958        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
959        * in the SoA vectors.
960        */
961 
962       for (k = 0; k < num_pixels; ++k) {
963          LLVMValueRef args[4];
964 
965          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
966          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
967                                             base_ptr, offset, k);
968 
969          if (num_pixels == 1) {
970             args[2] = i;
971             args[3] = j;
972          }
973          else {
974             LLVMValueRef index = lp_build_const_int32(gallivm, k);
975             args[2] = LLVMBuildExtractElement(builder, i, index, "");
976             args[3] = LLVMBuildExtractElement(builder, j, index, "");
977          }
978 
979          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
980 
981          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
982       }
983 
984       lp_build_conv(gallivm,
985                     lp_float32_vec4_type(),
986                     type,
987                     tmps, num_pixels, &res, 1);
988 
989       return res;
990    }
991 
992    assert(!util_format_is_pure_integer(format_desc->format));
993 
994    assert(0);
995    return lp_build_undef(gallivm, type);
996 }
997