1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include <llvm/Config/llvm-config.h>
51 
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56 
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67 
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71 
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75 
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79 
80 #define EXP_POLY_DEGREE 5
81 
82 #define LOG_POLY_DEGREE 4
83 
84 
85 /**
86  * Generate min(a, b)
87  * No checks for special case values of a or b = 1 or 0 are done.
88  * NaN's are handled according to the behavior specified by the
89  * nan_behavior argument.
90  */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93                     LLVMValueRef a,
94                     LLVMValueRef b,
95                     enum gallivm_nan_behavior nan_behavior)
96 {
97    const struct lp_type type = bld->type;
98    const char *intrinsic = NULL;
99    unsigned intr_size = 0;
100    LLVMValueRef cond;
101 
102    assert(lp_check_value(type, a));
103    assert(lp_check_value(type, b));
104 
105    /* TODO: optimize the constant case */
106 
107    if (type.floating && util_get_cpu_caps()->has_sse) {
108       if (type.width == 32) {
109          if (type.length == 1) {
110             intrinsic = "llvm.x86.sse.min.ss";
111             intr_size = 128;
112          }
113          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114             intrinsic = "llvm.x86.sse.min.ps";
115             intr_size = 128;
116          }
117          else {
118             intrinsic = "llvm.x86.avx.min.ps.256";
119             intr_size = 256;
120          }
121       }
122       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123          if (type.length == 1) {
124             intrinsic = "llvm.x86.sse2.min.sd";
125             intr_size = 128;
126          }
127          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128             intrinsic = "llvm.x86.sse2.min.pd";
129             intr_size = 128;
130          }
131          else {
132             intrinsic = "llvm.x86.avx.min.pd.256";
133             intr_size = 256;
134          }
135       }
136    }
137    else if (type.floating && util_get_cpu_caps()->has_altivec) {
138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140                       __FUNCTION__);
141       }
142       if (type.width == 32 && type.length == 4) {
143          intrinsic = "llvm.ppc.altivec.vminfp";
144          intr_size = 128;
145       }
146    } else if (util_get_cpu_caps()->has_altivec) {
147       intr_size = 128;
148       if (type.width == 8) {
149          if (!type.sign) {
150             intrinsic = "llvm.ppc.altivec.vminub";
151          } else {
152             intrinsic = "llvm.ppc.altivec.vminsb";
153          }
154       } else if (type.width == 16) {
155          if (!type.sign) {
156             intrinsic = "llvm.ppc.altivec.vminuh";
157          } else {
158             intrinsic = "llvm.ppc.altivec.vminsh";
159          }
160       } else if (type.width == 32) {
161          if (!type.sign) {
162             intrinsic = "llvm.ppc.altivec.vminuw";
163          } else {
164             intrinsic = "llvm.ppc.altivec.vminsw";
165          }
166       }
167    }
168 
169    if (intrinsic) {
170       /* We need to handle nan's for floating point numbers. If one of the
171        * inputs is nan the other should be returned (required by both D3D10+
172        * and OpenCL).
173        * The sse intrinsics return the second operator in case of nan by
174        * default so we need to special code to handle those.
175        */
176       if (util_get_cpu_caps()->has_sse && type.floating &&
177           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178          LLVMValueRef isnan, min;
179          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180                                                    type,
181                                                    intr_size, a, b);
182          isnan = lp_build_isnan(bld, b);
183          return lp_build_select(bld, isnan, a, min);
184       } else {
185          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186                                                     type,
187                                                     intr_size, a, b);
188       }
189    }
190 
191    if (type.floating) {
192       switch (nan_behavior) {
193       case GALLIVM_NAN_RETURN_OTHER: {
194          LLVMValueRef isnan = lp_build_isnan(bld, a);
195          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197          return lp_build_select(bld, cond, a, b);
198       }
199          break;
200       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202          return lp_build_select(bld, cond, a, b);
203       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205          return lp_build_select(bld, cond, b, a);
206       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208          return lp_build_select(bld, cond, a, b);
209          break;
210       default:
211          assert(0);
212          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213          return lp_build_select(bld, cond, a, b);
214       }
215    } else {
216       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217       return lp_build_select(bld, cond, a, b);
218    }
219 }
220 
221 
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224                  LLVMValueRef a,
225                  LLVMValueRef b,
226                  LLVMValueRef c)
227 {
228    LLVMTypeRef type = LLVMTypeOf(a);
229    assert(type == LLVMTypeOf(b));
230    assert(type == LLVMTypeOf(c));
231 
232    char intrinsic[32];
233    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234    LLVMValueRef args[] = { a, b, c };
235    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237 
238 
239 /**
240  * Generate max(a, b)
241  * No checks for special case values of a or b = 1 or 0 are done.
242  * NaN's are handled according to the behavior specified by the
243  * nan_behavior argument.
244  */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247                     LLVMValueRef a,
248                     LLVMValueRef b,
249                     enum gallivm_nan_behavior nan_behavior)
250 {
251    const struct lp_type type = bld->type;
252    const char *intrinsic = NULL;
253    unsigned intr_size = 0;
254    LLVMValueRef cond;
255 
256    assert(lp_check_value(type, a));
257    assert(lp_check_value(type, b));
258 
259    /* TODO: optimize the constant case */
260 
261    if (type.floating && util_get_cpu_caps()->has_sse) {
262       if (type.width == 32) {
263          if (type.length == 1) {
264             intrinsic = "llvm.x86.sse.max.ss";
265             intr_size = 128;
266          }
267          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268             intrinsic = "llvm.x86.sse.max.ps";
269             intr_size = 128;
270          }
271          else {
272             intrinsic = "llvm.x86.avx.max.ps.256";
273             intr_size = 256;
274          }
275       }
276       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277          if (type.length == 1) {
278             intrinsic = "llvm.x86.sse2.max.sd";
279             intr_size = 128;
280          }
281          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282             intrinsic = "llvm.x86.sse2.max.pd";
283             intr_size = 128;
284          }
285          else {
286             intrinsic = "llvm.x86.avx.max.pd.256";
287             intr_size = 256;
288          }
289       }
290    }
291    else if (type.floating && util_get_cpu_caps()->has_altivec) {
292       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294                       __FUNCTION__);
295       }
296       if (type.width == 32 || type.length == 4) {
297          intrinsic = "llvm.ppc.altivec.vmaxfp";
298          intr_size = 128;
299       }
300    } else if (util_get_cpu_caps()->has_altivec) {
301      intr_size = 128;
302      if (type.width == 8) {
303        if (!type.sign) {
304          intrinsic = "llvm.ppc.altivec.vmaxub";
305        } else {
306          intrinsic = "llvm.ppc.altivec.vmaxsb";
307        }
308      } else if (type.width == 16) {
309        if (!type.sign) {
310          intrinsic = "llvm.ppc.altivec.vmaxuh";
311        } else {
312          intrinsic = "llvm.ppc.altivec.vmaxsh";
313        }
314      } else if (type.width == 32) {
315        if (!type.sign) {
316          intrinsic = "llvm.ppc.altivec.vmaxuw";
317        } else {
318          intrinsic = "llvm.ppc.altivec.vmaxsw";
319        }
320      }
321    }
322 
323    if (intrinsic) {
324       if (util_get_cpu_caps()->has_sse && type.floating &&
325           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326          LLVMValueRef isnan, max;
327          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328                                                    type,
329                                                    intr_size, a, b);
330          isnan = lp_build_isnan(bld, b);
331          return lp_build_select(bld, isnan, a, max);
332       } else {
333          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334                                                     type,
335                                                     intr_size, a, b);
336       }
337    }
338 
339    if (type.floating) {
340       switch (nan_behavior) {
341       case GALLIVM_NAN_RETURN_OTHER: {
342          LLVMValueRef isnan = lp_build_isnan(bld, a);
343          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345          return lp_build_select(bld, cond, a, b);
346       }
347          break;
348       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350          return lp_build_select(bld, cond, a, b);
351       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353          return lp_build_select(bld, cond, b, a);
354       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356          return lp_build_select(bld, cond, a, b);
357          break;
358       default:
359          assert(0);
360          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361          return lp_build_select(bld, cond, a, b);
362       }
363    } else {
364       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365       return lp_build_select(bld, cond, a, b);
366    }
367 }
368 
369 
370 /**
371  * Generate 1 - a, or ~a depending on bld->type.
372  */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375               LLVMValueRef a)
376 {
377    LLVMBuilderRef builder = bld->gallivm->builder;
378    const struct lp_type type = bld->type;
379 
380    assert(lp_check_value(type, a));
381 
382    if(a == bld->one)
383       return bld->zero;
384    if(a == bld->zero)
385       return bld->one;
386 
387    if(type.norm && !type.floating && !type.fixed && !type.sign) {
388       if(LLVMIsConstant(a))
389          return LLVMConstNot(a);
390       else
391          return LLVMBuildNot(builder, a, "");
392    }
393 
394    if(LLVMIsConstant(a))
395       if (type.floating)
396           return LLVMConstFSub(bld->one, a);
397       else
398           return LLVMConstSub(bld->one, a);
399    else
400       if (type.floating)
401          return LLVMBuildFSub(builder, bld->one, a, "");
402       else
403          return LLVMBuildSub(builder, bld->one, a, "");
404 }
405 
406 
407 /**
408  * Generate a + b
409  */
410 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)411 lp_build_add(struct lp_build_context *bld,
412              LLVMValueRef a,
413              LLVMValueRef b)
414 {
415    LLVMBuilderRef builder = bld->gallivm->builder;
416    const struct lp_type type = bld->type;
417    LLVMValueRef res;
418 
419    assert(lp_check_value(type, a));
420    assert(lp_check_value(type, b));
421 
422    if (a == bld->zero)
423       return b;
424    if (b == bld->zero)
425       return a;
426    if (a == bld->undef || b == bld->undef)
427       return bld->undef;
428 
429    if (type.norm) {
430       const char *intrinsic = NULL;
431 
432       if (!type.sign && (a == bld->one || b == bld->one))
433         return bld->one;
434 
435       if (!type.floating && !type.fixed) {
436          if (LLVM_VERSION_MAJOR >= 8) {
437             char intrin[32];
438             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441          }
442          if (type.width * type.length == 128) {
443             if (util_get_cpu_caps()->has_sse2) {
444                if (type.width == 8)
445                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446                if (type.width == 16)
447                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448             } else if (util_get_cpu_caps()->has_altivec) {
449                if (type.width == 8)
450                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451                if (type.width == 16)
452                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453             }
454          }
455          if (type.width * type.length == 256) {
456             if (util_get_cpu_caps()->has_avx2) {
457                if (type.width == 8)
458                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459                if (type.width == 16)
460                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461             }
462          }
463       }
464 
465       if (intrinsic)
466          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467    }
468 
469    if(type.norm && !type.floating && !type.fixed) {
470       if (type.sign) {
471          uint64_t sign = (uint64_t)1 << (type.width - 1);
472          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474          /* a_clamp_max is the maximum a for positive b,
475             a_clamp_min is the minimum a for negative b. */
476          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479       }
480    }
481 
482    if(LLVMIsConstant(a) && LLVMIsConstant(b))
483       if (type.floating)
484          res = LLVMConstFAdd(a, b);
485       else
486          res = LLVMConstAdd(a, b);
487    else
488       if (type.floating)
489          res = LLVMBuildFAdd(builder, a, b, "");
490       else
491          res = LLVMBuildAdd(builder, a, b, "");
492 
493    /* clamp to ceiling of 1.0 */
494    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496 
497    if (type.norm && !type.floating && !type.fixed) {
498       if (!type.sign) {
499          /*
500           * newer llvm versions no longer support the intrinsics, but recognize
501           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502           * code, it is important we match the pattern llvm uses (and pray llvm
503           * doesn't change it - and hope they decide on the same pattern for
504           * all backends supporting it...).
505           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506           * interfere with llvm's ability to recognize the pattern but seems
507           * a bit brittle.
508           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509           */
510          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511          res = lp_build_select(bld, overflowed,
512                                LLVMConstAllOnes(bld->int_vec_type), res);
513       }
514    }
515 
516    /* XXX clamp to floor of -1 or 0??? */
517 
518    return res;
519 }
520 
521 
522 /** Return the scalar sum of the elements of a.
523  * Should avoid this operation whenever possible.
524  */
525 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)526 lp_build_horizontal_add(struct lp_build_context *bld,
527                         LLVMValueRef a)
528 {
529    LLVMBuilderRef builder = bld->gallivm->builder;
530    const struct lp_type type = bld->type;
531    LLVMValueRef index, res;
532    unsigned i, length;
533    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535    LLVMValueRef vecres, elem2;
536 
537    assert(lp_check_value(type, a));
538 
539    if (type.length == 1) {
540       return a;
541    }
542 
543    assert(!bld->type.norm);
544 
545    /*
546     * for byte vectors can do much better with psadbw.
547     * Using repeated shuffle/adds here. Note with multiple vectors
548     * this can be done more efficiently as outlined in the intel
549     * optimization manual.
550     * Note: could cause data rearrangement if used with smaller element
551     * sizes.
552     */
553 
554    vecres = a;
555    length = type.length / 2;
556    while (length > 1) {
557       LLVMValueRef vec1, vec2;
558       for (i = 0; i < length; i++) {
559          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561       }
562       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563                                     LLVMConstVector(shuffles1, length), "");
564       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565                                     LLVMConstVector(shuffles2, length), "");
566       if (type.floating) {
567          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568       }
569       else {
570          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571       }
572       length = length >> 1;
573    }
574 
575    /* always have vector of size 2 here */
576    assert(length == 1);
577 
578    index = lp_build_const_int32(bld->gallivm, 0);
579    res = LLVMBuildExtractElement(builder, vecres, index, "");
580    index = lp_build_const_int32(bld->gallivm, 1);
581    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582 
583    if (type.floating)
584       res = LLVMBuildFAdd(builder, res, elem2, "");
585     else
586       res = LLVMBuildAdd(builder, res, elem2, "");
587 
588    return res;
589 }
590 
591 /**
592  * Return the horizontal sums of 4 float vectors as a float4 vector.
593  * This uses the technique as outlined in Intel Optimization Manual.
594  */
595 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])596 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597                             LLVMValueRef src[4])
598 {
599    struct gallivm_state *gallivm = bld->gallivm;
600    LLVMBuilderRef builder = gallivm->builder;
601    LLVMValueRef shuffles[4];
602    LLVMValueRef tmp[4];
603    LLVMValueRef sumtmp[2], shuftmp[2];
604 
605    /* lower half of regs */
606    shuffles[0] = lp_build_const_int32(gallivm, 0);
607    shuffles[1] = lp_build_const_int32(gallivm, 1);
608    shuffles[2] = lp_build_const_int32(gallivm, 4);
609    shuffles[3] = lp_build_const_int32(gallivm, 5);
610    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611                                    LLVMConstVector(shuffles, 4), "");
612    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613                                    LLVMConstVector(shuffles, 4), "");
614 
615    /* upper half of regs */
616    shuffles[0] = lp_build_const_int32(gallivm, 2);
617    shuffles[1] = lp_build_const_int32(gallivm, 3);
618    shuffles[2] = lp_build_const_int32(gallivm, 6);
619    shuffles[3] = lp_build_const_int32(gallivm, 7);
620    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621                                    LLVMConstVector(shuffles, 4), "");
622    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623                                    LLVMConstVector(shuffles, 4), "");
624 
625    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627 
628    shuffles[0] = lp_build_const_int32(gallivm, 0);
629    shuffles[1] = lp_build_const_int32(gallivm, 2);
630    shuffles[2] = lp_build_const_int32(gallivm, 4);
631    shuffles[3] = lp_build_const_int32(gallivm, 6);
632    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633                                        LLVMConstVector(shuffles, 4), "");
634 
635    shuffles[0] = lp_build_const_int32(gallivm, 1);
636    shuffles[1] = lp_build_const_int32(gallivm, 3);
637    shuffles[2] = lp_build_const_int32(gallivm, 5);
638    shuffles[3] = lp_build_const_int32(gallivm, 7);
639    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640                                        LLVMConstVector(shuffles, 4), "");
641 
642    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643 }
644 
645 
646 /*
647  * partially horizontally add 2-4 float vectors with length nx4,
648  * i.e. only four adjacent values in each vector will be added,
649  * assuming values are really grouped in 4 which also determines
650  * output order.
651  *
652  * Return a vector of the same length as the initial vectors,
653  * with the excess elements (if any) being undefined.
654  * The element order is independent of number of input vectors.
655  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656  * the output order thus will be
657  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658  */
659 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)660 lp_build_hadd_partial4(struct lp_build_context *bld,
661                        LLVMValueRef vectors[],
662                        unsigned num_vecs)
663 {
664    struct gallivm_state *gallivm = bld->gallivm;
665    LLVMBuilderRef builder = gallivm->builder;
666    LLVMValueRef ret_vec;
667    LLVMValueRef tmp[4];
668    const char *intrinsic = NULL;
669 
670    assert(num_vecs >= 2 && num_vecs <= 4);
671    assert(bld->type.floating);
672 
673    /* only use this with at least 2 vectors, as it is sort of expensive
674     * (depending on cpu) and we always need two horizontal adds anyway,
675     * so a shuffle/add approach might be better.
676     */
677 
678    tmp[0] = vectors[0];
679    tmp[1] = vectors[1];
680 
681    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683 
684    if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685        bld->type.length == 4) {
686       intrinsic = "llvm.x86.sse3.hadd.ps";
687    }
688    else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689             bld->type.length == 8) {
690       intrinsic = "llvm.x86.avx.hadd.ps.256";
691    }
692    if (intrinsic) {
693       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694                                        lp_build_vec_type(gallivm, bld->type),
695                                        tmp[0], tmp[1]);
696       if (num_vecs > 2) {
697          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698                                           lp_build_vec_type(gallivm, bld->type),
699                                           tmp[2], tmp[3]);
700       }
701       else {
702          tmp[1] = tmp[0];
703       }
704       return lp_build_intrinsic_binary(builder, intrinsic,
705                                        lp_build_vec_type(gallivm, bld->type),
706                                        tmp[0], tmp[1]);
707    }
708 
709    if (bld->type.length == 4) {
710       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711    }
712    else {
713       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714       unsigned j;
715       unsigned num_iter = bld->type.length / 4;
716       struct lp_type parttype = bld->type;
717       parttype.length = 4;
718       for (j = 0; j < num_iter; j++) {
719          LLVMValueRef partsrc[4];
720          unsigned i;
721          for (i = 0; i < 4; i++) {
722             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723          }
724          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725       }
726       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727    }
728    return ret_vec;
729 }
730 
731 /**
732  * Generate a - b
733  */
734 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)735 lp_build_sub(struct lp_build_context *bld,
736              LLVMValueRef a,
737              LLVMValueRef b)
738 {
739    LLVMBuilderRef builder = bld->gallivm->builder;
740    const struct lp_type type = bld->type;
741    LLVMValueRef res;
742 
743    assert(lp_check_value(type, a));
744    assert(lp_check_value(type, b));
745 
746    if (b == bld->zero)
747       return a;
748    if (a == bld->undef || b == bld->undef)
749       return bld->undef;
750    if (a == b)
751       return bld->zero;
752 
753    if (type.norm) {
754       const char *intrinsic = NULL;
755 
756       if (!type.sign && b == bld->one)
757         return bld->zero;
758 
759       if (!type.floating && !type.fixed) {
760          if (LLVM_VERSION_MAJOR >= 8) {
761             char intrin[32];
762             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765          }
766          if (type.width * type.length == 128) {
767             if (util_get_cpu_caps()->has_sse2) {
768                if (type.width == 8)
769                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770                if (type.width == 16)
771                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772             } else if (util_get_cpu_caps()->has_altivec) {
773                if (type.width == 8)
774                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775                if (type.width == 16)
776                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777             }
778          }
779          if (type.width * type.length == 256) {
780             if (util_get_cpu_caps()->has_avx2) {
781                if (type.width == 8)
782                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783                if (type.width == 16)
784                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785             }
786          }
787       }
788 
789       if (intrinsic)
790          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791    }
792 
793    if(type.norm && !type.floating && !type.fixed) {
794       if (type.sign) {
795          uint64_t sign = (uint64_t)1 << (type.width - 1);
796          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798          /* a_clamp_max is the maximum a for negative b,
799             a_clamp_min is the minimum a for positive b. */
800          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803       } else {
804          /*
805           * This must match llvm pattern for saturated unsigned sub.
806           * (lp_build_max_simple actually does the job with its current
807           * definition but do it explicitly here.)
808           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809           * interfere with llvm's ability to recognize the pattern but seems
810           * a bit brittle.
811           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812           */
813          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814          a = lp_build_select(bld, no_ov, a, b);
815       }
816    }
817 
818    if(LLVMIsConstant(a) && LLVMIsConstant(b))
819       if (type.floating)
820          res = LLVMConstFSub(a, b);
821       else
822          res = LLVMConstSub(a, b);
823    else
824       if (type.floating)
825          res = LLVMBuildFSub(builder, a, b, "");
826       else
827          res = LLVMBuildSub(builder, a, b, "");
828 
829    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831 
832    return res;
833 }
834 
835 
836 
837 /**
838  * Normalized multiplication.
839  *
840  * There are several approaches for (using 8-bit normalized multiplication as
841  * an example):
842  *
843  * - alpha plus one
844  *
845  *     makes the following approximation to the division (Sree)
846  *
847  *       a*b/255 ~= (a*(b + 1)) >> 256
848  *
849  *     which is the fastest method that satisfies the following OpenGL criteria of
850  *
851  *       0*0 = 0 and 255*255 = 255
852  *
853  * - geometric series
854  *
855  *     takes the geometric series approximation to the division
856  *
857  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858  *
859  *     in this case just the first two terms to fit in 16bit arithmetic
860  *
861  *       t/255 ~= (t + (t >> 8)) >> 8
862  *
863  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
864  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
865  *     must be used.
866  *
867  * - geometric series plus rounding
868  *
869  *     when using a geometric series division instead of truncating the result
870  *     use roundoff in the approximation (Jim Blinn)
871  *
872  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
873  *
874  *     achieving the exact results.
875  *
876  *
877  *
878  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880  * @sa Michael Herf, The "double blend trick", May 2000,
881  *     http://www.stereopsis.com/doubleblend.html
882  */
883 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)884 lp_build_mul_norm(struct gallivm_state *gallivm,
885                   struct lp_type wide_type,
886                   LLVMValueRef a, LLVMValueRef b)
887 {
888    LLVMBuilderRef builder = gallivm->builder;
889    struct lp_build_context bld;
890    unsigned n;
891    LLVMValueRef half;
892    LLVMValueRef ab;
893 
894    assert(!wide_type.floating);
895    assert(lp_check_value(wide_type, a));
896    assert(lp_check_value(wide_type, b));
897 
898    lp_build_context_init(&bld, gallivm, wide_type);
899 
900    n = wide_type.width / 2;
901    if (wide_type.sign) {
902       --n;
903    }
904 
905    /*
906     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908     */
909 
910    /*
911     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912     */
913 
914    ab = LLVMBuildMul(builder, a, b, "");
915    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916 
917    /*
918     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919     */
920 
921    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922    if (wide_type.sign) {
923       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925       half = lp_build_select(&bld, sign, minus_half, half);
926    }
927    ab = LLVMBuildAdd(builder, ab, half, "");
928 
929    /* Final division */
930    ab = lp_build_shr_imm(&bld, ab, n);
931 
932    return ab;
933 }
934 
935 /**
936  * Generate a * b
937  */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940              LLVMValueRef a,
941              LLVMValueRef b)
942 {
943    LLVMBuilderRef builder = bld->gallivm->builder;
944    const struct lp_type type = bld->type;
945    LLVMValueRef shift;
946    LLVMValueRef res;
947 
948    assert(lp_check_value(type, a));
949    assert(lp_check_value(type, b));
950 
951    if(a == bld->zero)
952       return bld->zero;
953    if(a == bld->one)
954       return b;
955    if(b == bld->zero)
956       return bld->zero;
957    if(b == bld->one)
958       return a;
959    if(a == bld->undef || b == bld->undef)
960       return bld->undef;
961 
962    if (!type.floating && !type.fixed && type.norm) {
963       struct lp_type wide_type = lp_wider_type(type);
964       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965 
966       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968 
969       /* PMULLW, PSRLW, PADDW */
970       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972 
973       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974 
975       return ab;
976    }
977 
978    if(type.fixed)
979       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980    else
981       shift = NULL;
982 
983    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984       if (type.floating)
985          res = LLVMConstFMul(a, b);
986       else
987          res = LLVMConstMul(a, b);
988       if(shift) {
989          if(type.sign)
990             res = LLVMConstAShr(res, shift);
991          else
992             res = LLVMConstLShr(res, shift);
993       }
994    }
995    else {
996       if (type.floating)
997          res = LLVMBuildFMul(builder, a, b, "");
998       else
999          res = LLVMBuildMul(builder, a, b, "");
1000       if(shift) {
1001          if(type.sign)
1002             res = LLVMBuildAShr(builder, res, shift, "");
1003          else
1004             res = LLVMBuildLShr(builder, res, shift, "");
1005       }
1006    }
1007 
1008    return res;
1009 }
1010 
1011 /*
1012  * Widening mul, valid for 32x32 bit -> 64bit only.
1013  * Result is low 32bits, high bits returned in res_hi.
1014  *
1015  * Emits code that is meant to be compiled for the host CPU.
1016  */
1017 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1018 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019                          LLVMValueRef a,
1020                          LLVMValueRef b,
1021                          LLVMValueRef *res_hi)
1022 {
1023    struct gallivm_state *gallivm = bld->gallivm;
1024    LLVMBuilderRef builder = gallivm->builder;
1025 
1026    assert(bld->type.width == 32);
1027    assert(bld->type.floating == 0);
1028    assert(bld->type.fixed == 0);
1029    assert(bld->type.norm == 0);
1030 
1031    /*
1032     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033     * for x86 simd is atrocious (even if the high bits weren't required),
1034     * trying to handle real 64bit inputs (which of course can't happen due
1035     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036     * apparently llvm does not recognize this widening mul). This includes 6
1037     * (instead of 2) pmuludq plus extra adds and shifts
1038     * The same story applies to signed mul, albeit fixing this requires sse41.
1039     * https://llvm.org/bugs/show_bug.cgi?id=30845
1040     * So, whip up our own code, albeit only for length 4 and 8 (which
1041     * should be good enough)...
1042     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044     * for signed), which the fallback code does not, without this llvm
1045     * will likely still produce atrocious code.
1046     */
1047    if (LLVM_VERSION_MAJOR < 7 &&
1048        (bld->type.length == 4 || bld->type.length == 8) &&
1049        ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050         util_get_cpu_caps()->has_sse4_1)) {
1051       const char *intrinsic = NULL;
1052       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054       struct lp_type type_wide = lp_wider_type(bld->type);
1055       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056       unsigned i;
1057       for (i = 0; i < bld->type.length; i += 2) {
1058          shuf[i] = lp_build_const_int32(gallivm, i+1);
1059          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060       }
1061       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062       aeven = a;
1063       beven = b;
1064       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066 
1067       if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068          if (bld->type.sign) {
1069             intrinsic = "llvm.x86.avx2.pmul.dq";
1070          } else {
1071             intrinsic = "llvm.x86.avx2.pmulu.dq";
1072          }
1073          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074                                              wider_type, aeven, beven);
1075          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076                                             wider_type, aodd, bodd);
1077       }
1078       else {
1079          /* for consistent naming look elsewhere... */
1080          if (bld->type.sign) {
1081             intrinsic = "llvm.x86.sse41.pmuldq";
1082          } else {
1083             intrinsic = "llvm.x86.sse2.pmulu.dq";
1084          }
1085          /*
1086           * XXX If we only have AVX but not AVX2 this is a pain.
1087           * lp_build_intrinsic_binary_anylength() can't handle it
1088           * (due to src and dst type not being identical).
1089           */
1090          if (bld->type.length == 8) {
1091             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093             LLVMValueRef muleven2[2], mulodd2[2];
1094             struct lp_type type_wide_half = type_wide;
1095             LLVMTypeRef wtype_half;
1096             type_wide_half.length = 2;
1097             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107                                                     wtype_half, aevenlo, bevenlo);
1108             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109                                                    wtype_half, aoddlo, boddlo);
1110             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111                                                     wtype_half, aevenhi, bevenhi);
1112             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113                                                    wtype_half, aoddhi, boddhi);
1114             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116 
1117          }
1118          else {
1119             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120                                                 wider_type, aeven, beven);
1121             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122                                                wider_type, aodd, bodd);
1123          }
1124       }
1125       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127 
1128       for (i = 0; i < bld->type.length; i += 2) {
1129          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131       }
1132       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134 
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i);
1137          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141    }
1142    else {
1143       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144    }
1145 }
1146 
1147 
1148 /*
1149  * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150  * Result is low N bits, high bits returned in res_hi.
1151  *
1152  * Emits generic code.
1153  */
1154 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1155 lp_build_mul_32_lohi(struct lp_build_context *bld,
1156                      LLVMValueRef a,
1157                      LLVMValueRef b,
1158                      LLVMValueRef *res_hi)
1159 {
1160    struct gallivm_state *gallivm = bld->gallivm;
1161    LLVMBuilderRef builder = gallivm->builder;
1162    LLVMValueRef tmp, shift, res_lo;
1163    struct lp_type type_tmp;
1164    LLVMTypeRef wide_type, narrow_type;
1165 
1166    type_tmp = bld->type;
1167    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168    if (bld->type.width < 32)
1169       type_tmp.width = 32;
1170    else
1171       type_tmp.width *= 2;
1172    wide_type = lp_build_vec_type(gallivm, type_tmp);
1173    shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174 
1175    if (bld->type.sign) {
1176       a = LLVMBuildSExt(builder, a, wide_type, "");
1177       b = LLVMBuildSExt(builder, b, wide_type, "");
1178    } else {
1179       a = LLVMBuildZExt(builder, a, wide_type, "");
1180       b = LLVMBuildZExt(builder, b, wide_type, "");
1181    }
1182    tmp = LLVMBuildMul(builder, a, b, "");
1183 
1184    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185 
1186    /* Since we truncate anyway, LShr and AShr are equivalent. */
1187    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189 
1190    return res_lo;
1191 }
1192 
1193 
1194 /* a * b + c */
1195 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1196 lp_build_mad(struct lp_build_context *bld,
1197              LLVMValueRef a,
1198              LLVMValueRef b,
1199              LLVMValueRef c)
1200 {
1201    const struct lp_type type = bld->type;
1202    if (type.floating) {
1203       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204    } else {
1205       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206    }
1207 }
1208 
1209 
1210 /**
1211  * Small vector x scale multiplication optimization.
1212  */
1213 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1214 lp_build_mul_imm(struct lp_build_context *bld,
1215                  LLVMValueRef a,
1216                  int b)
1217 {
1218    LLVMBuilderRef builder = bld->gallivm->builder;
1219    LLVMValueRef factor;
1220 
1221    assert(lp_check_value(bld->type, a));
1222 
1223    if(b == 0)
1224       return bld->zero;
1225 
1226    if(b == 1)
1227       return a;
1228 
1229    if(b == -1)
1230       return lp_build_negate(bld, a);
1231 
1232    if(b == 2 && bld->type.floating)
1233       return lp_build_add(bld, a, a);
1234 
1235    if(util_is_power_of_two_or_zero(b)) {
1236       unsigned shift = ffs(b) - 1;
1237 
1238       if(bld->type.floating) {
1239 #if 0
1240          /*
1241           * Power of two multiplication by directly manipulating the exponent.
1242           *
1243           * XXX: This might not be always faster, it will introduce a small error
1244           * for multiplication by zero, and it will produce wrong results
1245           * for Inf and NaN.
1246           */
1247          unsigned mantissa = lp_mantissa(bld->type);
1248          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250          a = LLVMBuildAdd(builder, a, factor, "");
1251          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252          return a;
1253 #endif
1254       }
1255       else {
1256          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257          return LLVMBuildShl(builder, a, factor, "");
1258       }
1259    }
1260 
1261    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262    return lp_build_mul(bld, a, factor);
1263 }
1264 
1265 
1266 /**
1267  * Generate a / b
1268  */
1269 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1270 lp_build_div(struct lp_build_context *bld,
1271              LLVMValueRef a,
1272              LLVMValueRef b)
1273 {
1274    LLVMBuilderRef builder = bld->gallivm->builder;
1275    const struct lp_type type = bld->type;
1276 
1277    assert(lp_check_value(type, a));
1278    assert(lp_check_value(type, b));
1279 
1280    if(a == bld->zero)
1281       return bld->zero;
1282    if(a == bld->one && type.floating)
1283       return lp_build_rcp(bld, b);
1284    if(b == bld->zero)
1285       return bld->undef;
1286    if(b == bld->one)
1287       return a;
1288    if(a == bld->undef || b == bld->undef)
1289       return bld->undef;
1290 
1291    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292       if (type.floating)
1293          return LLVMConstFDiv(a, b);
1294       else if (type.sign)
1295          return LLVMConstSDiv(a, b);
1296       else
1297          return LLVMConstUDiv(a, b);
1298    }
1299 
1300    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301    if(FALSE &&
1302       ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304       type.floating)
1305       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306 
1307    if (type.floating)
1308       return LLVMBuildFDiv(builder, a, b, "");
1309    else if (type.sign)
1310       return LLVMBuildSDiv(builder, a, b, "");
1311    else
1312       return LLVMBuildUDiv(builder, a, b, "");
1313 }
1314 
1315 
1316 /**
1317  * Linear interpolation helper.
1318  *
1319  * @param normalized whether we are interpolating normalized values,
1320  *        encoded in normalized integers, twice as wide.
1321  *
1322  * @sa http://www.stereopsis.com/doubleblend.html
1323  */
1324 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1325 lp_build_lerp_simple(struct lp_build_context *bld,
1326                      LLVMValueRef x,
1327                      LLVMValueRef v0,
1328                      LLVMValueRef v1,
1329                      unsigned flags)
1330 {
1331    unsigned half_width = bld->type.width/2;
1332    LLVMBuilderRef builder = bld->gallivm->builder;
1333    LLVMValueRef delta;
1334    LLVMValueRef res;
1335 
1336    assert(lp_check_value(bld->type, x));
1337    assert(lp_check_value(bld->type, v0));
1338    assert(lp_check_value(bld->type, v1));
1339 
1340    delta = lp_build_sub(bld, v1, v0);
1341 
1342    if (bld->type.floating) {
1343       assert(flags == 0);
1344       return lp_build_mad(bld, x, delta, v0);
1345    }
1346 
1347    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348       if (!bld->type.sign) {
1349          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350             /*
1351              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352              * most-significant-bit to the lowest-significant-bit, so that
1353              * later we can just divide by 2**n instead of 2**n - 1.
1354              */
1355 
1356             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357          }
1358 
1359          /* (x * delta) >> n */
1360 	 /*
1361 	  * For this multiply, higher internal precision is required to pass CTS,
1362 	  * the most efficient path to that is pmulhrsw on ssse3 and above.
1363 	  * This could be opencoded on other arches if conformance was required.
1364 	  */
1365          if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1366             res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1367             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1368          } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1369             res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1370             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1371          } else {
1372             res = lp_build_mul(bld, x, delta);
1373             res = lp_build_shr_imm(bld, res, half_width);
1374          }
1375       } else {
1376          /*
1377           * The rescaling trick above doesn't work for signed numbers, so
1378           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1379           * instead.
1380           */
1381          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1382          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1383       }
1384    } else {
1385       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1386       res = lp_build_mul(bld, x, delta);
1387    }
1388 
1389    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1390       /*
1391        * At this point both res and v0 only use the lower half of the bits,
1392        * the rest is zero. Instead of add / mask, do add with half wide type.
1393        */
1394       struct lp_type narrow_type;
1395       struct lp_build_context narrow_bld;
1396 
1397       memset(&narrow_type, 0, sizeof narrow_type);
1398       narrow_type.sign   = bld->type.sign;
1399       narrow_type.width  = bld->type.width/2;
1400       narrow_type.length = bld->type.length*2;
1401 
1402       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1403       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1404       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1405       res = lp_build_add(&narrow_bld, v0, res);
1406       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1407    } else {
1408       res = lp_build_add(bld, v0, res);
1409 
1410       if (bld->type.fixed) {
1411          /*
1412           * We need to mask out the high order bits when lerping 8bit
1413           * normalized colors stored on 16bits
1414           */
1415          /* XXX: This step is necessary for lerping 8bit colors stored on
1416           * 16bits, but it will be wrong for true fixed point use cases.
1417           * Basically we need a more powerful lp_type, capable of further
1418           * distinguishing the values interpretation from the value storage.
1419           */
1420          LLVMValueRef low_bits;
1421          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1422          res = LLVMBuildAnd(builder, res, low_bits, "");
1423       }
1424    }
1425 
1426    return res;
1427 }
1428 
1429 
1430 /**
1431  * Linear interpolation.
1432  */
1433 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1434 lp_build_lerp(struct lp_build_context *bld,
1435               LLVMValueRef x,
1436               LLVMValueRef v0,
1437               LLVMValueRef v1,
1438               unsigned flags)
1439 {
1440    const struct lp_type type = bld->type;
1441    LLVMValueRef res;
1442 
1443    assert(lp_check_value(type, x));
1444    assert(lp_check_value(type, v0));
1445    assert(lp_check_value(type, v1));
1446 
1447    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1448 
1449    if (type.norm) {
1450       struct lp_type wide_type;
1451       struct lp_build_context wide_bld;
1452       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1453 
1454       assert(type.length >= 2);
1455 
1456       /*
1457        * Create a wider integer type, enough to hold the
1458        * intermediate result of the multiplication.
1459        */
1460       memset(&wide_type, 0, sizeof wide_type);
1461       wide_type.sign   = type.sign;
1462       wide_type.width  = type.width*2;
1463       wide_type.length = type.length/2;
1464 
1465       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1466 
1467       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1468       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1469       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1470 
1471       /*
1472        * Lerp both halves.
1473        */
1474 
1475       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1476 
1477       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1478       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1479 
1480       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1481    } else {
1482       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1483    }
1484 
1485    return res;
1486 }
1487 
1488 
1489 /**
1490  * Bilinear interpolation.
1491  *
1492  * Values indices are in v_{yx}.
1493  */
1494 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1495 lp_build_lerp_2d(struct lp_build_context *bld,
1496                  LLVMValueRef x,
1497                  LLVMValueRef y,
1498                  LLVMValueRef v00,
1499                  LLVMValueRef v01,
1500                  LLVMValueRef v10,
1501                  LLVMValueRef v11,
1502                  unsigned flags)
1503 {
1504    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1505    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1506    return lp_build_lerp(bld, y, v0, v1, flags);
1507 }
1508 
1509 
1510 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1511 lp_build_lerp_3d(struct lp_build_context *bld,
1512                  LLVMValueRef x,
1513                  LLVMValueRef y,
1514                  LLVMValueRef z,
1515                  LLVMValueRef v000,
1516                  LLVMValueRef v001,
1517                  LLVMValueRef v010,
1518                  LLVMValueRef v011,
1519                  LLVMValueRef v100,
1520                  LLVMValueRef v101,
1521                  LLVMValueRef v110,
1522                  LLVMValueRef v111,
1523                  unsigned flags)
1524 {
1525    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1526    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1527    return lp_build_lerp(bld, z, v0, v1, flags);
1528 }
1529 
1530 
1531 /**
1532  * Generate min(a, b)
1533  * Do checks for special cases but not for nans.
1534  */
1535 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1536 lp_build_min(struct lp_build_context *bld,
1537              LLVMValueRef a,
1538              LLVMValueRef b)
1539 {
1540    assert(lp_check_value(bld->type, a));
1541    assert(lp_check_value(bld->type, b));
1542 
1543    if(a == bld->undef || b == bld->undef)
1544       return bld->undef;
1545 
1546    if(a == b)
1547       return a;
1548 
1549    if (bld->type.norm) {
1550       if (!bld->type.sign) {
1551          if (a == bld->zero || b == bld->zero) {
1552             return bld->zero;
1553          }
1554       }
1555       if(a == bld->one)
1556          return b;
1557       if(b == bld->one)
1558          return a;
1559    }
1560 
1561    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1562 }
1563 
1564 /**
1565  * Generate min(a, b)
1566  * NaN's are handled according to the behavior specified by the
1567  * nan_behavior argument.
1568  */
1569 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1570 lp_build_min_ext(struct lp_build_context *bld,
1571                  LLVMValueRef a,
1572                  LLVMValueRef b,
1573                  enum gallivm_nan_behavior nan_behavior)
1574 {
1575    assert(lp_check_value(bld->type, a));
1576    assert(lp_check_value(bld->type, b));
1577 
1578    if(a == bld->undef || b == bld->undef)
1579       return bld->undef;
1580 
1581    if(a == b)
1582       return a;
1583 
1584    if (bld->type.norm) {
1585       if (!bld->type.sign) {
1586          if (a == bld->zero || b == bld->zero) {
1587             return bld->zero;
1588          }
1589       }
1590       if(a == bld->one)
1591          return b;
1592       if(b == bld->one)
1593          return a;
1594    }
1595 
1596    return lp_build_min_simple(bld, a, b, nan_behavior);
1597 }
1598 
1599 /**
1600  * Generate max(a, b)
1601  * Do checks for special cases, but NaN behavior is undefined.
1602  */
1603 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1604 lp_build_max(struct lp_build_context *bld,
1605              LLVMValueRef a,
1606              LLVMValueRef b)
1607 {
1608    assert(lp_check_value(bld->type, a));
1609    assert(lp_check_value(bld->type, b));
1610 
1611    if(a == bld->undef || b == bld->undef)
1612       return bld->undef;
1613 
1614    if(a == b)
1615       return a;
1616 
1617    if(bld->type.norm) {
1618       if(a == bld->one || b == bld->one)
1619          return bld->one;
1620       if (!bld->type.sign) {
1621          if (a == bld->zero) {
1622             return b;
1623          }
1624          if (b == bld->zero) {
1625             return a;
1626          }
1627       }
1628    }
1629 
1630    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1631 }
1632 
1633 
1634 /**
1635  * Generate max(a, b)
1636  * Checks for special cases.
1637  * NaN's are handled according to the behavior specified by the
1638  * nan_behavior argument.
1639  */
1640 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1641 lp_build_max_ext(struct lp_build_context *bld,
1642                   LLVMValueRef a,
1643                   LLVMValueRef b,
1644                   enum gallivm_nan_behavior nan_behavior)
1645 {
1646    assert(lp_check_value(bld->type, a));
1647    assert(lp_check_value(bld->type, b));
1648 
1649    if(a == bld->undef || b == bld->undef)
1650       return bld->undef;
1651 
1652    if(a == b)
1653       return a;
1654 
1655    if(bld->type.norm) {
1656       if(a == bld->one || b == bld->one)
1657          return bld->one;
1658       if (!bld->type.sign) {
1659          if (a == bld->zero) {
1660             return b;
1661          }
1662          if (b == bld->zero) {
1663             return a;
1664          }
1665       }
1666    }
1667 
1668    return lp_build_max_simple(bld, a, b, nan_behavior);
1669 }
1670 
1671 /**
1672  * Generate clamp(a, min, max)
1673  * NaN behavior (for any of a, min, max) is undefined.
1674  * Do checks for special cases.
1675  */
1676 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1677 lp_build_clamp(struct lp_build_context *bld,
1678                LLVMValueRef a,
1679                LLVMValueRef min,
1680                LLVMValueRef max)
1681 {
1682    assert(lp_check_value(bld->type, a));
1683    assert(lp_check_value(bld->type, min));
1684    assert(lp_check_value(bld->type, max));
1685 
1686    a = lp_build_min(bld, a, max);
1687    a = lp_build_max(bld, a, min);
1688    return a;
1689 }
1690 
1691 
1692 /**
1693  * Generate clamp(a, 0, 1)
1694  * A NaN will get converted to zero.
1695  */
1696 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1697 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1698                                 LLVMValueRef a)
1699 {
1700    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1701    a = lp_build_min(bld, a, bld->one);
1702    return a;
1703 }
1704 
1705 
1706 /**
1707  * Generate abs(a)
1708  */
1709 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1710 lp_build_abs(struct lp_build_context *bld,
1711              LLVMValueRef a)
1712 {
1713    LLVMBuilderRef builder = bld->gallivm->builder;
1714    const struct lp_type type = bld->type;
1715    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1716 
1717    assert(lp_check_value(type, a));
1718 
1719    if(!type.sign)
1720       return a;
1721 
1722    if(type.floating) {
1723       char intrinsic[32];
1724       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1725       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1726    }
1727 
1728    if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1729       switch(type.width) {
1730       case 8:
1731          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1732       case 16:
1733          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1734       case 32:
1735          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1736       }
1737    }
1738    else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1739       switch(type.width) {
1740       case 8:
1741          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1742       case 16:
1743          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1744       case 32:
1745          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1746       }
1747    }
1748 
1749    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1750                           a, LLVMBuildNeg(builder, a, ""));
1751 }
1752 
1753 
1754 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_negate(struct lp_build_context *bld,
1756                 LLVMValueRef a)
1757 {
1758    LLVMBuilderRef builder = bld->gallivm->builder;
1759 
1760    assert(lp_check_value(bld->type, a));
1761 
1762    if (bld->type.floating)
1763       a = LLVMBuildFNeg(builder, a, "");
1764    else
1765       a = LLVMBuildNeg(builder, a, "");
1766 
1767    return a;
1768 }
1769 
1770 
1771 /** Return -1, 0 or +1 depending on the sign of a */
1772 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_sgn(struct lp_build_context *bld,
1774              LLVMValueRef a)
1775 {
1776    LLVMBuilderRef builder = bld->gallivm->builder;
1777    const struct lp_type type = bld->type;
1778    LLVMValueRef cond;
1779    LLVMValueRef res;
1780 
1781    assert(lp_check_value(type, a));
1782 
1783    /* Handle non-zero case */
1784    if(!type.sign) {
1785       /* if not zero then sign must be positive */
1786       res = bld->one;
1787    }
1788    else if(type.floating) {
1789       LLVMTypeRef vec_type;
1790       LLVMTypeRef int_type;
1791       LLVMValueRef mask;
1792       LLVMValueRef sign;
1793       LLVMValueRef one;
1794       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1795 
1796       int_type = lp_build_int_vec_type(bld->gallivm, type);
1797       vec_type = lp_build_vec_type(bld->gallivm, type);
1798       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1799 
1800       /* Take the sign bit and add it to 1 constant */
1801       sign = LLVMBuildBitCast(builder, a, int_type, "");
1802       sign = LLVMBuildAnd(builder, sign, mask, "");
1803       one = LLVMConstBitCast(bld->one, int_type);
1804       res = LLVMBuildOr(builder, sign, one, "");
1805       res = LLVMBuildBitCast(builder, res, vec_type, "");
1806    }
1807    else
1808    {
1809       /* signed int/norm/fixed point */
1810       /* could use psign with sse3 and appropriate vectors here */
1811       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1812       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1813       res = lp_build_select(bld, cond, bld->one, minus_one);
1814    }
1815 
1816    /* Handle zero */
1817    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1818    res = lp_build_select(bld, cond, bld->zero, res);
1819 
1820    return res;
1821 }
1822 
1823 
1824 /**
1825  * Set the sign of float vector 'a' according to 'sign'.
1826  * If sign==0, return abs(a).
1827  * If sign==1, return -abs(a);
1828  * Other values for sign produce undefined results.
1829  */
1830 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1831 lp_build_set_sign(struct lp_build_context *bld,
1832                   LLVMValueRef a, LLVMValueRef sign)
1833 {
1834    LLVMBuilderRef builder = bld->gallivm->builder;
1835    const struct lp_type type = bld->type;
1836    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1837    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1838    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1839    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1840                              ~((unsigned long long) 1 << (type.width - 1)));
1841    LLVMValueRef val, res;
1842 
1843    assert(type.floating);
1844    assert(lp_check_value(type, a));
1845 
1846    /* val = reinterpret_cast<int>(a) */
1847    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1848    /* val = val & mask */
1849    val = LLVMBuildAnd(builder, val, mask, "");
1850    /* sign = sign << shift */
1851    sign = LLVMBuildShl(builder, sign, shift, "");
1852    /* res = val | sign */
1853    res = LLVMBuildOr(builder, val, sign, "");
1854    /* res = reinterpret_cast<float>(res) */
1855    res = LLVMBuildBitCast(builder, res, vec_type, "");
1856 
1857    return res;
1858 }
1859 
1860 
1861 /**
1862  * Convert vector of (or scalar) int to vector of (or scalar) float.
1863  */
1864 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1865 lp_build_int_to_float(struct lp_build_context *bld,
1866                       LLVMValueRef a)
1867 {
1868    LLVMBuilderRef builder = bld->gallivm->builder;
1869    const struct lp_type type = bld->type;
1870    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1871 
1872    assert(type.floating);
1873 
1874    return LLVMBuildSIToFP(builder, a, vec_type, "");
1875 }
1876 
1877 static boolean
arch_rounding_available(const struct lp_type type)1878 arch_rounding_available(const struct lp_type type)
1879 {
1880    if ((util_get_cpu_caps()->has_sse4_1 &&
1881        (type.length == 1 || type.width*type.length == 128)) ||
1882        (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1883        (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1884       return TRUE;
1885    else if ((util_get_cpu_caps()->has_altivec &&
1886             (type.width == 32 && type.length == 4)))
1887       return TRUE;
1888    else if (util_get_cpu_caps()->has_neon)
1889       return TRUE;
1890 
1891    return FALSE;
1892 }
1893 
1894 enum lp_build_round_mode
1895 {
1896    LP_BUILD_ROUND_NEAREST = 0,
1897    LP_BUILD_ROUND_FLOOR = 1,
1898    LP_BUILD_ROUND_CEIL = 2,
1899    LP_BUILD_ROUND_TRUNCATE = 3
1900 };
1901 
1902 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1903 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1904                              LLVMValueRef a)
1905 {
1906    LLVMBuilderRef builder = bld->gallivm->builder;
1907    const struct lp_type type = bld->type;
1908    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1909    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1910    const char *intrinsic;
1911    LLVMValueRef res;
1912 
1913    assert(type.floating);
1914    /* using the double precision conversions is a bit more complicated */
1915    assert(type.width == 32);
1916 
1917    assert(lp_check_value(type, a));
1918    assert(util_get_cpu_caps()->has_sse2);
1919 
1920    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1921    if (type.length == 1) {
1922       LLVMTypeRef vec_type;
1923       LLVMValueRef undef;
1924       LLVMValueRef arg;
1925       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1926 
1927       vec_type = LLVMVectorType(bld->elem_type, 4);
1928 
1929       intrinsic = "llvm.x86.sse.cvtss2si";
1930 
1931       undef = LLVMGetUndef(vec_type);
1932 
1933       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1934 
1935       res = lp_build_intrinsic_unary(builder, intrinsic,
1936                                      ret_type, arg);
1937    }
1938    else {
1939       if (type.width* type.length == 128) {
1940          intrinsic = "llvm.x86.sse2.cvtps2dq";
1941       }
1942       else {
1943          assert(type.width*type.length == 256);
1944          assert(util_get_cpu_caps()->has_avx);
1945 
1946          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1947       }
1948       res = lp_build_intrinsic_unary(builder, intrinsic,
1949                                      ret_type, a);
1950    }
1951 
1952    return res;
1953 }
1954 
1955 
1956 /*
1957  */
1958 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1959 lp_build_round_altivec(struct lp_build_context *bld,
1960                        LLVMValueRef a,
1961                        enum lp_build_round_mode mode)
1962 {
1963    LLVMBuilderRef builder = bld->gallivm->builder;
1964    const struct lp_type type = bld->type;
1965    const char *intrinsic = NULL;
1966 
1967    assert(type.floating);
1968 
1969    assert(lp_check_value(type, a));
1970    assert(util_get_cpu_caps()->has_altivec);
1971 
1972    (void)type;
1973 
1974    switch (mode) {
1975    case LP_BUILD_ROUND_NEAREST:
1976       intrinsic = "llvm.ppc.altivec.vrfin";
1977       break;
1978    case LP_BUILD_ROUND_FLOOR:
1979       intrinsic = "llvm.ppc.altivec.vrfim";
1980       break;
1981    case LP_BUILD_ROUND_CEIL:
1982       intrinsic = "llvm.ppc.altivec.vrfip";
1983       break;
1984    case LP_BUILD_ROUND_TRUNCATE:
1985       intrinsic = "llvm.ppc.altivec.vrfiz";
1986       break;
1987    }
1988 
1989    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1990 }
1991 
1992 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1993 lp_build_round_arch(struct lp_build_context *bld,
1994                     LLVMValueRef a,
1995                     enum lp_build_round_mode mode)
1996 {
1997    if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
1998       LLVMBuilderRef builder = bld->gallivm->builder;
1999       const struct lp_type type = bld->type;
2000       const char *intrinsic_root;
2001       char intrinsic[32];
2002 
2003       assert(type.floating);
2004       assert(lp_check_value(type, a));
2005       (void)type;
2006 
2007       switch (mode) {
2008       case LP_BUILD_ROUND_NEAREST:
2009          intrinsic_root = "llvm.nearbyint";
2010          break;
2011       case LP_BUILD_ROUND_FLOOR:
2012          intrinsic_root = "llvm.floor";
2013          break;
2014       case LP_BUILD_ROUND_CEIL:
2015          intrinsic_root = "llvm.ceil";
2016          break;
2017       case LP_BUILD_ROUND_TRUNCATE:
2018          intrinsic_root = "llvm.trunc";
2019          break;
2020       default:
2021          unreachable("unhandled lp_build_round_mode");
2022       }
2023 
2024       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2025       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2026    }
2027    else /* (util_get_cpu_caps()->has_altivec) */
2028      return lp_build_round_altivec(bld, a, mode);
2029 }
2030 
2031 /**
2032  * Return the integer part of a float (vector) value (== round toward zero).
2033  * The returned value is a float (vector).
2034  * Ex: trunc(-1.5) = -1.0
2035  */
2036 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2037 lp_build_trunc(struct lp_build_context *bld,
2038                LLVMValueRef a)
2039 {
2040    LLVMBuilderRef builder = bld->gallivm->builder;
2041    const struct lp_type type = bld->type;
2042 
2043    assert(type.floating);
2044    assert(lp_check_value(type, a));
2045 
2046    if (type.width == 16) {
2047       char intrinsic[64];
2048       lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2049       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2050    }
2051 
2052    if (arch_rounding_available(type)) {
2053       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2054    }
2055    else {
2056       const struct lp_type type = bld->type;
2057       struct lp_type inttype;
2058       struct lp_build_context intbld;
2059       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2060       LLVMValueRef trunc, res, anosign, mask;
2061       LLVMTypeRef int_vec_type = bld->int_vec_type;
2062       LLVMTypeRef vec_type = bld->vec_type;
2063 
2064       inttype = type;
2065       inttype.floating = 0;
2066       lp_build_context_init(&intbld, bld->gallivm, inttype);
2067 
2068       /* round by truncation */
2069       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2070       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2071 
2072       /* mask out sign bit */
2073       anosign = lp_build_abs(bld, a);
2074       /*
2075        * mask out all values if anosign > 2^24
2076        * This should work both for large ints (all rounding is no-op for them
2077        * because such floats are always exact) as well as special cases like
2078        * NaNs, Infs (taking advantage of the fact they use max exponent).
2079        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2080        */
2081       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2082       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2083       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2084       return lp_build_select(bld, mask, a, res);
2085    }
2086 }
2087 
2088 
2089 /**
2090  * Return float (vector) rounded to nearest integer (vector).  The returned
2091  * value is a float (vector).
2092  * Ex: round(0.9) = 1.0
2093  * Ex: round(-1.5) = -2.0
2094  */
2095 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2096 lp_build_round(struct lp_build_context *bld,
2097                LLVMValueRef a)
2098 {
2099    LLVMBuilderRef builder = bld->gallivm->builder;
2100    const struct lp_type type = bld->type;
2101 
2102    assert(type.floating);
2103    assert(lp_check_value(type, a));
2104 
2105    if (type.width == 16) {
2106       char intrinsic[64];
2107       lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2108       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2109    }
2110 
2111    if (arch_rounding_available(type)) {
2112       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2113    }
2114    else {
2115       const struct lp_type type = bld->type;
2116       struct lp_type inttype;
2117       struct lp_build_context intbld;
2118       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2119       LLVMValueRef res, anosign, mask;
2120       LLVMTypeRef int_vec_type = bld->int_vec_type;
2121       LLVMTypeRef vec_type = bld->vec_type;
2122 
2123       inttype = type;
2124       inttype.floating = 0;
2125       lp_build_context_init(&intbld, bld->gallivm, inttype);
2126 
2127       res = lp_build_iround(bld, a);
2128       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2129 
2130       /* mask out sign bit */
2131       anosign = lp_build_abs(bld, a);
2132       /*
2133        * mask out all values if anosign > 2^24
2134        * This should work both for large ints (all rounding is no-op for them
2135        * because such floats are always exact) as well as special cases like
2136        * NaNs, Infs (taking advantage of the fact they use max exponent).
2137        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2138        */
2139       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2140       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2141       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2142       return lp_build_select(bld, mask, a, res);
2143    }
2144 }
2145 
2146 
2147 /**
2148  * Return floor of float (vector), result is a float (vector)
2149  * Ex: floor(1.1) = 1.0
2150  * Ex: floor(-1.1) = -2.0
2151  */
2152 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2153 lp_build_floor(struct lp_build_context *bld,
2154                LLVMValueRef a)
2155 {
2156    LLVMBuilderRef builder = bld->gallivm->builder;
2157    const struct lp_type type = bld->type;
2158 
2159    assert(type.floating);
2160    assert(lp_check_value(type, a));
2161 
2162    if (arch_rounding_available(type)) {
2163       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2164    }
2165    else {
2166       const struct lp_type type = bld->type;
2167       struct lp_type inttype;
2168       struct lp_build_context intbld;
2169       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2170       LLVMValueRef trunc, res, anosign, mask;
2171       LLVMTypeRef int_vec_type = bld->int_vec_type;
2172       LLVMTypeRef vec_type = bld->vec_type;
2173 
2174       if (type.width != 32) {
2175          char intrinsic[32];
2176          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2177          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2178       }
2179 
2180       assert(type.width == 32); /* might want to handle doubles at some point */
2181 
2182       inttype = type;
2183       inttype.floating = 0;
2184       lp_build_context_init(&intbld, bld->gallivm, inttype);
2185 
2186       /* round by truncation */
2187       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2188       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2189 
2190       if (type.sign) {
2191          LLVMValueRef tmp;
2192 
2193          /*
2194           * fix values if rounding is wrong (for non-special cases)
2195           * - this is the case if trunc > a
2196           */
2197          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2198          /* tmp = trunc > a ? 1.0 : 0.0 */
2199          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2200          tmp = lp_build_and(&intbld, mask, tmp);
2201          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2202          res = lp_build_sub(bld, res, tmp);
2203       }
2204 
2205       /* mask out sign bit */
2206       anosign = lp_build_abs(bld, a);
2207       /*
2208        * mask out all values if anosign > 2^24
2209        * This should work both for large ints (all rounding is no-op for them
2210        * because such floats are always exact) as well as special cases like
2211        * NaNs, Infs (taking advantage of the fact they use max exponent).
2212        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2213        */
2214       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2215       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2216       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2217       return lp_build_select(bld, mask, a, res);
2218    }
2219 }
2220 
2221 
2222 /**
2223  * Return ceiling of float (vector), returning float (vector).
2224  * Ex: ceil( 1.1) = 2.0
2225  * Ex: ceil(-1.1) = -1.0
2226  */
2227 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2228 lp_build_ceil(struct lp_build_context *bld,
2229               LLVMValueRef a)
2230 {
2231    LLVMBuilderRef builder = bld->gallivm->builder;
2232    const struct lp_type type = bld->type;
2233 
2234    assert(type.floating);
2235    assert(lp_check_value(type, a));
2236 
2237    if (arch_rounding_available(type)) {
2238       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2239    }
2240    else {
2241       const struct lp_type type = bld->type;
2242       struct lp_type inttype;
2243       struct lp_build_context intbld;
2244       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2245       LLVMValueRef trunc, res, anosign, mask, tmp;
2246       LLVMTypeRef int_vec_type = bld->int_vec_type;
2247       LLVMTypeRef vec_type = bld->vec_type;
2248 
2249       if (type.width != 32) {
2250          char intrinsic[32];
2251          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2252          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2253       }
2254 
2255       assert(type.width == 32); /* might want to handle doubles at some point */
2256 
2257       inttype = type;
2258       inttype.floating = 0;
2259       lp_build_context_init(&intbld, bld->gallivm, inttype);
2260 
2261       /* round by truncation */
2262       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2263       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2264 
2265       /*
2266        * fix values if rounding is wrong (for non-special cases)
2267        * - this is the case if trunc < a
2268        */
2269       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2270       /* tmp = trunc < a ? 1.0 : 0.0 */
2271       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2272       tmp = lp_build_and(&intbld, mask, tmp);
2273       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2274       res = lp_build_add(bld, trunc, tmp);
2275 
2276       /* mask out sign bit */
2277       anosign = lp_build_abs(bld, a);
2278       /*
2279        * mask out all values if anosign > 2^24
2280        * This should work both for large ints (all rounding is no-op for them
2281        * because such floats are always exact) as well as special cases like
2282        * NaNs, Infs (taking advantage of the fact they use max exponent).
2283        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2284        */
2285       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2286       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2287       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2288       return lp_build_select(bld, mask, a, res);
2289    }
2290 }
2291 
2292 
2293 /**
2294  * Return fractional part of 'a' computed as a - floor(a)
2295  * Typically used in texture coord arithmetic.
2296  */
2297 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2298 lp_build_fract(struct lp_build_context *bld,
2299                LLVMValueRef a)
2300 {
2301    assert(bld->type.floating);
2302    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2303 }
2304 
2305 
2306 /**
2307  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2308  * against 0.99999(9). (Will also return that value for NaNs.)
2309  */
2310 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2311 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2312 {
2313    LLVMValueRef max;
2314 
2315    /* this is the largest number smaller than 1.0 representable as float */
2316    max = lp_build_const_vec(bld->gallivm, bld->type,
2317                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2318    return lp_build_min_ext(bld, fract, max,
2319                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2320 }
2321 
2322 
2323 /**
2324  * Same as lp_build_fract, but guarantees that the result is always smaller
2325  * than one. Will also return the smaller-than-one value for infs, NaNs.
2326  */
2327 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2328 lp_build_fract_safe(struct lp_build_context *bld,
2329                     LLVMValueRef a)
2330 {
2331    return clamp_fract(bld, lp_build_fract(bld, a));
2332 }
2333 
2334 
2335 /**
2336  * Return the integer part of a float (vector) value (== round toward zero).
2337  * The returned value is an integer (vector).
2338  * Ex: itrunc(-1.5) = -1
2339  */
2340 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2341 lp_build_itrunc(struct lp_build_context *bld,
2342                 LLVMValueRef a)
2343 {
2344    LLVMBuilderRef builder = bld->gallivm->builder;
2345    const struct lp_type type = bld->type;
2346    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2347 
2348    assert(type.floating);
2349    assert(lp_check_value(type, a));
2350 
2351    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2352 }
2353 
2354 
2355 /**
2356  * Return float (vector) rounded to nearest integer (vector).  The returned
2357  * value is an integer (vector).
2358  * Ex: iround(0.9) = 1
2359  * Ex: iround(-1.5) = -2
2360  */
2361 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2362 lp_build_iround(struct lp_build_context *bld,
2363                 LLVMValueRef a)
2364 {
2365    LLVMBuilderRef builder = bld->gallivm->builder;
2366    const struct lp_type type = bld->type;
2367    LLVMTypeRef int_vec_type = bld->int_vec_type;
2368    LLVMValueRef res;
2369 
2370    assert(type.floating);
2371 
2372    assert(lp_check_value(type, a));
2373 
2374    if ((util_get_cpu_caps()->has_sse2 &&
2375        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2376        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2377       return lp_build_iround_nearest_sse2(bld, a);
2378    }
2379    if (arch_rounding_available(type)) {
2380       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2381    }
2382    else {
2383       LLVMValueRef half;
2384 
2385       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2386 
2387       if (type.sign) {
2388          LLVMTypeRef vec_type = bld->vec_type;
2389          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2390                                     (unsigned long long)1 << (type.width - 1));
2391          LLVMValueRef sign;
2392 
2393          /* get sign bit */
2394          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2395          sign = LLVMBuildAnd(builder, sign, mask, "");
2396 
2397          /* sign * 0.5 */
2398          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2399          half = LLVMBuildOr(builder, sign, half, "");
2400          half = LLVMBuildBitCast(builder, half, vec_type, "");
2401       }
2402 
2403       res = LLVMBuildFAdd(builder, a, half, "");
2404    }
2405 
2406    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2407 
2408    return res;
2409 }
2410 
2411 
2412 /**
2413  * Return floor of float (vector), result is an int (vector)
2414  * Ex: ifloor(1.1) = 1.0
2415  * Ex: ifloor(-1.1) = -2.0
2416  */
2417 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2418 lp_build_ifloor(struct lp_build_context *bld,
2419                 LLVMValueRef a)
2420 {
2421    LLVMBuilderRef builder = bld->gallivm->builder;
2422    const struct lp_type type = bld->type;
2423    LLVMTypeRef int_vec_type = bld->int_vec_type;
2424    LLVMValueRef res;
2425 
2426    assert(type.floating);
2427    assert(lp_check_value(type, a));
2428 
2429    res = a;
2430    if (type.sign) {
2431       if (arch_rounding_available(type)) {
2432          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2433       }
2434       else {
2435          struct lp_type inttype;
2436          struct lp_build_context intbld;
2437          LLVMValueRef trunc, itrunc, mask;
2438 
2439          assert(type.floating);
2440          assert(lp_check_value(type, a));
2441 
2442          inttype = type;
2443          inttype.floating = 0;
2444          lp_build_context_init(&intbld, bld->gallivm, inttype);
2445 
2446          /* round by truncation */
2447          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2448          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2449 
2450          /*
2451           * fix values if rounding is wrong (for non-special cases)
2452           * - this is the case if trunc > a
2453           * The results of doing this with NaNs, very large values etc.
2454           * are undefined but this seems to be the case anyway.
2455           */
2456          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2457          /* cheapie minus one with mask since the mask is minus one / zero */
2458          return lp_build_add(&intbld, itrunc, mask);
2459       }
2460    }
2461 
2462    /* round to nearest (toward zero) */
2463    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2464 
2465    return res;
2466 }
2467 
2468 
2469 /**
2470  * Return ceiling of float (vector), returning int (vector).
2471  * Ex: iceil( 1.1) = 2
2472  * Ex: iceil(-1.1) = -1
2473  */
2474 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2475 lp_build_iceil(struct lp_build_context *bld,
2476                LLVMValueRef a)
2477 {
2478    LLVMBuilderRef builder = bld->gallivm->builder;
2479    const struct lp_type type = bld->type;
2480    LLVMTypeRef int_vec_type = bld->int_vec_type;
2481    LLVMValueRef res;
2482 
2483    assert(type.floating);
2484    assert(lp_check_value(type, a));
2485 
2486    if (arch_rounding_available(type)) {
2487       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2488    }
2489    else {
2490       struct lp_type inttype;
2491       struct lp_build_context intbld;
2492       LLVMValueRef trunc, itrunc, mask;
2493 
2494       assert(type.floating);
2495       assert(lp_check_value(type, a));
2496 
2497       inttype = type;
2498       inttype.floating = 0;
2499       lp_build_context_init(&intbld, bld->gallivm, inttype);
2500 
2501       /* round by truncation */
2502       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2503       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2504 
2505       /*
2506        * fix values if rounding is wrong (for non-special cases)
2507        * - this is the case if trunc < a
2508        * The results of doing this with NaNs, very large values etc.
2509        * are undefined but this seems to be the case anyway.
2510        */
2511       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2512       /* cheapie plus one with mask since the mask is minus one / zero */
2513       return lp_build_sub(&intbld, itrunc, mask);
2514    }
2515 
2516    /* round to nearest (toward zero) */
2517    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2518 
2519    return res;
2520 }
2521 
2522 
2523 /**
2524  * Combined ifloor() & fract().
2525  *
2526  * Preferred to calling the functions separately, as it will ensure that the
2527  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2528  */
2529 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2530 lp_build_ifloor_fract(struct lp_build_context *bld,
2531                       LLVMValueRef a,
2532                       LLVMValueRef *out_ipart,
2533                       LLVMValueRef *out_fpart)
2534 {
2535    LLVMBuilderRef builder = bld->gallivm->builder;
2536    const struct lp_type type = bld->type;
2537    LLVMValueRef ipart;
2538 
2539    assert(type.floating);
2540    assert(lp_check_value(type, a));
2541 
2542    if (arch_rounding_available(type)) {
2543       /*
2544        * floor() is easier.
2545        */
2546 
2547       ipart = lp_build_floor(bld, a);
2548       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2549       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2550    }
2551    else {
2552       /*
2553        * ifloor() is easier.
2554        */
2555 
2556       *out_ipart = lp_build_ifloor(bld, a);
2557       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2558       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2559    }
2560 }
2561 
2562 
2563 /**
2564  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2565  * always smaller than one.
2566  */
2567 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2568 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2569                            LLVMValueRef a,
2570                            LLVMValueRef *out_ipart,
2571                            LLVMValueRef *out_fpart)
2572 {
2573    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2574    *out_fpart = clamp_fract(bld, *out_fpart);
2575 }
2576 
2577 
2578 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2579 lp_build_sqrt(struct lp_build_context *bld,
2580               LLVMValueRef a)
2581 {
2582    LLVMBuilderRef builder = bld->gallivm->builder;
2583    const struct lp_type type = bld->type;
2584    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2585    char intrinsic[32];
2586 
2587    assert(lp_check_value(type, a));
2588 
2589    assert(type.floating);
2590    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2591 
2592    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2593 }
2594 
2595 
2596 /**
2597  * Do one Newton-Raphson step to improve reciprocate precision:
2598  *
2599  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2600  *
2601  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2602  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2603  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2604  * halo. It would be necessary to clamp the argument to prevent this.
2605  *
2606  * See also:
2607  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2608  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2609  */
2610 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2611 lp_build_rcp_refine(struct lp_build_context *bld,
2612                     LLVMValueRef a,
2613                     LLVMValueRef rcp_a)
2614 {
2615    LLVMBuilderRef builder = bld->gallivm->builder;
2616    LLVMValueRef neg_a;
2617    LLVMValueRef res;
2618 
2619    neg_a = LLVMBuildFNeg(builder, a, "");
2620    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2621    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2622 
2623    return res;
2624 }
2625 
2626 
2627 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2628 lp_build_rcp(struct lp_build_context *bld,
2629              LLVMValueRef a)
2630 {
2631    LLVMBuilderRef builder = bld->gallivm->builder;
2632    const struct lp_type type = bld->type;
2633 
2634    assert(lp_check_value(type, a));
2635 
2636    if(a == bld->zero)
2637       return bld->undef;
2638    if(a == bld->one)
2639       return bld->one;
2640    if(a == bld->undef)
2641       return bld->undef;
2642 
2643    assert(type.floating);
2644 
2645    if(LLVMIsConstant(a))
2646       return LLVMConstFDiv(bld->one, a);
2647 
2648    /*
2649     * We don't use RCPPS because:
2650     * - it only has 10bits of precision
2651     * - it doesn't even get the reciprocate of 1.0 exactly
2652     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2653     * - for recent processors the benefit over DIVPS is marginal, a case
2654     *   dependent
2655     *
2656     * We could still use it on certain processors if benchmarks show that the
2657     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2658     * particular uses that require less workarounds.
2659     */
2660 
2661    if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2662          (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2663       const unsigned num_iterations = 0;
2664       LLVMValueRef res;
2665       unsigned i;
2666       const char *intrinsic = NULL;
2667 
2668       if (type.length == 4) {
2669          intrinsic = "llvm.x86.sse.rcp.ps";
2670       }
2671       else {
2672          intrinsic = "llvm.x86.avx.rcp.ps.256";
2673       }
2674 
2675       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2676 
2677       for (i = 0; i < num_iterations; ++i) {
2678          res = lp_build_rcp_refine(bld, a, res);
2679       }
2680 
2681       return res;
2682    }
2683 
2684    return LLVMBuildFDiv(builder, bld->one, a, "");
2685 }
2686 
2687 
2688 /**
2689  * Do one Newton-Raphson step to improve rsqrt precision:
2690  *
2691  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2692  *
2693  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2694  */
2695 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2696 lp_build_rsqrt_refine(struct lp_build_context *bld,
2697                       LLVMValueRef a,
2698                       LLVMValueRef rsqrt_a)
2699 {
2700    LLVMBuilderRef builder = bld->gallivm->builder;
2701    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2702    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2703    LLVMValueRef res;
2704 
2705    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2706    res = LLVMBuildFMul(builder, a, res, "");
2707    res = LLVMBuildFSub(builder, three, res, "");
2708    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2709    res = LLVMBuildFMul(builder, half, res, "");
2710 
2711    return res;
2712 }
2713 
2714 
2715 /**
2716  * Generate 1/sqrt(a).
2717  * Result is undefined for values < 0, infinity for +0.
2718  */
2719 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2720 lp_build_rsqrt(struct lp_build_context *bld,
2721                LLVMValueRef a)
2722 {
2723    const struct lp_type type = bld->type;
2724 
2725    assert(lp_check_value(type, a));
2726 
2727    assert(type.floating);
2728 
2729    /*
2730     * This should be faster but all denormals will end up as infinity.
2731     */
2732    if (0 && lp_build_fast_rsqrt_available(type)) {
2733       const unsigned num_iterations = 1;
2734       LLVMValueRef res;
2735       unsigned i;
2736 
2737       /* rsqrt(1.0) != 1.0 here */
2738       res = lp_build_fast_rsqrt(bld, a);
2739 
2740       if (num_iterations) {
2741          /*
2742           * Newton-Raphson will result in NaN instead of infinity for zero,
2743           * and NaN instead of zero for infinity.
2744           * Also, need to ensure rsqrt(1.0) == 1.0.
2745           * All numbers smaller than FLT_MIN will result in +infinity
2746           * (rsqrtps treats all denormals as zero).
2747           */
2748          LLVMValueRef cmp;
2749          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2750          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2751 
2752          for (i = 0; i < num_iterations; ++i) {
2753             res = lp_build_rsqrt_refine(bld, a, res);
2754          }
2755          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2756          res = lp_build_select(bld, cmp, inf, res);
2757          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2758          res = lp_build_select(bld, cmp, bld->zero, res);
2759          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2760          res = lp_build_select(bld, cmp, bld->one, res);
2761       }
2762 
2763       return res;
2764    }
2765 
2766    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2767 }
2768 
2769 /**
2770  * If there's a fast (inaccurate) rsqrt instruction available
2771  * (caller may want to avoid to call rsqrt_fast if it's not available,
2772  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2773  * unavailable it would result in sqrt/div/mul so obviously
2774  * much better to just call sqrt, skipping both div and mul).
2775  */
2776 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2777 lp_build_fast_rsqrt_available(struct lp_type type)
2778 {
2779    assert(type.floating);
2780 
2781    if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2782        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2783       return true;
2784    }
2785    return false;
2786 }
2787 
2788 
2789 /**
2790  * Generate 1/sqrt(a).
2791  * Result is undefined for values < 0, infinity for +0.
2792  * Precision is limited, only ~10 bits guaranteed
2793  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2794  */
2795 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2796 lp_build_fast_rsqrt(struct lp_build_context *bld,
2797                     LLVMValueRef a)
2798 {
2799    LLVMBuilderRef builder = bld->gallivm->builder;
2800    const struct lp_type type = bld->type;
2801 
2802    assert(lp_check_value(type, a));
2803 
2804    if (lp_build_fast_rsqrt_available(type)) {
2805       const char *intrinsic = NULL;
2806 
2807       if (type.length == 4) {
2808          intrinsic = "llvm.x86.sse.rsqrt.ps";
2809       }
2810       else {
2811          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2812       }
2813       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2814    }
2815    else {
2816       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2817    }
2818    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2819 }
2820 
2821 
2822 /**
2823  * Generate sin(a) or cos(a) using polynomial approximation.
2824  * TODO: it might be worth recognizing sin and cos using same source
2825  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2826  * would be way cheaper than calculating (nearly) everything twice...
2827  * Not sure it's common enough to be worth bothering however, scs
2828  * opcode could also benefit from calculating both though.
2829  */
2830 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2831 lp_build_sin_or_cos(struct lp_build_context *bld,
2832                     LLVMValueRef a,
2833                     boolean cos)
2834 {
2835    struct gallivm_state *gallivm = bld->gallivm;
2836    LLVMBuilderRef b = gallivm->builder;
2837    struct lp_type int_type = lp_int_type(bld->type);
2838 
2839    /*
2840     *  take the absolute value,
2841     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2842     */
2843 
2844    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2845    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2846 
2847    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2848    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2849 
2850    /*
2851     * scale by 4/Pi
2852     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2853     */
2854 
2855    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2856    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2857 
2858    /*
2859     * store the integer part of y in mm0
2860     * emm2 = _mm_cvttps_epi32(y);
2861     */
2862 
2863    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2864 
2865    /*
2866     * j=(j+1) & (~1) (see the cephes sources)
2867     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2868     */
2869 
2870    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2871    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2872    /*
2873     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2874     */
2875    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2876    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2877 
2878    /*
2879     * y = _mm_cvtepi32_ps(emm2);
2880     */
2881    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2882 
2883    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2884    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2885    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2886    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2887 
2888    /*
2889     * Argument used for poly selection and sign bit determination
2890     * is different for sin vs. cos.
2891     */
2892    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2893                                emm2_and;
2894 
2895    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2896                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2897                                               const_29, "sign_bit") :
2898                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2899                                                               LLVMBuildShl(b, emm2_add,
2900                                                                            const_29, ""), ""),
2901                                               sign_mask, "sign_bit");
2902 
2903    /*
2904     * get the polynom selection mask
2905     * there is one polynom for 0 <= x <= Pi/4
2906     * and another one for Pi/4<x<=Pi/2
2907     * Both branches will be computed.
2908     *
2909     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2910     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2911     */
2912 
2913    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2914    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2915                                              int_type, PIPE_FUNC_EQUAL,
2916                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2917 
2918    /*
2919     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2920     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2921     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2922     */
2923    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2924    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2925    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2926 
2927    /*
2928     * The magic pass: "Extended precision modular arithmetic"
2929     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2930     */
2931    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2932    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2933    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2934 
2935    /*
2936     * Evaluate the first polynom  (0 <= x <= Pi/4)
2937     *
2938     * z = _mm_mul_ps(x,x);
2939     */
2940    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2941 
2942    /*
2943     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2944     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2945     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2946     */
2947    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2948    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2949    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2950 
2951    /*
2952     * y = *(v4sf*)_ps_coscof_p0;
2953     * y = _mm_mul_ps(y, z);
2954     */
2955    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2956    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2957    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2958    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2959 
2960 
2961    /*
2962     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2963     * y = _mm_sub_ps(y, tmp);
2964     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2965     */
2966    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2967    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2968    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2969    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2970    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2971 
2972    /*
2973     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2974     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2975     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2976     */
2977    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2978    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2979    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2980 
2981    /*
2982     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2983     *
2984     * y2 = *(v4sf*)_ps_sincof_p0;
2985     * y2 = _mm_mul_ps(y2, z);
2986     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2987     * y2 = _mm_mul_ps(y2, z);
2988     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2989     * y2 = _mm_mul_ps(y2, z);
2990     * y2 = _mm_mul_ps(y2, x);
2991     * y2 = _mm_add_ps(y2, x);
2992     */
2993 
2994    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2995    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2996    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2997    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2998 
2999    /*
3000     * select the correct result from the two polynoms
3001     * xmm3 = poly_mask;
3002     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3003     * y = _mm_andnot_ps(xmm3, y);
3004     * y = _mm_or_ps(y,y2);
3005     */
3006    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3007    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3008    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3009    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3010    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3011    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3012 
3013    /*
3014     * update the sign
3015     * y = _mm_xor_ps(y, sign_bit);
3016     */
3017    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3018    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3019 
3020    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3021 
3022    /* clamp output to be within [-1, 1] */
3023    y_result = lp_build_clamp(bld, y_result,
3024                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3025                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3026    /* If a is -inf, inf or NaN then return NaN */
3027    y_result = lp_build_select(bld, isfinite, y_result,
3028                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3029    return y_result;
3030 }
3031 
3032 
3033 /**
3034  * Generate sin(a)
3035  */
3036 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3037 lp_build_sin(struct lp_build_context *bld,
3038              LLVMValueRef a)
3039 {
3040    const struct lp_type type = bld->type;
3041 
3042    if (type.width == 16) {
3043       LLVMBuilderRef builder = bld->gallivm->builder;
3044       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3045       char intrinsic[32];
3046       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3047       LLVMValueRef args[] = { a };
3048       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3049    }
3050 
3051    return lp_build_sin_or_cos(bld, a, FALSE);
3052 }
3053 
3054 
3055 /**
3056  * Generate cos(a)
3057  */
3058 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3059 lp_build_cos(struct lp_build_context *bld,
3060              LLVMValueRef a)
3061 {
3062    const struct lp_type type = bld->type;
3063 
3064    if (type.width == 16) {
3065       LLVMBuilderRef builder = bld->gallivm->builder;
3066       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3067       char intrinsic[32];
3068       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3069       LLVMValueRef args[] = { a };
3070       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3071    }
3072 
3073    return lp_build_sin_or_cos(bld, a, TRUE);
3074 }
3075 
3076 
3077 /**
3078  * Generate pow(x, y)
3079  */
3080 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3081 lp_build_pow(struct lp_build_context *bld,
3082              LLVMValueRef x,
3083              LLVMValueRef y)
3084 {
3085    /* TODO: optimize the constant case */
3086    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3087        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3088       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3089                    __FUNCTION__);
3090    }
3091 
3092    LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3093    LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3094 
3095    res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3096    return res;
3097 }
3098 
3099 
3100 /**
3101  * Generate exp(x)
3102  */
3103 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3104 lp_build_exp(struct lp_build_context *bld,
3105              LLVMValueRef x)
3106 {
3107    /* log2(e) = 1/log(2) */
3108    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3109                                            1.4426950408889634);
3110 
3111    assert(lp_check_value(bld->type, x));
3112 
3113    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3114 }
3115 
3116 
3117 /**
3118  * Generate log(x)
3119  * Behavior is undefined with infs, 0s and nans
3120  */
3121 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3122 lp_build_log(struct lp_build_context *bld,
3123              LLVMValueRef x)
3124 {
3125    /* log(2) */
3126    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3127                                           0.69314718055994529);
3128 
3129    assert(lp_check_value(bld->type, x));
3130 
3131    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3132 }
3133 
3134 /**
3135  * Generate log(x) that handles edge cases (infs, 0s and nans)
3136  */
3137 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3138 lp_build_log_safe(struct lp_build_context *bld,
3139                   LLVMValueRef x)
3140 {
3141    /* log(2) */
3142    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3143                                           0.69314718055994529);
3144 
3145    assert(lp_check_value(bld->type, x));
3146 
3147    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3148 }
3149 
3150 
3151 /**
3152  * Generate polynomial.
3153  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3154  */
3155 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3156 lp_build_polynomial(struct lp_build_context *bld,
3157                     LLVMValueRef x,
3158                     const double *coeffs,
3159                     unsigned num_coeffs)
3160 {
3161    const struct lp_type type = bld->type;
3162    LLVMValueRef even = NULL, odd = NULL;
3163    LLVMValueRef x2;
3164    unsigned i;
3165 
3166    assert(lp_check_value(bld->type, x));
3167 
3168    /* TODO: optimize the constant case */
3169    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3170        LLVMIsConstant(x)) {
3171       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3172                    __FUNCTION__);
3173    }
3174 
3175    /*
3176     * Calculate odd and even terms seperately to decrease data dependency
3177     * Ex:
3178     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3179     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3180     */
3181    x2 = lp_build_mul(bld, x, x);
3182 
3183    for (i = num_coeffs; i--; ) {
3184       LLVMValueRef coeff;
3185 
3186       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3187 
3188       if (i % 2 == 0) {
3189          if (even)
3190             even = lp_build_mad(bld, x2, even, coeff);
3191          else
3192             even = coeff;
3193       } else {
3194          if (odd)
3195             odd = lp_build_mad(bld, x2, odd, coeff);
3196          else
3197             odd = coeff;
3198       }
3199    }
3200 
3201    if (odd)
3202       return lp_build_mad(bld, odd, x, even);
3203    else if (even)
3204       return even;
3205    else
3206       return bld->undef;
3207 }
3208 
3209 
3210 /**
3211  * Minimax polynomial fit of 2**x, in range [0, 1[
3212  */
3213 const double lp_build_exp2_polynomial[] = {
3214 #if EXP_POLY_DEGREE == 5
3215    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3216    0.693153073200168932794,
3217    0.240153617044375388211,
3218    0.0558263180532956664775,
3219    0.00898934009049466391101,
3220    0.00187757667519147912699
3221 #elif EXP_POLY_DEGREE == 4
3222    1.00000259337069434683,
3223    0.693003834469974940458,
3224    0.24144275689150793076,
3225    0.0520114606103070150235,
3226    0.0135341679161270268764
3227 #elif EXP_POLY_DEGREE == 3
3228    0.999925218562710312959,
3229    0.695833540494823811697,
3230    0.226067155427249155588,
3231    0.0780245226406372992967
3232 #elif EXP_POLY_DEGREE == 2
3233    1.00172476321474503578,
3234    0.657636275736077639316,
3235    0.33718943461968720704
3236 #else
3237 #error
3238 #endif
3239 };
3240 
3241 
3242 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3243 lp_build_exp2(struct lp_build_context *bld,
3244               LLVMValueRef x)
3245 {
3246    LLVMBuilderRef builder = bld->gallivm->builder;
3247    const struct lp_type type = bld->type;
3248    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3249    LLVMValueRef ipart = NULL;
3250    LLVMValueRef fpart = NULL;
3251    LLVMValueRef expipart = NULL;
3252    LLVMValueRef expfpart = NULL;
3253    LLVMValueRef res = NULL;
3254 
3255    if (type.floating && type.width == 16) {
3256       char intrinsic[32];
3257       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3258       LLVMValueRef args[] = { x };
3259       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3260    }
3261 
3262    assert(lp_check_value(bld->type, x));
3263 
3264    /* TODO: optimize the constant case */
3265    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3266        LLVMIsConstant(x)) {
3267       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3268                    __FUNCTION__);
3269    }
3270 
3271    assert(type.floating && type.width == 32);
3272 
3273    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3274     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3275    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3276                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3277    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3278                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3279 
3280    /* ipart = floor(x) */
3281    /* fpart = x - ipart */
3282    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3283 
3284    /* expipart = (float) (1 << ipart) */
3285    expipart = LLVMBuildAdd(builder, ipart,
3286                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3287    expipart = LLVMBuildShl(builder, expipart,
3288                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3289    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3290 
3291    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3292                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3293 
3294    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3295 
3296    return res;
3297 }
3298 
3299 
3300 
3301 /**
3302  * Extract the exponent of a IEEE-754 floating point value.
3303  *
3304  * Optionally apply an integer bias.
3305  *
3306  * Result is an integer value with
3307  *
3308  *   ifloor(log2(x)) + bias
3309  */
3310 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3311 lp_build_extract_exponent(struct lp_build_context *bld,
3312                           LLVMValueRef x,
3313                           int bias)
3314 {
3315    LLVMBuilderRef builder = bld->gallivm->builder;
3316    const struct lp_type type = bld->type;
3317    unsigned mantissa = lp_mantissa(type);
3318    LLVMValueRef res;
3319 
3320    assert(type.floating);
3321 
3322    assert(lp_check_value(bld->type, x));
3323 
3324    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3325 
3326    res = LLVMBuildLShr(builder, x,
3327                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3328    res = LLVMBuildAnd(builder, res,
3329                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3330    res = LLVMBuildSub(builder, res,
3331                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3332 
3333    return res;
3334 }
3335 
3336 
3337 /**
3338  * Extract the mantissa of the a floating.
3339  *
3340  * Result is a floating point value with
3341  *
3342  *   x / floor(log2(x))
3343  */
3344 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3345 lp_build_extract_mantissa(struct lp_build_context *bld,
3346                           LLVMValueRef x)
3347 {
3348    LLVMBuilderRef builder = bld->gallivm->builder;
3349    const struct lp_type type = bld->type;
3350    unsigned mantissa = lp_mantissa(type);
3351    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3352                                                   (1ULL << mantissa) - 1);
3353    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3354    LLVMValueRef res;
3355 
3356    assert(lp_check_value(bld->type, x));
3357 
3358    assert(type.floating);
3359 
3360    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3361 
3362    /* res = x / 2**ipart */
3363    res = LLVMBuildAnd(builder, x, mantmask, "");
3364    res = LLVMBuildOr(builder, res, one, "");
3365    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3366 
3367    return res;
3368 }
3369 
3370 
3371 
3372 /**
3373  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3374  * These coefficients can be generate with
3375  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3376  */
3377 const double lp_build_log2_polynomial[] = {
3378 #if LOG_POLY_DEGREE == 5
3379    2.88539008148777786488L,
3380    0.961796878841293367824L,
3381    0.577058946784739859012L,
3382    0.412914355135828735411L,
3383    0.308591899232910175289L,
3384    0.352376952300281371868L,
3385 #elif LOG_POLY_DEGREE == 4
3386    2.88539009343309178325L,
3387    0.961791550404184197881L,
3388    0.577440339438736392009L,
3389    0.403343858251329912514L,
3390    0.406718052498846252698L,
3391 #elif LOG_POLY_DEGREE == 3
3392    2.88538959748872753838L,
3393    0.961932915889597772928L,
3394    0.571118517972136195241L,
3395    0.493997535084709500285L,
3396 #else
3397 #error
3398 #endif
3399 };
3400 
3401 /**
3402  * See http://www.devmaster.net/forums/showthread.php?p=43580
3403  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3404  * http://www.nezumi.demon.co.uk/consult/logx.htm
3405  *
3406  * If handle_edge_cases is true the function will perform computations
3407  * to match the required D3D10+ behavior for each of the edge cases.
3408  * That means that if input is:
3409  * - less than zero (to and including -inf) then NaN will be returned
3410  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3411  * - +infinity, then +infinity will be returned
3412  * - NaN, then NaN will be returned
3413  *
3414  * Those checks are fairly expensive so if you don't need them make sure
3415  * handle_edge_cases is false.
3416  */
3417 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3418 lp_build_log2_approx(struct lp_build_context *bld,
3419                      LLVMValueRef x,
3420                      LLVMValueRef *p_exp,
3421                      LLVMValueRef *p_floor_log2,
3422                      LLVMValueRef *p_log2,
3423                      boolean handle_edge_cases)
3424 {
3425    LLVMBuilderRef builder = bld->gallivm->builder;
3426    const struct lp_type type = bld->type;
3427    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3428    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3429 
3430    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3431    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3432    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3433 
3434    LLVMValueRef i = NULL;
3435    LLVMValueRef y = NULL;
3436    LLVMValueRef z = NULL;
3437    LLVMValueRef exp = NULL;
3438    LLVMValueRef mant = NULL;
3439    LLVMValueRef logexp = NULL;
3440    LLVMValueRef p_z = NULL;
3441    LLVMValueRef res = NULL;
3442 
3443    if (bld->type.width == 16) {
3444       char intrinsic[32];
3445       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3446       LLVMValueRef args[] = { x };
3447       if (p_log2)
3448          *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3449       return;
3450    }
3451 
3452    assert(lp_check_value(bld->type, x));
3453 
3454    if(p_exp || p_floor_log2 || p_log2) {
3455       /* TODO: optimize the constant case */
3456       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3457           LLVMIsConstant(x)) {
3458          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3459                       __FUNCTION__);
3460       }
3461 
3462       assert(type.floating && type.width == 32);
3463 
3464       /*
3465        * We don't explicitly handle denormalized numbers. They will yield a
3466        * result in the neighbourhood of -127, which appears to be adequate
3467        * enough.
3468        */
3469 
3470       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3471 
3472       /* exp = (float) exponent(x) */
3473       exp = LLVMBuildAnd(builder, i, expmask, "");
3474    }
3475 
3476    if(p_floor_log2 || p_log2) {
3477       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3478       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3479       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3480    }
3481 
3482    if (p_log2) {
3483       /* mant = 1 + (float) mantissa(x) */
3484       mant = LLVMBuildAnd(builder, i, mantmask, "");
3485       mant = LLVMBuildOr(builder, mant, one, "");
3486       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3487 
3488       /* y = (mant - 1) / (mant + 1) */
3489       y = lp_build_div(bld,
3490          lp_build_sub(bld, mant, bld->one),
3491          lp_build_add(bld, mant, bld->one)
3492       );
3493 
3494       /* z = y^2 */
3495       z = lp_build_mul(bld, y, y);
3496 
3497       /* compute P(z) */
3498       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3499                                 ARRAY_SIZE(lp_build_log2_polynomial));
3500 
3501       /* y * P(z) + logexp */
3502       res = lp_build_mad(bld, y, p_z, logexp);
3503 
3504       if (type.floating && handle_edge_cases) {
3505          LLVMValueRef negmask, infmask,  zmask;
3506          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3507                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3508          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3509                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3510          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3511                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3512 
3513          /* If x is qual to inf make sure we return inf */
3514          res = lp_build_select(bld, infmask,
3515                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3516                                res);
3517          /* If x is qual to 0, return -inf */
3518          res = lp_build_select(bld, zmask,
3519                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3520                                res);
3521          /* If x is nan or less than 0, return nan */
3522          res = lp_build_select(bld, negmask,
3523                                lp_build_const_vec(bld->gallivm, type,  NAN),
3524                                res);
3525       }
3526    }
3527 
3528    if (p_exp) {
3529       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3530       *p_exp = exp;
3531    }
3532 
3533    if (p_floor_log2)
3534       *p_floor_log2 = logexp;
3535 
3536    if (p_log2)
3537       *p_log2 = res;
3538 }
3539 
3540 
3541 /*
3542  * log2 implementation which doesn't have special code to
3543  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3544  * the results for those cases are undefined.
3545  */
3546 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3547 lp_build_log2(struct lp_build_context *bld,
3548               LLVMValueRef x)
3549 {
3550    LLVMValueRef res;
3551    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3552    return res;
3553 }
3554 
3555 /*
3556  * Version of log2 which handles all edge cases.
3557  * Look at documentation of lp_build_log2_approx for
3558  * description of the behavior for each of the edge cases.
3559  */
3560 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3561 lp_build_log2_safe(struct lp_build_context *bld,
3562                    LLVMValueRef x)
3563 {
3564    LLVMValueRef res;
3565    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3566    return res;
3567 }
3568 
3569 
3570 /**
3571  * Faster (and less accurate) log2.
3572  *
3573  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3574  *
3575  * Piece-wise linear approximation, with exact results when x is a
3576  * power of two.
3577  *
3578  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3579  */
3580 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3581 lp_build_fast_log2(struct lp_build_context *bld,
3582                    LLVMValueRef x)
3583 {
3584    LLVMBuilderRef builder = bld->gallivm->builder;
3585    LLVMValueRef ipart;
3586    LLVMValueRef fpart;
3587 
3588    assert(lp_check_value(bld->type, x));
3589 
3590    assert(bld->type.floating);
3591 
3592    /* ipart = floor(log2(x)) - 1 */
3593    ipart = lp_build_extract_exponent(bld, x, -1);
3594    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3595 
3596    /* fpart = x / 2**ipart */
3597    fpart = lp_build_extract_mantissa(bld, x);
3598 
3599    /* ipart + fpart */
3600    return LLVMBuildFAdd(builder, ipart, fpart, "");
3601 }
3602 
3603 
3604 /**
3605  * Fast implementation of iround(log2(x)).
3606  *
3607  * Not an approximation -- it should give accurate results all the time.
3608  */
3609 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3610 lp_build_ilog2(struct lp_build_context *bld,
3611                LLVMValueRef x)
3612 {
3613    LLVMBuilderRef builder = bld->gallivm->builder;
3614    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3615    LLVMValueRef ipart;
3616 
3617    assert(bld->type.floating);
3618 
3619    assert(lp_check_value(bld->type, x));
3620 
3621    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3622    x = LLVMBuildFMul(builder, x, sqrt2, "");
3623 
3624    /* ipart = floor(log2(x) + 0.5)  */
3625    ipart = lp_build_extract_exponent(bld, x, 0);
3626 
3627    return ipart;
3628 }
3629 
3630 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3631 lp_build_mod(struct lp_build_context *bld,
3632              LLVMValueRef x,
3633              LLVMValueRef y)
3634 {
3635    LLVMBuilderRef builder = bld->gallivm->builder;
3636    LLVMValueRef res;
3637    const struct lp_type type = bld->type;
3638 
3639    assert(lp_check_value(type, x));
3640    assert(lp_check_value(type, y));
3641 
3642    if (type.floating)
3643       res = LLVMBuildFRem(builder, x, y, "");
3644    else if (type.sign)
3645       res = LLVMBuildSRem(builder, x, y, "");
3646    else
3647       res = LLVMBuildURem(builder, x, y, "");
3648    return res;
3649 }
3650 
3651 
3652 /*
3653  * For floating inputs it creates and returns a mask
3654  * which is all 1's for channels which are NaN.
3655  * Channels inside x which are not NaN will be 0.
3656  */
3657 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3658 lp_build_isnan(struct lp_build_context *bld,
3659                LLVMValueRef x)
3660 {
3661    LLVMValueRef mask;
3662    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3663 
3664    assert(bld->type.floating);
3665    assert(lp_check_value(bld->type, x));
3666 
3667    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3668                         "isnotnan");
3669    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3670    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3671    return mask;
3672 }
3673 
3674 /* Returns all 1's for floating point numbers that are
3675  * finite numbers and returns all zeros for -inf,
3676  * inf and nan's */
3677 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3678 lp_build_isfinite(struct lp_build_context *bld,
3679                   LLVMValueRef x)
3680 {
3681    LLVMBuilderRef builder = bld->gallivm->builder;
3682    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683    struct lp_type int_type = lp_int_type(bld->type);
3684    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3685    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3686                                                     0x7f800000);
3687 
3688    if (!bld->type.floating) {
3689       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3690    }
3691    assert(bld->type.floating);
3692    assert(lp_check_value(bld->type, x));
3693    assert(bld->type.width == 32);
3694 
3695    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3696    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3697                            intx, infornan32);
3698 }
3699 
3700 /*
3701  * Returns true if the number is nan or inf and false otherwise.
3702  * The input has to be a floating point vector.
3703  */
3704 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3705 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3706                        const struct lp_type type,
3707                        LLVMValueRef x)
3708 {
3709    LLVMBuilderRef builder = gallivm->builder;
3710    struct lp_type int_type = lp_int_type(type);
3711    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3712                                                 0x7f800000);
3713    LLVMValueRef ret;
3714 
3715    assert(type.floating);
3716 
3717    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3718    ret = LLVMBuildAnd(builder, ret, const0, "");
3719    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3720                           ret, const0);
3721 
3722    return ret;
3723 }
3724 
3725 
3726 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3727 lp_build_fpstate_get(struct gallivm_state *gallivm)
3728 {
3729    if (util_get_cpu_caps()->has_sse) {
3730       LLVMBuilderRef builder = gallivm->builder;
3731       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3732          gallivm,
3733          LLVMInt32TypeInContext(gallivm->context),
3734          "mxcsr_ptr");
3735       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3736           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3737       lp_build_intrinsic(builder,
3738                          "llvm.x86.sse.stmxcsr",
3739                          LLVMVoidTypeInContext(gallivm->context),
3740                          &mxcsr_ptr8, 1, 0);
3741       return mxcsr_ptr;
3742    }
3743    return 0;
3744 }
3745 
3746 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3747 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3748                                   boolean zero)
3749 {
3750    if (util_get_cpu_caps()->has_sse) {
3751       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3752       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3753 
3754       LLVMBuilderRef builder = gallivm->builder;
3755       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3756       LLVMValueRef mxcsr =
3757          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3758 
3759       if (util_get_cpu_caps()->has_daz) {
3760          /* Enable denormals are zero mode */
3761          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3762       }
3763       if (zero) {
3764          mxcsr = LLVMBuildOr(builder, mxcsr,
3765                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3766       } else {
3767          mxcsr = LLVMBuildAnd(builder, mxcsr,
3768                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3769       }
3770 
3771       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3772       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3773    }
3774 }
3775 
3776 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3777 lp_build_fpstate_set(struct gallivm_state *gallivm,
3778                      LLVMValueRef mxcsr_ptr)
3779 {
3780    if (util_get_cpu_caps()->has_sse) {
3781       LLVMBuilderRef builder = gallivm->builder;
3782       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3783                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3784       lp_build_intrinsic(builder,
3785                          "llvm.x86.sse.ldmxcsr",
3786                          LLVMVoidTypeInContext(gallivm->context),
3787                          &mxcsr_ptr, 1, 0);
3788    }
3789 }
3790