1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include <llvm/Config/llvm-config.h>
51 
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56 
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67 
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71 
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75 
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79 
80 #define EXP_POLY_DEGREE 5
81 
82 #define LOG_POLY_DEGREE 4
83 
84 
85 /**
86  * Generate min(a, b)
87  * No checks for special case values of a or b = 1 or 0 are done.
88  * NaN's are handled according to the behavior specified by the
89  * nan_behavior argument.
90  */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93                     LLVMValueRef a,
94                     LLVMValueRef b,
95                     enum gallivm_nan_behavior nan_behavior)
96 {
97    const struct lp_type type = bld->type;
98    const char *intrinsic = NULL;
99    unsigned intr_size = 0;
100    LLVMValueRef cond;
101 
102    assert(lp_check_value(type, a));
103    assert(lp_check_value(type, b));
104 
105    /* TODO: optimize the constant case */
106 
107    if (type.floating && util_get_cpu_caps()->has_sse) {
108       if (type.width == 32) {
109          if (type.length == 1) {
110             intrinsic = "llvm.x86.sse.min.ss";
111             intr_size = 128;
112          }
113          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114             intrinsic = "llvm.x86.sse.min.ps";
115             intr_size = 128;
116          }
117          else {
118             intrinsic = "llvm.x86.avx.min.ps.256";
119             intr_size = 256;
120          }
121       }
122       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123          if (type.length == 1) {
124             intrinsic = "llvm.x86.sse2.min.sd";
125             intr_size = 128;
126          }
127          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128             intrinsic = "llvm.x86.sse2.min.pd";
129             intr_size = 128;
130          }
131          else {
132             intrinsic = "llvm.x86.avx.min.pd.256";
133             intr_size = 256;
134          }
135       }
136    }
137    else if (type.floating && util_get_cpu_caps()->has_altivec) {
138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140                       __FUNCTION__);
141       }
142       if (type.width == 32 && type.length == 4) {
143          intrinsic = "llvm.ppc.altivec.vminfp";
144          intr_size = 128;
145       }
146    } else if (util_get_cpu_caps()->has_altivec) {
147       intr_size = 128;
148       if (type.width == 8) {
149          if (!type.sign) {
150             intrinsic = "llvm.ppc.altivec.vminub";
151          } else {
152             intrinsic = "llvm.ppc.altivec.vminsb";
153          }
154       } else if (type.width == 16) {
155          if (!type.sign) {
156             intrinsic = "llvm.ppc.altivec.vminuh";
157          } else {
158             intrinsic = "llvm.ppc.altivec.vminsh";
159          }
160       } else if (type.width == 32) {
161          if (!type.sign) {
162             intrinsic = "llvm.ppc.altivec.vminuw";
163          } else {
164             intrinsic = "llvm.ppc.altivec.vminsw";
165          }
166       }
167    }
168 
169    if (intrinsic) {
170       /* We need to handle nan's for floating point numbers. If one of the
171        * inputs is nan the other should be returned (required by both D3D10+
172        * and OpenCL).
173        * The sse intrinsics return the second operator in case of nan by
174        * default so we need to special code to handle those.
175        */
176       if (util_get_cpu_caps()->has_sse && type.floating &&
177           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178          LLVMValueRef isnan, min;
179          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180                                                    type,
181                                                    intr_size, a, b);
182          isnan = lp_build_isnan(bld, b);
183          return lp_build_select(bld, isnan, a, min);
184       } else {
185          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186                                                     type,
187                                                     intr_size, a, b);
188       }
189    }
190 
191    if (type.floating) {
192       switch (nan_behavior) {
193       case GALLIVM_NAN_RETURN_OTHER: {
194          LLVMValueRef isnan = lp_build_isnan(bld, a);
195          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197          return lp_build_select(bld, cond, a, b);
198       }
199          break;
200       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202          return lp_build_select(bld, cond, a, b);
203       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205          return lp_build_select(bld, cond, b, a);
206       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208          return lp_build_select(bld, cond, a, b);
209          break;
210       default:
211          assert(0);
212          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213          return lp_build_select(bld, cond, a, b);
214       }
215    } else {
216       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217       return lp_build_select(bld, cond, a, b);
218    }
219 }
220 
221 
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224                  LLVMValueRef a,
225                  LLVMValueRef b,
226                  LLVMValueRef c)
227 {
228    LLVMTypeRef type = LLVMTypeOf(a);
229    assert(type == LLVMTypeOf(b));
230    assert(type == LLVMTypeOf(c));
231 
232    char intrinsic[32];
233    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234    LLVMValueRef args[] = { a, b, c };
235    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237 
238 
239 /**
240  * Generate max(a, b)
241  * No checks for special case values of a or b = 1 or 0 are done.
242  * NaN's are handled according to the behavior specified by the
243  * nan_behavior argument.
244  */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247                     LLVMValueRef a,
248                     LLVMValueRef b,
249                     enum gallivm_nan_behavior nan_behavior)
250 {
251    const struct lp_type type = bld->type;
252    const char *intrinsic = NULL;
253    unsigned intr_size = 0;
254    LLVMValueRef cond;
255 
256    assert(lp_check_value(type, a));
257    assert(lp_check_value(type, b));
258 
259    /* TODO: optimize the constant case */
260 
261    if (type.floating && util_get_cpu_caps()->has_sse) {
262       if (type.width == 32) {
263          if (type.length == 1) {
264             intrinsic = "llvm.x86.sse.max.ss";
265             intr_size = 128;
266          }
267          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268             intrinsic = "llvm.x86.sse.max.ps";
269             intr_size = 128;
270          }
271          else {
272             intrinsic = "llvm.x86.avx.max.ps.256";
273             intr_size = 256;
274          }
275       }
276       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277          if (type.length == 1) {
278             intrinsic = "llvm.x86.sse2.max.sd";
279             intr_size = 128;
280          }
281          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282             intrinsic = "llvm.x86.sse2.max.pd";
283             intr_size = 128;
284          }
285          else {
286             intrinsic = "llvm.x86.avx.max.pd.256";
287             intr_size = 256;
288          }
289       }
290    }
291    else if (type.floating && util_get_cpu_caps()->has_altivec) {
292       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294                       __FUNCTION__);
295       }
296       if (type.width == 32 || type.length == 4) {
297          intrinsic = "llvm.ppc.altivec.vmaxfp";
298          intr_size = 128;
299       }
300    } else if (util_get_cpu_caps()->has_altivec) {
301      intr_size = 128;
302      if (type.width == 8) {
303        if (!type.sign) {
304          intrinsic = "llvm.ppc.altivec.vmaxub";
305        } else {
306          intrinsic = "llvm.ppc.altivec.vmaxsb";
307        }
308      } else if (type.width == 16) {
309        if (!type.sign) {
310          intrinsic = "llvm.ppc.altivec.vmaxuh";
311        } else {
312          intrinsic = "llvm.ppc.altivec.vmaxsh";
313        }
314      } else if (type.width == 32) {
315        if (!type.sign) {
316          intrinsic = "llvm.ppc.altivec.vmaxuw";
317        } else {
318          intrinsic = "llvm.ppc.altivec.vmaxsw";
319        }
320      }
321    }
322 
323    if (intrinsic) {
324       if (util_get_cpu_caps()->has_sse && type.floating &&
325           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326          LLVMValueRef isnan, max;
327          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328                                                    type,
329                                                    intr_size, a, b);
330          isnan = lp_build_isnan(bld, b);
331          return lp_build_select(bld, isnan, a, max);
332       } else {
333          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334                                                     type,
335                                                     intr_size, a, b);
336       }
337    }
338 
339    if (type.floating) {
340       switch (nan_behavior) {
341       case GALLIVM_NAN_RETURN_OTHER: {
342          LLVMValueRef isnan = lp_build_isnan(bld, a);
343          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345          return lp_build_select(bld, cond, a, b);
346       }
347          break;
348       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350          return lp_build_select(bld, cond, a, b);
351       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353          return lp_build_select(bld, cond, b, a);
354       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356          return lp_build_select(bld, cond, a, b);
357          break;
358       default:
359          assert(0);
360          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361          return lp_build_select(bld, cond, a, b);
362       }
363    } else {
364       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365       return lp_build_select(bld, cond, a, b);
366    }
367 }
368 
369 
370 /**
371  * Generate 1 - a, or ~a depending on bld->type.
372  */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375               LLVMValueRef a)
376 {
377    LLVMBuilderRef builder = bld->gallivm->builder;
378    const struct lp_type type = bld->type;
379 
380    assert(lp_check_value(type, a));
381 
382    if(a == bld->one)
383       return bld->zero;
384    if(a == bld->zero)
385       return bld->one;
386 
387    if(type.norm && !type.floating && !type.fixed && !type.sign) {
388       if(LLVMIsConstant(a))
389          return LLVMConstNot(a);
390       else
391          return LLVMBuildNot(builder, a, "");
392    }
393 
394    if(LLVMIsConstant(a))
395       if (type.floating)
396           return LLVMConstFSub(bld->one, a);
397       else
398           return LLVMConstSub(bld->one, a);
399    else
400       if (type.floating)
401          return LLVMBuildFSub(builder, bld->one, a, "");
402       else
403          return LLVMBuildSub(builder, bld->one, a, "");
404 }
405 
406 
407 /**
408  * Generate a + b
409  */
410 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)411 lp_build_add(struct lp_build_context *bld,
412              LLVMValueRef a,
413              LLVMValueRef b)
414 {
415    LLVMBuilderRef builder = bld->gallivm->builder;
416    const struct lp_type type = bld->type;
417    LLVMValueRef res;
418 
419    assert(lp_check_value(type, a));
420    assert(lp_check_value(type, b));
421 
422    if (a == bld->zero)
423       return b;
424    if (b == bld->zero)
425       return a;
426    if (a == bld->undef || b == bld->undef)
427       return bld->undef;
428 
429    if (type.norm) {
430       const char *intrinsic = NULL;
431 
432       if (!type.sign && (a == bld->one || b == bld->one))
433         return bld->one;
434 
435       if (!type.floating && !type.fixed) {
436          if (LLVM_VERSION_MAJOR >= 8) {
437             char intrin[32];
438             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441          }
442          if (type.width * type.length == 128) {
443             if (util_get_cpu_caps()->has_sse2) {
444                if (type.width == 8)
445                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446                if (type.width == 16)
447                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448             } else if (util_get_cpu_caps()->has_altivec) {
449                if (type.width == 8)
450                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451                if (type.width == 16)
452                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453             }
454          }
455          if (type.width * type.length == 256) {
456             if (util_get_cpu_caps()->has_avx2) {
457                if (type.width == 8)
458                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459                if (type.width == 16)
460                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461             }
462          }
463       }
464 
465       if (intrinsic)
466          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467    }
468 
469    if(type.norm && !type.floating && !type.fixed) {
470       if (type.sign) {
471          uint64_t sign = (uint64_t)1 << (type.width - 1);
472          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474          /* a_clamp_max is the maximum a for positive b,
475             a_clamp_min is the minimum a for negative b. */
476          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479       }
480    }
481 
482    if(LLVMIsConstant(a) && LLVMIsConstant(b))
483       if (type.floating)
484          res = LLVMConstFAdd(a, b);
485       else
486          res = LLVMConstAdd(a, b);
487    else
488       if (type.floating)
489          res = LLVMBuildFAdd(builder, a, b, "");
490       else
491          res = LLVMBuildAdd(builder, a, b, "");
492 
493    /* clamp to ceiling of 1.0 */
494    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496 
497    if (type.norm && !type.floating && !type.fixed) {
498       if (!type.sign) {
499          /*
500           * newer llvm versions no longer support the intrinsics, but recognize
501           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502           * code, it is important we match the pattern llvm uses (and pray llvm
503           * doesn't change it - and hope they decide on the same pattern for
504           * all backends supporting it...).
505           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506           * interfere with llvm's ability to recognize the pattern but seems
507           * a bit brittle.
508           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509           */
510          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511          res = lp_build_select(bld, overflowed,
512                                LLVMConstAllOnes(bld->int_vec_type), res);
513       }
514    }
515 
516    /* XXX clamp to floor of -1 or 0??? */
517 
518    return res;
519 }
520 
521 
522 /** Return the scalar sum of the elements of a.
523  * Should avoid this operation whenever possible.
524  */
525 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)526 lp_build_horizontal_add(struct lp_build_context *bld,
527                         LLVMValueRef a)
528 {
529    LLVMBuilderRef builder = bld->gallivm->builder;
530    const struct lp_type type = bld->type;
531    LLVMValueRef index, res;
532    unsigned i, length;
533    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535    LLVMValueRef vecres, elem2;
536 
537    assert(lp_check_value(type, a));
538 
539    if (type.length == 1) {
540       return a;
541    }
542 
543    assert(!bld->type.norm);
544 
545    /*
546     * for byte vectors can do much better with psadbw.
547     * Using repeated shuffle/adds here. Note with multiple vectors
548     * this can be done more efficiently as outlined in the intel
549     * optimization manual.
550     * Note: could cause data rearrangement if used with smaller element
551     * sizes.
552     */
553 
554    vecres = a;
555    length = type.length / 2;
556    while (length > 1) {
557       LLVMValueRef vec1, vec2;
558       for (i = 0; i < length; i++) {
559          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561       }
562       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563                                     LLVMConstVector(shuffles1, length), "");
564       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565                                     LLVMConstVector(shuffles2, length), "");
566       if (type.floating) {
567          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568       }
569       else {
570          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571       }
572       length = length >> 1;
573    }
574 
575    /* always have vector of size 2 here */
576    assert(length == 1);
577 
578    index = lp_build_const_int32(bld->gallivm, 0);
579    res = LLVMBuildExtractElement(builder, vecres, index, "");
580    index = lp_build_const_int32(bld->gallivm, 1);
581    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582 
583    if (type.floating)
584       res = LLVMBuildFAdd(builder, res, elem2, "");
585     else
586       res = LLVMBuildAdd(builder, res, elem2, "");
587 
588    return res;
589 }
590 
591 /**
592  * Return the horizontal sums of 4 float vectors as a float4 vector.
593  * This uses the technique as outlined in Intel Optimization Manual.
594  */
595 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])596 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597                             LLVMValueRef src[4])
598 {
599    struct gallivm_state *gallivm = bld->gallivm;
600    LLVMBuilderRef builder = gallivm->builder;
601    LLVMValueRef shuffles[4];
602    LLVMValueRef tmp[4];
603    LLVMValueRef sumtmp[2], shuftmp[2];
604 
605    /* lower half of regs */
606    shuffles[0] = lp_build_const_int32(gallivm, 0);
607    shuffles[1] = lp_build_const_int32(gallivm, 1);
608    shuffles[2] = lp_build_const_int32(gallivm, 4);
609    shuffles[3] = lp_build_const_int32(gallivm, 5);
610    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611                                    LLVMConstVector(shuffles, 4), "");
612    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613                                    LLVMConstVector(shuffles, 4), "");
614 
615    /* upper half of regs */
616    shuffles[0] = lp_build_const_int32(gallivm, 2);
617    shuffles[1] = lp_build_const_int32(gallivm, 3);
618    shuffles[2] = lp_build_const_int32(gallivm, 6);
619    shuffles[3] = lp_build_const_int32(gallivm, 7);
620    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621                                    LLVMConstVector(shuffles, 4), "");
622    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623                                    LLVMConstVector(shuffles, 4), "");
624 
625    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627 
628    shuffles[0] = lp_build_const_int32(gallivm, 0);
629    shuffles[1] = lp_build_const_int32(gallivm, 2);
630    shuffles[2] = lp_build_const_int32(gallivm, 4);
631    shuffles[3] = lp_build_const_int32(gallivm, 6);
632    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633                                        LLVMConstVector(shuffles, 4), "");
634 
635    shuffles[0] = lp_build_const_int32(gallivm, 1);
636    shuffles[1] = lp_build_const_int32(gallivm, 3);
637    shuffles[2] = lp_build_const_int32(gallivm, 5);
638    shuffles[3] = lp_build_const_int32(gallivm, 7);
639    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640                                        LLVMConstVector(shuffles, 4), "");
641 
642    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643 }
644 
645 
646 /*
647  * partially horizontally add 2-4 float vectors with length nx4,
648  * i.e. only four adjacent values in each vector will be added,
649  * assuming values are really grouped in 4 which also determines
650  * output order.
651  *
652  * Return a vector of the same length as the initial vectors,
653  * with the excess elements (if any) being undefined.
654  * The element order is independent of number of input vectors.
655  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656  * the output order thus will be
657  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658  */
659 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)660 lp_build_hadd_partial4(struct lp_build_context *bld,
661                        LLVMValueRef vectors[],
662                        unsigned num_vecs)
663 {
664    struct gallivm_state *gallivm = bld->gallivm;
665    LLVMBuilderRef builder = gallivm->builder;
666    LLVMValueRef ret_vec;
667    LLVMValueRef tmp[4];
668    const char *intrinsic = NULL;
669 
670    assert(num_vecs >= 2 && num_vecs <= 4);
671    assert(bld->type.floating);
672 
673    /* only use this with at least 2 vectors, as it is sort of expensive
674     * (depending on cpu) and we always need two horizontal adds anyway,
675     * so a shuffle/add approach might be better.
676     */
677 
678    tmp[0] = vectors[0];
679    tmp[1] = vectors[1];
680 
681    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683 
684    if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685        bld->type.length == 4) {
686       intrinsic = "llvm.x86.sse3.hadd.ps";
687    }
688    else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689             bld->type.length == 8) {
690       intrinsic = "llvm.x86.avx.hadd.ps.256";
691    }
692    if (intrinsic) {
693       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694                                        lp_build_vec_type(gallivm, bld->type),
695                                        tmp[0], tmp[1]);
696       if (num_vecs > 2) {
697          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698                                           lp_build_vec_type(gallivm, bld->type),
699                                           tmp[2], tmp[3]);
700       }
701       else {
702          tmp[1] = tmp[0];
703       }
704       return lp_build_intrinsic_binary(builder, intrinsic,
705                                        lp_build_vec_type(gallivm, bld->type),
706                                        tmp[0], tmp[1]);
707    }
708 
709    if (bld->type.length == 4) {
710       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711    }
712    else {
713       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714       unsigned j;
715       unsigned num_iter = bld->type.length / 4;
716       struct lp_type parttype = bld->type;
717       parttype.length = 4;
718       for (j = 0; j < num_iter; j++) {
719          LLVMValueRef partsrc[4];
720          unsigned i;
721          for (i = 0; i < 4; i++) {
722             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723          }
724          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725       }
726       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727    }
728    return ret_vec;
729 }
730 
731 /**
732  * Generate a - b
733  */
734 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)735 lp_build_sub(struct lp_build_context *bld,
736              LLVMValueRef a,
737              LLVMValueRef b)
738 {
739    LLVMBuilderRef builder = bld->gallivm->builder;
740    const struct lp_type type = bld->type;
741    LLVMValueRef res;
742 
743    assert(lp_check_value(type, a));
744    assert(lp_check_value(type, b));
745 
746    if (b == bld->zero)
747       return a;
748    if (a == bld->undef || b == bld->undef)
749       return bld->undef;
750    if (a == b)
751       return bld->zero;
752 
753    if (type.norm) {
754       const char *intrinsic = NULL;
755 
756       if (!type.sign && b == bld->one)
757         return bld->zero;
758 
759       if (!type.floating && !type.fixed) {
760          if (LLVM_VERSION_MAJOR >= 8) {
761             char intrin[32];
762             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765          }
766          if (type.width * type.length == 128) {
767             if (util_get_cpu_caps()->has_sse2) {
768                if (type.width == 8)
769                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770                if (type.width == 16)
771                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772             } else if (util_get_cpu_caps()->has_altivec) {
773                if (type.width == 8)
774                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775                if (type.width == 16)
776                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777             }
778          }
779          if (type.width * type.length == 256) {
780             if (util_get_cpu_caps()->has_avx2) {
781                if (type.width == 8)
782                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783                if (type.width == 16)
784                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785             }
786          }
787       }
788 
789       if (intrinsic)
790          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791    }
792 
793    if(type.norm && !type.floating && !type.fixed) {
794       if (type.sign) {
795          uint64_t sign = (uint64_t)1 << (type.width - 1);
796          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798          /* a_clamp_max is the maximum a for negative b,
799             a_clamp_min is the minimum a for positive b. */
800          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803       } else {
804          /*
805           * This must match llvm pattern for saturated unsigned sub.
806           * (lp_build_max_simple actually does the job with its current
807           * definition but do it explicitly here.)
808           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809           * interfere with llvm's ability to recognize the pattern but seems
810           * a bit brittle.
811           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812           */
813          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814          a = lp_build_select(bld, no_ov, a, b);
815       }
816    }
817 
818    if(LLVMIsConstant(a) && LLVMIsConstant(b))
819       if (type.floating)
820          res = LLVMConstFSub(a, b);
821       else
822          res = LLVMConstSub(a, b);
823    else
824       if (type.floating)
825          res = LLVMBuildFSub(builder, a, b, "");
826       else
827          res = LLVMBuildSub(builder, a, b, "");
828 
829    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831 
832    return res;
833 }
834 
835 
836 
837 /**
838  * Normalized multiplication.
839  *
840  * There are several approaches for (using 8-bit normalized multiplication as
841  * an example):
842  *
843  * - alpha plus one
844  *
845  *     makes the following approximation to the division (Sree)
846  *
847  *       a*b/255 ~= (a*(b + 1)) >> 256
848  *
849  *     which is the fastest method that satisfies the following OpenGL criteria of
850  *
851  *       0*0 = 0 and 255*255 = 255
852  *
853  * - geometric series
854  *
855  *     takes the geometric series approximation to the division
856  *
857  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858  *
859  *     in this case just the first two terms to fit in 16bit arithmetic
860  *
861  *       t/255 ~= (t + (t >> 8)) >> 8
862  *
863  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
864  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
865  *     must be used.
866  *
867  * - geometric series plus rounding
868  *
869  *     when using a geometric series division instead of truncating the result
870  *     use roundoff in the approximation (Jim Blinn)
871  *
872  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
873  *
874  *     achieving the exact results.
875  *
876  *
877  *
878  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880  * @sa Michael Herf, The "double blend trick", May 2000,
881  *     http://www.stereopsis.com/doubleblend.html
882  */
883 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)884 lp_build_mul_norm(struct gallivm_state *gallivm,
885                   struct lp_type wide_type,
886                   LLVMValueRef a, LLVMValueRef b)
887 {
888    LLVMBuilderRef builder = gallivm->builder;
889    struct lp_build_context bld;
890    unsigned n;
891    LLVMValueRef half;
892    LLVMValueRef ab;
893 
894    assert(!wide_type.floating);
895    assert(lp_check_value(wide_type, a));
896    assert(lp_check_value(wide_type, b));
897 
898    lp_build_context_init(&bld, gallivm, wide_type);
899 
900    n = wide_type.width / 2;
901    if (wide_type.sign) {
902       --n;
903    }
904 
905    /*
906     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908     */
909 
910    /*
911     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912     */
913 
914    ab = LLVMBuildMul(builder, a, b, "");
915    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916 
917    /*
918     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919     */
920 
921    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922    if (wide_type.sign) {
923       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925       half = lp_build_select(&bld, sign, minus_half, half);
926    }
927    ab = LLVMBuildAdd(builder, ab, half, "");
928 
929    /* Final division */
930    ab = lp_build_shr_imm(&bld, ab, n);
931 
932    return ab;
933 }
934 
935 /**
936  * Generate a * b
937  */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940              LLVMValueRef a,
941              LLVMValueRef b)
942 {
943    LLVMBuilderRef builder = bld->gallivm->builder;
944    const struct lp_type type = bld->type;
945    LLVMValueRef shift;
946    LLVMValueRef res;
947 
948    assert(lp_check_value(type, a));
949    assert(lp_check_value(type, b));
950 
951    if(a == bld->zero)
952       return bld->zero;
953    if(a == bld->one)
954       return b;
955    if(b == bld->zero)
956       return bld->zero;
957    if(b == bld->one)
958       return a;
959    if(a == bld->undef || b == bld->undef)
960       return bld->undef;
961 
962    if (!type.floating && !type.fixed && type.norm) {
963       struct lp_type wide_type = lp_wider_type(type);
964       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965 
966       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968 
969       /* PMULLW, PSRLW, PADDW */
970       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972 
973       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974 
975       return ab;
976    }
977 
978    if(type.fixed)
979       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980    else
981       shift = NULL;
982 
983    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984       if (type.floating)
985          res = LLVMConstFMul(a, b);
986       else
987          res = LLVMConstMul(a, b);
988       if(shift) {
989          if(type.sign)
990             res = LLVMConstAShr(res, shift);
991          else
992             res = LLVMConstLShr(res, shift);
993       }
994    }
995    else {
996       if (type.floating)
997          res = LLVMBuildFMul(builder, a, b, "");
998       else
999          res = LLVMBuildMul(builder, a, b, "");
1000       if(shift) {
1001          if(type.sign)
1002             res = LLVMBuildAShr(builder, res, shift, "");
1003          else
1004             res = LLVMBuildLShr(builder, res, shift, "");
1005       }
1006    }
1007 
1008    return res;
1009 }
1010 
1011 /*
1012  * Widening mul, valid for 32x32 bit -> 64bit only.
1013  * Result is low 32bits, high bits returned in res_hi.
1014  *
1015  * Emits code that is meant to be compiled for the host CPU.
1016  */
1017 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1018 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019                          LLVMValueRef a,
1020                          LLVMValueRef b,
1021                          LLVMValueRef *res_hi)
1022 {
1023    struct gallivm_state *gallivm = bld->gallivm;
1024    LLVMBuilderRef builder = gallivm->builder;
1025 
1026    assert(bld->type.width == 32);
1027    assert(bld->type.floating == 0);
1028    assert(bld->type.fixed == 0);
1029    assert(bld->type.norm == 0);
1030 
1031    /*
1032     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033     * for x86 simd is atrocious (even if the high bits weren't required),
1034     * trying to handle real 64bit inputs (which of course can't happen due
1035     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036     * apparently llvm does not recognize this widening mul). This includes 6
1037     * (instead of 2) pmuludq plus extra adds and shifts
1038     * The same story applies to signed mul, albeit fixing this requires sse41.
1039     * https://llvm.org/bugs/show_bug.cgi?id=30845
1040     * So, whip up our own code, albeit only for length 4 and 8 (which
1041     * should be good enough)...
1042     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044     * for signed), which the fallback code does not, without this llvm
1045     * will likely still produce atrocious code.
1046     */
1047    if (LLVM_VERSION_MAJOR < 7 &&
1048        (bld->type.length == 4 || bld->type.length == 8) &&
1049        ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050         util_get_cpu_caps()->has_sse4_1)) {
1051       const char *intrinsic = NULL;
1052       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054       struct lp_type type_wide = lp_wider_type(bld->type);
1055       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056       unsigned i;
1057       for (i = 0; i < bld->type.length; i += 2) {
1058          shuf[i] = lp_build_const_int32(gallivm, i+1);
1059          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060       }
1061       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062       aeven = a;
1063       beven = b;
1064       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066 
1067       if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068          if (bld->type.sign) {
1069             intrinsic = "llvm.x86.avx2.pmul.dq";
1070          } else {
1071             intrinsic = "llvm.x86.avx2.pmulu.dq";
1072          }
1073          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074                                              wider_type, aeven, beven);
1075          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076                                             wider_type, aodd, bodd);
1077       }
1078       else {
1079          /* for consistent naming look elsewhere... */
1080          if (bld->type.sign) {
1081             intrinsic = "llvm.x86.sse41.pmuldq";
1082          } else {
1083             intrinsic = "llvm.x86.sse2.pmulu.dq";
1084          }
1085          /*
1086           * XXX If we only have AVX but not AVX2 this is a pain.
1087           * lp_build_intrinsic_binary_anylength() can't handle it
1088           * (due to src and dst type not being identical).
1089           */
1090          if (bld->type.length == 8) {
1091             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093             LLVMValueRef muleven2[2], mulodd2[2];
1094             struct lp_type type_wide_half = type_wide;
1095             LLVMTypeRef wtype_half;
1096             type_wide_half.length = 2;
1097             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107                                                     wtype_half, aevenlo, bevenlo);
1108             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109                                                    wtype_half, aoddlo, boddlo);
1110             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111                                                     wtype_half, aevenhi, bevenhi);
1112             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113                                                    wtype_half, aoddhi, boddhi);
1114             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116 
1117          }
1118          else {
1119             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120                                                 wider_type, aeven, beven);
1121             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122                                                wider_type, aodd, bodd);
1123          }
1124       }
1125       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127 
1128       for (i = 0; i < bld->type.length; i += 2) {
1129          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131       }
1132       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134 
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i);
1137          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141    }
1142    else {
1143       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144    }
1145 }
1146 
1147 
1148 /*
1149  * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150  * Result is low N bits, high bits returned in res_hi.
1151  *
1152  * Emits generic code.
1153  */
1154 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1155 lp_build_mul_32_lohi(struct lp_build_context *bld,
1156                      LLVMValueRef a,
1157                      LLVMValueRef b,
1158                      LLVMValueRef *res_hi)
1159 {
1160    struct gallivm_state *gallivm = bld->gallivm;
1161    LLVMBuilderRef builder = gallivm->builder;
1162    LLVMValueRef tmp, shift, res_lo;
1163    struct lp_type type_tmp;
1164    LLVMTypeRef wide_type, narrow_type;
1165 
1166    type_tmp = bld->type;
1167    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168    if (bld->type.width < 32)
1169       type_tmp.width = 32;
1170    else
1171       type_tmp.width *= 2;
1172    wide_type = lp_build_vec_type(gallivm, type_tmp);
1173    shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174 
1175    if (bld->type.sign) {
1176       a = LLVMBuildSExt(builder, a, wide_type, "");
1177       b = LLVMBuildSExt(builder, b, wide_type, "");
1178    } else {
1179       a = LLVMBuildZExt(builder, a, wide_type, "");
1180       b = LLVMBuildZExt(builder, b, wide_type, "");
1181    }
1182    tmp = LLVMBuildMul(builder, a, b, "");
1183 
1184    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185 
1186    /* Since we truncate anyway, LShr and AShr are equivalent. */
1187    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189 
1190    return res_lo;
1191 }
1192 
1193 
1194 /* a * b + c */
1195 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1196 lp_build_mad(struct lp_build_context *bld,
1197              LLVMValueRef a,
1198              LLVMValueRef b,
1199              LLVMValueRef c)
1200 {
1201    const struct lp_type type = bld->type;
1202    if (type.floating) {
1203       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204    } else {
1205       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206    }
1207 }
1208 
1209 
1210 /**
1211  * Small vector x scale multiplication optimization.
1212  */
1213 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1214 lp_build_mul_imm(struct lp_build_context *bld,
1215                  LLVMValueRef a,
1216                  int b)
1217 {
1218    LLVMBuilderRef builder = bld->gallivm->builder;
1219    LLVMValueRef factor;
1220 
1221    assert(lp_check_value(bld->type, a));
1222 
1223    if(b == 0)
1224       return bld->zero;
1225 
1226    if(b == 1)
1227       return a;
1228 
1229    if(b == -1)
1230       return lp_build_negate(bld, a);
1231 
1232    if(b == 2 && bld->type.floating)
1233       return lp_build_add(bld, a, a);
1234 
1235    if(util_is_power_of_two_or_zero(b)) {
1236       unsigned shift = ffs(b) - 1;
1237 
1238       if(bld->type.floating) {
1239 #if 0
1240          /*
1241           * Power of two multiplication by directly manipulating the exponent.
1242           *
1243           * XXX: This might not be always faster, it will introduce a small error
1244           * for multiplication by zero, and it will produce wrong results
1245           * for Inf and NaN.
1246           */
1247          unsigned mantissa = lp_mantissa(bld->type);
1248          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250          a = LLVMBuildAdd(builder, a, factor, "");
1251          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252          return a;
1253 #endif
1254       }
1255       else {
1256          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257          return LLVMBuildShl(builder, a, factor, "");
1258       }
1259    }
1260 
1261    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262    return lp_build_mul(bld, a, factor);
1263 }
1264 
1265 
1266 /**
1267  * Generate a / b
1268  */
1269 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1270 lp_build_div(struct lp_build_context *bld,
1271              LLVMValueRef a,
1272              LLVMValueRef b)
1273 {
1274    LLVMBuilderRef builder = bld->gallivm->builder;
1275    const struct lp_type type = bld->type;
1276 
1277    assert(lp_check_value(type, a));
1278    assert(lp_check_value(type, b));
1279 
1280    if(a == bld->zero)
1281       return bld->zero;
1282    if(a == bld->one && type.floating)
1283       return lp_build_rcp(bld, b);
1284    if(b == bld->zero)
1285       return bld->undef;
1286    if(b == bld->one)
1287       return a;
1288    if(a == bld->undef || b == bld->undef)
1289       return bld->undef;
1290 
1291    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292       if (type.floating)
1293          return LLVMConstFDiv(a, b);
1294       else if (type.sign)
1295          return LLVMConstSDiv(a, b);
1296       else
1297          return LLVMConstUDiv(a, b);
1298    }
1299 
1300    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301    if(FALSE &&
1302       ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304       type.floating)
1305       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306 
1307    if (type.floating)
1308       return LLVMBuildFDiv(builder, a, b, "");
1309    else if (type.sign)
1310       return LLVMBuildSDiv(builder, a, b, "");
1311    else
1312       return LLVMBuildUDiv(builder, a, b, "");
1313 }
1314 
1315 
1316 /**
1317  * Linear interpolation helper.
1318  *
1319  * @param normalized whether we are interpolating normalized values,
1320  *        encoded in normalized integers, twice as wide.
1321  *
1322  * @sa http://www.stereopsis.com/doubleblend.html
1323  */
1324 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1325 lp_build_lerp_simple(struct lp_build_context *bld,
1326                      LLVMValueRef x,
1327                      LLVMValueRef v0,
1328                      LLVMValueRef v1,
1329                      unsigned flags)
1330 {
1331    unsigned half_width = bld->type.width/2;
1332    LLVMBuilderRef builder = bld->gallivm->builder;
1333    LLVMValueRef delta;
1334    LLVMValueRef res;
1335 
1336    assert(lp_check_value(bld->type, x));
1337    assert(lp_check_value(bld->type, v0));
1338    assert(lp_check_value(bld->type, v1));
1339 
1340    delta = lp_build_sub(bld, v1, v0);
1341 
1342    if (bld->type.floating) {
1343       assert(flags == 0);
1344       return lp_build_mad(bld, x, delta, v0);
1345    }
1346 
1347    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348       if (!bld->type.sign) {
1349          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350             /*
1351              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352              * most-significant-bit to the lowest-significant-bit, so that
1353              * later we can just divide by 2**n instead of 2**n - 1.
1354              */
1355 
1356             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357          }
1358 
1359          /* (x * delta) >> n */
1360 	 /*
1361 	  * For this multiply, higher internal precision is required to pass CTS,
1362 	  * the most efficient path to that is pmulhrsw on ssse3 and above.
1363 	  * This could be opencoded on other arches if conformance was required.
1364 	  */
1365          if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1366             res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1367             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1368          } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1369             res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1370             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1371          } else {
1372             res = lp_build_mul(bld, x, delta);
1373             res = lp_build_shr_imm(bld, res, half_width);
1374          }
1375       } else {
1376          /*
1377           * The rescaling trick above doesn't work for signed numbers, so
1378           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1379           * instead.
1380           */
1381          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1382          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1383       }
1384    } else {
1385       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1386       res = lp_build_mul(bld, x, delta);
1387    }
1388 
1389    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1390       /*
1391        * At this point both res and v0 only use the lower half of the bits,
1392        * the rest is zero. Instead of add / mask, do add with half wide type.
1393        */
1394       struct lp_type narrow_type;
1395       struct lp_build_context narrow_bld;
1396 
1397       memset(&narrow_type, 0, sizeof narrow_type);
1398       narrow_type.sign   = bld->type.sign;
1399       narrow_type.width  = bld->type.width/2;
1400       narrow_type.length = bld->type.length*2;
1401 
1402       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1403       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1404       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1405       res = lp_build_add(&narrow_bld, v0, res);
1406       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1407    } else {
1408       res = lp_build_add(bld, v0, res);
1409 
1410       if (bld->type.fixed) {
1411          /*
1412           * We need to mask out the high order bits when lerping 8bit
1413           * normalized colors stored on 16bits
1414           */
1415          /* XXX: This step is necessary for lerping 8bit colors stored on
1416           * 16bits, but it will be wrong for true fixed point use cases.
1417           * Basically we need a more powerful lp_type, capable of further
1418           * distinguishing the values interpretation from the value storage.
1419           */
1420          LLVMValueRef low_bits;
1421          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1422          res = LLVMBuildAnd(builder, res, low_bits, "");
1423       }
1424    }
1425 
1426    return res;
1427 }
1428 
1429 
1430 /**
1431  * Linear interpolation.
1432  */
1433 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1434 lp_build_lerp(struct lp_build_context *bld,
1435               LLVMValueRef x,
1436               LLVMValueRef v0,
1437               LLVMValueRef v1,
1438               unsigned flags)
1439 {
1440    const struct lp_type type = bld->type;
1441    LLVMValueRef res;
1442 
1443    assert(lp_check_value(type, x));
1444    assert(lp_check_value(type, v0));
1445    assert(lp_check_value(type, v1));
1446 
1447    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1448 
1449    if (type.norm) {
1450       struct lp_type wide_type;
1451       struct lp_build_context wide_bld;
1452       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1453 
1454       assert(type.length >= 2);
1455 
1456       /*
1457        * Create a wider integer type, enough to hold the
1458        * intermediate result of the multiplication.
1459        */
1460       memset(&wide_type, 0, sizeof wide_type);
1461       wide_type.sign   = type.sign;
1462       wide_type.width  = type.width*2;
1463       wide_type.length = type.length/2;
1464 
1465       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1466 
1467       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1468       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1469       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1470 
1471       /*
1472        * Lerp both halves.
1473        */
1474 
1475       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1476 
1477       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1478       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1479 
1480       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1481    } else {
1482       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1483    }
1484 
1485    return res;
1486 }
1487 
1488 
1489 /**
1490  * Bilinear interpolation.
1491  *
1492  * Values indices are in v_{yx}.
1493  */
1494 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1495 lp_build_lerp_2d(struct lp_build_context *bld,
1496                  LLVMValueRef x,
1497                  LLVMValueRef y,
1498                  LLVMValueRef v00,
1499                  LLVMValueRef v01,
1500                  LLVMValueRef v10,
1501                  LLVMValueRef v11,
1502                  unsigned flags)
1503 {
1504    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1505    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1506    return lp_build_lerp(bld, y, v0, v1, flags);
1507 }
1508 
1509 
1510 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1511 lp_build_lerp_3d(struct lp_build_context *bld,
1512                  LLVMValueRef x,
1513                  LLVMValueRef y,
1514                  LLVMValueRef z,
1515                  LLVMValueRef v000,
1516                  LLVMValueRef v001,
1517                  LLVMValueRef v010,
1518                  LLVMValueRef v011,
1519                  LLVMValueRef v100,
1520                  LLVMValueRef v101,
1521                  LLVMValueRef v110,
1522                  LLVMValueRef v111,
1523                  unsigned flags)
1524 {
1525    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1526    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1527    return lp_build_lerp(bld, z, v0, v1, flags);
1528 }
1529 
1530 
1531 /**
1532  * Generate min(a, b)
1533  * Do checks for special cases but not for nans.
1534  */
1535 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1536 lp_build_min(struct lp_build_context *bld,
1537              LLVMValueRef a,
1538              LLVMValueRef b)
1539 {
1540    assert(lp_check_value(bld->type, a));
1541    assert(lp_check_value(bld->type, b));
1542 
1543    if(a == bld->undef || b == bld->undef)
1544       return bld->undef;
1545 
1546    if(a == b)
1547       return a;
1548 
1549    if (bld->type.norm) {
1550       if (!bld->type.sign) {
1551          if (a == bld->zero || b == bld->zero) {
1552             return bld->zero;
1553          }
1554       }
1555       if(a == bld->one)
1556          return b;
1557       if(b == bld->one)
1558          return a;
1559    }
1560 
1561    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1562 }
1563 
1564 /**
1565  * Generate min(a, b)
1566  * NaN's are handled according to the behavior specified by the
1567  * nan_behavior argument.
1568  */
1569 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1570 lp_build_min_ext(struct lp_build_context *bld,
1571                  LLVMValueRef a,
1572                  LLVMValueRef b,
1573                  enum gallivm_nan_behavior nan_behavior)
1574 {
1575    assert(lp_check_value(bld->type, a));
1576    assert(lp_check_value(bld->type, b));
1577 
1578    if(a == bld->undef || b == bld->undef)
1579       return bld->undef;
1580 
1581    if(a == b)
1582       return a;
1583 
1584    if (bld->type.norm) {
1585       if (!bld->type.sign) {
1586          if (a == bld->zero || b == bld->zero) {
1587             return bld->zero;
1588          }
1589       }
1590       if(a == bld->one)
1591          return b;
1592       if(b == bld->one)
1593          return a;
1594    }
1595 
1596    return lp_build_min_simple(bld, a, b, nan_behavior);
1597 }
1598 
1599 /**
1600  * Generate max(a, b)
1601  * Do checks for special cases, but NaN behavior is undefined.
1602  */
1603 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1604 lp_build_max(struct lp_build_context *bld,
1605              LLVMValueRef a,
1606              LLVMValueRef b)
1607 {
1608    assert(lp_check_value(bld->type, a));
1609    assert(lp_check_value(bld->type, b));
1610 
1611    if(a == bld->undef || b == bld->undef)
1612       return bld->undef;
1613 
1614    if(a == b)
1615       return a;
1616 
1617    if(bld->type.norm) {
1618       if(a == bld->one || b == bld->one)
1619          return bld->one;
1620       if (!bld->type.sign) {
1621          if (a == bld->zero) {
1622             return b;
1623          }
1624          if (b == bld->zero) {
1625             return a;
1626          }
1627       }
1628    }
1629 
1630    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1631 }
1632 
1633 
1634 /**
1635  * Generate max(a, b)
1636  * Checks for special cases.
1637  * NaN's are handled according to the behavior specified by the
1638  * nan_behavior argument.
1639  */
1640 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1641 lp_build_max_ext(struct lp_build_context *bld,
1642                   LLVMValueRef a,
1643                   LLVMValueRef b,
1644                   enum gallivm_nan_behavior nan_behavior)
1645 {
1646    assert(lp_check_value(bld->type, a));
1647    assert(lp_check_value(bld->type, b));
1648 
1649    if(a == bld->undef || b == bld->undef)
1650       return bld->undef;
1651 
1652    if(a == b)
1653       return a;
1654 
1655    if(bld->type.norm) {
1656       if(a == bld->one || b == bld->one)
1657          return bld->one;
1658       if (!bld->type.sign) {
1659          if (a == bld->zero) {
1660             return b;
1661          }
1662          if (b == bld->zero) {
1663             return a;
1664          }
1665       }
1666    }
1667 
1668    return lp_build_max_simple(bld, a, b, nan_behavior);
1669 }
1670 
1671 /**
1672  * Generate clamp(a, min, max)
1673  * NaN behavior (for any of a, min, max) is undefined.
1674  * Do checks for special cases.
1675  */
1676 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1677 lp_build_clamp(struct lp_build_context *bld,
1678                LLVMValueRef a,
1679                LLVMValueRef min,
1680                LLVMValueRef max)
1681 {
1682    assert(lp_check_value(bld->type, a));
1683    assert(lp_check_value(bld->type, min));
1684    assert(lp_check_value(bld->type, max));
1685 
1686    a = lp_build_min(bld, a, max);
1687    a = lp_build_max(bld, a, min);
1688    return a;
1689 }
1690 
1691 
1692 /**
1693  * Generate clamp(a, 0, 1)
1694  * A NaN will get converted to zero.
1695  */
1696 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1697 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1698                                 LLVMValueRef a)
1699 {
1700    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1701    a = lp_build_min(bld, a, bld->one);
1702    return a;
1703 }
1704 
1705 
1706 /**
1707  * Generate abs(a)
1708  */
1709 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1710 lp_build_abs(struct lp_build_context *bld,
1711              LLVMValueRef a)
1712 {
1713    LLVMBuilderRef builder = bld->gallivm->builder;
1714    const struct lp_type type = bld->type;
1715    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1716 
1717    assert(lp_check_value(type, a));
1718 
1719    if(!type.sign)
1720       return a;
1721 
1722    if(type.floating) {
1723       char intrinsic[32];
1724       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1725       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1726    }
1727 
1728    if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1729       switch(type.width) {
1730       case 8:
1731          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1732       case 16:
1733          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1734       case 32:
1735          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1736       }
1737    }
1738    else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1739       switch(type.width) {
1740       case 8:
1741          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1742       case 16:
1743          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1744       case 32:
1745          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1746       }
1747    }
1748 
1749    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1750                           a, LLVMBuildNeg(builder, a, ""));
1751 }
1752 
1753 
1754 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_negate(struct lp_build_context *bld,
1756                 LLVMValueRef a)
1757 {
1758    LLVMBuilderRef builder = bld->gallivm->builder;
1759 
1760    assert(lp_check_value(bld->type, a));
1761 
1762    if (bld->type.floating)
1763       a = LLVMBuildFNeg(builder, a, "");
1764    else
1765       a = LLVMBuildNeg(builder, a, "");
1766 
1767    return a;
1768 }
1769 
1770 
1771 /** Return -1, 0 or +1 depending on the sign of a */
1772 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_sgn(struct lp_build_context *bld,
1774              LLVMValueRef a)
1775 {
1776    LLVMBuilderRef builder = bld->gallivm->builder;
1777    const struct lp_type type = bld->type;
1778    LLVMValueRef cond;
1779    LLVMValueRef res;
1780 
1781    assert(lp_check_value(type, a));
1782 
1783    /* Handle non-zero case */
1784    if(!type.sign) {
1785       /* if not zero then sign must be positive */
1786       res = bld->one;
1787    }
1788    else if(type.floating) {
1789       LLVMTypeRef vec_type;
1790       LLVMTypeRef int_type;
1791       LLVMValueRef mask;
1792       LLVMValueRef sign;
1793       LLVMValueRef one;
1794       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1795 
1796       int_type = lp_build_int_vec_type(bld->gallivm, type);
1797       vec_type = lp_build_vec_type(bld->gallivm, type);
1798       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1799 
1800       /* Take the sign bit and add it to 1 constant */
1801       sign = LLVMBuildBitCast(builder, a, int_type, "");
1802       sign = LLVMBuildAnd(builder, sign, mask, "");
1803       one = LLVMConstBitCast(bld->one, int_type);
1804       res = LLVMBuildOr(builder, sign, one, "");
1805       res = LLVMBuildBitCast(builder, res, vec_type, "");
1806    }
1807    else
1808    {
1809       /* signed int/norm/fixed point */
1810       /* could use psign with sse3 and appropriate vectors here */
1811       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1812       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1813       res = lp_build_select(bld, cond, bld->one, minus_one);
1814    }
1815 
1816    /* Handle zero */
1817    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1818    res = lp_build_select(bld, cond, bld->zero, res);
1819 
1820    return res;
1821 }
1822 
1823 
1824 /**
1825  * Set the sign of float vector 'a' according to 'sign'.
1826  * If sign==0, return abs(a).
1827  * If sign==1, return -abs(a);
1828  * Other values for sign produce undefined results.
1829  */
1830 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1831 lp_build_set_sign(struct lp_build_context *bld,
1832                   LLVMValueRef a, LLVMValueRef sign)
1833 {
1834    LLVMBuilderRef builder = bld->gallivm->builder;
1835    const struct lp_type type = bld->type;
1836    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1837    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1838    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1839    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1840                              ~((unsigned long long) 1 << (type.width - 1)));
1841    LLVMValueRef val, res;
1842 
1843    assert(type.floating);
1844    assert(lp_check_value(type, a));
1845 
1846    /* val = reinterpret_cast<int>(a) */
1847    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1848    /* val = val & mask */
1849    val = LLVMBuildAnd(builder, val, mask, "");
1850    /* sign = sign << shift */
1851    sign = LLVMBuildShl(builder, sign, shift, "");
1852    /* res = val | sign */
1853    res = LLVMBuildOr(builder, val, sign, "");
1854    /* res = reinterpret_cast<float>(res) */
1855    res = LLVMBuildBitCast(builder, res, vec_type, "");
1856 
1857    return res;
1858 }
1859 
1860 
1861 /**
1862  * Convert vector of (or scalar) int to vector of (or scalar) float.
1863  */
1864 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1865 lp_build_int_to_float(struct lp_build_context *bld,
1866                       LLVMValueRef a)
1867 {
1868    LLVMBuilderRef builder = bld->gallivm->builder;
1869    const struct lp_type type = bld->type;
1870    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1871 
1872    assert(type.floating);
1873 
1874    return LLVMBuildSIToFP(builder, a, vec_type, "");
1875 }
1876 
1877 static boolean
arch_rounding_available(const struct lp_type type)1878 arch_rounding_available(const struct lp_type type)
1879 {
1880    if ((util_get_cpu_caps()->has_sse4_1 &&
1881        (type.length == 1 || type.width*type.length == 128)) ||
1882        (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1883        (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1884       return TRUE;
1885    else if ((util_get_cpu_caps()->has_altivec &&
1886             (type.width == 32 && type.length == 4)))
1887       return TRUE;
1888    else if (util_get_cpu_caps()->has_neon)
1889       return TRUE;
1890    else if (util_get_cpu_caps()->family == CPU_S390X)
1891       return TRUE;
1892 
1893    return FALSE;
1894 }
1895 
1896 enum lp_build_round_mode
1897 {
1898    LP_BUILD_ROUND_NEAREST = 0,
1899    LP_BUILD_ROUND_FLOOR = 1,
1900    LP_BUILD_ROUND_CEIL = 2,
1901    LP_BUILD_ROUND_TRUNCATE = 3
1902 };
1903 
1904 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1905 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1906                              LLVMValueRef a)
1907 {
1908    LLVMBuilderRef builder = bld->gallivm->builder;
1909    const struct lp_type type = bld->type;
1910    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1911    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1912    const char *intrinsic;
1913    LLVMValueRef res;
1914 
1915    assert(type.floating);
1916    /* using the double precision conversions is a bit more complicated */
1917    assert(type.width == 32);
1918 
1919    assert(lp_check_value(type, a));
1920    assert(util_get_cpu_caps()->has_sse2);
1921 
1922    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1923    if (type.length == 1) {
1924       LLVMTypeRef vec_type;
1925       LLVMValueRef undef;
1926       LLVMValueRef arg;
1927       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1928 
1929       vec_type = LLVMVectorType(bld->elem_type, 4);
1930 
1931       intrinsic = "llvm.x86.sse.cvtss2si";
1932 
1933       undef = LLVMGetUndef(vec_type);
1934 
1935       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1936 
1937       res = lp_build_intrinsic_unary(builder, intrinsic,
1938                                      ret_type, arg);
1939    }
1940    else {
1941       if (type.width* type.length == 128) {
1942          intrinsic = "llvm.x86.sse2.cvtps2dq";
1943       }
1944       else {
1945          assert(type.width*type.length == 256);
1946          assert(util_get_cpu_caps()->has_avx);
1947 
1948          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1949       }
1950       res = lp_build_intrinsic_unary(builder, intrinsic,
1951                                      ret_type, a);
1952    }
1953 
1954    return res;
1955 }
1956 
1957 
1958 /*
1959  */
1960 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1961 lp_build_round_altivec(struct lp_build_context *bld,
1962                        LLVMValueRef a,
1963                        enum lp_build_round_mode mode)
1964 {
1965    LLVMBuilderRef builder = bld->gallivm->builder;
1966    const struct lp_type type = bld->type;
1967    const char *intrinsic = NULL;
1968 
1969    assert(type.floating);
1970 
1971    assert(lp_check_value(type, a));
1972    assert(util_get_cpu_caps()->has_altivec);
1973 
1974    (void)type;
1975 
1976    switch (mode) {
1977    case LP_BUILD_ROUND_NEAREST:
1978       intrinsic = "llvm.ppc.altivec.vrfin";
1979       break;
1980    case LP_BUILD_ROUND_FLOOR:
1981       intrinsic = "llvm.ppc.altivec.vrfim";
1982       break;
1983    case LP_BUILD_ROUND_CEIL:
1984       intrinsic = "llvm.ppc.altivec.vrfip";
1985       break;
1986    case LP_BUILD_ROUND_TRUNCATE:
1987       intrinsic = "llvm.ppc.altivec.vrfiz";
1988       break;
1989    }
1990 
1991    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1992 }
1993 
1994 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1995 lp_build_round_arch(struct lp_build_context *bld,
1996                     LLVMValueRef a,
1997                     enum lp_build_round_mode mode)
1998 {
1999    if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
2000        util_get_cpu_caps()->family == CPU_S390X) {
2001       LLVMBuilderRef builder = bld->gallivm->builder;
2002       const struct lp_type type = bld->type;
2003       const char *intrinsic_root;
2004       char intrinsic[32];
2005 
2006       assert(type.floating);
2007       assert(lp_check_value(type, a));
2008       (void)type;
2009 
2010       switch (mode) {
2011       case LP_BUILD_ROUND_NEAREST:
2012          intrinsic_root = "llvm.nearbyint";
2013          break;
2014       case LP_BUILD_ROUND_FLOOR:
2015          intrinsic_root = "llvm.floor";
2016          break;
2017       case LP_BUILD_ROUND_CEIL:
2018          intrinsic_root = "llvm.ceil";
2019          break;
2020       case LP_BUILD_ROUND_TRUNCATE:
2021          intrinsic_root = "llvm.trunc";
2022          break;
2023       default:
2024          unreachable("unhandled lp_build_round_mode");
2025       }
2026 
2027       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2028       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2029    }
2030    else /* (util_get_cpu_caps()->has_altivec) */
2031      return lp_build_round_altivec(bld, a, mode);
2032 }
2033 
2034 /**
2035  * Return the integer part of a float (vector) value (== round toward zero).
2036  * The returned value is a float (vector).
2037  * Ex: trunc(-1.5) = -1.0
2038  */
2039 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2040 lp_build_trunc(struct lp_build_context *bld,
2041                LLVMValueRef a)
2042 {
2043    LLVMBuilderRef builder = bld->gallivm->builder;
2044    const struct lp_type type = bld->type;
2045 
2046    assert(type.floating);
2047    assert(lp_check_value(type, a));
2048 
2049    if (type.width == 16) {
2050       char intrinsic[64];
2051       lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2052       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2053    }
2054 
2055    if (arch_rounding_available(type)) {
2056       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2057    }
2058    else {
2059       const struct lp_type type = bld->type;
2060       struct lp_type inttype;
2061       struct lp_build_context intbld;
2062       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2063       LLVMValueRef trunc, res, anosign, mask;
2064       LLVMTypeRef int_vec_type = bld->int_vec_type;
2065       LLVMTypeRef vec_type = bld->vec_type;
2066 
2067       inttype = type;
2068       inttype.floating = 0;
2069       lp_build_context_init(&intbld, bld->gallivm, inttype);
2070 
2071       /* round by truncation */
2072       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2073       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2074 
2075       /* mask out sign bit */
2076       anosign = lp_build_abs(bld, a);
2077       /*
2078        * mask out all values if anosign > 2^24
2079        * This should work both for large ints (all rounding is no-op for them
2080        * because such floats are always exact) as well as special cases like
2081        * NaNs, Infs (taking advantage of the fact they use max exponent).
2082        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2083        */
2084       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2085       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2086       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2087       return lp_build_select(bld, mask, a, res);
2088    }
2089 }
2090 
2091 
2092 /**
2093  * Return float (vector) rounded to nearest integer (vector).  The returned
2094  * value is a float (vector).
2095  * Ex: round(0.9) = 1.0
2096  * Ex: round(-1.5) = -2.0
2097  */
2098 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2099 lp_build_round(struct lp_build_context *bld,
2100                LLVMValueRef a)
2101 {
2102    LLVMBuilderRef builder = bld->gallivm->builder;
2103    const struct lp_type type = bld->type;
2104 
2105    assert(type.floating);
2106    assert(lp_check_value(type, a));
2107 
2108    if (type.width == 16) {
2109       char intrinsic[64];
2110       lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2111       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2112    }
2113 
2114    if (arch_rounding_available(type)) {
2115       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2116    }
2117    else {
2118       const struct lp_type type = bld->type;
2119       struct lp_type inttype;
2120       struct lp_build_context intbld;
2121       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122       LLVMValueRef res, anosign, mask;
2123       LLVMTypeRef int_vec_type = bld->int_vec_type;
2124       LLVMTypeRef vec_type = bld->vec_type;
2125 
2126       inttype = type;
2127       inttype.floating = 0;
2128       lp_build_context_init(&intbld, bld->gallivm, inttype);
2129 
2130       res = lp_build_iround(bld, a);
2131       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2132 
2133       /* mask out sign bit */
2134       anosign = lp_build_abs(bld, a);
2135       /*
2136        * mask out all values if anosign > 2^24
2137        * This should work both for large ints (all rounding is no-op for them
2138        * because such floats are always exact) as well as special cases like
2139        * NaNs, Infs (taking advantage of the fact they use max exponent).
2140        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2141        */
2142       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2143       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2144       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2145       return lp_build_select(bld, mask, a, res);
2146    }
2147 }
2148 
2149 
2150 /**
2151  * Return floor of float (vector), result is a float (vector)
2152  * Ex: floor(1.1) = 1.0
2153  * Ex: floor(-1.1) = -2.0
2154  */
2155 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2156 lp_build_floor(struct lp_build_context *bld,
2157                LLVMValueRef a)
2158 {
2159    LLVMBuilderRef builder = bld->gallivm->builder;
2160    const struct lp_type type = bld->type;
2161 
2162    assert(type.floating);
2163    assert(lp_check_value(type, a));
2164 
2165    if (arch_rounding_available(type)) {
2166       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2167    }
2168    else {
2169       const struct lp_type type = bld->type;
2170       struct lp_type inttype;
2171       struct lp_build_context intbld;
2172       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173       LLVMValueRef trunc, res, anosign, mask;
2174       LLVMTypeRef int_vec_type = bld->int_vec_type;
2175       LLVMTypeRef vec_type = bld->vec_type;
2176 
2177       if (type.width != 32) {
2178          char intrinsic[32];
2179          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2180          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2181       }
2182 
2183       assert(type.width == 32); /* might want to handle doubles at some point */
2184 
2185       inttype = type;
2186       inttype.floating = 0;
2187       lp_build_context_init(&intbld, bld->gallivm, inttype);
2188 
2189       /* round by truncation */
2190       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2191       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2192 
2193       if (type.sign) {
2194          LLVMValueRef tmp;
2195 
2196          /*
2197           * fix values if rounding is wrong (for non-special cases)
2198           * - this is the case if trunc > a
2199           */
2200          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2201          /* tmp = trunc > a ? 1.0 : 0.0 */
2202          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2203          tmp = lp_build_and(&intbld, mask, tmp);
2204          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2205          res = lp_build_sub(bld, res, tmp);
2206       }
2207 
2208       /* mask out sign bit */
2209       anosign = lp_build_abs(bld, a);
2210       /*
2211        * mask out all values if anosign > 2^24
2212        * This should work both for large ints (all rounding is no-op for them
2213        * because such floats are always exact) as well as special cases like
2214        * NaNs, Infs (taking advantage of the fact they use max exponent).
2215        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2216        */
2217       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2218       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2219       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2220       return lp_build_select(bld, mask, a, res);
2221    }
2222 }
2223 
2224 
2225 /**
2226  * Return ceiling of float (vector), returning float (vector).
2227  * Ex: ceil( 1.1) = 2.0
2228  * Ex: ceil(-1.1) = -1.0
2229  */
2230 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2231 lp_build_ceil(struct lp_build_context *bld,
2232               LLVMValueRef a)
2233 {
2234    LLVMBuilderRef builder = bld->gallivm->builder;
2235    const struct lp_type type = bld->type;
2236 
2237    assert(type.floating);
2238    assert(lp_check_value(type, a));
2239 
2240    if (arch_rounding_available(type)) {
2241       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2242    }
2243    else {
2244       const struct lp_type type = bld->type;
2245       struct lp_type inttype;
2246       struct lp_build_context intbld;
2247       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2248       LLVMValueRef trunc, res, anosign, mask, tmp;
2249       LLVMTypeRef int_vec_type = bld->int_vec_type;
2250       LLVMTypeRef vec_type = bld->vec_type;
2251 
2252       if (type.width != 32) {
2253          char intrinsic[32];
2254          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2255          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2256       }
2257 
2258       assert(type.width == 32); /* might want to handle doubles at some point */
2259 
2260       inttype = type;
2261       inttype.floating = 0;
2262       lp_build_context_init(&intbld, bld->gallivm, inttype);
2263 
2264       /* round by truncation */
2265       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2266       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2267 
2268       /*
2269        * fix values if rounding is wrong (for non-special cases)
2270        * - this is the case if trunc < a
2271        */
2272       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2273       /* tmp = trunc < a ? 1.0 : 0.0 */
2274       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2275       tmp = lp_build_and(&intbld, mask, tmp);
2276       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2277       res = lp_build_add(bld, trunc, tmp);
2278 
2279       /* mask out sign bit */
2280       anosign = lp_build_abs(bld, a);
2281       /*
2282        * mask out all values if anosign > 2^24
2283        * This should work both for large ints (all rounding is no-op for them
2284        * because such floats are always exact) as well as special cases like
2285        * NaNs, Infs (taking advantage of the fact they use max exponent).
2286        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2287        */
2288       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2289       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2290       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2291       return lp_build_select(bld, mask, a, res);
2292    }
2293 }
2294 
2295 
2296 /**
2297  * Return fractional part of 'a' computed as a - floor(a)
2298  * Typically used in texture coord arithmetic.
2299  */
2300 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2301 lp_build_fract(struct lp_build_context *bld,
2302                LLVMValueRef a)
2303 {
2304    assert(bld->type.floating);
2305    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2306 }
2307 
2308 
2309 /**
2310  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2311  * against 0.99999(9). (Will also return that value for NaNs.)
2312  */
2313 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2314 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2315 {
2316    LLVMValueRef max;
2317 
2318    /* this is the largest number smaller than 1.0 representable as float */
2319    max = lp_build_const_vec(bld->gallivm, bld->type,
2320                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2321    return lp_build_min_ext(bld, fract, max,
2322                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2323 }
2324 
2325 
2326 /**
2327  * Same as lp_build_fract, but guarantees that the result is always smaller
2328  * than one. Will also return the smaller-than-one value for infs, NaNs.
2329  */
2330 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2331 lp_build_fract_safe(struct lp_build_context *bld,
2332                     LLVMValueRef a)
2333 {
2334    return clamp_fract(bld, lp_build_fract(bld, a));
2335 }
2336 
2337 
2338 /**
2339  * Return the integer part of a float (vector) value (== round toward zero).
2340  * The returned value is an integer (vector).
2341  * Ex: itrunc(-1.5) = -1
2342  */
2343 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2344 lp_build_itrunc(struct lp_build_context *bld,
2345                 LLVMValueRef a)
2346 {
2347    LLVMBuilderRef builder = bld->gallivm->builder;
2348    const struct lp_type type = bld->type;
2349    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2350 
2351    assert(type.floating);
2352    assert(lp_check_value(type, a));
2353 
2354    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355 }
2356 
2357 
2358 /**
2359  * Return float (vector) rounded to nearest integer (vector).  The returned
2360  * value is an integer (vector).
2361  * Ex: iround(0.9) = 1
2362  * Ex: iround(-1.5) = -2
2363  */
2364 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2365 lp_build_iround(struct lp_build_context *bld,
2366                 LLVMValueRef a)
2367 {
2368    LLVMBuilderRef builder = bld->gallivm->builder;
2369    const struct lp_type type = bld->type;
2370    LLVMTypeRef int_vec_type = bld->int_vec_type;
2371    LLVMValueRef res;
2372 
2373    assert(type.floating);
2374 
2375    assert(lp_check_value(type, a));
2376 
2377    if ((util_get_cpu_caps()->has_sse2 &&
2378        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2379        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2380       return lp_build_iround_nearest_sse2(bld, a);
2381    }
2382    if (arch_rounding_available(type)) {
2383       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2384    }
2385    else {
2386       LLVMValueRef half;
2387 
2388       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2389 
2390       if (type.sign) {
2391          LLVMTypeRef vec_type = bld->vec_type;
2392          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2393                                     (unsigned long long)1 << (type.width - 1));
2394          LLVMValueRef sign;
2395 
2396          /* get sign bit */
2397          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2398          sign = LLVMBuildAnd(builder, sign, mask, "");
2399 
2400          /* sign * 0.5 */
2401          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2402          half = LLVMBuildOr(builder, sign, half, "");
2403          half = LLVMBuildBitCast(builder, half, vec_type, "");
2404       }
2405 
2406       res = LLVMBuildFAdd(builder, a, half, "");
2407    }
2408 
2409    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2410 
2411    return res;
2412 }
2413 
2414 
2415 /**
2416  * Return floor of float (vector), result is an int (vector)
2417  * Ex: ifloor(1.1) = 1.0
2418  * Ex: ifloor(-1.1) = -2.0
2419  */
2420 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2421 lp_build_ifloor(struct lp_build_context *bld,
2422                 LLVMValueRef a)
2423 {
2424    LLVMBuilderRef builder = bld->gallivm->builder;
2425    const struct lp_type type = bld->type;
2426    LLVMTypeRef int_vec_type = bld->int_vec_type;
2427    LLVMValueRef res;
2428 
2429    assert(type.floating);
2430    assert(lp_check_value(type, a));
2431 
2432    res = a;
2433    if (type.sign) {
2434       if (arch_rounding_available(type)) {
2435          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2436       }
2437       else {
2438          struct lp_type inttype;
2439          struct lp_build_context intbld;
2440          LLVMValueRef trunc, itrunc, mask;
2441 
2442          assert(type.floating);
2443          assert(lp_check_value(type, a));
2444 
2445          inttype = type;
2446          inttype.floating = 0;
2447          lp_build_context_init(&intbld, bld->gallivm, inttype);
2448 
2449          /* round by truncation */
2450          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2451          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2452 
2453          /*
2454           * fix values if rounding is wrong (for non-special cases)
2455           * - this is the case if trunc > a
2456           * The results of doing this with NaNs, very large values etc.
2457           * are undefined but this seems to be the case anyway.
2458           */
2459          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2460          /* cheapie minus one with mask since the mask is minus one / zero */
2461          return lp_build_add(&intbld, itrunc, mask);
2462       }
2463    }
2464 
2465    /* round to nearest (toward zero) */
2466    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2467 
2468    return res;
2469 }
2470 
2471 
2472 /**
2473  * Return ceiling of float (vector), returning int (vector).
2474  * Ex: iceil( 1.1) = 2
2475  * Ex: iceil(-1.1) = -1
2476  */
2477 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2478 lp_build_iceil(struct lp_build_context *bld,
2479                LLVMValueRef a)
2480 {
2481    LLVMBuilderRef builder = bld->gallivm->builder;
2482    const struct lp_type type = bld->type;
2483    LLVMTypeRef int_vec_type = bld->int_vec_type;
2484    LLVMValueRef res;
2485 
2486    assert(type.floating);
2487    assert(lp_check_value(type, a));
2488 
2489    if (arch_rounding_available(type)) {
2490       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2491    }
2492    else {
2493       struct lp_type inttype;
2494       struct lp_build_context intbld;
2495       LLVMValueRef trunc, itrunc, mask;
2496 
2497       assert(type.floating);
2498       assert(lp_check_value(type, a));
2499 
2500       inttype = type;
2501       inttype.floating = 0;
2502       lp_build_context_init(&intbld, bld->gallivm, inttype);
2503 
2504       /* round by truncation */
2505       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2506       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2507 
2508       /*
2509        * fix values if rounding is wrong (for non-special cases)
2510        * - this is the case if trunc < a
2511        * The results of doing this with NaNs, very large values etc.
2512        * are undefined but this seems to be the case anyway.
2513        */
2514       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2515       /* cheapie plus one with mask since the mask is minus one / zero */
2516       return lp_build_sub(&intbld, itrunc, mask);
2517    }
2518 
2519    /* round to nearest (toward zero) */
2520    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2521 
2522    return res;
2523 }
2524 
2525 
2526 /**
2527  * Combined ifloor() & fract().
2528  *
2529  * Preferred to calling the functions separately, as it will ensure that the
2530  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2531  */
2532 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2533 lp_build_ifloor_fract(struct lp_build_context *bld,
2534                       LLVMValueRef a,
2535                       LLVMValueRef *out_ipart,
2536                       LLVMValueRef *out_fpart)
2537 {
2538    LLVMBuilderRef builder = bld->gallivm->builder;
2539    const struct lp_type type = bld->type;
2540    LLVMValueRef ipart;
2541 
2542    assert(type.floating);
2543    assert(lp_check_value(type, a));
2544 
2545    if (arch_rounding_available(type)) {
2546       /*
2547        * floor() is easier.
2548        */
2549 
2550       ipart = lp_build_floor(bld, a);
2551       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2552       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2553    }
2554    else {
2555       /*
2556        * ifloor() is easier.
2557        */
2558 
2559       *out_ipart = lp_build_ifloor(bld, a);
2560       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2561       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2562    }
2563 }
2564 
2565 
2566 /**
2567  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2568  * always smaller than one.
2569  */
2570 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2571 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2572                            LLVMValueRef a,
2573                            LLVMValueRef *out_ipart,
2574                            LLVMValueRef *out_fpart)
2575 {
2576    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2577    *out_fpart = clamp_fract(bld, *out_fpart);
2578 }
2579 
2580 
2581 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2582 lp_build_sqrt(struct lp_build_context *bld,
2583               LLVMValueRef a)
2584 {
2585    LLVMBuilderRef builder = bld->gallivm->builder;
2586    const struct lp_type type = bld->type;
2587    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2588    char intrinsic[32];
2589 
2590    assert(lp_check_value(type, a));
2591 
2592    assert(type.floating);
2593    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2594 
2595    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2596 }
2597 
2598 
2599 /**
2600  * Do one Newton-Raphson step to improve reciprocate precision:
2601  *
2602  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2603  *
2604  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2605  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2606  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2607  * halo. It would be necessary to clamp the argument to prevent this.
2608  *
2609  * See also:
2610  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2611  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2612  */
2613 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2614 lp_build_rcp_refine(struct lp_build_context *bld,
2615                     LLVMValueRef a,
2616                     LLVMValueRef rcp_a)
2617 {
2618    LLVMBuilderRef builder = bld->gallivm->builder;
2619    LLVMValueRef neg_a;
2620    LLVMValueRef res;
2621 
2622    neg_a = LLVMBuildFNeg(builder, a, "");
2623    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2624    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2625 
2626    return res;
2627 }
2628 
2629 
2630 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2631 lp_build_rcp(struct lp_build_context *bld,
2632              LLVMValueRef a)
2633 {
2634    LLVMBuilderRef builder = bld->gallivm->builder;
2635    const struct lp_type type = bld->type;
2636 
2637    assert(lp_check_value(type, a));
2638 
2639    if(a == bld->zero)
2640       return bld->undef;
2641    if(a == bld->one)
2642       return bld->one;
2643    if(a == bld->undef)
2644       return bld->undef;
2645 
2646    assert(type.floating);
2647 
2648    if(LLVMIsConstant(a))
2649       return LLVMConstFDiv(bld->one, a);
2650 
2651    /*
2652     * We don't use RCPPS because:
2653     * - it only has 10bits of precision
2654     * - it doesn't even get the reciprocate of 1.0 exactly
2655     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2656     * - for recent processors the benefit over DIVPS is marginal, a case
2657     *   dependent
2658     *
2659     * We could still use it on certain processors if benchmarks show that the
2660     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2661     * particular uses that require less workarounds.
2662     */
2663 
2664    if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2665          (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2666       const unsigned num_iterations = 0;
2667       LLVMValueRef res;
2668       unsigned i;
2669       const char *intrinsic = NULL;
2670 
2671       if (type.length == 4) {
2672          intrinsic = "llvm.x86.sse.rcp.ps";
2673       }
2674       else {
2675          intrinsic = "llvm.x86.avx.rcp.ps.256";
2676       }
2677 
2678       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2679 
2680       for (i = 0; i < num_iterations; ++i) {
2681          res = lp_build_rcp_refine(bld, a, res);
2682       }
2683 
2684       return res;
2685    }
2686 
2687    return LLVMBuildFDiv(builder, bld->one, a, "");
2688 }
2689 
2690 
2691 /**
2692  * Do one Newton-Raphson step to improve rsqrt precision:
2693  *
2694  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2695  *
2696  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2697  */
2698 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2699 lp_build_rsqrt_refine(struct lp_build_context *bld,
2700                       LLVMValueRef a,
2701                       LLVMValueRef rsqrt_a)
2702 {
2703    LLVMBuilderRef builder = bld->gallivm->builder;
2704    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2705    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2706    LLVMValueRef res;
2707 
2708    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2709    res = LLVMBuildFMul(builder, a, res, "");
2710    res = LLVMBuildFSub(builder, three, res, "");
2711    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2712    res = LLVMBuildFMul(builder, half, res, "");
2713 
2714    return res;
2715 }
2716 
2717 
2718 /**
2719  * Generate 1/sqrt(a).
2720  * Result is undefined for values < 0, infinity for +0.
2721  */
2722 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2723 lp_build_rsqrt(struct lp_build_context *bld,
2724                LLVMValueRef a)
2725 {
2726    const struct lp_type type = bld->type;
2727 
2728    assert(lp_check_value(type, a));
2729 
2730    assert(type.floating);
2731 
2732    /*
2733     * This should be faster but all denormals will end up as infinity.
2734     */
2735    if (0 && lp_build_fast_rsqrt_available(type)) {
2736       const unsigned num_iterations = 1;
2737       LLVMValueRef res;
2738       unsigned i;
2739 
2740       /* rsqrt(1.0) != 1.0 here */
2741       res = lp_build_fast_rsqrt(bld, a);
2742 
2743       if (num_iterations) {
2744          /*
2745           * Newton-Raphson will result in NaN instead of infinity for zero,
2746           * and NaN instead of zero for infinity.
2747           * Also, need to ensure rsqrt(1.0) == 1.0.
2748           * All numbers smaller than FLT_MIN will result in +infinity
2749           * (rsqrtps treats all denormals as zero).
2750           */
2751          LLVMValueRef cmp;
2752          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2753          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2754 
2755          for (i = 0; i < num_iterations; ++i) {
2756             res = lp_build_rsqrt_refine(bld, a, res);
2757          }
2758          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2759          res = lp_build_select(bld, cmp, inf, res);
2760          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2761          res = lp_build_select(bld, cmp, bld->zero, res);
2762          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2763          res = lp_build_select(bld, cmp, bld->one, res);
2764       }
2765 
2766       return res;
2767    }
2768 
2769    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2770 }
2771 
2772 /**
2773  * If there's a fast (inaccurate) rsqrt instruction available
2774  * (caller may want to avoid to call rsqrt_fast if it's not available,
2775  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2776  * unavailable it would result in sqrt/div/mul so obviously
2777  * much better to just call sqrt, skipping both div and mul).
2778  */
2779 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2780 lp_build_fast_rsqrt_available(struct lp_type type)
2781 {
2782    assert(type.floating);
2783 
2784    if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2785        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2786       return true;
2787    }
2788    return false;
2789 }
2790 
2791 
2792 /**
2793  * Generate 1/sqrt(a).
2794  * Result is undefined for values < 0, infinity for +0.
2795  * Precision is limited, only ~10 bits guaranteed
2796  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2797  */
2798 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2799 lp_build_fast_rsqrt(struct lp_build_context *bld,
2800                     LLVMValueRef a)
2801 {
2802    LLVMBuilderRef builder = bld->gallivm->builder;
2803    const struct lp_type type = bld->type;
2804 
2805    assert(lp_check_value(type, a));
2806 
2807    if (lp_build_fast_rsqrt_available(type)) {
2808       const char *intrinsic = NULL;
2809 
2810       if (type.length == 4) {
2811          intrinsic = "llvm.x86.sse.rsqrt.ps";
2812       }
2813       else {
2814          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2815       }
2816       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2817    }
2818    else {
2819       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2820    }
2821    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2822 }
2823 
2824 
2825 /**
2826  * Generate sin(a) or cos(a) using polynomial approximation.
2827  * TODO: it might be worth recognizing sin and cos using same source
2828  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2829  * would be way cheaper than calculating (nearly) everything twice...
2830  * Not sure it's common enough to be worth bothering however, scs
2831  * opcode could also benefit from calculating both though.
2832  */
2833 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2834 lp_build_sin_or_cos(struct lp_build_context *bld,
2835                     LLVMValueRef a,
2836                     boolean cos)
2837 {
2838    struct gallivm_state *gallivm = bld->gallivm;
2839    LLVMBuilderRef b = gallivm->builder;
2840    struct lp_type int_type = lp_int_type(bld->type);
2841 
2842    /*
2843     *  take the absolute value,
2844     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845     */
2846 
2847    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849 
2850    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852 
2853    /*
2854     * scale by 4/Pi
2855     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856     */
2857 
2858    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860 
2861    /*
2862     * store the integer part of y in mm0
2863     * emm2 = _mm_cvttps_epi32(y);
2864     */
2865 
2866    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867 
2868    /*
2869     * j=(j+1) & (~1) (see the cephes sources)
2870     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871     */
2872 
2873    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875    /*
2876     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877     */
2878    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880 
2881    /*
2882     * y = _mm_cvtepi32_ps(emm2);
2883     */
2884    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885 
2886    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2887    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2888    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2889    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2890 
2891    /*
2892     * Argument used for poly selection and sign bit determination
2893     * is different for sin vs. cos.
2894     */
2895    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2896                                emm2_and;
2897 
2898    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2899                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2900                                               const_29, "sign_bit") :
2901                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2902                                                               LLVMBuildShl(b, emm2_add,
2903                                                                            const_29, ""), ""),
2904                                               sign_mask, "sign_bit");
2905 
2906    /*
2907     * get the polynom selection mask
2908     * there is one polynom for 0 <= x <= Pi/4
2909     * and another one for Pi/4<x<=Pi/2
2910     * Both branches will be computed.
2911     *
2912     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2913     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2914     */
2915 
2916    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2917    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2918                                              int_type, PIPE_FUNC_EQUAL,
2919                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2920 
2921    /*
2922     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2923     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2924     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2925     */
2926    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2927    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2928    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2929 
2930    /*
2931     * The magic pass: "Extended precision modular arithmetic"
2932     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2933     */
2934    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2935    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2936    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2937 
2938    /*
2939     * Evaluate the first polynom  (0 <= x <= Pi/4)
2940     *
2941     * z = _mm_mul_ps(x,x);
2942     */
2943    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2944 
2945    /*
2946     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2947     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2948     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2949     */
2950    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2951    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2952    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2953 
2954    /*
2955     * y = *(v4sf*)_ps_coscof_p0;
2956     * y = _mm_mul_ps(y, z);
2957     */
2958    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2959    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2960    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2961    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2962 
2963 
2964    /*
2965     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2966     * y = _mm_sub_ps(y, tmp);
2967     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2968     */
2969    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2970    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2971    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2972    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2973    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2974 
2975    /*
2976     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2977     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2978     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2979     */
2980    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2981    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2982    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2983 
2984    /*
2985     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2986     *
2987     * y2 = *(v4sf*)_ps_sincof_p0;
2988     * y2 = _mm_mul_ps(y2, z);
2989     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2990     * y2 = _mm_mul_ps(y2, z);
2991     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2992     * y2 = _mm_mul_ps(y2, z);
2993     * y2 = _mm_mul_ps(y2, x);
2994     * y2 = _mm_add_ps(y2, x);
2995     */
2996 
2997    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2998    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2999    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3000    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3001 
3002    /*
3003     * select the correct result from the two polynoms
3004     * xmm3 = poly_mask;
3005     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3006     * y = _mm_andnot_ps(xmm3, y);
3007     * y = _mm_or_ps(y,y2);
3008     */
3009    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3010    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3011    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3012    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3013    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3014    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3015 
3016    /*
3017     * update the sign
3018     * y = _mm_xor_ps(y, sign_bit);
3019     */
3020    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3021    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3022 
3023    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3024 
3025    /* clamp output to be within [-1, 1] */
3026    y_result = lp_build_clamp(bld, y_result,
3027                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3028                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3029    /* If a is -inf, inf or NaN then return NaN */
3030    y_result = lp_build_select(bld, isfinite, y_result,
3031                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3032    return y_result;
3033 }
3034 
3035 
3036 /**
3037  * Generate sin(a)
3038  */
3039 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3040 lp_build_sin(struct lp_build_context *bld,
3041              LLVMValueRef a)
3042 {
3043    const struct lp_type type = bld->type;
3044 
3045    if (type.width == 16) {
3046       LLVMBuilderRef builder = bld->gallivm->builder;
3047       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3048       char intrinsic[32];
3049       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3050       LLVMValueRef args[] = { a };
3051       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3052    }
3053 
3054    return lp_build_sin_or_cos(bld, a, FALSE);
3055 }
3056 
3057 
3058 /**
3059  * Generate cos(a)
3060  */
3061 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3062 lp_build_cos(struct lp_build_context *bld,
3063              LLVMValueRef a)
3064 {
3065    const struct lp_type type = bld->type;
3066 
3067    if (type.width == 16) {
3068       LLVMBuilderRef builder = bld->gallivm->builder;
3069       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3070       char intrinsic[32];
3071       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3072       LLVMValueRef args[] = { a };
3073       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3074    }
3075 
3076    return lp_build_sin_or_cos(bld, a, TRUE);
3077 }
3078 
3079 
3080 /**
3081  * Generate pow(x, y)
3082  */
3083 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3084 lp_build_pow(struct lp_build_context *bld,
3085              LLVMValueRef x,
3086              LLVMValueRef y)
3087 {
3088    /* TODO: optimize the constant case */
3089    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3090        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3091       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3092                    __FUNCTION__);
3093    }
3094 
3095    LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3096    LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3097 
3098    res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3099    return res;
3100 }
3101 
3102 
3103 /**
3104  * Generate exp(x)
3105  */
3106 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3107 lp_build_exp(struct lp_build_context *bld,
3108              LLVMValueRef x)
3109 {
3110    /* log2(e) = 1/log(2) */
3111    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3112                                            1.4426950408889634);
3113 
3114    assert(lp_check_value(bld->type, x));
3115 
3116    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3117 }
3118 
3119 
3120 /**
3121  * Generate log(x)
3122  * Behavior is undefined with infs, 0s and nans
3123  */
3124 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3125 lp_build_log(struct lp_build_context *bld,
3126              LLVMValueRef x)
3127 {
3128    /* log(2) */
3129    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3130                                           0.69314718055994529);
3131 
3132    assert(lp_check_value(bld->type, x));
3133 
3134    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3135 }
3136 
3137 /**
3138  * Generate log(x) that handles edge cases (infs, 0s and nans)
3139  */
3140 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3141 lp_build_log_safe(struct lp_build_context *bld,
3142                   LLVMValueRef x)
3143 {
3144    /* log(2) */
3145    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3146                                           0.69314718055994529);
3147 
3148    assert(lp_check_value(bld->type, x));
3149 
3150    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3151 }
3152 
3153 
3154 /**
3155  * Generate polynomial.
3156  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3157  */
3158 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3159 lp_build_polynomial(struct lp_build_context *bld,
3160                     LLVMValueRef x,
3161                     const double *coeffs,
3162                     unsigned num_coeffs)
3163 {
3164    const struct lp_type type = bld->type;
3165    LLVMValueRef even = NULL, odd = NULL;
3166    LLVMValueRef x2;
3167    unsigned i;
3168 
3169    assert(lp_check_value(bld->type, x));
3170 
3171    /* TODO: optimize the constant case */
3172    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3173        LLVMIsConstant(x)) {
3174       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3175                    __FUNCTION__);
3176    }
3177 
3178    /*
3179     * Calculate odd and even terms seperately to decrease data dependency
3180     * Ex:
3181     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3182     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3183     */
3184    x2 = lp_build_mul(bld, x, x);
3185 
3186    for (i = num_coeffs; i--; ) {
3187       LLVMValueRef coeff;
3188 
3189       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3190 
3191       if (i % 2 == 0) {
3192          if (even)
3193             even = lp_build_mad(bld, x2, even, coeff);
3194          else
3195             even = coeff;
3196       } else {
3197          if (odd)
3198             odd = lp_build_mad(bld, x2, odd, coeff);
3199          else
3200             odd = coeff;
3201       }
3202    }
3203 
3204    if (odd)
3205       return lp_build_mad(bld, odd, x, even);
3206    else if (even)
3207       return even;
3208    else
3209       return bld->undef;
3210 }
3211 
3212 
3213 /**
3214  * Minimax polynomial fit of 2**x, in range [0, 1[
3215  */
3216 const double lp_build_exp2_polynomial[] = {
3217 #if EXP_POLY_DEGREE == 5
3218    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3219    0.693153073200168932794,
3220    0.240153617044375388211,
3221    0.0558263180532956664775,
3222    0.00898934009049466391101,
3223    0.00187757667519147912699
3224 #elif EXP_POLY_DEGREE == 4
3225    1.00000259337069434683,
3226    0.693003834469974940458,
3227    0.24144275689150793076,
3228    0.0520114606103070150235,
3229    0.0135341679161270268764
3230 #elif EXP_POLY_DEGREE == 3
3231    0.999925218562710312959,
3232    0.695833540494823811697,
3233    0.226067155427249155588,
3234    0.0780245226406372992967
3235 #elif EXP_POLY_DEGREE == 2
3236    1.00172476321474503578,
3237    0.657636275736077639316,
3238    0.33718943461968720704
3239 #else
3240 #error
3241 #endif
3242 };
3243 
3244 
3245 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3246 lp_build_exp2(struct lp_build_context *bld,
3247               LLVMValueRef x)
3248 {
3249    LLVMBuilderRef builder = bld->gallivm->builder;
3250    const struct lp_type type = bld->type;
3251    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3252    LLVMValueRef ipart = NULL;
3253    LLVMValueRef fpart = NULL;
3254    LLVMValueRef expipart = NULL;
3255    LLVMValueRef expfpart = NULL;
3256    LLVMValueRef res = NULL;
3257 
3258    if (type.floating && type.width == 16) {
3259       char intrinsic[32];
3260       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3261       LLVMValueRef args[] = { x };
3262       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3263    }
3264 
3265    assert(lp_check_value(bld->type, x));
3266 
3267    /* TODO: optimize the constant case */
3268    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3269        LLVMIsConstant(x)) {
3270       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3271                    __FUNCTION__);
3272    }
3273 
3274    assert(type.floating && type.width == 32);
3275 
3276    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3277     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3278    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3279                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3280    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3281                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3282 
3283    /* ipart = floor(x) */
3284    /* fpart = x - ipart */
3285    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3286 
3287    /* expipart = (float) (1 << ipart) */
3288    expipart = LLVMBuildAdd(builder, ipart,
3289                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3290    expipart = LLVMBuildShl(builder, expipart,
3291                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3292    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3293 
3294    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3295                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3296 
3297    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3298 
3299    return res;
3300 }
3301 
3302 
3303 
3304 /**
3305  * Extract the exponent of a IEEE-754 floating point value.
3306  *
3307  * Optionally apply an integer bias.
3308  *
3309  * Result is an integer value with
3310  *
3311  *   ifloor(log2(x)) + bias
3312  */
3313 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3314 lp_build_extract_exponent(struct lp_build_context *bld,
3315                           LLVMValueRef x,
3316                           int bias)
3317 {
3318    LLVMBuilderRef builder = bld->gallivm->builder;
3319    const struct lp_type type = bld->type;
3320    unsigned mantissa = lp_mantissa(type);
3321    LLVMValueRef res;
3322 
3323    assert(type.floating);
3324 
3325    assert(lp_check_value(bld->type, x));
3326 
3327    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3328 
3329    res = LLVMBuildLShr(builder, x,
3330                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3331    res = LLVMBuildAnd(builder, res,
3332                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3333    res = LLVMBuildSub(builder, res,
3334                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3335 
3336    return res;
3337 }
3338 
3339 
3340 /**
3341  * Extract the mantissa of the a floating.
3342  *
3343  * Result is a floating point value with
3344  *
3345  *   x / floor(log2(x))
3346  */
3347 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3348 lp_build_extract_mantissa(struct lp_build_context *bld,
3349                           LLVMValueRef x)
3350 {
3351    LLVMBuilderRef builder = bld->gallivm->builder;
3352    const struct lp_type type = bld->type;
3353    unsigned mantissa = lp_mantissa(type);
3354    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3355                                                   (1ULL << mantissa) - 1);
3356    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3357    LLVMValueRef res;
3358 
3359    assert(lp_check_value(bld->type, x));
3360 
3361    assert(type.floating);
3362 
3363    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3364 
3365    /* res = x / 2**ipart */
3366    res = LLVMBuildAnd(builder, x, mantmask, "");
3367    res = LLVMBuildOr(builder, res, one, "");
3368    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3369 
3370    return res;
3371 }
3372 
3373 
3374 
3375 /**
3376  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3377  * These coefficients can be generate with
3378  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3379  */
3380 const double lp_build_log2_polynomial[] = {
3381 #if LOG_POLY_DEGREE == 5
3382    2.88539008148777786488L,
3383    0.961796878841293367824L,
3384    0.577058946784739859012L,
3385    0.412914355135828735411L,
3386    0.308591899232910175289L,
3387    0.352376952300281371868L,
3388 #elif LOG_POLY_DEGREE == 4
3389    2.88539009343309178325L,
3390    0.961791550404184197881L,
3391    0.577440339438736392009L,
3392    0.403343858251329912514L,
3393    0.406718052498846252698L,
3394 #elif LOG_POLY_DEGREE == 3
3395    2.88538959748872753838L,
3396    0.961932915889597772928L,
3397    0.571118517972136195241L,
3398    0.493997535084709500285L,
3399 #else
3400 #error
3401 #endif
3402 };
3403 
3404 /**
3405  * See http://www.devmaster.net/forums/showthread.php?p=43580
3406  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3407  * http://www.nezumi.demon.co.uk/consult/logx.htm
3408  *
3409  * If handle_edge_cases is true the function will perform computations
3410  * to match the required D3D10+ behavior for each of the edge cases.
3411  * That means that if input is:
3412  * - less than zero (to and including -inf) then NaN will be returned
3413  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3414  * - +infinity, then +infinity will be returned
3415  * - NaN, then NaN will be returned
3416  *
3417  * Those checks are fairly expensive so if you don't need them make sure
3418  * handle_edge_cases is false.
3419  */
3420 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3421 lp_build_log2_approx(struct lp_build_context *bld,
3422                      LLVMValueRef x,
3423                      LLVMValueRef *p_exp,
3424                      LLVMValueRef *p_floor_log2,
3425                      LLVMValueRef *p_log2,
3426                      boolean handle_edge_cases)
3427 {
3428    LLVMBuilderRef builder = bld->gallivm->builder;
3429    const struct lp_type type = bld->type;
3430    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3431    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3432 
3433    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3434    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3435    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3436 
3437    LLVMValueRef i = NULL;
3438    LLVMValueRef y = NULL;
3439    LLVMValueRef z = NULL;
3440    LLVMValueRef exp = NULL;
3441    LLVMValueRef mant = NULL;
3442    LLVMValueRef logexp = NULL;
3443    LLVMValueRef p_z = NULL;
3444    LLVMValueRef res = NULL;
3445 
3446    if (bld->type.width == 16) {
3447       char intrinsic[32];
3448       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3449       LLVMValueRef args[] = { x };
3450       if (p_log2)
3451          *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3452       return;
3453    }
3454 
3455    assert(lp_check_value(bld->type, x));
3456 
3457    if(p_exp || p_floor_log2 || p_log2) {
3458       /* TODO: optimize the constant case */
3459       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3460           LLVMIsConstant(x)) {
3461          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3462                       __FUNCTION__);
3463       }
3464 
3465       assert(type.floating && type.width == 32);
3466 
3467       /*
3468        * We don't explicitly handle denormalized numbers. They will yield a
3469        * result in the neighbourhood of -127, which appears to be adequate
3470        * enough.
3471        */
3472 
3473       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3474 
3475       /* exp = (float) exponent(x) */
3476       exp = LLVMBuildAnd(builder, i, expmask, "");
3477    }
3478 
3479    if(p_floor_log2 || p_log2) {
3480       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3481       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3482       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3483    }
3484 
3485    if (p_log2) {
3486       /* mant = 1 + (float) mantissa(x) */
3487       mant = LLVMBuildAnd(builder, i, mantmask, "");
3488       mant = LLVMBuildOr(builder, mant, one, "");
3489       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3490 
3491       /* y = (mant - 1) / (mant + 1) */
3492       y = lp_build_div(bld,
3493          lp_build_sub(bld, mant, bld->one),
3494          lp_build_add(bld, mant, bld->one)
3495       );
3496 
3497       /* z = y^2 */
3498       z = lp_build_mul(bld, y, y);
3499 
3500       /* compute P(z) */
3501       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3502                                 ARRAY_SIZE(lp_build_log2_polynomial));
3503 
3504       /* y * P(z) + logexp */
3505       res = lp_build_mad(bld, y, p_z, logexp);
3506 
3507       if (type.floating && handle_edge_cases) {
3508          LLVMValueRef negmask, infmask,  zmask;
3509          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3510                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3511          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3512                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3513          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3514                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3515 
3516          /* If x is qual to inf make sure we return inf */
3517          res = lp_build_select(bld, infmask,
3518                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3519                                res);
3520          /* If x is qual to 0, return -inf */
3521          res = lp_build_select(bld, zmask,
3522                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3523                                res);
3524          /* If x is nan or less than 0, return nan */
3525          res = lp_build_select(bld, negmask,
3526                                lp_build_const_vec(bld->gallivm, type,  NAN),
3527                                res);
3528       }
3529    }
3530 
3531    if (p_exp) {
3532       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3533       *p_exp = exp;
3534    }
3535 
3536    if (p_floor_log2)
3537       *p_floor_log2 = logexp;
3538 
3539    if (p_log2)
3540       *p_log2 = res;
3541 }
3542 
3543 
3544 /*
3545  * log2 implementation which doesn't have special code to
3546  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3547  * the results for those cases are undefined.
3548  */
3549 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3550 lp_build_log2(struct lp_build_context *bld,
3551               LLVMValueRef x)
3552 {
3553    LLVMValueRef res;
3554    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3555    return res;
3556 }
3557 
3558 /*
3559  * Version of log2 which handles all edge cases.
3560  * Look at documentation of lp_build_log2_approx for
3561  * description of the behavior for each of the edge cases.
3562  */
3563 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3564 lp_build_log2_safe(struct lp_build_context *bld,
3565                    LLVMValueRef x)
3566 {
3567    LLVMValueRef res;
3568    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3569    return res;
3570 }
3571 
3572 
3573 /**
3574  * Faster (and less accurate) log2.
3575  *
3576  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3577  *
3578  * Piece-wise linear approximation, with exact results when x is a
3579  * power of two.
3580  *
3581  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3582  */
3583 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3584 lp_build_fast_log2(struct lp_build_context *bld,
3585                    LLVMValueRef x)
3586 {
3587    LLVMBuilderRef builder = bld->gallivm->builder;
3588    LLVMValueRef ipart;
3589    LLVMValueRef fpart;
3590 
3591    assert(lp_check_value(bld->type, x));
3592 
3593    assert(bld->type.floating);
3594 
3595    /* ipart = floor(log2(x)) - 1 */
3596    ipart = lp_build_extract_exponent(bld, x, -1);
3597    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3598 
3599    /* fpart = x / 2**ipart */
3600    fpart = lp_build_extract_mantissa(bld, x);
3601 
3602    /* ipart + fpart */
3603    return LLVMBuildFAdd(builder, ipart, fpart, "");
3604 }
3605 
3606 
3607 /**
3608  * Fast implementation of iround(log2(x)).
3609  *
3610  * Not an approximation -- it should give accurate results all the time.
3611  */
3612 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3613 lp_build_ilog2(struct lp_build_context *bld,
3614                LLVMValueRef x)
3615 {
3616    LLVMBuilderRef builder = bld->gallivm->builder;
3617    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3618    LLVMValueRef ipart;
3619 
3620    assert(bld->type.floating);
3621 
3622    assert(lp_check_value(bld->type, x));
3623 
3624    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3625    x = LLVMBuildFMul(builder, x, sqrt2, "");
3626 
3627    /* ipart = floor(log2(x) + 0.5)  */
3628    ipart = lp_build_extract_exponent(bld, x, 0);
3629 
3630    return ipart;
3631 }
3632 
3633 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3634 lp_build_mod(struct lp_build_context *bld,
3635              LLVMValueRef x,
3636              LLVMValueRef y)
3637 {
3638    LLVMBuilderRef builder = bld->gallivm->builder;
3639    LLVMValueRef res;
3640    const struct lp_type type = bld->type;
3641 
3642    assert(lp_check_value(type, x));
3643    assert(lp_check_value(type, y));
3644 
3645    if (type.floating)
3646       res = LLVMBuildFRem(builder, x, y, "");
3647    else if (type.sign)
3648       res = LLVMBuildSRem(builder, x, y, "");
3649    else
3650       res = LLVMBuildURem(builder, x, y, "");
3651    return res;
3652 }
3653 
3654 
3655 /*
3656  * For floating inputs it creates and returns a mask
3657  * which is all 1's for channels which are NaN.
3658  * Channels inside x which are not NaN will be 0.
3659  */
3660 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3661 lp_build_isnan(struct lp_build_context *bld,
3662                LLVMValueRef x)
3663 {
3664    LLVMValueRef mask;
3665    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3666 
3667    assert(bld->type.floating);
3668    assert(lp_check_value(bld->type, x));
3669 
3670    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3671                         "isnotnan");
3672    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3673    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3674    return mask;
3675 }
3676 
3677 /* Returns all 1's for floating point numbers that are
3678  * finite numbers and returns all zeros for -inf,
3679  * inf and nan's */
3680 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3681 lp_build_isfinite(struct lp_build_context *bld,
3682                   LLVMValueRef x)
3683 {
3684    LLVMBuilderRef builder = bld->gallivm->builder;
3685    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3686    struct lp_type int_type = lp_int_type(bld->type);
3687    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3688    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3689                                                     0x7f800000);
3690 
3691    if (!bld->type.floating) {
3692       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3693    }
3694    assert(bld->type.floating);
3695    assert(lp_check_value(bld->type, x));
3696    assert(bld->type.width == 32);
3697 
3698    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3699    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3700                            intx, infornan32);
3701 }
3702 
3703 /*
3704  * Returns true if the number is nan or inf and false otherwise.
3705  * The input has to be a floating point vector.
3706  */
3707 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3708 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3709                        const struct lp_type type,
3710                        LLVMValueRef x)
3711 {
3712    LLVMBuilderRef builder = gallivm->builder;
3713    struct lp_type int_type = lp_int_type(type);
3714    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3715                                                 0x7f800000);
3716    LLVMValueRef ret;
3717 
3718    assert(type.floating);
3719 
3720    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3721    ret = LLVMBuildAnd(builder, ret, const0, "");
3722    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3723                           ret, const0);
3724 
3725    return ret;
3726 }
3727 
3728 
3729 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3730 lp_build_fpstate_get(struct gallivm_state *gallivm)
3731 {
3732    if (util_get_cpu_caps()->has_sse) {
3733       LLVMBuilderRef builder = gallivm->builder;
3734       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3735          gallivm,
3736          LLVMInt32TypeInContext(gallivm->context),
3737          "mxcsr_ptr");
3738       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3739           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3740       lp_build_intrinsic(builder,
3741                          "llvm.x86.sse.stmxcsr",
3742                          LLVMVoidTypeInContext(gallivm->context),
3743                          &mxcsr_ptr8, 1, 0);
3744       return mxcsr_ptr;
3745    }
3746    return 0;
3747 }
3748 
3749 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3750 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3751                                   boolean zero)
3752 {
3753    if (util_get_cpu_caps()->has_sse) {
3754       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3755       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3756 
3757       LLVMBuilderRef builder = gallivm->builder;
3758       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3759       LLVMValueRef mxcsr =
3760          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3761 
3762       if (util_get_cpu_caps()->has_daz) {
3763          /* Enable denormals are zero mode */
3764          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3765       }
3766       if (zero) {
3767          mxcsr = LLVMBuildOr(builder, mxcsr,
3768                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3769       } else {
3770          mxcsr = LLVMBuildAnd(builder, mxcsr,
3771                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3772       }
3773 
3774       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3775       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3776    }
3777 }
3778 
3779 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3780 lp_build_fpstate_set(struct gallivm_state *gallivm,
3781                      LLVMValueRef mxcsr_ptr)
3782 {
3783    if (util_get_cpu_caps()->has_sse) {
3784       LLVMBuilderRef builder = gallivm->builder;
3785       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3786                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3787       lp_build_intrinsic(builder,
3788                          "llvm.x86.sse.ldmxcsr",
3789                          LLVMVoidTypeInContext(gallivm->context),
3790                          &mxcsr_ptr, 1, 0);
3791    }
3792 }
3793