1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_get_cpu_caps()->has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_get_cpu_caps()->has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_get_cpu_caps()->has_altivec) {
147 intr_size = 128;
148 if (type.width == 8) {
149 if (!type.sign) {
150 intrinsic = "llvm.ppc.altivec.vminub";
151 } else {
152 intrinsic = "llvm.ppc.altivec.vminsb";
153 }
154 } else if (type.width == 16) {
155 if (!type.sign) {
156 intrinsic = "llvm.ppc.altivec.vminuh";
157 } else {
158 intrinsic = "llvm.ppc.altivec.vminsh";
159 }
160 } else if (type.width == 32) {
161 if (!type.sign) {
162 intrinsic = "llvm.ppc.altivec.vminuw";
163 } else {
164 intrinsic = "llvm.ppc.altivec.vminsw";
165 }
166 }
167 }
168
169 if (intrinsic) {
170 /* We need to handle nan's for floating point numbers. If one of the
171 * inputs is nan the other should be returned (required by both D3D10+
172 * and OpenCL).
173 * The sse intrinsics return the second operator in case of nan by
174 * default so we need to special code to handle those.
175 */
176 if (util_get_cpu_caps()->has_sse && type.floating &&
177 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178 LLVMValueRef isnan, min;
179 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 isnan = lp_build_isnan(bld, b);
183 return lp_build_select(bld, isnan, a, min);
184 } else {
185 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186 type,
187 intr_size, a, b);
188 }
189 }
190
191 if (type.floating) {
192 switch (nan_behavior) {
193 case GALLIVM_NAN_RETURN_OTHER: {
194 LLVMValueRef isnan = lp_build_isnan(bld, a);
195 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197 return lp_build_select(bld, cond, a, b);
198 }
199 break;
200 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202 return lp_build_select(bld, cond, a, b);
203 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205 return lp_build_select(bld, cond, b, a);
206 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208 return lp_build_select(bld, cond, a, b);
209 break;
210 default:
211 assert(0);
212 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213 return lp_build_select(bld, cond, a, b);
214 }
215 } else {
216 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217 return lp_build_select(bld, cond, a, b);
218 }
219 }
220
221
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224 LLVMValueRef a,
225 LLVMValueRef b,
226 LLVMValueRef c)
227 {
228 LLVMTypeRef type = LLVMTypeOf(a);
229 assert(type == LLVMTypeOf(b));
230 assert(type == LLVMTypeOf(c));
231
232 char intrinsic[32];
233 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234 LLVMValueRef args[] = { a, b, c };
235 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237
238
239 /**
240 * Generate max(a, b)
241 * No checks for special case values of a or b = 1 or 0 are done.
242 * NaN's are handled according to the behavior specified by the
243 * nan_behavior argument.
244 */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247 LLVMValueRef a,
248 LLVMValueRef b,
249 enum gallivm_nan_behavior nan_behavior)
250 {
251 const struct lp_type type = bld->type;
252 const char *intrinsic = NULL;
253 unsigned intr_size = 0;
254 LLVMValueRef cond;
255
256 assert(lp_check_value(type, a));
257 assert(lp_check_value(type, b));
258
259 /* TODO: optimize the constant case */
260
261 if (type.floating && util_get_cpu_caps()->has_sse) {
262 if (type.width == 32) {
263 if (type.length == 1) {
264 intrinsic = "llvm.x86.sse.max.ss";
265 intr_size = 128;
266 }
267 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268 intrinsic = "llvm.x86.sse.max.ps";
269 intr_size = 128;
270 }
271 else {
272 intrinsic = "llvm.x86.avx.max.ps.256";
273 intr_size = 256;
274 }
275 }
276 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277 if (type.length == 1) {
278 intrinsic = "llvm.x86.sse2.max.sd";
279 intr_size = 128;
280 }
281 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282 intrinsic = "llvm.x86.sse2.max.pd";
283 intr_size = 128;
284 }
285 else {
286 intrinsic = "llvm.x86.avx.max.pd.256";
287 intr_size = 256;
288 }
289 }
290 }
291 else if (type.floating && util_get_cpu_caps()->has_altivec) {
292 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294 __FUNCTION__);
295 }
296 if (type.width == 32 || type.length == 4) {
297 intrinsic = "llvm.ppc.altivec.vmaxfp";
298 intr_size = 128;
299 }
300 } else if (util_get_cpu_caps()->has_altivec) {
301 intr_size = 128;
302 if (type.width == 8) {
303 if (!type.sign) {
304 intrinsic = "llvm.ppc.altivec.vmaxub";
305 } else {
306 intrinsic = "llvm.ppc.altivec.vmaxsb";
307 }
308 } else if (type.width == 16) {
309 if (!type.sign) {
310 intrinsic = "llvm.ppc.altivec.vmaxuh";
311 } else {
312 intrinsic = "llvm.ppc.altivec.vmaxsh";
313 }
314 } else if (type.width == 32) {
315 if (!type.sign) {
316 intrinsic = "llvm.ppc.altivec.vmaxuw";
317 } else {
318 intrinsic = "llvm.ppc.altivec.vmaxsw";
319 }
320 }
321 }
322
323 if (intrinsic) {
324 if (util_get_cpu_caps()->has_sse && type.floating &&
325 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326 LLVMValueRef isnan, max;
327 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328 type,
329 intr_size, a, b);
330 isnan = lp_build_isnan(bld, b);
331 return lp_build_select(bld, isnan, a, max);
332 } else {
333 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334 type,
335 intr_size, a, b);
336 }
337 }
338
339 if (type.floating) {
340 switch (nan_behavior) {
341 case GALLIVM_NAN_RETURN_OTHER: {
342 LLVMValueRef isnan = lp_build_isnan(bld, a);
343 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345 return lp_build_select(bld, cond, a, b);
346 }
347 break;
348 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350 return lp_build_select(bld, cond, a, b);
351 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353 return lp_build_select(bld, cond, b, a);
354 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356 return lp_build_select(bld, cond, a, b);
357 break;
358 default:
359 assert(0);
360 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361 return lp_build_select(bld, cond, a, b);
362 }
363 } else {
364 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365 return lp_build_select(bld, cond, a, b);
366 }
367 }
368
369
370 /**
371 * Generate 1 - a, or ~a depending on bld->type.
372 */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375 LLVMValueRef a)
376 {
377 LLVMBuilderRef builder = bld->gallivm->builder;
378 const struct lp_type type = bld->type;
379
380 assert(lp_check_value(type, a));
381
382 if(a == bld->one)
383 return bld->zero;
384 if(a == bld->zero)
385 return bld->one;
386
387 if(type.norm && !type.floating && !type.fixed && !type.sign) {
388 if(LLVMIsConstant(a))
389 return LLVMConstNot(a);
390 else
391 return LLVMBuildNot(builder, a, "");
392 }
393
394 if(LLVMIsConstant(a))
395 if (type.floating)
396 return LLVMConstFSub(bld->one, a);
397 else
398 return LLVMConstSub(bld->one, a);
399 else
400 if (type.floating)
401 return LLVMBuildFSub(builder, bld->one, a, "");
402 else
403 return LLVMBuildSub(builder, bld->one, a, "");
404 }
405
406
407 /**
408 * Generate a + b
409 */
410 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)411 lp_build_add(struct lp_build_context *bld,
412 LLVMValueRef a,
413 LLVMValueRef b)
414 {
415 LLVMBuilderRef builder = bld->gallivm->builder;
416 const struct lp_type type = bld->type;
417 LLVMValueRef res;
418
419 assert(lp_check_value(type, a));
420 assert(lp_check_value(type, b));
421
422 if (a == bld->zero)
423 return b;
424 if (b == bld->zero)
425 return a;
426 if (a == bld->undef || b == bld->undef)
427 return bld->undef;
428
429 if (type.norm) {
430 const char *intrinsic = NULL;
431
432 if (!type.sign && (a == bld->one || b == bld->one))
433 return bld->one;
434
435 if (!type.floating && !type.fixed) {
436 if (LLVM_VERSION_MAJOR >= 8) {
437 char intrin[32];
438 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441 }
442 if (type.width * type.length == 128) {
443 if (util_get_cpu_caps()->has_sse2) {
444 if (type.width == 8)
445 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446 if (type.width == 16)
447 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448 } else if (util_get_cpu_caps()->has_altivec) {
449 if (type.width == 8)
450 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451 if (type.width == 16)
452 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453 }
454 }
455 if (type.width * type.length == 256) {
456 if (util_get_cpu_caps()->has_avx2) {
457 if (type.width == 8)
458 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459 if (type.width == 16)
460 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461 }
462 }
463 }
464
465 if (intrinsic)
466 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467 }
468
469 if(type.norm && !type.floating && !type.fixed) {
470 if (type.sign) {
471 uint64_t sign = (uint64_t)1 << (type.width - 1);
472 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474 /* a_clamp_max is the maximum a for positive b,
475 a_clamp_min is the minimum a for negative b. */
476 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479 }
480 }
481
482 if(LLVMIsConstant(a) && LLVMIsConstant(b))
483 if (type.floating)
484 res = LLVMConstFAdd(a, b);
485 else
486 res = LLVMConstAdd(a, b);
487 else
488 if (type.floating)
489 res = LLVMBuildFAdd(builder, a, b, "");
490 else
491 res = LLVMBuildAdd(builder, a, b, "");
492
493 /* clamp to ceiling of 1.0 */
494 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496
497 if (type.norm && !type.floating && !type.fixed) {
498 if (!type.sign) {
499 /*
500 * newer llvm versions no longer support the intrinsics, but recognize
501 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502 * code, it is important we match the pattern llvm uses (and pray llvm
503 * doesn't change it - and hope they decide on the same pattern for
504 * all backends supporting it...).
505 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506 * interfere with llvm's ability to recognize the pattern but seems
507 * a bit brittle.
508 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509 */
510 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511 res = lp_build_select(bld, overflowed,
512 LLVMConstAllOnes(bld->int_vec_type), res);
513 }
514 }
515
516 /* XXX clamp to floor of -1 or 0??? */
517
518 return res;
519 }
520
521
522 /** Return the scalar sum of the elements of a.
523 * Should avoid this operation whenever possible.
524 */
525 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)526 lp_build_horizontal_add(struct lp_build_context *bld,
527 LLVMValueRef a)
528 {
529 LLVMBuilderRef builder = bld->gallivm->builder;
530 const struct lp_type type = bld->type;
531 LLVMValueRef index, res;
532 unsigned i, length;
533 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535 LLVMValueRef vecres, elem2;
536
537 assert(lp_check_value(type, a));
538
539 if (type.length == 1) {
540 return a;
541 }
542
543 assert(!bld->type.norm);
544
545 /*
546 * for byte vectors can do much better with psadbw.
547 * Using repeated shuffle/adds here. Note with multiple vectors
548 * this can be done more efficiently as outlined in the intel
549 * optimization manual.
550 * Note: could cause data rearrangement if used with smaller element
551 * sizes.
552 */
553
554 vecres = a;
555 length = type.length / 2;
556 while (length > 1) {
557 LLVMValueRef vec1, vec2;
558 for (i = 0; i < length; i++) {
559 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561 }
562 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563 LLVMConstVector(shuffles1, length), "");
564 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565 LLVMConstVector(shuffles2, length), "");
566 if (type.floating) {
567 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568 }
569 else {
570 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571 }
572 length = length >> 1;
573 }
574
575 /* always have vector of size 2 here */
576 assert(length == 1);
577
578 index = lp_build_const_int32(bld->gallivm, 0);
579 res = LLVMBuildExtractElement(builder, vecres, index, "");
580 index = lp_build_const_int32(bld->gallivm, 1);
581 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582
583 if (type.floating)
584 res = LLVMBuildFAdd(builder, res, elem2, "");
585 else
586 res = LLVMBuildAdd(builder, res, elem2, "");
587
588 return res;
589 }
590
591 /**
592 * Return the horizontal sums of 4 float vectors as a float4 vector.
593 * This uses the technique as outlined in Intel Optimization Manual.
594 */
595 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])596 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597 LLVMValueRef src[4])
598 {
599 struct gallivm_state *gallivm = bld->gallivm;
600 LLVMBuilderRef builder = gallivm->builder;
601 LLVMValueRef shuffles[4];
602 LLVMValueRef tmp[4];
603 LLVMValueRef sumtmp[2], shuftmp[2];
604
605 /* lower half of regs */
606 shuffles[0] = lp_build_const_int32(gallivm, 0);
607 shuffles[1] = lp_build_const_int32(gallivm, 1);
608 shuffles[2] = lp_build_const_int32(gallivm, 4);
609 shuffles[3] = lp_build_const_int32(gallivm, 5);
610 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611 LLVMConstVector(shuffles, 4), "");
612 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613 LLVMConstVector(shuffles, 4), "");
614
615 /* upper half of regs */
616 shuffles[0] = lp_build_const_int32(gallivm, 2);
617 shuffles[1] = lp_build_const_int32(gallivm, 3);
618 shuffles[2] = lp_build_const_int32(gallivm, 6);
619 shuffles[3] = lp_build_const_int32(gallivm, 7);
620 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621 LLVMConstVector(shuffles, 4), "");
622 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623 LLVMConstVector(shuffles, 4), "");
624
625 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627
628 shuffles[0] = lp_build_const_int32(gallivm, 0);
629 shuffles[1] = lp_build_const_int32(gallivm, 2);
630 shuffles[2] = lp_build_const_int32(gallivm, 4);
631 shuffles[3] = lp_build_const_int32(gallivm, 6);
632 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633 LLVMConstVector(shuffles, 4), "");
634
635 shuffles[0] = lp_build_const_int32(gallivm, 1);
636 shuffles[1] = lp_build_const_int32(gallivm, 3);
637 shuffles[2] = lp_build_const_int32(gallivm, 5);
638 shuffles[3] = lp_build_const_int32(gallivm, 7);
639 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640 LLVMConstVector(shuffles, 4), "");
641
642 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643 }
644
645
646 /*
647 * partially horizontally add 2-4 float vectors with length nx4,
648 * i.e. only four adjacent values in each vector will be added,
649 * assuming values are really grouped in 4 which also determines
650 * output order.
651 *
652 * Return a vector of the same length as the initial vectors,
653 * with the excess elements (if any) being undefined.
654 * The element order is independent of number of input vectors.
655 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656 * the output order thus will be
657 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658 */
659 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)660 lp_build_hadd_partial4(struct lp_build_context *bld,
661 LLVMValueRef vectors[],
662 unsigned num_vecs)
663 {
664 struct gallivm_state *gallivm = bld->gallivm;
665 LLVMBuilderRef builder = gallivm->builder;
666 LLVMValueRef ret_vec;
667 LLVMValueRef tmp[4];
668 const char *intrinsic = NULL;
669
670 assert(num_vecs >= 2 && num_vecs <= 4);
671 assert(bld->type.floating);
672
673 /* only use this with at least 2 vectors, as it is sort of expensive
674 * (depending on cpu) and we always need two horizontal adds anyway,
675 * so a shuffle/add approach might be better.
676 */
677
678 tmp[0] = vectors[0];
679 tmp[1] = vectors[1];
680
681 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683
684 if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685 bld->type.length == 4) {
686 intrinsic = "llvm.x86.sse3.hadd.ps";
687 }
688 else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689 bld->type.length == 8) {
690 intrinsic = "llvm.x86.avx.hadd.ps.256";
691 }
692 if (intrinsic) {
693 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694 lp_build_vec_type(gallivm, bld->type),
695 tmp[0], tmp[1]);
696 if (num_vecs > 2) {
697 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698 lp_build_vec_type(gallivm, bld->type),
699 tmp[2], tmp[3]);
700 }
701 else {
702 tmp[1] = tmp[0];
703 }
704 return lp_build_intrinsic_binary(builder, intrinsic,
705 lp_build_vec_type(gallivm, bld->type),
706 tmp[0], tmp[1]);
707 }
708
709 if (bld->type.length == 4) {
710 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711 }
712 else {
713 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714 unsigned j;
715 unsigned num_iter = bld->type.length / 4;
716 struct lp_type parttype = bld->type;
717 parttype.length = 4;
718 for (j = 0; j < num_iter; j++) {
719 LLVMValueRef partsrc[4];
720 unsigned i;
721 for (i = 0; i < 4; i++) {
722 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723 }
724 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725 }
726 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727 }
728 return ret_vec;
729 }
730
731 /**
732 * Generate a - b
733 */
734 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)735 lp_build_sub(struct lp_build_context *bld,
736 LLVMValueRef a,
737 LLVMValueRef b)
738 {
739 LLVMBuilderRef builder = bld->gallivm->builder;
740 const struct lp_type type = bld->type;
741 LLVMValueRef res;
742
743 assert(lp_check_value(type, a));
744 assert(lp_check_value(type, b));
745
746 if (b == bld->zero)
747 return a;
748 if (a == bld->undef || b == bld->undef)
749 return bld->undef;
750 if (a == b)
751 return bld->zero;
752
753 if (type.norm) {
754 const char *intrinsic = NULL;
755
756 if (!type.sign && b == bld->one)
757 return bld->zero;
758
759 if (!type.floating && !type.fixed) {
760 if (LLVM_VERSION_MAJOR >= 8) {
761 char intrin[32];
762 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765 }
766 if (type.width * type.length == 128) {
767 if (util_get_cpu_caps()->has_sse2) {
768 if (type.width == 8)
769 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770 if (type.width == 16)
771 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772 } else if (util_get_cpu_caps()->has_altivec) {
773 if (type.width == 8)
774 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775 if (type.width == 16)
776 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777 }
778 }
779 if (type.width * type.length == 256) {
780 if (util_get_cpu_caps()->has_avx2) {
781 if (type.width == 8)
782 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783 if (type.width == 16)
784 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785 }
786 }
787 }
788
789 if (intrinsic)
790 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791 }
792
793 if(type.norm && !type.floating && !type.fixed) {
794 if (type.sign) {
795 uint64_t sign = (uint64_t)1 << (type.width - 1);
796 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798 /* a_clamp_max is the maximum a for negative b,
799 a_clamp_min is the minimum a for positive b. */
800 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803 } else {
804 /*
805 * This must match llvm pattern for saturated unsigned sub.
806 * (lp_build_max_simple actually does the job with its current
807 * definition but do it explicitly here.)
808 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809 * interfere with llvm's ability to recognize the pattern but seems
810 * a bit brittle.
811 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812 */
813 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814 a = lp_build_select(bld, no_ov, a, b);
815 }
816 }
817
818 if(LLVMIsConstant(a) && LLVMIsConstant(b))
819 if (type.floating)
820 res = LLVMConstFSub(a, b);
821 else
822 res = LLVMConstSub(a, b);
823 else
824 if (type.floating)
825 res = LLVMBuildFSub(builder, a, b, "");
826 else
827 res = LLVMBuildSub(builder, a, b, "");
828
829 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831
832 return res;
833 }
834
835
836
837 /**
838 * Normalized multiplication.
839 *
840 * There are several approaches for (using 8-bit normalized multiplication as
841 * an example):
842 *
843 * - alpha plus one
844 *
845 * makes the following approximation to the division (Sree)
846 *
847 * a*b/255 ~= (a*(b + 1)) >> 256
848 *
849 * which is the fastest method that satisfies the following OpenGL criteria of
850 *
851 * 0*0 = 0 and 255*255 = 255
852 *
853 * - geometric series
854 *
855 * takes the geometric series approximation to the division
856 *
857 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858 *
859 * in this case just the first two terms to fit in 16bit arithmetic
860 *
861 * t/255 ~= (t + (t >> 8)) >> 8
862 *
863 * note that just by itself it doesn't satisfies the OpenGL criteria, as
864 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
865 * must be used.
866 *
867 * - geometric series plus rounding
868 *
869 * when using a geometric series division instead of truncating the result
870 * use roundoff in the approximation (Jim Blinn)
871 *
872 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
873 *
874 * achieving the exact results.
875 *
876 *
877 *
878 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880 * @sa Michael Herf, The "double blend trick", May 2000,
881 * http://www.stereopsis.com/doubleblend.html
882 */
883 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)884 lp_build_mul_norm(struct gallivm_state *gallivm,
885 struct lp_type wide_type,
886 LLVMValueRef a, LLVMValueRef b)
887 {
888 LLVMBuilderRef builder = gallivm->builder;
889 struct lp_build_context bld;
890 unsigned n;
891 LLVMValueRef half;
892 LLVMValueRef ab;
893
894 assert(!wide_type.floating);
895 assert(lp_check_value(wide_type, a));
896 assert(lp_check_value(wide_type, b));
897
898 lp_build_context_init(&bld, gallivm, wide_type);
899
900 n = wide_type.width / 2;
901 if (wide_type.sign) {
902 --n;
903 }
904
905 /*
906 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908 */
909
910 /*
911 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912 */
913
914 ab = LLVMBuildMul(builder, a, b, "");
915 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916
917 /*
918 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919 */
920
921 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922 if (wide_type.sign) {
923 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925 half = lp_build_select(&bld, sign, minus_half, half);
926 }
927 ab = LLVMBuildAdd(builder, ab, half, "");
928
929 /* Final division */
930 ab = lp_build_shr_imm(&bld, ab, n);
931
932 return ab;
933 }
934
935 /**
936 * Generate a * b
937 */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940 LLVMValueRef a,
941 LLVMValueRef b)
942 {
943 LLVMBuilderRef builder = bld->gallivm->builder;
944 const struct lp_type type = bld->type;
945 LLVMValueRef shift;
946 LLVMValueRef res;
947
948 assert(lp_check_value(type, a));
949 assert(lp_check_value(type, b));
950
951 if(a == bld->zero)
952 return bld->zero;
953 if(a == bld->one)
954 return b;
955 if(b == bld->zero)
956 return bld->zero;
957 if(b == bld->one)
958 return a;
959 if(a == bld->undef || b == bld->undef)
960 return bld->undef;
961
962 if (!type.floating && !type.fixed && type.norm) {
963 struct lp_type wide_type = lp_wider_type(type);
964 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965
966 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968
969 /* PMULLW, PSRLW, PADDW */
970 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972
973 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974
975 return ab;
976 }
977
978 if(type.fixed)
979 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980 else
981 shift = NULL;
982
983 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984 if (type.floating)
985 res = LLVMConstFMul(a, b);
986 else
987 res = LLVMConstMul(a, b);
988 if(shift) {
989 if(type.sign)
990 res = LLVMConstAShr(res, shift);
991 else
992 res = LLVMConstLShr(res, shift);
993 }
994 }
995 else {
996 if (type.floating)
997 res = LLVMBuildFMul(builder, a, b, "");
998 else
999 res = LLVMBuildMul(builder, a, b, "");
1000 if(shift) {
1001 if(type.sign)
1002 res = LLVMBuildAShr(builder, res, shift, "");
1003 else
1004 res = LLVMBuildLShr(builder, res, shift, "");
1005 }
1006 }
1007
1008 return res;
1009 }
1010
1011 /*
1012 * Widening mul, valid for 32x32 bit -> 64bit only.
1013 * Result is low 32bits, high bits returned in res_hi.
1014 *
1015 * Emits code that is meant to be compiled for the host CPU.
1016 */
1017 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1018 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019 LLVMValueRef a,
1020 LLVMValueRef b,
1021 LLVMValueRef *res_hi)
1022 {
1023 struct gallivm_state *gallivm = bld->gallivm;
1024 LLVMBuilderRef builder = gallivm->builder;
1025
1026 assert(bld->type.width == 32);
1027 assert(bld->type.floating == 0);
1028 assert(bld->type.fixed == 0);
1029 assert(bld->type.norm == 0);
1030
1031 /*
1032 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033 * for x86 simd is atrocious (even if the high bits weren't required),
1034 * trying to handle real 64bit inputs (which of course can't happen due
1035 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036 * apparently llvm does not recognize this widening mul). This includes 6
1037 * (instead of 2) pmuludq plus extra adds and shifts
1038 * The same story applies to signed mul, albeit fixing this requires sse41.
1039 * https://llvm.org/bugs/show_bug.cgi?id=30845
1040 * So, whip up our own code, albeit only for length 4 and 8 (which
1041 * should be good enough)...
1042 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044 * for signed), which the fallback code does not, without this llvm
1045 * will likely still produce atrocious code.
1046 */
1047 if (LLVM_VERSION_MAJOR < 7 &&
1048 (bld->type.length == 4 || bld->type.length == 8) &&
1049 ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050 util_get_cpu_caps()->has_sse4_1)) {
1051 const char *intrinsic = NULL;
1052 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054 struct lp_type type_wide = lp_wider_type(bld->type);
1055 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056 unsigned i;
1057 for (i = 0; i < bld->type.length; i += 2) {
1058 shuf[i] = lp_build_const_int32(gallivm, i+1);
1059 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060 }
1061 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062 aeven = a;
1063 beven = b;
1064 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066
1067 if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068 if (bld->type.sign) {
1069 intrinsic = "llvm.x86.avx2.pmul.dq";
1070 } else {
1071 intrinsic = "llvm.x86.avx2.pmulu.dq";
1072 }
1073 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074 wider_type, aeven, beven);
1075 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076 wider_type, aodd, bodd);
1077 }
1078 else {
1079 /* for consistent naming look elsewhere... */
1080 if (bld->type.sign) {
1081 intrinsic = "llvm.x86.sse41.pmuldq";
1082 } else {
1083 intrinsic = "llvm.x86.sse2.pmulu.dq";
1084 }
1085 /*
1086 * XXX If we only have AVX but not AVX2 this is a pain.
1087 * lp_build_intrinsic_binary_anylength() can't handle it
1088 * (due to src and dst type not being identical).
1089 */
1090 if (bld->type.length == 8) {
1091 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093 LLVMValueRef muleven2[2], mulodd2[2];
1094 struct lp_type type_wide_half = type_wide;
1095 LLVMTypeRef wtype_half;
1096 type_wide_half.length = 2;
1097 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107 wtype_half, aevenlo, bevenlo);
1108 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109 wtype_half, aoddlo, boddlo);
1110 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111 wtype_half, aevenhi, bevenhi);
1112 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113 wtype_half, aoddhi, boddhi);
1114 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116
1117 }
1118 else {
1119 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120 wider_type, aeven, beven);
1121 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122 wider_type, aodd, bodd);
1123 }
1124 }
1125 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127
1128 for (i = 0; i < bld->type.length; i += 2) {
1129 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131 }
1132 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i);
1137 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141 }
1142 else {
1143 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144 }
1145 }
1146
1147
1148 /*
1149 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150 * Result is low N bits, high bits returned in res_hi.
1151 *
1152 * Emits generic code.
1153 */
1154 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1155 lp_build_mul_32_lohi(struct lp_build_context *bld,
1156 LLVMValueRef a,
1157 LLVMValueRef b,
1158 LLVMValueRef *res_hi)
1159 {
1160 struct gallivm_state *gallivm = bld->gallivm;
1161 LLVMBuilderRef builder = gallivm->builder;
1162 LLVMValueRef tmp, shift, res_lo;
1163 struct lp_type type_tmp;
1164 LLVMTypeRef wide_type, narrow_type;
1165
1166 type_tmp = bld->type;
1167 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168 if (bld->type.width < 32)
1169 type_tmp.width = 32;
1170 else
1171 type_tmp.width *= 2;
1172 wide_type = lp_build_vec_type(gallivm, type_tmp);
1173 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174
1175 if (bld->type.sign) {
1176 a = LLVMBuildSExt(builder, a, wide_type, "");
1177 b = LLVMBuildSExt(builder, b, wide_type, "");
1178 } else {
1179 a = LLVMBuildZExt(builder, a, wide_type, "");
1180 b = LLVMBuildZExt(builder, b, wide_type, "");
1181 }
1182 tmp = LLVMBuildMul(builder, a, b, "");
1183
1184 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185
1186 /* Since we truncate anyway, LShr and AShr are equivalent. */
1187 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189
1190 return res_lo;
1191 }
1192
1193
1194 /* a * b + c */
1195 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1196 lp_build_mad(struct lp_build_context *bld,
1197 LLVMValueRef a,
1198 LLVMValueRef b,
1199 LLVMValueRef c)
1200 {
1201 const struct lp_type type = bld->type;
1202 if (type.floating) {
1203 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204 } else {
1205 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206 }
1207 }
1208
1209
1210 /**
1211 * Small vector x scale multiplication optimization.
1212 */
1213 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1214 lp_build_mul_imm(struct lp_build_context *bld,
1215 LLVMValueRef a,
1216 int b)
1217 {
1218 LLVMBuilderRef builder = bld->gallivm->builder;
1219 LLVMValueRef factor;
1220
1221 assert(lp_check_value(bld->type, a));
1222
1223 if(b == 0)
1224 return bld->zero;
1225
1226 if(b == 1)
1227 return a;
1228
1229 if(b == -1)
1230 return lp_build_negate(bld, a);
1231
1232 if(b == 2 && bld->type.floating)
1233 return lp_build_add(bld, a, a);
1234
1235 if(util_is_power_of_two_or_zero(b)) {
1236 unsigned shift = ffs(b) - 1;
1237
1238 if(bld->type.floating) {
1239 #if 0
1240 /*
1241 * Power of two multiplication by directly manipulating the exponent.
1242 *
1243 * XXX: This might not be always faster, it will introduce a small error
1244 * for multiplication by zero, and it will produce wrong results
1245 * for Inf and NaN.
1246 */
1247 unsigned mantissa = lp_mantissa(bld->type);
1248 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250 a = LLVMBuildAdd(builder, a, factor, "");
1251 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252 return a;
1253 #endif
1254 }
1255 else {
1256 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257 return LLVMBuildShl(builder, a, factor, "");
1258 }
1259 }
1260
1261 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262 return lp_build_mul(bld, a, factor);
1263 }
1264
1265
1266 /**
1267 * Generate a / b
1268 */
1269 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1270 lp_build_div(struct lp_build_context *bld,
1271 LLVMValueRef a,
1272 LLVMValueRef b)
1273 {
1274 LLVMBuilderRef builder = bld->gallivm->builder;
1275 const struct lp_type type = bld->type;
1276
1277 assert(lp_check_value(type, a));
1278 assert(lp_check_value(type, b));
1279
1280 if(a == bld->zero)
1281 return bld->zero;
1282 if(a == bld->one && type.floating)
1283 return lp_build_rcp(bld, b);
1284 if(b == bld->zero)
1285 return bld->undef;
1286 if(b == bld->one)
1287 return a;
1288 if(a == bld->undef || b == bld->undef)
1289 return bld->undef;
1290
1291 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292 if (type.floating)
1293 return LLVMConstFDiv(a, b);
1294 else if (type.sign)
1295 return LLVMConstSDiv(a, b);
1296 else
1297 return LLVMConstUDiv(a, b);
1298 }
1299
1300 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301 if(FALSE &&
1302 ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304 type.floating)
1305 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306
1307 if (type.floating)
1308 return LLVMBuildFDiv(builder, a, b, "");
1309 else if (type.sign)
1310 return LLVMBuildSDiv(builder, a, b, "");
1311 else
1312 return LLVMBuildUDiv(builder, a, b, "");
1313 }
1314
1315
1316 /**
1317 * Linear interpolation helper.
1318 *
1319 * @param normalized whether we are interpolating normalized values,
1320 * encoded in normalized integers, twice as wide.
1321 *
1322 * @sa http://www.stereopsis.com/doubleblend.html
1323 */
1324 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1325 lp_build_lerp_simple(struct lp_build_context *bld,
1326 LLVMValueRef x,
1327 LLVMValueRef v0,
1328 LLVMValueRef v1,
1329 unsigned flags)
1330 {
1331 unsigned half_width = bld->type.width/2;
1332 LLVMBuilderRef builder = bld->gallivm->builder;
1333 LLVMValueRef delta;
1334 LLVMValueRef res;
1335
1336 assert(lp_check_value(bld->type, x));
1337 assert(lp_check_value(bld->type, v0));
1338 assert(lp_check_value(bld->type, v1));
1339
1340 delta = lp_build_sub(bld, v1, v0);
1341
1342 if (bld->type.floating) {
1343 assert(flags == 0);
1344 return lp_build_mad(bld, x, delta, v0);
1345 }
1346
1347 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348 if (!bld->type.sign) {
1349 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350 /*
1351 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352 * most-significant-bit to the lowest-significant-bit, so that
1353 * later we can just divide by 2**n instead of 2**n - 1.
1354 */
1355
1356 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357 }
1358
1359 /* (x * delta) >> n */
1360 /*
1361 * For this multiply, higher internal precision is required to pass CTS,
1362 * the most efficient path to that is pmulhrsw on ssse3 and above.
1363 * This could be opencoded on other arches if conformance was required.
1364 */
1365 if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1366 res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1367 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1368 } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1369 res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1370 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1371 } else {
1372 res = lp_build_mul(bld, x, delta);
1373 res = lp_build_shr_imm(bld, res, half_width);
1374 }
1375 } else {
1376 /*
1377 * The rescaling trick above doesn't work for signed numbers, so
1378 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1379 * instead.
1380 */
1381 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1382 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1383 }
1384 } else {
1385 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1386 res = lp_build_mul(bld, x, delta);
1387 }
1388
1389 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1390 /*
1391 * At this point both res and v0 only use the lower half of the bits,
1392 * the rest is zero. Instead of add / mask, do add with half wide type.
1393 */
1394 struct lp_type narrow_type;
1395 struct lp_build_context narrow_bld;
1396
1397 memset(&narrow_type, 0, sizeof narrow_type);
1398 narrow_type.sign = bld->type.sign;
1399 narrow_type.width = bld->type.width/2;
1400 narrow_type.length = bld->type.length*2;
1401
1402 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1403 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1404 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1405 res = lp_build_add(&narrow_bld, v0, res);
1406 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1407 } else {
1408 res = lp_build_add(bld, v0, res);
1409
1410 if (bld->type.fixed) {
1411 /*
1412 * We need to mask out the high order bits when lerping 8bit
1413 * normalized colors stored on 16bits
1414 */
1415 /* XXX: This step is necessary for lerping 8bit colors stored on
1416 * 16bits, but it will be wrong for true fixed point use cases.
1417 * Basically we need a more powerful lp_type, capable of further
1418 * distinguishing the values interpretation from the value storage.
1419 */
1420 LLVMValueRef low_bits;
1421 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1422 res = LLVMBuildAnd(builder, res, low_bits, "");
1423 }
1424 }
1425
1426 return res;
1427 }
1428
1429
1430 /**
1431 * Linear interpolation.
1432 */
1433 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1434 lp_build_lerp(struct lp_build_context *bld,
1435 LLVMValueRef x,
1436 LLVMValueRef v0,
1437 LLVMValueRef v1,
1438 unsigned flags)
1439 {
1440 const struct lp_type type = bld->type;
1441 LLVMValueRef res;
1442
1443 assert(lp_check_value(type, x));
1444 assert(lp_check_value(type, v0));
1445 assert(lp_check_value(type, v1));
1446
1447 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1448
1449 if (type.norm) {
1450 struct lp_type wide_type;
1451 struct lp_build_context wide_bld;
1452 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1453
1454 assert(type.length >= 2);
1455
1456 /*
1457 * Create a wider integer type, enough to hold the
1458 * intermediate result of the multiplication.
1459 */
1460 memset(&wide_type, 0, sizeof wide_type);
1461 wide_type.sign = type.sign;
1462 wide_type.width = type.width*2;
1463 wide_type.length = type.length/2;
1464
1465 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1466
1467 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1468 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1469 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1470
1471 /*
1472 * Lerp both halves.
1473 */
1474
1475 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1476
1477 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1478 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1479
1480 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1481 } else {
1482 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1483 }
1484
1485 return res;
1486 }
1487
1488
1489 /**
1490 * Bilinear interpolation.
1491 *
1492 * Values indices are in v_{yx}.
1493 */
1494 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1495 lp_build_lerp_2d(struct lp_build_context *bld,
1496 LLVMValueRef x,
1497 LLVMValueRef y,
1498 LLVMValueRef v00,
1499 LLVMValueRef v01,
1500 LLVMValueRef v10,
1501 LLVMValueRef v11,
1502 unsigned flags)
1503 {
1504 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1505 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1506 return lp_build_lerp(bld, y, v0, v1, flags);
1507 }
1508
1509
1510 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1511 lp_build_lerp_3d(struct lp_build_context *bld,
1512 LLVMValueRef x,
1513 LLVMValueRef y,
1514 LLVMValueRef z,
1515 LLVMValueRef v000,
1516 LLVMValueRef v001,
1517 LLVMValueRef v010,
1518 LLVMValueRef v011,
1519 LLVMValueRef v100,
1520 LLVMValueRef v101,
1521 LLVMValueRef v110,
1522 LLVMValueRef v111,
1523 unsigned flags)
1524 {
1525 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1526 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1527 return lp_build_lerp(bld, z, v0, v1, flags);
1528 }
1529
1530
1531 /**
1532 * Generate min(a, b)
1533 * Do checks for special cases but not for nans.
1534 */
1535 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1536 lp_build_min(struct lp_build_context *bld,
1537 LLVMValueRef a,
1538 LLVMValueRef b)
1539 {
1540 assert(lp_check_value(bld->type, a));
1541 assert(lp_check_value(bld->type, b));
1542
1543 if(a == bld->undef || b == bld->undef)
1544 return bld->undef;
1545
1546 if(a == b)
1547 return a;
1548
1549 if (bld->type.norm) {
1550 if (!bld->type.sign) {
1551 if (a == bld->zero || b == bld->zero) {
1552 return bld->zero;
1553 }
1554 }
1555 if(a == bld->one)
1556 return b;
1557 if(b == bld->one)
1558 return a;
1559 }
1560
1561 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1562 }
1563
1564 /**
1565 * Generate min(a, b)
1566 * NaN's are handled according to the behavior specified by the
1567 * nan_behavior argument.
1568 */
1569 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1570 lp_build_min_ext(struct lp_build_context *bld,
1571 LLVMValueRef a,
1572 LLVMValueRef b,
1573 enum gallivm_nan_behavior nan_behavior)
1574 {
1575 assert(lp_check_value(bld->type, a));
1576 assert(lp_check_value(bld->type, b));
1577
1578 if(a == bld->undef || b == bld->undef)
1579 return bld->undef;
1580
1581 if(a == b)
1582 return a;
1583
1584 if (bld->type.norm) {
1585 if (!bld->type.sign) {
1586 if (a == bld->zero || b == bld->zero) {
1587 return bld->zero;
1588 }
1589 }
1590 if(a == bld->one)
1591 return b;
1592 if(b == bld->one)
1593 return a;
1594 }
1595
1596 return lp_build_min_simple(bld, a, b, nan_behavior);
1597 }
1598
1599 /**
1600 * Generate max(a, b)
1601 * Do checks for special cases, but NaN behavior is undefined.
1602 */
1603 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1604 lp_build_max(struct lp_build_context *bld,
1605 LLVMValueRef a,
1606 LLVMValueRef b)
1607 {
1608 assert(lp_check_value(bld->type, a));
1609 assert(lp_check_value(bld->type, b));
1610
1611 if(a == bld->undef || b == bld->undef)
1612 return bld->undef;
1613
1614 if(a == b)
1615 return a;
1616
1617 if(bld->type.norm) {
1618 if(a == bld->one || b == bld->one)
1619 return bld->one;
1620 if (!bld->type.sign) {
1621 if (a == bld->zero) {
1622 return b;
1623 }
1624 if (b == bld->zero) {
1625 return a;
1626 }
1627 }
1628 }
1629
1630 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1631 }
1632
1633
1634 /**
1635 * Generate max(a, b)
1636 * Checks for special cases.
1637 * NaN's are handled according to the behavior specified by the
1638 * nan_behavior argument.
1639 */
1640 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1641 lp_build_max_ext(struct lp_build_context *bld,
1642 LLVMValueRef a,
1643 LLVMValueRef b,
1644 enum gallivm_nan_behavior nan_behavior)
1645 {
1646 assert(lp_check_value(bld->type, a));
1647 assert(lp_check_value(bld->type, b));
1648
1649 if(a == bld->undef || b == bld->undef)
1650 return bld->undef;
1651
1652 if(a == b)
1653 return a;
1654
1655 if(bld->type.norm) {
1656 if(a == bld->one || b == bld->one)
1657 return bld->one;
1658 if (!bld->type.sign) {
1659 if (a == bld->zero) {
1660 return b;
1661 }
1662 if (b == bld->zero) {
1663 return a;
1664 }
1665 }
1666 }
1667
1668 return lp_build_max_simple(bld, a, b, nan_behavior);
1669 }
1670
1671 /**
1672 * Generate clamp(a, min, max)
1673 * NaN behavior (for any of a, min, max) is undefined.
1674 * Do checks for special cases.
1675 */
1676 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1677 lp_build_clamp(struct lp_build_context *bld,
1678 LLVMValueRef a,
1679 LLVMValueRef min,
1680 LLVMValueRef max)
1681 {
1682 assert(lp_check_value(bld->type, a));
1683 assert(lp_check_value(bld->type, min));
1684 assert(lp_check_value(bld->type, max));
1685
1686 a = lp_build_min(bld, a, max);
1687 a = lp_build_max(bld, a, min);
1688 return a;
1689 }
1690
1691
1692 /**
1693 * Generate clamp(a, 0, 1)
1694 * A NaN will get converted to zero.
1695 */
1696 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1697 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1698 LLVMValueRef a)
1699 {
1700 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1701 a = lp_build_min(bld, a, bld->one);
1702 return a;
1703 }
1704
1705
1706 /**
1707 * Generate abs(a)
1708 */
1709 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1710 lp_build_abs(struct lp_build_context *bld,
1711 LLVMValueRef a)
1712 {
1713 LLVMBuilderRef builder = bld->gallivm->builder;
1714 const struct lp_type type = bld->type;
1715 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1716
1717 assert(lp_check_value(type, a));
1718
1719 if(!type.sign)
1720 return a;
1721
1722 if(type.floating) {
1723 char intrinsic[32];
1724 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1725 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1726 }
1727
1728 if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1729 switch(type.width) {
1730 case 8:
1731 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1732 case 16:
1733 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1734 case 32:
1735 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1736 }
1737 }
1738 else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1739 switch(type.width) {
1740 case 8:
1741 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1742 case 16:
1743 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1744 case 32:
1745 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1746 }
1747 }
1748
1749 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1750 a, LLVMBuildNeg(builder, a, ""));
1751 }
1752
1753
1754 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_negate(struct lp_build_context *bld,
1756 LLVMValueRef a)
1757 {
1758 LLVMBuilderRef builder = bld->gallivm->builder;
1759
1760 assert(lp_check_value(bld->type, a));
1761
1762 if (bld->type.floating)
1763 a = LLVMBuildFNeg(builder, a, "");
1764 else
1765 a = LLVMBuildNeg(builder, a, "");
1766
1767 return a;
1768 }
1769
1770
1771 /** Return -1, 0 or +1 depending on the sign of a */
1772 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_sgn(struct lp_build_context *bld,
1774 LLVMValueRef a)
1775 {
1776 LLVMBuilderRef builder = bld->gallivm->builder;
1777 const struct lp_type type = bld->type;
1778 LLVMValueRef cond;
1779 LLVMValueRef res;
1780
1781 assert(lp_check_value(type, a));
1782
1783 /* Handle non-zero case */
1784 if(!type.sign) {
1785 /* if not zero then sign must be positive */
1786 res = bld->one;
1787 }
1788 else if(type.floating) {
1789 LLVMTypeRef vec_type;
1790 LLVMTypeRef int_type;
1791 LLVMValueRef mask;
1792 LLVMValueRef sign;
1793 LLVMValueRef one;
1794 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1795
1796 int_type = lp_build_int_vec_type(bld->gallivm, type);
1797 vec_type = lp_build_vec_type(bld->gallivm, type);
1798 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1799
1800 /* Take the sign bit and add it to 1 constant */
1801 sign = LLVMBuildBitCast(builder, a, int_type, "");
1802 sign = LLVMBuildAnd(builder, sign, mask, "");
1803 one = LLVMConstBitCast(bld->one, int_type);
1804 res = LLVMBuildOr(builder, sign, one, "");
1805 res = LLVMBuildBitCast(builder, res, vec_type, "");
1806 }
1807 else
1808 {
1809 /* signed int/norm/fixed point */
1810 /* could use psign with sse3 and appropriate vectors here */
1811 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1812 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1813 res = lp_build_select(bld, cond, bld->one, minus_one);
1814 }
1815
1816 /* Handle zero */
1817 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1818 res = lp_build_select(bld, cond, bld->zero, res);
1819
1820 return res;
1821 }
1822
1823
1824 /**
1825 * Set the sign of float vector 'a' according to 'sign'.
1826 * If sign==0, return abs(a).
1827 * If sign==1, return -abs(a);
1828 * Other values for sign produce undefined results.
1829 */
1830 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1831 lp_build_set_sign(struct lp_build_context *bld,
1832 LLVMValueRef a, LLVMValueRef sign)
1833 {
1834 LLVMBuilderRef builder = bld->gallivm->builder;
1835 const struct lp_type type = bld->type;
1836 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1837 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1838 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1839 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1840 ~((unsigned long long) 1 << (type.width - 1)));
1841 LLVMValueRef val, res;
1842
1843 assert(type.floating);
1844 assert(lp_check_value(type, a));
1845
1846 /* val = reinterpret_cast<int>(a) */
1847 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1848 /* val = val & mask */
1849 val = LLVMBuildAnd(builder, val, mask, "");
1850 /* sign = sign << shift */
1851 sign = LLVMBuildShl(builder, sign, shift, "");
1852 /* res = val | sign */
1853 res = LLVMBuildOr(builder, val, sign, "");
1854 /* res = reinterpret_cast<float>(res) */
1855 res = LLVMBuildBitCast(builder, res, vec_type, "");
1856
1857 return res;
1858 }
1859
1860
1861 /**
1862 * Convert vector of (or scalar) int to vector of (or scalar) float.
1863 */
1864 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1865 lp_build_int_to_float(struct lp_build_context *bld,
1866 LLVMValueRef a)
1867 {
1868 LLVMBuilderRef builder = bld->gallivm->builder;
1869 const struct lp_type type = bld->type;
1870 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1871
1872 assert(type.floating);
1873
1874 return LLVMBuildSIToFP(builder, a, vec_type, "");
1875 }
1876
1877 static boolean
arch_rounding_available(const struct lp_type type)1878 arch_rounding_available(const struct lp_type type)
1879 {
1880 if ((util_get_cpu_caps()->has_sse4_1 &&
1881 (type.length == 1 || type.width*type.length == 128)) ||
1882 (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1883 (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1884 return TRUE;
1885 else if ((util_get_cpu_caps()->has_altivec &&
1886 (type.width == 32 && type.length == 4)))
1887 return TRUE;
1888 else if (util_get_cpu_caps()->has_neon)
1889 return TRUE;
1890 else if (util_get_cpu_caps()->family == CPU_S390X)
1891 return TRUE;
1892
1893 return FALSE;
1894 }
1895
1896 enum lp_build_round_mode
1897 {
1898 LP_BUILD_ROUND_NEAREST = 0,
1899 LP_BUILD_ROUND_FLOOR = 1,
1900 LP_BUILD_ROUND_CEIL = 2,
1901 LP_BUILD_ROUND_TRUNCATE = 3
1902 };
1903
1904 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1905 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1906 LLVMValueRef a)
1907 {
1908 LLVMBuilderRef builder = bld->gallivm->builder;
1909 const struct lp_type type = bld->type;
1910 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1911 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1912 const char *intrinsic;
1913 LLVMValueRef res;
1914
1915 assert(type.floating);
1916 /* using the double precision conversions is a bit more complicated */
1917 assert(type.width == 32);
1918
1919 assert(lp_check_value(type, a));
1920 assert(util_get_cpu_caps()->has_sse2);
1921
1922 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1923 if (type.length == 1) {
1924 LLVMTypeRef vec_type;
1925 LLVMValueRef undef;
1926 LLVMValueRef arg;
1927 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1928
1929 vec_type = LLVMVectorType(bld->elem_type, 4);
1930
1931 intrinsic = "llvm.x86.sse.cvtss2si";
1932
1933 undef = LLVMGetUndef(vec_type);
1934
1935 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1936
1937 res = lp_build_intrinsic_unary(builder, intrinsic,
1938 ret_type, arg);
1939 }
1940 else {
1941 if (type.width* type.length == 128) {
1942 intrinsic = "llvm.x86.sse2.cvtps2dq";
1943 }
1944 else {
1945 assert(type.width*type.length == 256);
1946 assert(util_get_cpu_caps()->has_avx);
1947
1948 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1949 }
1950 res = lp_build_intrinsic_unary(builder, intrinsic,
1951 ret_type, a);
1952 }
1953
1954 return res;
1955 }
1956
1957
1958 /*
1959 */
1960 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1961 lp_build_round_altivec(struct lp_build_context *bld,
1962 LLVMValueRef a,
1963 enum lp_build_round_mode mode)
1964 {
1965 LLVMBuilderRef builder = bld->gallivm->builder;
1966 const struct lp_type type = bld->type;
1967 const char *intrinsic = NULL;
1968
1969 assert(type.floating);
1970
1971 assert(lp_check_value(type, a));
1972 assert(util_get_cpu_caps()->has_altivec);
1973
1974 (void)type;
1975
1976 switch (mode) {
1977 case LP_BUILD_ROUND_NEAREST:
1978 intrinsic = "llvm.ppc.altivec.vrfin";
1979 break;
1980 case LP_BUILD_ROUND_FLOOR:
1981 intrinsic = "llvm.ppc.altivec.vrfim";
1982 break;
1983 case LP_BUILD_ROUND_CEIL:
1984 intrinsic = "llvm.ppc.altivec.vrfip";
1985 break;
1986 case LP_BUILD_ROUND_TRUNCATE:
1987 intrinsic = "llvm.ppc.altivec.vrfiz";
1988 break;
1989 }
1990
1991 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1992 }
1993
1994 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1995 lp_build_round_arch(struct lp_build_context *bld,
1996 LLVMValueRef a,
1997 enum lp_build_round_mode mode)
1998 {
1999 if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
2000 util_get_cpu_caps()->family == CPU_S390X) {
2001 LLVMBuilderRef builder = bld->gallivm->builder;
2002 const struct lp_type type = bld->type;
2003 const char *intrinsic_root;
2004 char intrinsic[32];
2005
2006 assert(type.floating);
2007 assert(lp_check_value(type, a));
2008 (void)type;
2009
2010 switch (mode) {
2011 case LP_BUILD_ROUND_NEAREST:
2012 intrinsic_root = "llvm.nearbyint";
2013 break;
2014 case LP_BUILD_ROUND_FLOOR:
2015 intrinsic_root = "llvm.floor";
2016 break;
2017 case LP_BUILD_ROUND_CEIL:
2018 intrinsic_root = "llvm.ceil";
2019 break;
2020 case LP_BUILD_ROUND_TRUNCATE:
2021 intrinsic_root = "llvm.trunc";
2022 break;
2023 default:
2024 unreachable("unhandled lp_build_round_mode");
2025 }
2026
2027 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2028 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2029 }
2030 else /* (util_get_cpu_caps()->has_altivec) */
2031 return lp_build_round_altivec(bld, a, mode);
2032 }
2033
2034 /**
2035 * Return the integer part of a float (vector) value (== round toward zero).
2036 * The returned value is a float (vector).
2037 * Ex: trunc(-1.5) = -1.0
2038 */
2039 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2040 lp_build_trunc(struct lp_build_context *bld,
2041 LLVMValueRef a)
2042 {
2043 LLVMBuilderRef builder = bld->gallivm->builder;
2044 const struct lp_type type = bld->type;
2045
2046 assert(type.floating);
2047 assert(lp_check_value(type, a));
2048
2049 if (type.width == 16) {
2050 char intrinsic[64];
2051 lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2052 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2053 }
2054
2055 if (arch_rounding_available(type)) {
2056 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2057 }
2058 else {
2059 const struct lp_type type = bld->type;
2060 struct lp_type inttype;
2061 struct lp_build_context intbld;
2062 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2063 LLVMValueRef trunc, res, anosign, mask;
2064 LLVMTypeRef int_vec_type = bld->int_vec_type;
2065 LLVMTypeRef vec_type = bld->vec_type;
2066
2067 inttype = type;
2068 inttype.floating = 0;
2069 lp_build_context_init(&intbld, bld->gallivm, inttype);
2070
2071 /* round by truncation */
2072 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2073 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2074
2075 /* mask out sign bit */
2076 anosign = lp_build_abs(bld, a);
2077 /*
2078 * mask out all values if anosign > 2^24
2079 * This should work both for large ints (all rounding is no-op for them
2080 * because such floats are always exact) as well as special cases like
2081 * NaNs, Infs (taking advantage of the fact they use max exponent).
2082 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2083 */
2084 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2085 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2086 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2087 return lp_build_select(bld, mask, a, res);
2088 }
2089 }
2090
2091
2092 /**
2093 * Return float (vector) rounded to nearest integer (vector). The returned
2094 * value is a float (vector).
2095 * Ex: round(0.9) = 1.0
2096 * Ex: round(-1.5) = -2.0
2097 */
2098 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2099 lp_build_round(struct lp_build_context *bld,
2100 LLVMValueRef a)
2101 {
2102 LLVMBuilderRef builder = bld->gallivm->builder;
2103 const struct lp_type type = bld->type;
2104
2105 assert(type.floating);
2106 assert(lp_check_value(type, a));
2107
2108 if (type.width == 16) {
2109 char intrinsic[64];
2110 lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2111 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2112 }
2113
2114 if (arch_rounding_available(type)) {
2115 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2116 }
2117 else {
2118 const struct lp_type type = bld->type;
2119 struct lp_type inttype;
2120 struct lp_build_context intbld;
2121 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122 LLVMValueRef res, anosign, mask;
2123 LLVMTypeRef int_vec_type = bld->int_vec_type;
2124 LLVMTypeRef vec_type = bld->vec_type;
2125
2126 inttype = type;
2127 inttype.floating = 0;
2128 lp_build_context_init(&intbld, bld->gallivm, inttype);
2129
2130 res = lp_build_iround(bld, a);
2131 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2132
2133 /* mask out sign bit */
2134 anosign = lp_build_abs(bld, a);
2135 /*
2136 * mask out all values if anosign > 2^24
2137 * This should work both for large ints (all rounding is no-op for them
2138 * because such floats are always exact) as well as special cases like
2139 * NaNs, Infs (taking advantage of the fact they use max exponent).
2140 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2141 */
2142 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2143 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2144 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2145 return lp_build_select(bld, mask, a, res);
2146 }
2147 }
2148
2149
2150 /**
2151 * Return floor of float (vector), result is a float (vector)
2152 * Ex: floor(1.1) = 1.0
2153 * Ex: floor(-1.1) = -2.0
2154 */
2155 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2156 lp_build_floor(struct lp_build_context *bld,
2157 LLVMValueRef a)
2158 {
2159 LLVMBuilderRef builder = bld->gallivm->builder;
2160 const struct lp_type type = bld->type;
2161
2162 assert(type.floating);
2163 assert(lp_check_value(type, a));
2164
2165 if (arch_rounding_available(type)) {
2166 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2167 }
2168 else {
2169 const struct lp_type type = bld->type;
2170 struct lp_type inttype;
2171 struct lp_build_context intbld;
2172 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173 LLVMValueRef trunc, res, anosign, mask;
2174 LLVMTypeRef int_vec_type = bld->int_vec_type;
2175 LLVMTypeRef vec_type = bld->vec_type;
2176
2177 if (type.width != 32) {
2178 char intrinsic[32];
2179 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2180 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2181 }
2182
2183 assert(type.width == 32); /* might want to handle doubles at some point */
2184
2185 inttype = type;
2186 inttype.floating = 0;
2187 lp_build_context_init(&intbld, bld->gallivm, inttype);
2188
2189 /* round by truncation */
2190 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2191 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2192
2193 if (type.sign) {
2194 LLVMValueRef tmp;
2195
2196 /*
2197 * fix values if rounding is wrong (for non-special cases)
2198 * - this is the case if trunc > a
2199 */
2200 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2201 /* tmp = trunc > a ? 1.0 : 0.0 */
2202 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2203 tmp = lp_build_and(&intbld, mask, tmp);
2204 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2205 res = lp_build_sub(bld, res, tmp);
2206 }
2207
2208 /* mask out sign bit */
2209 anosign = lp_build_abs(bld, a);
2210 /*
2211 * mask out all values if anosign > 2^24
2212 * This should work both for large ints (all rounding is no-op for them
2213 * because such floats are always exact) as well as special cases like
2214 * NaNs, Infs (taking advantage of the fact they use max exponent).
2215 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2216 */
2217 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2218 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2219 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2220 return lp_build_select(bld, mask, a, res);
2221 }
2222 }
2223
2224
2225 /**
2226 * Return ceiling of float (vector), returning float (vector).
2227 * Ex: ceil( 1.1) = 2.0
2228 * Ex: ceil(-1.1) = -1.0
2229 */
2230 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2231 lp_build_ceil(struct lp_build_context *bld,
2232 LLVMValueRef a)
2233 {
2234 LLVMBuilderRef builder = bld->gallivm->builder;
2235 const struct lp_type type = bld->type;
2236
2237 assert(type.floating);
2238 assert(lp_check_value(type, a));
2239
2240 if (arch_rounding_available(type)) {
2241 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2242 }
2243 else {
2244 const struct lp_type type = bld->type;
2245 struct lp_type inttype;
2246 struct lp_build_context intbld;
2247 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2248 LLVMValueRef trunc, res, anosign, mask, tmp;
2249 LLVMTypeRef int_vec_type = bld->int_vec_type;
2250 LLVMTypeRef vec_type = bld->vec_type;
2251
2252 if (type.width != 32) {
2253 char intrinsic[32];
2254 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2255 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2256 }
2257
2258 assert(type.width == 32); /* might want to handle doubles at some point */
2259
2260 inttype = type;
2261 inttype.floating = 0;
2262 lp_build_context_init(&intbld, bld->gallivm, inttype);
2263
2264 /* round by truncation */
2265 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2266 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2267
2268 /*
2269 * fix values if rounding is wrong (for non-special cases)
2270 * - this is the case if trunc < a
2271 */
2272 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2273 /* tmp = trunc < a ? 1.0 : 0.0 */
2274 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2275 tmp = lp_build_and(&intbld, mask, tmp);
2276 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2277 res = lp_build_add(bld, trunc, tmp);
2278
2279 /* mask out sign bit */
2280 anosign = lp_build_abs(bld, a);
2281 /*
2282 * mask out all values if anosign > 2^24
2283 * This should work both for large ints (all rounding is no-op for them
2284 * because such floats are always exact) as well as special cases like
2285 * NaNs, Infs (taking advantage of the fact they use max exponent).
2286 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2287 */
2288 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2289 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2290 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2291 return lp_build_select(bld, mask, a, res);
2292 }
2293 }
2294
2295
2296 /**
2297 * Return fractional part of 'a' computed as a - floor(a)
2298 * Typically used in texture coord arithmetic.
2299 */
2300 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2301 lp_build_fract(struct lp_build_context *bld,
2302 LLVMValueRef a)
2303 {
2304 assert(bld->type.floating);
2305 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2306 }
2307
2308
2309 /**
2310 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2311 * against 0.99999(9). (Will also return that value for NaNs.)
2312 */
2313 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2314 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2315 {
2316 LLVMValueRef max;
2317
2318 /* this is the largest number smaller than 1.0 representable as float */
2319 max = lp_build_const_vec(bld->gallivm, bld->type,
2320 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2321 return lp_build_min_ext(bld, fract, max,
2322 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2323 }
2324
2325
2326 /**
2327 * Same as lp_build_fract, but guarantees that the result is always smaller
2328 * than one. Will also return the smaller-than-one value for infs, NaNs.
2329 */
2330 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2331 lp_build_fract_safe(struct lp_build_context *bld,
2332 LLVMValueRef a)
2333 {
2334 return clamp_fract(bld, lp_build_fract(bld, a));
2335 }
2336
2337
2338 /**
2339 * Return the integer part of a float (vector) value (== round toward zero).
2340 * The returned value is an integer (vector).
2341 * Ex: itrunc(-1.5) = -1
2342 */
2343 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2344 lp_build_itrunc(struct lp_build_context *bld,
2345 LLVMValueRef a)
2346 {
2347 LLVMBuilderRef builder = bld->gallivm->builder;
2348 const struct lp_type type = bld->type;
2349 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2350
2351 assert(type.floating);
2352 assert(lp_check_value(type, a));
2353
2354 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355 }
2356
2357
2358 /**
2359 * Return float (vector) rounded to nearest integer (vector). The returned
2360 * value is an integer (vector).
2361 * Ex: iround(0.9) = 1
2362 * Ex: iround(-1.5) = -2
2363 */
2364 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2365 lp_build_iround(struct lp_build_context *bld,
2366 LLVMValueRef a)
2367 {
2368 LLVMBuilderRef builder = bld->gallivm->builder;
2369 const struct lp_type type = bld->type;
2370 LLVMTypeRef int_vec_type = bld->int_vec_type;
2371 LLVMValueRef res;
2372
2373 assert(type.floating);
2374
2375 assert(lp_check_value(type, a));
2376
2377 if ((util_get_cpu_caps()->has_sse2 &&
2378 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2379 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2380 return lp_build_iround_nearest_sse2(bld, a);
2381 }
2382 if (arch_rounding_available(type)) {
2383 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2384 }
2385 else {
2386 LLVMValueRef half;
2387
2388 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2389
2390 if (type.sign) {
2391 LLVMTypeRef vec_type = bld->vec_type;
2392 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2393 (unsigned long long)1 << (type.width - 1));
2394 LLVMValueRef sign;
2395
2396 /* get sign bit */
2397 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2398 sign = LLVMBuildAnd(builder, sign, mask, "");
2399
2400 /* sign * 0.5 */
2401 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2402 half = LLVMBuildOr(builder, sign, half, "");
2403 half = LLVMBuildBitCast(builder, half, vec_type, "");
2404 }
2405
2406 res = LLVMBuildFAdd(builder, a, half, "");
2407 }
2408
2409 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2410
2411 return res;
2412 }
2413
2414
2415 /**
2416 * Return floor of float (vector), result is an int (vector)
2417 * Ex: ifloor(1.1) = 1.0
2418 * Ex: ifloor(-1.1) = -2.0
2419 */
2420 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2421 lp_build_ifloor(struct lp_build_context *bld,
2422 LLVMValueRef a)
2423 {
2424 LLVMBuilderRef builder = bld->gallivm->builder;
2425 const struct lp_type type = bld->type;
2426 LLVMTypeRef int_vec_type = bld->int_vec_type;
2427 LLVMValueRef res;
2428
2429 assert(type.floating);
2430 assert(lp_check_value(type, a));
2431
2432 res = a;
2433 if (type.sign) {
2434 if (arch_rounding_available(type)) {
2435 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2436 }
2437 else {
2438 struct lp_type inttype;
2439 struct lp_build_context intbld;
2440 LLVMValueRef trunc, itrunc, mask;
2441
2442 assert(type.floating);
2443 assert(lp_check_value(type, a));
2444
2445 inttype = type;
2446 inttype.floating = 0;
2447 lp_build_context_init(&intbld, bld->gallivm, inttype);
2448
2449 /* round by truncation */
2450 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2451 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2452
2453 /*
2454 * fix values if rounding is wrong (for non-special cases)
2455 * - this is the case if trunc > a
2456 * The results of doing this with NaNs, very large values etc.
2457 * are undefined but this seems to be the case anyway.
2458 */
2459 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2460 /* cheapie minus one with mask since the mask is minus one / zero */
2461 return lp_build_add(&intbld, itrunc, mask);
2462 }
2463 }
2464
2465 /* round to nearest (toward zero) */
2466 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2467
2468 return res;
2469 }
2470
2471
2472 /**
2473 * Return ceiling of float (vector), returning int (vector).
2474 * Ex: iceil( 1.1) = 2
2475 * Ex: iceil(-1.1) = -1
2476 */
2477 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2478 lp_build_iceil(struct lp_build_context *bld,
2479 LLVMValueRef a)
2480 {
2481 LLVMBuilderRef builder = bld->gallivm->builder;
2482 const struct lp_type type = bld->type;
2483 LLVMTypeRef int_vec_type = bld->int_vec_type;
2484 LLVMValueRef res;
2485
2486 assert(type.floating);
2487 assert(lp_check_value(type, a));
2488
2489 if (arch_rounding_available(type)) {
2490 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2491 }
2492 else {
2493 struct lp_type inttype;
2494 struct lp_build_context intbld;
2495 LLVMValueRef trunc, itrunc, mask;
2496
2497 assert(type.floating);
2498 assert(lp_check_value(type, a));
2499
2500 inttype = type;
2501 inttype.floating = 0;
2502 lp_build_context_init(&intbld, bld->gallivm, inttype);
2503
2504 /* round by truncation */
2505 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2506 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2507
2508 /*
2509 * fix values if rounding is wrong (for non-special cases)
2510 * - this is the case if trunc < a
2511 * The results of doing this with NaNs, very large values etc.
2512 * are undefined but this seems to be the case anyway.
2513 */
2514 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2515 /* cheapie plus one with mask since the mask is minus one / zero */
2516 return lp_build_sub(&intbld, itrunc, mask);
2517 }
2518
2519 /* round to nearest (toward zero) */
2520 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2521
2522 return res;
2523 }
2524
2525
2526 /**
2527 * Combined ifloor() & fract().
2528 *
2529 * Preferred to calling the functions separately, as it will ensure that the
2530 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2531 */
2532 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2533 lp_build_ifloor_fract(struct lp_build_context *bld,
2534 LLVMValueRef a,
2535 LLVMValueRef *out_ipart,
2536 LLVMValueRef *out_fpart)
2537 {
2538 LLVMBuilderRef builder = bld->gallivm->builder;
2539 const struct lp_type type = bld->type;
2540 LLVMValueRef ipart;
2541
2542 assert(type.floating);
2543 assert(lp_check_value(type, a));
2544
2545 if (arch_rounding_available(type)) {
2546 /*
2547 * floor() is easier.
2548 */
2549
2550 ipart = lp_build_floor(bld, a);
2551 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2552 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2553 }
2554 else {
2555 /*
2556 * ifloor() is easier.
2557 */
2558
2559 *out_ipart = lp_build_ifloor(bld, a);
2560 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2561 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2562 }
2563 }
2564
2565
2566 /**
2567 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2568 * always smaller than one.
2569 */
2570 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2571 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2572 LLVMValueRef a,
2573 LLVMValueRef *out_ipart,
2574 LLVMValueRef *out_fpart)
2575 {
2576 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2577 *out_fpart = clamp_fract(bld, *out_fpart);
2578 }
2579
2580
2581 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2582 lp_build_sqrt(struct lp_build_context *bld,
2583 LLVMValueRef a)
2584 {
2585 LLVMBuilderRef builder = bld->gallivm->builder;
2586 const struct lp_type type = bld->type;
2587 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2588 char intrinsic[32];
2589
2590 assert(lp_check_value(type, a));
2591
2592 assert(type.floating);
2593 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2594
2595 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2596 }
2597
2598
2599 /**
2600 * Do one Newton-Raphson step to improve reciprocate precision:
2601 *
2602 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2603 *
2604 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2605 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2606 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2607 * halo. It would be necessary to clamp the argument to prevent this.
2608 *
2609 * See also:
2610 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2612 */
2613 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2614 lp_build_rcp_refine(struct lp_build_context *bld,
2615 LLVMValueRef a,
2616 LLVMValueRef rcp_a)
2617 {
2618 LLVMBuilderRef builder = bld->gallivm->builder;
2619 LLVMValueRef neg_a;
2620 LLVMValueRef res;
2621
2622 neg_a = LLVMBuildFNeg(builder, a, "");
2623 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2624 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2625
2626 return res;
2627 }
2628
2629
2630 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2631 lp_build_rcp(struct lp_build_context *bld,
2632 LLVMValueRef a)
2633 {
2634 LLVMBuilderRef builder = bld->gallivm->builder;
2635 const struct lp_type type = bld->type;
2636
2637 assert(lp_check_value(type, a));
2638
2639 if(a == bld->zero)
2640 return bld->undef;
2641 if(a == bld->one)
2642 return bld->one;
2643 if(a == bld->undef)
2644 return bld->undef;
2645
2646 assert(type.floating);
2647
2648 if(LLVMIsConstant(a))
2649 return LLVMConstFDiv(bld->one, a);
2650
2651 /*
2652 * We don't use RCPPS because:
2653 * - it only has 10bits of precision
2654 * - it doesn't even get the reciprocate of 1.0 exactly
2655 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2656 * - for recent processors the benefit over DIVPS is marginal, a case
2657 * dependent
2658 *
2659 * We could still use it on certain processors if benchmarks show that the
2660 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2661 * particular uses that require less workarounds.
2662 */
2663
2664 if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2665 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2666 const unsigned num_iterations = 0;
2667 LLVMValueRef res;
2668 unsigned i;
2669 const char *intrinsic = NULL;
2670
2671 if (type.length == 4) {
2672 intrinsic = "llvm.x86.sse.rcp.ps";
2673 }
2674 else {
2675 intrinsic = "llvm.x86.avx.rcp.ps.256";
2676 }
2677
2678 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2679
2680 for (i = 0; i < num_iterations; ++i) {
2681 res = lp_build_rcp_refine(bld, a, res);
2682 }
2683
2684 return res;
2685 }
2686
2687 return LLVMBuildFDiv(builder, bld->one, a, "");
2688 }
2689
2690
2691 /**
2692 * Do one Newton-Raphson step to improve rsqrt precision:
2693 *
2694 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2695 *
2696 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2697 */
2698 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2699 lp_build_rsqrt_refine(struct lp_build_context *bld,
2700 LLVMValueRef a,
2701 LLVMValueRef rsqrt_a)
2702 {
2703 LLVMBuilderRef builder = bld->gallivm->builder;
2704 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2705 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2706 LLVMValueRef res;
2707
2708 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2709 res = LLVMBuildFMul(builder, a, res, "");
2710 res = LLVMBuildFSub(builder, three, res, "");
2711 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2712 res = LLVMBuildFMul(builder, half, res, "");
2713
2714 return res;
2715 }
2716
2717
2718 /**
2719 * Generate 1/sqrt(a).
2720 * Result is undefined for values < 0, infinity for +0.
2721 */
2722 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2723 lp_build_rsqrt(struct lp_build_context *bld,
2724 LLVMValueRef a)
2725 {
2726 const struct lp_type type = bld->type;
2727
2728 assert(lp_check_value(type, a));
2729
2730 assert(type.floating);
2731
2732 /*
2733 * This should be faster but all denormals will end up as infinity.
2734 */
2735 if (0 && lp_build_fast_rsqrt_available(type)) {
2736 const unsigned num_iterations = 1;
2737 LLVMValueRef res;
2738 unsigned i;
2739
2740 /* rsqrt(1.0) != 1.0 here */
2741 res = lp_build_fast_rsqrt(bld, a);
2742
2743 if (num_iterations) {
2744 /*
2745 * Newton-Raphson will result in NaN instead of infinity for zero,
2746 * and NaN instead of zero for infinity.
2747 * Also, need to ensure rsqrt(1.0) == 1.0.
2748 * All numbers smaller than FLT_MIN will result in +infinity
2749 * (rsqrtps treats all denormals as zero).
2750 */
2751 LLVMValueRef cmp;
2752 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2753 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2754
2755 for (i = 0; i < num_iterations; ++i) {
2756 res = lp_build_rsqrt_refine(bld, a, res);
2757 }
2758 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2759 res = lp_build_select(bld, cmp, inf, res);
2760 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2761 res = lp_build_select(bld, cmp, bld->zero, res);
2762 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2763 res = lp_build_select(bld, cmp, bld->one, res);
2764 }
2765
2766 return res;
2767 }
2768
2769 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2770 }
2771
2772 /**
2773 * If there's a fast (inaccurate) rsqrt instruction available
2774 * (caller may want to avoid to call rsqrt_fast if it's not available,
2775 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2776 * unavailable it would result in sqrt/div/mul so obviously
2777 * much better to just call sqrt, skipping both div and mul).
2778 */
2779 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2780 lp_build_fast_rsqrt_available(struct lp_type type)
2781 {
2782 assert(type.floating);
2783
2784 if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2785 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2786 return true;
2787 }
2788 return false;
2789 }
2790
2791
2792 /**
2793 * Generate 1/sqrt(a).
2794 * Result is undefined for values < 0, infinity for +0.
2795 * Precision is limited, only ~10 bits guaranteed
2796 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2797 */
2798 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2799 lp_build_fast_rsqrt(struct lp_build_context *bld,
2800 LLVMValueRef a)
2801 {
2802 LLVMBuilderRef builder = bld->gallivm->builder;
2803 const struct lp_type type = bld->type;
2804
2805 assert(lp_check_value(type, a));
2806
2807 if (lp_build_fast_rsqrt_available(type)) {
2808 const char *intrinsic = NULL;
2809
2810 if (type.length == 4) {
2811 intrinsic = "llvm.x86.sse.rsqrt.ps";
2812 }
2813 else {
2814 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2815 }
2816 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2817 }
2818 else {
2819 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2820 }
2821 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2822 }
2823
2824
2825 /**
2826 * Generate sin(a) or cos(a) using polynomial approximation.
2827 * TODO: it might be worth recognizing sin and cos using same source
2828 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2829 * would be way cheaper than calculating (nearly) everything twice...
2830 * Not sure it's common enough to be worth bothering however, scs
2831 * opcode could also benefit from calculating both though.
2832 */
2833 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2834 lp_build_sin_or_cos(struct lp_build_context *bld,
2835 LLVMValueRef a,
2836 boolean cos)
2837 {
2838 struct gallivm_state *gallivm = bld->gallivm;
2839 LLVMBuilderRef b = gallivm->builder;
2840 struct lp_type int_type = lp_int_type(bld->type);
2841
2842 /*
2843 * take the absolute value,
2844 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845 */
2846
2847 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849
2850 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852
2853 /*
2854 * scale by 4/Pi
2855 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856 */
2857
2858 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860
2861 /*
2862 * store the integer part of y in mm0
2863 * emm2 = _mm_cvttps_epi32(y);
2864 */
2865
2866 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867
2868 /*
2869 * j=(j+1) & (~1) (see the cephes sources)
2870 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871 */
2872
2873 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875 /*
2876 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877 */
2878 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880
2881 /*
2882 * y = _mm_cvtepi32_ps(emm2);
2883 */
2884 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885
2886 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2887 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2888 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2889 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2890
2891 /*
2892 * Argument used for poly selection and sign bit determination
2893 * is different for sin vs. cos.
2894 */
2895 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2896 emm2_and;
2897
2898 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2899 LLVMBuildNot(b, emm2_2, ""), ""),
2900 const_29, "sign_bit") :
2901 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2902 LLVMBuildShl(b, emm2_add,
2903 const_29, ""), ""),
2904 sign_mask, "sign_bit");
2905
2906 /*
2907 * get the polynom selection mask
2908 * there is one polynom for 0 <= x <= Pi/4
2909 * and another one for Pi/4<x<=Pi/2
2910 * Both branches will be computed.
2911 *
2912 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2913 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2914 */
2915
2916 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2917 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2918 int_type, PIPE_FUNC_EQUAL,
2919 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2920
2921 /*
2922 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2923 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2924 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2925 */
2926 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2927 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2928 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2929
2930 /*
2931 * The magic pass: "Extended precision modular arithmetic"
2932 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2933 */
2934 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2935 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2936 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2937
2938 /*
2939 * Evaluate the first polynom (0 <= x <= Pi/4)
2940 *
2941 * z = _mm_mul_ps(x,x);
2942 */
2943 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2944
2945 /*
2946 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2947 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2948 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2949 */
2950 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2951 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2952 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2953
2954 /*
2955 * y = *(v4sf*)_ps_coscof_p0;
2956 * y = _mm_mul_ps(y, z);
2957 */
2958 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2959 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2960 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2961 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2962
2963
2964 /*
2965 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2966 * y = _mm_sub_ps(y, tmp);
2967 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2968 */
2969 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2970 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2971 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2972 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2973 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2974
2975 /*
2976 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2977 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2978 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2979 */
2980 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2981 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2982 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2983
2984 /*
2985 * Evaluate the second polynom (Pi/4 <= x <= 0)
2986 *
2987 * y2 = *(v4sf*)_ps_sincof_p0;
2988 * y2 = _mm_mul_ps(y2, z);
2989 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2990 * y2 = _mm_mul_ps(y2, z);
2991 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2992 * y2 = _mm_mul_ps(y2, z);
2993 * y2 = _mm_mul_ps(y2, x);
2994 * y2 = _mm_add_ps(y2, x);
2995 */
2996
2997 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2998 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2999 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3000 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3001
3002 /*
3003 * select the correct result from the two polynoms
3004 * xmm3 = poly_mask;
3005 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3006 * y = _mm_andnot_ps(xmm3, y);
3007 * y = _mm_or_ps(y,y2);
3008 */
3009 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3010 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3011 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3012 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3013 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3014 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3015
3016 /*
3017 * update the sign
3018 * y = _mm_xor_ps(y, sign_bit);
3019 */
3020 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3021 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3022
3023 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3024
3025 /* clamp output to be within [-1, 1] */
3026 y_result = lp_build_clamp(bld, y_result,
3027 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3028 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3029 /* If a is -inf, inf or NaN then return NaN */
3030 y_result = lp_build_select(bld, isfinite, y_result,
3031 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3032 return y_result;
3033 }
3034
3035
3036 /**
3037 * Generate sin(a)
3038 */
3039 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3040 lp_build_sin(struct lp_build_context *bld,
3041 LLVMValueRef a)
3042 {
3043 const struct lp_type type = bld->type;
3044
3045 if (type.width == 16) {
3046 LLVMBuilderRef builder = bld->gallivm->builder;
3047 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3048 char intrinsic[32];
3049 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3050 LLVMValueRef args[] = { a };
3051 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3052 }
3053
3054 return lp_build_sin_or_cos(bld, a, FALSE);
3055 }
3056
3057
3058 /**
3059 * Generate cos(a)
3060 */
3061 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3062 lp_build_cos(struct lp_build_context *bld,
3063 LLVMValueRef a)
3064 {
3065 const struct lp_type type = bld->type;
3066
3067 if (type.width == 16) {
3068 LLVMBuilderRef builder = bld->gallivm->builder;
3069 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3070 char intrinsic[32];
3071 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3072 LLVMValueRef args[] = { a };
3073 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3074 }
3075
3076 return lp_build_sin_or_cos(bld, a, TRUE);
3077 }
3078
3079
3080 /**
3081 * Generate pow(x, y)
3082 */
3083 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3084 lp_build_pow(struct lp_build_context *bld,
3085 LLVMValueRef x,
3086 LLVMValueRef y)
3087 {
3088 /* TODO: optimize the constant case */
3089 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3090 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3091 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3092 __FUNCTION__);
3093 }
3094
3095 LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3096 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3097
3098 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3099 return res;
3100 }
3101
3102
3103 /**
3104 * Generate exp(x)
3105 */
3106 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3107 lp_build_exp(struct lp_build_context *bld,
3108 LLVMValueRef x)
3109 {
3110 /* log2(e) = 1/log(2) */
3111 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3112 1.4426950408889634);
3113
3114 assert(lp_check_value(bld->type, x));
3115
3116 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3117 }
3118
3119
3120 /**
3121 * Generate log(x)
3122 * Behavior is undefined with infs, 0s and nans
3123 */
3124 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3125 lp_build_log(struct lp_build_context *bld,
3126 LLVMValueRef x)
3127 {
3128 /* log(2) */
3129 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3130 0.69314718055994529);
3131
3132 assert(lp_check_value(bld->type, x));
3133
3134 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3135 }
3136
3137 /**
3138 * Generate log(x) that handles edge cases (infs, 0s and nans)
3139 */
3140 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3141 lp_build_log_safe(struct lp_build_context *bld,
3142 LLVMValueRef x)
3143 {
3144 /* log(2) */
3145 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3146 0.69314718055994529);
3147
3148 assert(lp_check_value(bld->type, x));
3149
3150 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3151 }
3152
3153
3154 /**
3155 * Generate polynomial.
3156 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3157 */
3158 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3159 lp_build_polynomial(struct lp_build_context *bld,
3160 LLVMValueRef x,
3161 const double *coeffs,
3162 unsigned num_coeffs)
3163 {
3164 const struct lp_type type = bld->type;
3165 LLVMValueRef even = NULL, odd = NULL;
3166 LLVMValueRef x2;
3167 unsigned i;
3168
3169 assert(lp_check_value(bld->type, x));
3170
3171 /* TODO: optimize the constant case */
3172 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3173 LLVMIsConstant(x)) {
3174 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3175 __FUNCTION__);
3176 }
3177
3178 /*
3179 * Calculate odd and even terms seperately to decrease data dependency
3180 * Ex:
3181 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3182 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3183 */
3184 x2 = lp_build_mul(bld, x, x);
3185
3186 for (i = num_coeffs; i--; ) {
3187 LLVMValueRef coeff;
3188
3189 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3190
3191 if (i % 2 == 0) {
3192 if (even)
3193 even = lp_build_mad(bld, x2, even, coeff);
3194 else
3195 even = coeff;
3196 } else {
3197 if (odd)
3198 odd = lp_build_mad(bld, x2, odd, coeff);
3199 else
3200 odd = coeff;
3201 }
3202 }
3203
3204 if (odd)
3205 return lp_build_mad(bld, odd, x, even);
3206 else if (even)
3207 return even;
3208 else
3209 return bld->undef;
3210 }
3211
3212
3213 /**
3214 * Minimax polynomial fit of 2**x, in range [0, 1[
3215 */
3216 const double lp_build_exp2_polynomial[] = {
3217 #if EXP_POLY_DEGREE == 5
3218 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3219 0.693153073200168932794,
3220 0.240153617044375388211,
3221 0.0558263180532956664775,
3222 0.00898934009049466391101,
3223 0.00187757667519147912699
3224 #elif EXP_POLY_DEGREE == 4
3225 1.00000259337069434683,
3226 0.693003834469974940458,
3227 0.24144275689150793076,
3228 0.0520114606103070150235,
3229 0.0135341679161270268764
3230 #elif EXP_POLY_DEGREE == 3
3231 0.999925218562710312959,
3232 0.695833540494823811697,
3233 0.226067155427249155588,
3234 0.0780245226406372992967
3235 #elif EXP_POLY_DEGREE == 2
3236 1.00172476321474503578,
3237 0.657636275736077639316,
3238 0.33718943461968720704
3239 #else
3240 #error
3241 #endif
3242 };
3243
3244
3245 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3246 lp_build_exp2(struct lp_build_context *bld,
3247 LLVMValueRef x)
3248 {
3249 LLVMBuilderRef builder = bld->gallivm->builder;
3250 const struct lp_type type = bld->type;
3251 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3252 LLVMValueRef ipart = NULL;
3253 LLVMValueRef fpart = NULL;
3254 LLVMValueRef expipart = NULL;
3255 LLVMValueRef expfpart = NULL;
3256 LLVMValueRef res = NULL;
3257
3258 if (type.floating && type.width == 16) {
3259 char intrinsic[32];
3260 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3261 LLVMValueRef args[] = { x };
3262 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3263 }
3264
3265 assert(lp_check_value(bld->type, x));
3266
3267 /* TODO: optimize the constant case */
3268 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3269 LLVMIsConstant(x)) {
3270 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3271 __FUNCTION__);
3272 }
3273
3274 assert(type.floating && type.width == 32);
3275
3276 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3277 * the result is INF and if it's smaller than -126.9 the result is 0 */
3278 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3279 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3280 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3281 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3282
3283 /* ipart = floor(x) */
3284 /* fpart = x - ipart */
3285 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3286
3287 /* expipart = (float) (1 << ipart) */
3288 expipart = LLVMBuildAdd(builder, ipart,
3289 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3290 expipart = LLVMBuildShl(builder, expipart,
3291 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3292 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3293
3294 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3295 ARRAY_SIZE(lp_build_exp2_polynomial));
3296
3297 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3298
3299 return res;
3300 }
3301
3302
3303
3304 /**
3305 * Extract the exponent of a IEEE-754 floating point value.
3306 *
3307 * Optionally apply an integer bias.
3308 *
3309 * Result is an integer value with
3310 *
3311 * ifloor(log2(x)) + bias
3312 */
3313 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3314 lp_build_extract_exponent(struct lp_build_context *bld,
3315 LLVMValueRef x,
3316 int bias)
3317 {
3318 LLVMBuilderRef builder = bld->gallivm->builder;
3319 const struct lp_type type = bld->type;
3320 unsigned mantissa = lp_mantissa(type);
3321 LLVMValueRef res;
3322
3323 assert(type.floating);
3324
3325 assert(lp_check_value(bld->type, x));
3326
3327 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3328
3329 res = LLVMBuildLShr(builder, x,
3330 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3331 res = LLVMBuildAnd(builder, res,
3332 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3333 res = LLVMBuildSub(builder, res,
3334 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3335
3336 return res;
3337 }
3338
3339
3340 /**
3341 * Extract the mantissa of the a floating.
3342 *
3343 * Result is a floating point value with
3344 *
3345 * x / floor(log2(x))
3346 */
3347 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3348 lp_build_extract_mantissa(struct lp_build_context *bld,
3349 LLVMValueRef x)
3350 {
3351 LLVMBuilderRef builder = bld->gallivm->builder;
3352 const struct lp_type type = bld->type;
3353 unsigned mantissa = lp_mantissa(type);
3354 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3355 (1ULL << mantissa) - 1);
3356 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3357 LLVMValueRef res;
3358
3359 assert(lp_check_value(bld->type, x));
3360
3361 assert(type.floating);
3362
3363 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3364
3365 /* res = x / 2**ipart */
3366 res = LLVMBuildAnd(builder, x, mantmask, "");
3367 res = LLVMBuildOr(builder, res, one, "");
3368 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3369
3370 return res;
3371 }
3372
3373
3374
3375 /**
3376 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3377 * These coefficients can be generate with
3378 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3379 */
3380 const double lp_build_log2_polynomial[] = {
3381 #if LOG_POLY_DEGREE == 5
3382 2.88539008148777786488L,
3383 0.961796878841293367824L,
3384 0.577058946784739859012L,
3385 0.412914355135828735411L,
3386 0.308591899232910175289L,
3387 0.352376952300281371868L,
3388 #elif LOG_POLY_DEGREE == 4
3389 2.88539009343309178325L,
3390 0.961791550404184197881L,
3391 0.577440339438736392009L,
3392 0.403343858251329912514L,
3393 0.406718052498846252698L,
3394 #elif LOG_POLY_DEGREE == 3
3395 2.88538959748872753838L,
3396 0.961932915889597772928L,
3397 0.571118517972136195241L,
3398 0.493997535084709500285L,
3399 #else
3400 #error
3401 #endif
3402 };
3403
3404 /**
3405 * See http://www.devmaster.net/forums/showthread.php?p=43580
3406 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3407 * http://www.nezumi.demon.co.uk/consult/logx.htm
3408 *
3409 * If handle_edge_cases is true the function will perform computations
3410 * to match the required D3D10+ behavior for each of the edge cases.
3411 * That means that if input is:
3412 * - less than zero (to and including -inf) then NaN will be returned
3413 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3414 * - +infinity, then +infinity will be returned
3415 * - NaN, then NaN will be returned
3416 *
3417 * Those checks are fairly expensive so if you don't need them make sure
3418 * handle_edge_cases is false.
3419 */
3420 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3421 lp_build_log2_approx(struct lp_build_context *bld,
3422 LLVMValueRef x,
3423 LLVMValueRef *p_exp,
3424 LLVMValueRef *p_floor_log2,
3425 LLVMValueRef *p_log2,
3426 boolean handle_edge_cases)
3427 {
3428 LLVMBuilderRef builder = bld->gallivm->builder;
3429 const struct lp_type type = bld->type;
3430 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3431 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3432
3433 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3434 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3435 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3436
3437 LLVMValueRef i = NULL;
3438 LLVMValueRef y = NULL;
3439 LLVMValueRef z = NULL;
3440 LLVMValueRef exp = NULL;
3441 LLVMValueRef mant = NULL;
3442 LLVMValueRef logexp = NULL;
3443 LLVMValueRef p_z = NULL;
3444 LLVMValueRef res = NULL;
3445
3446 if (bld->type.width == 16) {
3447 char intrinsic[32];
3448 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3449 LLVMValueRef args[] = { x };
3450 if (p_log2)
3451 *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3452 return;
3453 }
3454
3455 assert(lp_check_value(bld->type, x));
3456
3457 if(p_exp || p_floor_log2 || p_log2) {
3458 /* TODO: optimize the constant case */
3459 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3460 LLVMIsConstant(x)) {
3461 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3462 __FUNCTION__);
3463 }
3464
3465 assert(type.floating && type.width == 32);
3466
3467 /*
3468 * We don't explicitly handle denormalized numbers. They will yield a
3469 * result in the neighbourhood of -127, which appears to be adequate
3470 * enough.
3471 */
3472
3473 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3474
3475 /* exp = (float) exponent(x) */
3476 exp = LLVMBuildAnd(builder, i, expmask, "");
3477 }
3478
3479 if(p_floor_log2 || p_log2) {
3480 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3481 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3482 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3483 }
3484
3485 if (p_log2) {
3486 /* mant = 1 + (float) mantissa(x) */
3487 mant = LLVMBuildAnd(builder, i, mantmask, "");
3488 mant = LLVMBuildOr(builder, mant, one, "");
3489 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3490
3491 /* y = (mant - 1) / (mant + 1) */
3492 y = lp_build_div(bld,
3493 lp_build_sub(bld, mant, bld->one),
3494 lp_build_add(bld, mant, bld->one)
3495 );
3496
3497 /* z = y^2 */
3498 z = lp_build_mul(bld, y, y);
3499
3500 /* compute P(z) */
3501 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3502 ARRAY_SIZE(lp_build_log2_polynomial));
3503
3504 /* y * P(z) + logexp */
3505 res = lp_build_mad(bld, y, p_z, logexp);
3506
3507 if (type.floating && handle_edge_cases) {
3508 LLVMValueRef negmask, infmask, zmask;
3509 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3510 lp_build_const_vec(bld->gallivm, type, 0.0f));
3511 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3512 lp_build_const_vec(bld->gallivm, type, 0.0f));
3513 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3514 lp_build_const_vec(bld->gallivm, type, INFINITY));
3515
3516 /* If x is qual to inf make sure we return inf */
3517 res = lp_build_select(bld, infmask,
3518 lp_build_const_vec(bld->gallivm, type, INFINITY),
3519 res);
3520 /* If x is qual to 0, return -inf */
3521 res = lp_build_select(bld, zmask,
3522 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3523 res);
3524 /* If x is nan or less than 0, return nan */
3525 res = lp_build_select(bld, negmask,
3526 lp_build_const_vec(bld->gallivm, type, NAN),
3527 res);
3528 }
3529 }
3530
3531 if (p_exp) {
3532 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3533 *p_exp = exp;
3534 }
3535
3536 if (p_floor_log2)
3537 *p_floor_log2 = logexp;
3538
3539 if (p_log2)
3540 *p_log2 = res;
3541 }
3542
3543
3544 /*
3545 * log2 implementation which doesn't have special code to
3546 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3547 * the results for those cases are undefined.
3548 */
3549 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3550 lp_build_log2(struct lp_build_context *bld,
3551 LLVMValueRef x)
3552 {
3553 LLVMValueRef res;
3554 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3555 return res;
3556 }
3557
3558 /*
3559 * Version of log2 which handles all edge cases.
3560 * Look at documentation of lp_build_log2_approx for
3561 * description of the behavior for each of the edge cases.
3562 */
3563 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3564 lp_build_log2_safe(struct lp_build_context *bld,
3565 LLVMValueRef x)
3566 {
3567 LLVMValueRef res;
3568 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3569 return res;
3570 }
3571
3572
3573 /**
3574 * Faster (and less accurate) log2.
3575 *
3576 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3577 *
3578 * Piece-wise linear approximation, with exact results when x is a
3579 * power of two.
3580 *
3581 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3582 */
3583 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3584 lp_build_fast_log2(struct lp_build_context *bld,
3585 LLVMValueRef x)
3586 {
3587 LLVMBuilderRef builder = bld->gallivm->builder;
3588 LLVMValueRef ipart;
3589 LLVMValueRef fpart;
3590
3591 assert(lp_check_value(bld->type, x));
3592
3593 assert(bld->type.floating);
3594
3595 /* ipart = floor(log2(x)) - 1 */
3596 ipart = lp_build_extract_exponent(bld, x, -1);
3597 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3598
3599 /* fpart = x / 2**ipart */
3600 fpart = lp_build_extract_mantissa(bld, x);
3601
3602 /* ipart + fpart */
3603 return LLVMBuildFAdd(builder, ipart, fpart, "");
3604 }
3605
3606
3607 /**
3608 * Fast implementation of iround(log2(x)).
3609 *
3610 * Not an approximation -- it should give accurate results all the time.
3611 */
3612 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3613 lp_build_ilog2(struct lp_build_context *bld,
3614 LLVMValueRef x)
3615 {
3616 LLVMBuilderRef builder = bld->gallivm->builder;
3617 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3618 LLVMValueRef ipart;
3619
3620 assert(bld->type.floating);
3621
3622 assert(lp_check_value(bld->type, x));
3623
3624 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3625 x = LLVMBuildFMul(builder, x, sqrt2, "");
3626
3627 /* ipart = floor(log2(x) + 0.5) */
3628 ipart = lp_build_extract_exponent(bld, x, 0);
3629
3630 return ipart;
3631 }
3632
3633 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3634 lp_build_mod(struct lp_build_context *bld,
3635 LLVMValueRef x,
3636 LLVMValueRef y)
3637 {
3638 LLVMBuilderRef builder = bld->gallivm->builder;
3639 LLVMValueRef res;
3640 const struct lp_type type = bld->type;
3641
3642 assert(lp_check_value(type, x));
3643 assert(lp_check_value(type, y));
3644
3645 if (type.floating)
3646 res = LLVMBuildFRem(builder, x, y, "");
3647 else if (type.sign)
3648 res = LLVMBuildSRem(builder, x, y, "");
3649 else
3650 res = LLVMBuildURem(builder, x, y, "");
3651 return res;
3652 }
3653
3654
3655 /*
3656 * For floating inputs it creates and returns a mask
3657 * which is all 1's for channels which are NaN.
3658 * Channels inside x which are not NaN will be 0.
3659 */
3660 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3661 lp_build_isnan(struct lp_build_context *bld,
3662 LLVMValueRef x)
3663 {
3664 LLVMValueRef mask;
3665 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3666
3667 assert(bld->type.floating);
3668 assert(lp_check_value(bld->type, x));
3669
3670 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3671 "isnotnan");
3672 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3673 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3674 return mask;
3675 }
3676
3677 /* Returns all 1's for floating point numbers that are
3678 * finite numbers and returns all zeros for -inf,
3679 * inf and nan's */
3680 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3681 lp_build_isfinite(struct lp_build_context *bld,
3682 LLVMValueRef x)
3683 {
3684 LLVMBuilderRef builder = bld->gallivm->builder;
3685 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3686 struct lp_type int_type = lp_int_type(bld->type);
3687 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3688 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3689 0x7f800000);
3690
3691 if (!bld->type.floating) {
3692 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3693 }
3694 assert(bld->type.floating);
3695 assert(lp_check_value(bld->type, x));
3696 assert(bld->type.width == 32);
3697
3698 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3699 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3700 intx, infornan32);
3701 }
3702
3703 /*
3704 * Returns true if the number is nan or inf and false otherwise.
3705 * The input has to be a floating point vector.
3706 */
3707 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3708 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3709 const struct lp_type type,
3710 LLVMValueRef x)
3711 {
3712 LLVMBuilderRef builder = gallivm->builder;
3713 struct lp_type int_type = lp_int_type(type);
3714 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3715 0x7f800000);
3716 LLVMValueRef ret;
3717
3718 assert(type.floating);
3719
3720 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3721 ret = LLVMBuildAnd(builder, ret, const0, "");
3722 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3723 ret, const0);
3724
3725 return ret;
3726 }
3727
3728
3729 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3730 lp_build_fpstate_get(struct gallivm_state *gallivm)
3731 {
3732 if (util_get_cpu_caps()->has_sse) {
3733 LLVMBuilderRef builder = gallivm->builder;
3734 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3735 gallivm,
3736 LLVMInt32TypeInContext(gallivm->context),
3737 "mxcsr_ptr");
3738 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3739 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3740 lp_build_intrinsic(builder,
3741 "llvm.x86.sse.stmxcsr",
3742 LLVMVoidTypeInContext(gallivm->context),
3743 &mxcsr_ptr8, 1, 0);
3744 return mxcsr_ptr;
3745 }
3746 return 0;
3747 }
3748
3749 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3750 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3751 boolean zero)
3752 {
3753 if (util_get_cpu_caps()->has_sse) {
3754 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3755 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3756
3757 LLVMBuilderRef builder = gallivm->builder;
3758 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3759 LLVMValueRef mxcsr =
3760 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3761
3762 if (util_get_cpu_caps()->has_daz) {
3763 /* Enable denormals are zero mode */
3764 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3765 }
3766 if (zero) {
3767 mxcsr = LLVMBuildOr(builder, mxcsr,
3768 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3769 } else {
3770 mxcsr = LLVMBuildAnd(builder, mxcsr,
3771 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3772 }
3773
3774 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3775 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3776 }
3777 }
3778
3779 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3780 lp_build_fpstate_set(struct gallivm_state *gallivm,
3781 LLVMValueRef mxcsr_ptr)
3782 {
3783 if (util_get_cpu_caps()->has_sse) {
3784 LLVMBuilderRef builder = gallivm->builder;
3785 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3786 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3787 lp_build_intrinsic(builder,
3788 "llvm.x86.sse.ldmxcsr",
3789 LLVMVoidTypeInContext(gallivm->context),
3790 &mxcsr_ptr, 1, 0);
3791 }
3792 }
3793