1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
19  *
20  * The above copyright notice and this permission notice (including the
21  * next paragraph) shall be included in all copies or substantial portions
22  * of the Software.
23  *
24  */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27 
28 #include "ac_exp_param.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40 
41 #include <assert.h>
42 #include <stdio.h>
43 
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45 
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47  */
48 struct ac_llvm_flow {
49    /* Loop exit or next part of if/else/endif. */
50    LLVMBasicBlockRef next_block;
51    LLVMBasicBlockRef loop_entry_block;
52 };
53 
54 /* Initialize module-independent parts of the context.
55  *
56  * The caller is responsible for initializing ctx::module and ctx::builder.
57  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,enum chip_class chip_class,enum radeon_family family,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits)58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59                           enum chip_class chip_class, enum radeon_family family,
60                           const struct radeon_info *info,
61                           enum ac_float_mode float_mode, unsigned wave_size,
62                           unsigned ballot_mask_bits)
63 {
64    ctx->context = LLVMContextCreate();
65 
66    ctx->chip_class = chip_class;
67    ctx->family = family;
68    ctx->info = info;
69    ctx->wave_size = wave_size;
70    ctx->ballot_mask_bits = ballot_mask_bits;
71    ctx->float_mode = float_mode;
72    ctx->module = ac_create_module(compiler->tm, ctx->context);
73    ctx->builder = ac_create_builder(ctx->context, float_mode);
74 
75    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
76    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
77    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
78    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
79    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
80    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
81    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
82    ctx->intptr = ctx->i32;
83    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
84    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
85    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
86    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
87    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
88    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
89    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
90    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
91    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
92    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
93    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
94    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
95    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
96    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
97    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
98    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
99 
100    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
101    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
102    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
103    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
104    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
105    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
106    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
107    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
108    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
109    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
110    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
111    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
112    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
113    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
114    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
115    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
116 
117    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
118    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
119 
120    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
121 
122    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
123 
124    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
125 
126    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
127    ctx->flow = calloc(1, sizeof(*ctx->flow));
128 }
129 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)130 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
131 {
132    free(ctx->flow->stack);
133    free(ctx->flow);
134    ctx->flow = NULL;
135 }
136 
ac_get_llvm_num_components(LLVMValueRef value)137 int ac_get_llvm_num_components(LLVMValueRef value)
138 {
139    LLVMTypeRef type = LLVMTypeOf(value);
140    unsigned num_components =
141       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
142    return num_components;
143 }
144 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)145 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
146 {
147    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
148       assert(index == 0);
149       return value;
150    }
151 
152    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
153 }
154 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)155 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
156 {
157    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
158       type = LLVMGetElementType(type);
159 
160    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
161       return LLVMGetIntTypeWidth(type);
162 
163    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
164       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
165          return 32;
166    }
167 
168    if (type == ctx->f16)
169       return 16;
170    if (type == ctx->f32)
171       return 32;
172    if (type == ctx->f64)
173       return 64;
174 
175    unreachable("Unhandled type kind in get_elem_bits");
176 }
177 
ac_get_type_size(LLVMTypeRef type)178 unsigned ac_get_type_size(LLVMTypeRef type)
179 {
180    LLVMTypeKind kind = LLVMGetTypeKind(type);
181 
182    switch (kind) {
183    case LLVMIntegerTypeKind:
184       return LLVMGetIntTypeWidth(type) / 8;
185    case LLVMHalfTypeKind:
186       return 2;
187    case LLVMFloatTypeKind:
188       return 4;
189    case LLVMDoubleTypeKind:
190       return 8;
191    case LLVMPointerTypeKind:
192       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
193          return 4;
194       return 8;
195    case LLVMVectorTypeKind:
196       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
197    case LLVMArrayTypeKind:
198       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
199    default:
200       assert(0);
201       return 0;
202    }
203 }
204 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)205 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
206 {
207    if (t == ctx->i1)
208       return ctx->i1;
209    else if (t == ctx->i8)
210       return ctx->i8;
211    else if (t == ctx->f16 || t == ctx->i16)
212       return ctx->i16;
213    else if (t == ctx->f32 || t == ctx->i32)
214       return ctx->i32;
215    else if (t == ctx->f64 || t == ctx->i64)
216       return ctx->i64;
217    else
218       unreachable("Unhandled integer size");
219 }
220 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)221 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
222 {
223    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
224       LLVMTypeRef elem_type = LLVMGetElementType(t);
225       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
226    }
227    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
228       switch (LLVMGetPointerAddressSpace(t)) {
229       case AC_ADDR_SPACE_GLOBAL:
230          return ctx->i64;
231       case AC_ADDR_SPACE_CONST_32BIT:
232       case AC_ADDR_SPACE_LDS:
233          return ctx->i32;
234       default:
235          unreachable("unhandled address space");
236       }
237    }
238    return to_integer_type_scalar(ctx, t);
239 }
240 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)241 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
242 {
243    LLVMTypeRef type = LLVMTypeOf(v);
244    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
245       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
246    }
247    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
248 }
249 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)250 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
251 {
252    LLVMTypeRef type = LLVMTypeOf(v);
253    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
254       return v;
255    return ac_to_integer(ctx, v);
256 }
257 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)258 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
259 {
260    if (t == ctx->i8)
261       return ctx->i8;
262    else if (t == ctx->i16 || t == ctx->f16)
263       return ctx->f16;
264    else if (t == ctx->i32 || t == ctx->f32)
265       return ctx->f32;
266    else if (t == ctx->i64 || t == ctx->f64)
267       return ctx->f64;
268    else
269       unreachable("Unhandled float size");
270 }
271 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)272 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
273 {
274    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
275       LLVMTypeRef elem_type = LLVMGetElementType(t);
276       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
277    }
278    return to_float_type_scalar(ctx, t);
279 }
280 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)281 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
282 {
283    LLVMTypeRef type = LLVMTypeOf(v);
284    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
285 }
286 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)287 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
288                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
289                                 unsigned attrib_mask)
290 {
291    LLVMValueRef function, call;
292    bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
293 
294    function = LLVMGetNamedFunction(ctx->module, name);
295    if (!function) {
296       LLVMTypeRef param_types[32], function_type;
297       unsigned i;
298 
299       assert(param_count <= 32);
300 
301       for (i = 0; i < param_count; ++i) {
302          assert(params[i]);
303          param_types[i] = LLVMTypeOf(params[i]);
304       }
305       function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
306       function = LLVMAddFunction(ctx->module, name, function_type);
307 
308       LLVMSetFunctionCallConv(function, LLVMCCallConv);
309       LLVMSetLinkage(function, LLVMExternalLinkage);
310 
311       if (!set_callsite_attrs)
312          ac_add_func_attributes(ctx->context, function, attrib_mask);
313    }
314 
315    call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
316    if (set_callsite_attrs)
317       ac_add_func_attributes(ctx->context, call, attrib_mask);
318    return call;
319 }
320 
321 /**
322  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
323  * intrinsic names).
324  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)325 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
326 {
327    LLVMTypeRef elem_type = type;
328 
329    if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
330       unsigned count = LLVMCountStructElementTypes(type);
331       int ret = snprintf(buf, bufsize, "sl_");
332       buf += ret;
333       bufsize -= ret;
334 
335       LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
336       LLVMGetStructElementTypes(type, elems);
337 
338       for (unsigned i = 0; i < count; i++) {
339          ac_build_type_name_for_intr(elems[i], buf, bufsize);
340          ret = strlen(buf);
341          buf += ret;
342          bufsize -= ret;
343       }
344 
345       snprintf(buf, bufsize, "s");
346       return;
347    }
348 
349    assert(bufsize >= 8);
350    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
351       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
352       if (ret < 0) {
353          char *type_name = LLVMPrintTypeToString(type);
354          fprintf(stderr, "Error building type name for: %s\n", type_name);
355          LLVMDisposeMessage(type_name);
356          return;
357       }
358       elem_type = LLVMGetElementType(type);
359       buf += ret;
360       bufsize -= ret;
361    }
362    switch (LLVMGetTypeKind(elem_type)) {
363    default:
364       break;
365    case LLVMIntegerTypeKind:
366       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
367       break;
368    case LLVMHalfTypeKind:
369       snprintf(buf, bufsize, "f16");
370       break;
371    case LLVMFloatTypeKind:
372       snprintf(buf, bufsize, "f32");
373       break;
374    case LLVMDoubleTypeKind:
375       snprintf(buf, bufsize, "f64");
376       break;
377    }
378 }
379 
380 /**
381  * Helper function that builds an LLVM IR PHI node and immediately adds
382  * incoming edges.
383  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)384 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
385                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
386 {
387    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
388    LLVMAddIncoming(phi, values, blocks, count_incoming);
389    return phi;
390 }
391 
ac_build_s_barrier(struct ac_llvm_context * ctx)392 void ac_build_s_barrier(struct ac_llvm_context *ctx)
393 {
394    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
395 }
396 
397 /* Prevent optimizations (at least of memory accesses) across the current
398  * point in the program by emitting empty inline assembly that is marked as
399  * having side effects.
400  *
401  * Optionally, a value can be passed through the inline assembly to prevent
402  * LLVM from hoisting calls to ReadNone functions.
403  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)404 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
405 {
406    static int counter = 0;
407 
408    LLVMBuilderRef builder = ctx->builder;
409    char code[16];
410    const char *constraint = sgpr ? "=s,0" : "=v,0";
411 
412    snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
413 
414    if (!pgpr) {
415       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
416       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
417       LLVMBuildCall(builder, inlineasm, NULL, 0, "");
418    } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
419       /* Simple version for i32 that allows the caller to set LLVM metadata on the call
420        * instruction. */
421       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
422       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
423 
424       *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
425    } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
426       /* Simple version for i16 that allows the caller to set LLVM metadata on the call
427        * instruction. */
428       LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
429       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
430 
431       *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
432    } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
433       LLVMTypeRef type = LLVMTypeOf(*pgpr);
434       LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
435       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
436 
437       *pgpr = LLVMBuildCall(builder, inlineasm, pgpr, 1, "");
438    } else {
439       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
440       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
441       LLVMTypeRef type = LLVMTypeOf(*pgpr);
442       unsigned bitsize = ac_get_elem_bits(ctx, type);
443       LLVMValueRef vgpr = *pgpr;
444       LLVMTypeRef vgpr_type;
445       unsigned vgpr_size;
446       LLVMValueRef vgpr0;
447 
448       if (bitsize < 32)
449          vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
450 
451       vgpr_type = LLVMTypeOf(vgpr);
452       vgpr_size = ac_get_type_size(vgpr_type);
453 
454       assert(vgpr_size % 4 == 0);
455 
456       vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
457       vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
458       vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
459       vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
460       vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
461 
462       if (bitsize < 32)
463          vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
464 
465       *pgpr = vgpr;
466    }
467 }
468 
ac_build_shader_clock(struct ac_llvm_context * ctx,nir_scope scope)469 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
470 {
471    const char *subgroup = "llvm.readcyclecounter";
472    const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
473 
474    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
475    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
476 }
477 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)478 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
479 {
480    const char *name;
481 
482    if (LLVMTypeOf(value) == ctx->i1)
483       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
484 
485    if (ctx->wave_size == 64)
486       name = "llvm.amdgcn.icmp.i64.i32";
487    else
488       name = "llvm.amdgcn.icmp.i32.i32";
489 
490    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
491 
492    /* We currently have no other way to prevent LLVM from lifting the icmp
493     * calls to a dominating basic block.
494     */
495    ac_build_optimization_barrier(ctx, &args[0], false);
496 
497    args[0] = ac_to_integer(ctx, args[0]);
498 
499    return ac_build_intrinsic(
500       ctx, name, ctx->iN_wavemask, args, 3,
501       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
502 }
503 
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)504 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
505 {
506    const char *name;
507 
508    if (ctx->wave_size == 64)
509       name = "llvm.amdgcn.icmp.i64.i1";
510    else
511       name = "llvm.amdgcn.icmp.i32.i1";
512 
513    LLVMValueRef args[3] = {
514       value,
515       ctx->i1false,
516       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
517    };
518 
519    return ac_build_intrinsic(
520       ctx, name, ctx->iN_wavemask, args, 3,
521       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
522 }
523 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)524 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
525 {
526    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
527    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
528    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
529 }
530 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)531 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
532 {
533    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
534    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
535                         "");
536 }
537 
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)538 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
539 {
540    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
541    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
542 
543    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
544    LLVMValueRef none =
545       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
546    return LLVMBuildOr(ctx->builder, all, none, "");
547 }
548 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)549 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
550                                             unsigned value_count, unsigned component)
551 {
552    LLVMValueRef vec = NULL;
553 
554    if (value_count == 1) {
555       return values[component];
556    } else if (!value_count)
557       unreachable("value_count is 0");
558 
559    for (unsigned i = component; i < value_count + component; i++) {
560       LLVMValueRef value = values[i];
561 
562       if (i == component)
563          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
564       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
565       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
566    }
567    return vec;
568 }
569 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool load,bool always_vector)570 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
571                                              unsigned value_count, unsigned value_stride, bool load,
572                                              bool always_vector)
573 {
574    LLVMBuilderRef builder = ctx->builder;
575    LLVMValueRef vec = NULL;
576    unsigned i;
577 
578    if (value_count == 1 && !always_vector) {
579       if (load)
580          return LLVMBuildLoad(builder, values[0], "");
581       return values[0];
582    } else if (!value_count)
583       unreachable("value_count is 0");
584 
585    for (i = 0; i < value_count; i++) {
586       LLVMValueRef value = values[i * value_stride];
587       if (load)
588          value = LLVMBuildLoad(builder, value, "");
589 
590       if (!i)
591          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
592       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
593       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
594    }
595    return vec;
596 }
597 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)598 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
599                                     unsigned value_count)
600 {
601    return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
602 }
603 
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)604 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
605 {
606    unsigned a_size = ac_get_llvm_num_components(a);
607    unsigned b_size = ac_get_llvm_num_components(b);
608 
609    LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
610    for (unsigned i = 0; i < a_size; i++)
611       elems[i] = ac_llvm_extract_elem(ctx, a, i);
612    for (unsigned i = 0; i < b_size; i++)
613       elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
614 
615    return ac_build_gather_values(ctx, elems, a_size + b_size);
616 }
617 
618 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
619  * channels with undef. Extract at most src_channels components from the input.
620  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)621 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
622                              unsigned src_channels, unsigned dst_channels)
623 {
624    LLVMTypeRef elemtype;
625    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
626 
627    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
628       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
629 
630       if (src_channels == dst_channels && vec_size == dst_channels)
631          return value;
632 
633       src_channels = MIN2(src_channels, vec_size);
634 
635       for (unsigned i = 0; i < src_channels; i++)
636          chan[i] = ac_llvm_extract_elem(ctx, value, i);
637 
638       elemtype = LLVMGetElementType(LLVMTypeOf(value));
639    } else {
640       if (src_channels) {
641          assert(src_channels == 1);
642          chan[0] = value;
643       }
644       elemtype = LLVMTypeOf(value);
645    }
646 
647    for (unsigned i = src_channels; i < dst_channels; i++)
648       chan[i] = LLVMGetUndef(elemtype);
649 
650    return ac_build_gather_values(ctx, chan, dst_channels);
651 }
652 
653 /* Extract components [start, start + channels) from a vector.
654  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)655 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
656                                    unsigned channels)
657 {
658    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
659 
660    for (unsigned i = 0; i < channels; i++)
661       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
662 
663    return ac_build_gather_values(ctx, chan, channels);
664 }
665 
666 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
667  * with undef. Extract at most num_channels components from the input.
668  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)669 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
670                                      unsigned num_channels)
671 {
672    return ac_build_expand(ctx, value, num_channels, 4);
673 }
674 
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)675 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
676 {
677    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
678    const char *name;
679 
680    if (type_size == 2)
681       name = "llvm.rint.f16";
682    else if (type_size == 4)
683       name = "llvm.rint.f32";
684    else
685       name = "llvm.rint.f64";
686 
687    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
688 }
689 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)690 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
691 {
692    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
693    const char *name;
694 
695    /* For doubles, we need precise division to pass GLCTS. */
696    if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
697       return LLVMBuildFDiv(ctx->builder, num, den, "");
698 
699    if (type_size == 2)
700       name = "llvm.amdgcn.rcp.f16";
701    else if (type_size == 4)
702       name = "llvm.amdgcn.rcp.f32";
703    else
704       name = "llvm.amdgcn.rcp.f64";
705 
706    LLVMValueRef rcp =
707       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
708 
709    return LLVMBuildFMul(ctx->builder, num, rcp, "");
710 }
711 
712 /* See fast_idiv_by_const.h. */
713 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)714 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
715                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
716                                 LLVMValueRef post_shift, LLVMValueRef increment)
717 {
718    LLVMBuilderRef builder = ctx->builder;
719 
720    num = LLVMBuildLShr(builder, num, pre_shift, "");
721    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
722                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
723    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
724    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
725    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
726    return LLVMBuildLShr(builder, num, post_shift, "");
727 }
728 
729 /* See fast_idiv_by_const.h. */
730 /* If num != UINT_MAX, this more efficient version can be used. */
731 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)732 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
733                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
734                                     LLVMValueRef post_shift, LLVMValueRef increment)
735 {
736    LLVMBuilderRef builder = ctx->builder;
737 
738    num = LLVMBuildLShr(builder, num, pre_shift, "");
739    num = LLVMBuildNUWAdd(builder, num, increment, "");
740    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
741                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
742    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
743    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
744    return LLVMBuildLShr(builder, num, post_shift, "");
745 }
746 
747 /* See fast_idiv_by_const.h. */
748 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)749 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
750                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
751 {
752    LLVMBuilderRef builder = ctx->builder;
753 
754    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
755                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
756    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
757    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
758    return LLVMBuildLShr(builder, num, post_shift, "");
759 }
760 
761 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
762  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
763  * already multiplied by two. id is the cube face number.
764  */
765 struct cube_selection_coords {
766    LLVMValueRef stc[2];
767    LLVMValueRef ma;
768    LLVMValueRef id;
769 };
770 
build_cube_intrinsic(struct ac_llvm_context * ctx,LLVMValueRef in[3],struct cube_selection_coords * out)771 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
772                                  struct cube_selection_coords *out)
773 {
774    LLVMTypeRef f32 = ctx->f32;
775 
776    out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
777    out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
778    out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
779    out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
780 }
781 
782 /**
783  * Build a manual selection sequence for cube face sc/tc coordinates and
784  * major axis vector (multiplied by 2 for consistency) for the given
785  * vec3 \p coords, for the face implied by \p selcoords.
786  *
787  * For the major axis, we always adjust the sign to be in the direction of
788  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
789  * the selcoords major axis.
790  */
build_cube_select(struct ac_llvm_context * ctx,const struct cube_selection_coords * selcoords,const LLVMValueRef * coords,LLVMValueRef * out_st,LLVMValueRef * out_ma)791 static void build_cube_select(struct ac_llvm_context *ctx,
792                               const struct cube_selection_coords *selcoords,
793                               const LLVMValueRef *coords, LLVMValueRef *out_st,
794                               LLVMValueRef *out_ma)
795 {
796    LLVMBuilderRef builder = ctx->builder;
797    LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
798    LLVMValueRef is_ma_positive;
799    LLVMValueRef sgn_ma;
800    LLVMValueRef is_ma_z, is_not_ma_z;
801    LLVMValueRef is_ma_y;
802    LLVMValueRef is_ma_x;
803    LLVMValueRef sgn;
804    LLVMValueRef tmp;
805 
806    is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
807    sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
808                             LLVMConstReal(f32, -1.0), "");
809 
810    is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
811    is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
812    is_ma_y = LLVMBuildAnd(
813       builder, is_not_ma_z,
814       LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
815    is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
816 
817    /* Select sc */
818    tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
819    sgn = LLVMBuildSelect(
820       builder, is_ma_y, LLVMConstReal(f32, 1.0),
821       LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
822    out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
823 
824    /* Select tc */
825    tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
826    sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
827    out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
828 
829    /* Select ma */
830    tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
831                          LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
832    tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
833    *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
834 }
835 
ac_prepare_cube_coords(struct ac_llvm_context * ctx,bool is_deriv,bool is_array,bool is_lod,LLVMValueRef * coords_arg,LLVMValueRef * derivs_arg)836 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
837                             LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
838 {
839 
840    LLVMBuilderRef builder = ctx->builder;
841    struct cube_selection_coords selcoords;
842    LLVMValueRef coords[3];
843    LLVMValueRef invma;
844 
845    if (is_array && !is_lod) {
846       LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
847 
848       /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
849        *
850        *    "For Array forms, the array layer used will be
851        *
852        *       max(0, min(d−1, floor(layer+0.5)))
853        *
854        *     where d is the depth of the texture array and layer
855        *     comes from the component indicated in the tables below.
856        *     Workaroudn for an issue where the layer is taken from a
857        *     helper invocation which happens to fall on a different
858        *     layer due to extrapolation."
859        *
860        * GFX8 and earlier attempt to implement this in hardware by
861        * clamping the value of coords[2] = (8 * layer) + face.
862        * Unfortunately, this means that the we end up with the wrong
863        * face when clamping occurs.
864        *
865        * Clamp the layer earlier to work around the issue.
866        */
867       if (ctx->chip_class <= GFX8) {
868          LLVMValueRef ge0;
869          ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
870          tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
871       }
872 
873       coords_arg[3] = tmp;
874    }
875 
876    build_cube_intrinsic(ctx, coords_arg, &selcoords);
877 
878    invma =
879       ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
880    invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
881 
882    for (int i = 0; i < 2; ++i)
883       coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
884 
885    coords[2] = selcoords.id;
886 
887    if (is_deriv && derivs_arg) {
888       LLVMValueRef derivs[4];
889       int axis;
890 
891       /* Convert cube derivatives to 2D derivatives. */
892       for (axis = 0; axis < 2; axis++) {
893          LLVMValueRef deriv_st[2];
894          LLVMValueRef deriv_ma;
895 
896          /* Transform the derivative alongside the texture
897           * coordinate. Mathematically, the correct formula is
898           * as follows. Assume we're projecting onto the +Z face
899           * and denote by dx/dh the derivative of the (original)
900           * X texture coordinate with respect to horizontal
901           * window coordinates. The projection onto the +Z face
902           * plane is:
903           *
904           *   f(x,z) = x/z
905           *
906           * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
907           *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
908           *
909           * This motivatives the implementation below.
910           *
911           * Whether this actually gives the expected results for
912           * apps that might feed in derivatives obtained via
913           * finite differences is anyone's guess. The OpenGL spec
914           * seems awfully quiet about how textureGrad for cube
915           * maps should be handled.
916           */
917          build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
918 
919          deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
920 
921          for (int i = 0; i < 2; ++i)
922             derivs[axis * 2 + i] =
923                LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
924                              LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
925       }
926 
927       memcpy(derivs_arg, derivs, sizeof(derivs));
928    }
929 
930    /* Shift the texture coordinate. This must be applied after the
931     * derivative calculation.
932     */
933    for (int i = 0; i < 2; ++i)
934       coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
935 
936    if (is_array) {
937       /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
938       /* coords_arg.w component - array_index for cube arrays */
939       coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
940    }
941 
942    memcpy(coords_arg, coords, sizeof(coords));
943 }
944 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)945 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
946                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
947                                 LLVMValueRef j)
948 {
949    LLVMValueRef args[5];
950    LLVMValueRef p1;
951 
952    args[0] = i;
953    args[1] = llvm_chan;
954    args[2] = attr_number;
955    args[3] = params;
956 
957    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
958 
959    args[0] = p1;
960    args[1] = j;
961    args[2] = llvm_chan;
962    args[3] = attr_number;
963    args[4] = params;
964 
965    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
966                              AC_FUNC_ATTR_READNONE);
967 }
968 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)969 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
970                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
971                                     LLVMValueRef j, bool high_16bits)
972 {
973    LLVMValueRef args[6];
974    LLVMValueRef p1;
975 
976    args[0] = i;
977    args[1] = llvm_chan;
978    args[2] = attr_number;
979    args[3] = high_16bits ? ctx->i1true : ctx->i1false;
980    args[4] = params;
981 
982    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
983                            AC_FUNC_ATTR_READNONE);
984 
985    args[0] = p1;
986    args[1] = j;
987    args[2] = llvm_chan;
988    args[3] = attr_number;
989    args[4] = high_16bits ? ctx->i1true : ctx->i1false;
990    args[5] = params;
991 
992    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
993                              AC_FUNC_ATTR_READNONE);
994 }
995 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,LLVMValueRef parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)996 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
997                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
998                                     LLVMValueRef params)
999 {
1000    LLVMValueRef args[4];
1001 
1002    args[0] = parameter;
1003    args[1] = llvm_chan;
1004    args[2] = attr_number;
1005    args[3] = params;
1006 
1007    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
1008                              AC_FUNC_ATTR_READNONE);
1009 }
1010 
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1011 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1012                               LLVMValueRef index)
1013 {
1014    return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1015 }
1016 
ac_build_gep0(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1017 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1018 {
1019    LLVMValueRef indices[2] = {
1020       ctx->i32_0,
1021       index,
1022    };
1023    return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1024 }
1025 
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef index)1026 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
1027 {
1028    return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1029                                LLVMTypeOf(ptr), "");
1030 }
1031 
ac_build_indexed_store(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,LLVMValueRef value)1032 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
1033                             LLVMValueRef value)
1034 {
1035    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
1036 }
1037 
1038 /**
1039  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1040  * It's equivalent to doing a load from &base_ptr[index].
1041  *
1042  * \param base_ptr  Where the array starts.
1043  * \param index     The element index into the array.
1044  * \param uniform   Whether the base_ptr and index can be assumed to be
1045  *                  dynamically uniform (i.e. load to an SGPR)
1046  * \param invariant Whether the load is invariant (no other opcodes affect it)
1047  * \param no_unsigned_wraparound
1048  *    For all possible re-associations and re-distributions of an expression
1049  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1050  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1051  *    does not result in an unsigned integer wraparound. This is used for
1052  *    optimal code generation of 32-bit pointer arithmetic.
1053  *
1054  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1055  *    integer wraparound can't be an imm offset in s_load_dword, because
1056  *    the instruction performs "addr + offset" in 64 bits.
1057  *
1058  *    Expected usage for bindless textures by chaining GEPs:
1059  *      // possible unsigned wraparound, don't use InBounds:
1060  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1061  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1062  *
1063  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1064  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1065  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)1066 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1067                                          LLVMValueRef index, bool uniform, bool invariant,
1068                                          bool no_unsigned_wraparound)
1069 {
1070    LLVMValueRef pointer, result;
1071 
1072    if (no_unsigned_wraparound &&
1073        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1074       pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1075    else
1076       pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1077 
1078    if (uniform)
1079       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1080    result = LLVMBuildLoad(ctx->builder, pointer, "");
1081    if (invariant)
1082       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1083    LLVMSetAlignment(result, 4);
1084    return result;
1085 }
1086 
ac_build_load(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1087 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1088 {
1089    return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1090 }
1091 
ac_build_load_invariant(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1092 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1093                                      LLVMValueRef index)
1094 {
1095    return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1096 }
1097 
1098 /* This assumes that there is no unsigned integer wraparound during the address
1099  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1100 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1101                                    LLVMValueRef index)
1102 {
1103    return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1104 }
1105 
1106 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1107 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1108                                                    LLVMValueRef base_ptr, LLVMValueRef index)
1109 {
1110    return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1111 }
1112 
get_load_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1113 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1114 {
1115    return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1116 }
1117 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool use_format,bool structurized)1118 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1119                                          LLVMValueRef data, LLVMValueRef vindex,
1120                                          LLVMValueRef voffset, LLVMValueRef soffset,
1121                                          unsigned cache_policy, bool use_format, bool structurized)
1122 {
1123    LLVMValueRef args[6];
1124    int idx = 0;
1125    args[idx++] = data;
1126    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1127    if (structurized)
1128       args[idx++] = vindex ? vindex : ctx->i32_0;
1129    args[idx++] = voffset ? voffset : ctx->i32_0;
1130    args[idx++] = soffset ? soffset : ctx->i32_0;
1131    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1132    const char *indexing_kind = structurized ? "struct" : "raw";
1133    char name[256], type_name[8];
1134 
1135    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1136 
1137    if (use_format) {
1138       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1139                type_name);
1140    } else {
1141       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1142    }
1143 
1144    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1145 }
1146 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,unsigned cache_policy)1147 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1148                                   LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1149 {
1150    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1151 }
1152 
1153 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1154  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1155  * or v4i32 (num_channels=3,4).
1156  */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,unsigned num_channels,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,unsigned cache_policy)1157 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1158                                  unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1159                                  unsigned inst_offset, unsigned cache_policy)
1160 {
1161    /* Split 3 channel stores. */
1162    if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1163       LLVMValueRef v[3], v01;
1164 
1165       for (int i = 0; i < 3; i++) {
1166          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1167       }
1168       v01 = ac_build_gather_values(ctx, v, 2);
1169 
1170       ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1171       ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1172                                   cache_policy);
1173       return;
1174    }
1175 
1176    /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1177     * (voffset is swizzled, but soffset isn't swizzled).
1178     * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1179     */
1180    if (!(cache_policy & ac_swizzled)) {
1181       LLVMValueRef offset = soffset;
1182 
1183       if (inst_offset)
1184          offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1185 
1186       ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1187                                    cache_policy, false, false);
1188       return;
1189    }
1190 
1191    static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1192                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
1193                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1194    unsigned dfmt = dfmts[num_channels - 1];
1195    unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1196    LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1197 
1198    ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1199                               nfmt, cache_policy);
1200 }
1201 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool use_format,bool structurized)1202 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1203                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1204                                                 LLVMValueRef soffset, unsigned num_channels,
1205                                                 LLVMTypeRef channel_type, unsigned cache_policy,
1206                                                 bool can_speculate, bool use_format,
1207                                                 bool structurized)
1208 {
1209    LLVMValueRef args[5];
1210    int idx = 0;
1211    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1212    if (structurized)
1213       args[idx++] = vindex ? vindex : ctx->i32_0;
1214    args[idx++] = voffset ? voffset : ctx->i32_0;
1215    args[idx++] = soffset ? soffset : ctx->i32_0;
1216    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1217    unsigned func =
1218       !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1219    const char *indexing_kind = structurized ? "struct" : "raw";
1220    char name[256], type_name[8];
1221 
1222    /* D16 is only supported on gfx8+ */
1223    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1224           ctx->chip_class >= GFX8);
1225 
1226    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1227    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1228 
1229    if (use_format) {
1230       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1231                type_name);
1232    } else {
1233       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1234    }
1235 
1236    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1237 }
1238 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool allow_smem)1239 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1240                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1241                                   unsigned inst_offset, LLVMTypeRef channel_type,
1242                                   unsigned cache_policy, bool can_speculate, bool allow_smem)
1243 {
1244    LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1245    if (voffset)
1246       offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1247    if (soffset)
1248       offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1249 
1250    if (allow_smem && !(cache_policy & ac_slc) &&
1251        (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1252       assert(vindex == NULL);
1253 
1254       LLVMValueRef result[8];
1255 
1256       for (int i = 0; i < num_channels; i++) {
1257          if (i) {
1258             offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1259          }
1260          LLVMValueRef args[3] = {
1261             rsrc,
1262             offset,
1263             LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1264          };
1265          result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1266                                         AC_FUNC_ATTR_READNONE);
1267       }
1268       if (num_channels == 1)
1269          return result[0];
1270 
1271       if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1272          result[num_channels++] = LLVMGetUndef(ctx->f32);
1273       return ac_build_gather_values(ctx, result, num_channels);
1274    }
1275 
1276    return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels,
1277                                       channel_type, cache_policy, can_speculate, false, false);
1278 }
1279 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,unsigned cache_policy,bool can_speculate,bool d16,bool tfe)1280 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1281                                          LLVMValueRef vindex, LLVMValueRef voffset,
1282                                          unsigned num_channels, unsigned cache_policy,
1283                                          bool can_speculate, bool d16, bool tfe)
1284 {
1285    if (tfe) {
1286       assert(!d16);
1287 
1288       char code[256];
1289       /* The definition in the assembly and the one in the constraint string
1290        * differs because of an assembler bug.
1291        */
1292       snprintf(code, sizeof(code),
1293                "v_mov_b32 v0, 0\n"
1294                "v_mov_b32 v1, 0\n"
1295                "v_mov_b32 v2, 0\n"
1296                "v_mov_b32 v3, 0\n"
1297                "v_mov_b32 v4, 0\n"
1298                "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1299                "s_waitcnt vmcnt(0)",
1300                cache_policy & ac_glc ? "glc" : "",
1301                cache_policy & ac_slc ? "slc" : "",
1302                cache_policy & ac_dlc ? "dlc" : "");
1303 
1304       LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1305       LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1306       LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1307 
1308       LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1309                                    voffset ? voffset : ctx->i32_0};
1310 
1311       LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1312                              LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1313       LLVMValueRef res = LLVMBuildCall(ctx->builder, inlineasm, args, 2, "");
1314 
1315       return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1316                              ac_llvm_extract_elem(ctx, res, 4));
1317    }
1318 
1319    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1320                                       d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1321                                       true);
1322 }
1323 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate,bool structurized)1324 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1325                                           LLVMValueRef vindex, LLVMValueRef voffset,
1326                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1327                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1328                                           unsigned cache_policy, bool can_speculate,
1329                                           bool structurized)
1330 {
1331    voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1332 
1333    LLVMValueRef args[6];
1334    int idx = 0;
1335    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1336    if (structurized)
1337       args[idx++] = vindex ? vindex : ctx->i32_0;
1338    args[idx++] = voffset ? voffset : ctx->i32_0;
1339    args[idx++] = soffset ? soffset : ctx->i32_0;
1340    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1341    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1342    unsigned func =
1343       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1344    const char *indexing_kind = structurized ? "struct" : "raw";
1345    char name[256], type_name[8];
1346 
1347    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1348    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1349 
1350    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1351 
1352    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1353 }
1354 
ac_build_struct_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1355 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1356                                           LLVMValueRef vindex, LLVMValueRef voffset,
1357                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1358                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1359                                           unsigned cache_policy, bool can_speculate)
1360 {
1361    return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1362                                 nfmt, cache_policy, can_speculate, true);
1363 }
1364 
ac_build_tbuffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1365 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1366                                          LLVMValueRef voffset, LLVMValueRef soffset,
1367                                          LLVMValueRef immoffset, unsigned cache_policy)
1368 {
1369    voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1370 
1371    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1372                                       cache_policy, false, false, false);
1373 }
1374 
ac_build_tbuffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1375 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1376                                         LLVMValueRef voffset, LLVMValueRef soffset,
1377                                         LLVMValueRef immoffset, unsigned cache_policy)
1378 {
1379    voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1380 
1381    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1382                                       false, false, false);
1383 }
1384 
1385 /**
1386  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1387  *
1388  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1389  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1390  */
ac_ufN_to_float(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned exp_bits,unsigned mant_bits)1391 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1392                                     unsigned exp_bits, unsigned mant_bits)
1393 {
1394    assert(LLVMTypeOf(src) == ctx->i32);
1395 
1396    LLVMValueRef tmp;
1397    LLVMValueRef mantissa;
1398    mantissa =
1399       LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1400 
1401    /* Converting normal numbers is just a shift + correcting the exponent bias */
1402    unsigned normal_shift = 23 - mant_bits;
1403    unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1404    LLVMValueRef shifted, normal;
1405 
1406    shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1407    normal =
1408       LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1409 
1410    /* Converting nan/inf numbers is the same, but with a different exponent update */
1411    LLVMValueRef naninf;
1412    naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1413 
1414    /* Converting denormals is the complex case: determine the leading zeros of the
1415     * mantissa to obtain the correct shift for the mantissa and exponent correction.
1416     */
1417    LLVMValueRef denormal;
1418    LLVMValueRef params[2] = {
1419       mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1420    };
1421    LLVMValueRef ctlz =
1422       ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1423 
1424    /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1425    tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1426    denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1427 
1428    unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1429    tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1430    tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1431    denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1432 
1433    /* Select the final result. */
1434    LLVMValueRef result;
1435 
1436    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1437                        LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");
1438    result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1439 
1440    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1441                        LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");
1442    result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1443 
1444    tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1445    result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1446 
1447    return ac_to_float(ctx, result);
1448 }
1449 
1450 /**
1451  * Generate a fully general open coded buffer format fetch with all required
1452  * fixups suitable for vertex fetch, using non-format buffer loads.
1453  *
1454  * Some combinations of argument values have special interpretations:
1455  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1456  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1457  *
1458  * \param log_size log(size of channel in bytes)
1459  * \param num_channels number of channels (1 to 4)
1460  * \param format AC_FETCH_FORMAT_xxx value
1461  * \param reverse whether XYZ channels are reversed
1462  * \param known_aligned whether the source is known to be aligned to hardware's
1463  *                      effective element size for loading the given format
1464  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1465  * \param rsrc buffer resource descriptor
1466  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1467  */
ac_build_opencoded_load_format(struct ac_llvm_context * ctx,unsigned log_size,unsigned num_channels,unsigned format,bool reverse,bool known_aligned,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool can_speculate)1468 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1469                                             unsigned num_channels, unsigned format, bool reverse,
1470                                             bool known_aligned, LLVMValueRef rsrc,
1471                                             LLVMValueRef vindex, LLVMValueRef voffset,
1472                                             LLVMValueRef soffset, unsigned cache_policy,
1473                                             bool can_speculate)
1474 {
1475    LLVMValueRef tmp;
1476    unsigned load_log_size = log_size;
1477    unsigned load_num_channels = num_channels;
1478    if (log_size == 3) {
1479       load_log_size = 2;
1480       if (format == AC_FETCH_FORMAT_FLOAT) {
1481          load_num_channels = 2 * num_channels;
1482       } else {
1483          load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1484       }
1485    }
1486 
1487    int log_recombine = 0;
1488    if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1489       /* Avoid alignment restrictions by loading one byte at a time. */
1490       load_num_channels <<= load_log_size;
1491       log_recombine = load_log_size;
1492       load_log_size = 0;
1493    } else if (load_num_channels == 2 || load_num_channels == 4) {
1494       log_recombine = -util_logbase2(load_num_channels);
1495       load_num_channels = 1;
1496       load_log_size += -log_recombine;
1497    }
1498 
1499    LLVMValueRef loads[32]; /* up to 32 bytes */
1500    for (unsigned i = 0; i < load_num_channels; ++i) {
1501       tmp =
1502          LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1503       LLVMTypeRef channel_type =
1504          load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1505       unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1506       loads[i] =
1507          ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1508                                      cache_policy, can_speculate, false, true);
1509       if (load_log_size >= 2)
1510          loads[i] = ac_to_integer(ctx, loads[i]);
1511    }
1512 
1513    if (log_recombine > 0) {
1514       /* Recombine bytes if necessary (GFX6 only) */
1515       LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1516 
1517       for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1518          LLVMValueRef accum = NULL;
1519          for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1520             tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1521             if (i == 0) {
1522                accum = tmp;
1523             } else {
1524                tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1525                accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1526             }
1527          }
1528          loads[dst] = accum;
1529       }
1530    } else if (log_recombine < 0) {
1531       /* Split vectors of dwords */
1532       if (load_log_size > 2) {
1533          assert(load_num_channels == 1);
1534          LLVMValueRef loaded = loads[0];
1535          unsigned log_split = load_log_size - 2;
1536          log_recombine += log_split;
1537          load_num_channels = 1 << log_split;
1538          load_log_size = 2;
1539          for (unsigned i = 0; i < load_num_channels; ++i) {
1540             tmp = LLVMConstInt(ctx->i32, i, false);
1541             loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1542          }
1543       }
1544 
1545       /* Further split dwords and shorts if required */
1546       if (log_recombine < 0) {
1547          for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1548               --src) {
1549             unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1550             LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1551             LLVMValueRef loaded = loads[src - 1];
1552             LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1553             for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1554                tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1555                tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1556                loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1557             }
1558          }
1559       }
1560    }
1561 
1562    if (log_size == 3) {
1563       if (format == AC_FETCH_FORMAT_FLOAT) {
1564          for (unsigned i = 0; i < num_channels; ++i) {
1565             tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1566             loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1567          }
1568       } else if (format == AC_FETCH_FORMAT_FIXED) {
1569          /* 10_11_11_FLOAT */
1570          LLVMValueRef data = loads[0];
1571          LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1572          LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1573          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1574          LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1575          LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1576 
1577          loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1578          loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1579          loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1580 
1581          num_channels = 3;
1582          log_size = 2;
1583          format = AC_FETCH_FORMAT_FLOAT;
1584       } else {
1585          /* 2_10_10_10 data formats */
1586          LLVMValueRef data = loads[0];
1587          LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1588          LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1589          loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1590          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1591          loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1592          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1593          loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1594          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1595          loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1596 
1597          num_channels = 4;
1598       }
1599    }
1600 
1601    if (format == AC_FETCH_FORMAT_FLOAT) {
1602       if (log_size != 2) {
1603          for (unsigned chan = 0; chan < num_channels; ++chan) {
1604             tmp = ac_to_float(ctx, loads[chan]);
1605             if (log_size == 3)
1606                tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1607             else if (log_size == 1)
1608                tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1609             loads[chan] = ac_to_integer(ctx, tmp);
1610          }
1611       }
1612    } else if (format == AC_FETCH_FORMAT_UINT) {
1613       if (log_size != 2) {
1614          for (unsigned chan = 0; chan < num_channels; ++chan)
1615             loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1616       }
1617    } else if (format == AC_FETCH_FORMAT_SINT) {
1618       if (log_size != 2) {
1619          for (unsigned chan = 0; chan < num_channels; ++chan)
1620             loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1621       }
1622    } else {
1623       bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1624                     format == AC_FETCH_FORMAT_UINT;
1625 
1626       for (unsigned chan = 0; chan < num_channels; ++chan) {
1627          if (unsign) {
1628             tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1629          } else {
1630             tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1631          }
1632 
1633          LLVMValueRef scale = NULL;
1634          if (format == AC_FETCH_FORMAT_FIXED) {
1635             assert(log_size == 2);
1636             scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1637          } else if (format == AC_FETCH_FORMAT_UNORM) {
1638             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1639             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1640          } else if (format == AC_FETCH_FORMAT_SNORM) {
1641             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1642             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1643          }
1644          if (scale)
1645             tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1646 
1647          if (format == AC_FETCH_FORMAT_SNORM) {
1648             /* Clamp to [-1, 1] */
1649             LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1650             LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1651             tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1652          }
1653 
1654          loads[chan] = ac_to_integer(ctx, tmp);
1655       }
1656    }
1657 
1658    while (num_channels < 4) {
1659       if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1660          loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1661       } else {
1662          loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1663       }
1664       num_channels++;
1665    }
1666 
1667    if (reverse) {
1668       tmp = loads[0];
1669       loads[0] = loads[2];
1670       loads[2] = tmp;
1671    }
1672 
1673    return ac_build_gather_values(ctx, loads, 4);
1674 }
1675 
ac_build_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool structurized)1676 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1677                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1678                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1679                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1680                                    unsigned cache_policy, bool structurized)
1681 {
1682    voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1683 
1684    LLVMValueRef args[7];
1685    int idx = 0;
1686    args[idx++] = vdata;
1687    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1688    if (structurized)
1689       args[idx++] = vindex ? vindex : ctx->i32_0;
1690    args[idx++] = voffset ? voffset : ctx->i32_0;
1691    args[idx++] = soffset ? soffset : ctx->i32_0;
1692    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1693    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1694    unsigned func =
1695       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1696    const char *indexing_kind = structurized ? "struct" : "raw";
1697    char name[256], type_name[8];
1698 
1699    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1700    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1701 
1702    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1703 
1704    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1705 }
1706 
ac_build_struct_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1707 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1708                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1709                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1710                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1711                                    unsigned cache_policy)
1712 {
1713    ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1714                           nfmt, cache_policy, true);
1715 }
1716 
ac_build_raw_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1717 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1718                                 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1719                                 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1720                                 unsigned cache_policy)
1721 {
1722    ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1723                           nfmt, cache_policy, false);
1724 }
1725 
ac_build_tbuffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1726 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1727                                   LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1728                                   unsigned cache_policy)
1729 {
1730    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1731 
1732    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1733                                 false);
1734 }
1735 
ac_build_tbuffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1736 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1737                                  LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1738 {
1739    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1740 
1741    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1742                                 false);
1743 }
1744 
1745 /**
1746  * Set range metadata on an instruction.  This can only be used on load and
1747  * call instructions.  If you know an instruction can only produce the values
1748  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1749  * \p lo is the minimum value inclusive.
1750  * \p hi is the maximum value exclusive.
1751  */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1752 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1753                            unsigned hi)
1754 {
1755    LLVMValueRef range_md, md_args[2];
1756    LLVMTypeRef type = LLVMTypeOf(value);
1757    LLVMContextRef context = LLVMGetTypeContext(type);
1758 
1759    md_args[0] = LLVMConstInt(type, lo, false);
1760    md_args[1] = LLVMConstInt(type, hi, false);
1761    range_md = LLVMMDNodeInContext(context, md_args, 2);
1762    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1763 }
1764 
ac_get_thread_id(struct ac_llvm_context * ctx)1765 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1766 {
1767    return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1768 }
1769 
1770 /*
1771  * AMD GCN implements derivatives using the local data store (LDS)
1772  * All writes to the LDS happen in all executing threads at
1773  * the same time. TID is the Thread ID for the current
1774  * thread and is a value between 0 and 63, representing
1775  * the thread's position in the wavefront.
1776  *
1777  * For the pixel shader threads are grouped into quads of four pixels.
1778  * The TIDs of the pixels of a quad are:
1779  *
1780  *  +------+------+
1781  *  |4n + 0|4n + 1|
1782  *  +------+------+
1783  *  |4n + 2|4n + 3|
1784  *  +------+------+
1785  *
1786  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1787  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1788  * the current pixel's column, and masking with 0xfffffffe yields the TID
1789  * of the left pixel of the current pixel's row.
1790  *
1791  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1792  * adding 2 yields the TID of the pixel below the top pixel.
1793  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1794 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1795 {
1796    unsigned tl_lanes[4], trbl_lanes[4];
1797    char name[32], type[8];
1798    LLVMValueRef tl, trbl;
1799    LLVMTypeRef result_type;
1800    LLVMValueRef result;
1801 
1802    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1803 
1804    if (result_type == ctx->f16)
1805       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1806    else if (result_type == ctx->v2f16)
1807       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1808 
1809    for (unsigned i = 0; i < 4; ++i) {
1810       tl_lanes[i] = i & mask;
1811       trbl_lanes[i] = (i & mask) + idx;
1812    }
1813 
1814    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1815    trbl =
1816       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1817 
1818    if (result_type == ctx->f16) {
1819       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1820       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1821    }
1822 
1823    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1824    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1825    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1826 
1827    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1828    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1829 
1830    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1831 }
1832 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t msg,LLVMValueRef wave_id)1833 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1834 {
1835    LLVMValueRef args[2];
1836    args[0] = LLVMConstInt(ctx->i32, msg, false);
1837    args[1] = wave_id;
1838    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1839 }
1840 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1841 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1842 {
1843    LLVMValueRef msb =
1844       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1845 
1846    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1847     * the index from LSB. Invert it by doing "31 - msb". */
1848    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1849 
1850    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1851    LLVMValueRef cond =
1852       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1853                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1854 
1855    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1856 }
1857 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1858 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1859 {
1860    const char *intrin_name;
1861    LLVMTypeRef type;
1862    LLVMValueRef highest_bit;
1863    LLVMValueRef zero;
1864    unsigned bitsize;
1865 
1866    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1867    switch (bitsize) {
1868    case 64:
1869       intrin_name = "llvm.ctlz.i64";
1870       type = ctx->i64;
1871       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1872       zero = ctx->i64_0;
1873       break;
1874    case 32:
1875       intrin_name = "llvm.ctlz.i32";
1876       type = ctx->i32;
1877       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1878       zero = ctx->i32_0;
1879       break;
1880    case 16:
1881       intrin_name = "llvm.ctlz.i16";
1882       type = ctx->i16;
1883       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1884       zero = ctx->i16_0;
1885       break;
1886    case 8:
1887       intrin_name = "llvm.ctlz.i8";
1888       type = ctx->i8;
1889       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1890       zero = ctx->i8_0;
1891       break;
1892    default:
1893       unreachable(!"invalid bitsize");
1894       break;
1895    }
1896 
1897    LLVMValueRef params[2] = {
1898       arg,
1899       ctx->i1true,
1900    };
1901 
1902    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1903 
1904    /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1905     * the index from LSB. Invert it by doing "31 - msb". */
1906    msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1907 
1908    if (bitsize == 64) {
1909       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1910    } else if (bitsize < 32) {
1911       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1912    }
1913 
1914    /* check for zero */
1915    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1916                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1917 }
1918 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1919 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1920 {
1921    char name[64], type[64];
1922 
1923    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1924    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1925    LLVMValueRef args[2] = {a, b};
1926    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1927 }
1928 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1929 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1930 {
1931    char name[64], type[64];
1932 
1933    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1934    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1935    LLVMValueRef args[2] = {a, b};
1936    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1937 }
1938 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1939 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1942    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1945 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1948    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1951 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1952 {
1953    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1954    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1955 }
1956 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1957 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1958 {
1959    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1960    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1961 }
1962 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1963 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1964 {
1965    LLVMTypeRef t = LLVMTypeOf(value);
1966    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1967                         LLVMConstReal(t, 1.0));
1968 }
1969 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1970 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1971 {
1972    LLVMValueRef args[9];
1973 
1974    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1975    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1976 
1977    if (a->compr) {
1978       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1979       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1980       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1981       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1982 
1983       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1984    } else {
1985       args[2] = a->out[0];
1986       args[3] = a->out[1];
1987       args[4] = a->out[2];
1988       args[5] = a->out[3];
1989       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1990       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1991 
1992       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1993    }
1994 }
1995 
ac_build_export_null(struct ac_llvm_context * ctx)1996 void ac_build_export_null(struct ac_llvm_context *ctx)
1997 {
1998    struct ac_export_args args;
1999 
2000    args.enabled_channels = 0x0; /* enabled channels */
2001    args.valid_mask = 1;         /* whether the EXEC mask is valid */
2002    args.done = 1;               /* DONE bit */
2003    args.target = V_008DFC_SQ_EXP_NULL;
2004    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
2005    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2006    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2007    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2008    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2009 
2010    ac_build_export(ctx, &args);
2011 }
2012 
ac_num_coords(enum ac_image_dim dim)2013 static unsigned ac_num_coords(enum ac_image_dim dim)
2014 {
2015    switch (dim) {
2016    case ac_image_1d:
2017       return 1;
2018    case ac_image_2d:
2019    case ac_image_1darray:
2020       return 2;
2021    case ac_image_3d:
2022    case ac_image_cube:
2023    case ac_image_2darray:
2024    case ac_image_2dmsaa:
2025       return 3;
2026    case ac_image_2darraymsaa:
2027       return 4;
2028    default:
2029       unreachable("ac_num_coords: bad dim");
2030    }
2031 }
2032 
ac_num_derivs(enum ac_image_dim dim)2033 static unsigned ac_num_derivs(enum ac_image_dim dim)
2034 {
2035    switch (dim) {
2036    case ac_image_1d:
2037    case ac_image_1darray:
2038       return 2;
2039    case ac_image_2d:
2040    case ac_image_2darray:
2041    case ac_image_cube:
2042       return 4;
2043    case ac_image_3d:
2044       return 6;
2045    case ac_image_2dmsaa:
2046    case ac_image_2darraymsaa:
2047    default:
2048       unreachable("derivatives not supported");
2049    }
2050 }
2051 
get_atomic_name(enum ac_atomic_op op)2052 static const char *get_atomic_name(enum ac_atomic_op op)
2053 {
2054    switch (op) {
2055    case ac_atomic_swap:
2056       return "swap";
2057    case ac_atomic_add:
2058       return "add";
2059    case ac_atomic_sub:
2060       return "sub";
2061    case ac_atomic_smin:
2062       return "smin";
2063    case ac_atomic_umin:
2064       return "umin";
2065    case ac_atomic_smax:
2066       return "smax";
2067    case ac_atomic_umax:
2068       return "umax";
2069    case ac_atomic_and:
2070       return "and";
2071    case ac_atomic_or:
2072       return "or";
2073    case ac_atomic_xor:
2074       return "xor";
2075    case ac_atomic_inc_wrap:
2076       return "inc";
2077    case ac_atomic_dec_wrap:
2078       return "dec";
2079    case ac_atomic_fmin:
2080       return "fmin";
2081    case ac_atomic_fmax:
2082       return "fmax";
2083    }
2084    unreachable("bad atomic op");
2085 }
2086 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)2087 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2088 {
2089    const char *overload[3] = {"", "", ""};
2090    unsigned num_overloads = 0;
2091    LLVMValueRef args[18];
2092    unsigned num_args = 0;
2093    enum ac_image_dim dim = a->dim;
2094 
2095    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2096    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2097            a->opcode != ac_image_store_mip) ||
2098           a->lod);
2099    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2100           (!a->compare && !a->offset));
2101    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2102            a->opcode == ac_image_get_lod) ||
2103           !a->bias);
2104    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2105           1);
2106    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2107    assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2108                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2109                       a->opcode != ac_image_get_resinfo));
2110    assert(!a->a16 || ctx->chip_class >= GFX9);
2111    assert(a->g16 == a->a16 || ctx->chip_class >= GFX10);
2112 
2113    assert(!a->offset ||
2114           ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
2115    assert(!a->bias ||
2116           ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
2117    assert(!a->compare ||
2118           ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
2119    assert(!a->derivs[0] ||
2120           ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
2121            (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
2122    assert(!a->coords[0] ||
2123           ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
2124            (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
2125    assert(!a->lod ||
2126           ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
2127            (a->opcode == ac_image_get_resinfo ||
2128             ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
2129             ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
2130    assert(!a->min_lod ||
2131           ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
2132           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
2133 
2134    if (a->opcode == ac_image_get_lod) {
2135       switch (dim) {
2136       case ac_image_1darray:
2137          dim = ac_image_1d;
2138          break;
2139       case ac_image_2darray:
2140       case ac_image_cube:
2141          dim = ac_image_2d;
2142          break;
2143       default:
2144          break;
2145       }
2146    }
2147 
2148    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2149                  a->opcode == ac_image_get_lod;
2150    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2151    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2152                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2153    LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
2154    uint8_t dmask = a->dmask;
2155    LLVMTypeRef data_type;
2156    char data_type_str[32];
2157 
2158    if (atomic) {
2159       data_type = LLVMTypeOf(a->data[0]);
2160    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2161       /* Image stores might have been shrinked using the format. */
2162       data_type = LLVMTypeOf(a->data[0]);
2163       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2164    } else {
2165       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2166    }
2167 
2168    if (a->tfe) {
2169       data_type = LLVMStructTypeInContext(
2170          ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
2171    }
2172 
2173    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2174       args[num_args++] = a->data[0];
2175       if (a->opcode == ac_image_atomic_cmpswap)
2176          args[num_args++] = a->data[1];
2177    }
2178 
2179    if (!atomic)
2180       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2181 
2182    if (a->offset)
2183       args[num_args++] = ac_to_integer(ctx, a->offset);
2184    if (a->bias) {
2185       args[num_args++] = ac_to_float(ctx, a->bias);
2186       overload[num_overloads++] = ".f32";
2187    }
2188    if (a->compare)
2189       args[num_args++] = ac_to_float(ctx, a->compare);
2190    if (a->derivs[0]) {
2191       unsigned count = ac_num_derivs(dim);
2192       for (unsigned i = 0; i < count; ++i)
2193          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2194       overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
2195    }
2196    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2197    for (unsigned i = 0; i < num_coords; ++i)
2198       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2199    if (a->lod)
2200       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2201    if (a->min_lod)
2202       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2203 
2204    overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
2205 
2206    args[num_args++] = a->resource;
2207    if (sample) {
2208       args[num_args++] = a->sampler;
2209       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2210    }
2211 
2212    args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
2213    args[num_args++] = LLVMConstInt(
2214       ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2215 
2216    const char *name;
2217    const char *atomic_subop = "";
2218    switch (a->opcode) {
2219    case ac_image_sample:
2220       name = "sample";
2221       break;
2222    case ac_image_gather4:
2223       name = "gather4";
2224       break;
2225    case ac_image_load:
2226       name = "load";
2227       break;
2228    case ac_image_load_mip:
2229       name = "load.mip";
2230       break;
2231    case ac_image_store:
2232       name = "store";
2233       break;
2234    case ac_image_store_mip:
2235       name = "store.mip";
2236       break;
2237    case ac_image_atomic:
2238       name = "atomic.";
2239       atomic_subop = get_atomic_name(a->atomic);
2240       break;
2241    case ac_image_atomic_cmpswap:
2242       name = "atomic.";
2243       atomic_subop = "cmpswap";
2244       break;
2245    case ac_image_get_lod:
2246       name = "getlod";
2247       break;
2248    case ac_image_get_resinfo:
2249       name = "getresinfo";
2250       break;
2251    default:
2252       unreachable("invalid image opcode");
2253    }
2254 
2255    const char *dimname;
2256    switch (dim) {
2257    case ac_image_1d:
2258       dimname = "1d";
2259       break;
2260    case ac_image_2d:
2261       dimname = "2d";
2262       break;
2263    case ac_image_3d:
2264       dimname = "3d";
2265       break;
2266    case ac_image_cube:
2267       dimname = "cube";
2268       break;
2269    case ac_image_1darray:
2270       dimname = "1darray";
2271       break;
2272    case ac_image_2darray:
2273       dimname = "2darray";
2274       break;
2275    case ac_image_2dmsaa:
2276       dimname = "2dmsaa";
2277       break;
2278    case ac_image_2darraymsaa:
2279       dimname = "2darraymsaa";
2280       break;
2281    default:
2282       unreachable("invalid dim");
2283    }
2284 
2285    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2286 
2287    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2288    char intr_name[96];
2289    snprintf(intr_name, sizeof(intr_name),
2290             "llvm.amdgcn.image.%s%s" /* base name */
2291             "%s%s%s%s"               /* sample/gather modifiers */
2292             ".%s.%s%s%s%s",          /* dimension and type overloads */
2293             name, atomic_subop, a->compare ? ".c" : "",
2294             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2295             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2296             data_type_str, overload[0], overload[1], overload[2]);
2297 
2298    LLVMTypeRef retty;
2299    if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2300       retty = ctx->voidt;
2301    else
2302       retty = data_type;
2303 
2304    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2305    if (a->tfe) {
2306       LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
2307       LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
2308       result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
2309    }
2310 
2311    if (!sample && !atomic && retty != ctx->voidt)
2312       result = ac_to_integer(ctx, result);
2313 
2314    return result;
2315 }
2316 
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)2317 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2318 {
2319    LLVMValueRef samples;
2320 
2321    /* Read the samples from the descriptor directly.
2322     * Hardware doesn't have any instruction for this.
2323     */
2324    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2325    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2326    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2327    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2328    return samples;
2329 }
2330 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2331 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2332 {
2333    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2334                              AC_FUNC_ATTR_READNONE);
2335 }
2336 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2337 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2338 {
2339    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2340                                          AC_FUNC_ATTR_READNONE);
2341    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2342 }
2343 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2344 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2345 {
2346    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2347                                          AC_FUNC_ATTR_READNONE);
2348    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2349 }
2350 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2351 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2352                                          LLVMValueRef args[2])
2353 {
2354    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2355    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2356    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2357                                           "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",
2358                                           false, false);
2359    return LLVMBuildCall(ctx->builder, code, args, 2, "");
2360 }
2361 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2362 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2363                                          LLVMValueRef args[2])
2364 {
2365    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2366    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2367    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2368                                           "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",
2369                                           false, false);
2370    return LLVMBuildCall(ctx->builder, code, args, 2, "");
2371 }
2372 
2373 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2374 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2375                                  bool hi)
2376 {
2377    assert(bits == 8 || bits == 10 || bits == 16);
2378 
2379    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2380    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2381    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2382    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2383 
2384    /* Clamp. */
2385    if (bits != 16) {
2386       for (int i = 0; i < 2; i++) {
2387          bool alpha = hi && i == 1;
2388          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2389          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2390       }
2391    }
2392 
2393    LLVMValueRef res =
2394       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2395    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2396 }
2397 
2398 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2399 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2400                                  bool hi)
2401 {
2402    assert(bits == 8 || bits == 10 || bits == 16);
2403 
2404    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2405    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2406 
2407    /* Clamp. */
2408    if (bits != 16) {
2409       for (int i = 0; i < 2; i++) {
2410          bool alpha = hi && i == 1;
2411          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2412       }
2413    }
2414 
2415    LLVMValueRef res =
2416       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2417    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2418 }
2419 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2420 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2421 {
2422    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2423 }
2424 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2425 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2426 {
2427    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2428 }
2429 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2430 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2431                           LLVMValueRef width, bool is_signed)
2432 {
2433    LLVMValueRef args[] = {
2434       input,
2435       offset,
2436       width,
2437    };
2438 
2439    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2440                              ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2441 }
2442 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2443 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2444                            LLVMValueRef s2)
2445 {
2446    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2447 }
2448 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2449 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2450                            LLVMValueRef s2)
2451 {
2452    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2453    if (ctx->chip_class >= GFX10) {
2454       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2455                                 AC_FUNC_ATTR_READNONE);
2456    }
2457 
2458    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2459 }
2460 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2461 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2462 {
2463    if (!wait_flags)
2464       return;
2465 
2466    unsigned lgkmcnt = 63;
2467    unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2468    unsigned vscnt = 63;
2469 
2470    if (wait_flags & AC_WAIT_LGKM)
2471       lgkmcnt = 0;
2472    if (wait_flags & AC_WAIT_VLOAD)
2473       vmcnt = 0;
2474 
2475    if (wait_flags & AC_WAIT_VSTORE) {
2476       if (ctx->chip_class >= GFX10)
2477          vscnt = 0;
2478       else
2479          vmcnt = 0;
2480    }
2481 
2482    /* There is no intrinsic for vscnt(0), so use a fence. */
2483    if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2484        vscnt == 0) {
2485       LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2486       return;
2487    }
2488 
2489    unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2490                      (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2491 
2492    LLVMValueRef args[1] = {
2493       LLVMConstInt(ctx->i32, simm16, false),
2494    };
2495    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2496 }
2497 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2498 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2499                            LLVMTypeRef type)
2500 {
2501    unsigned bitsize = ac_get_elem_bits(ctx, type);
2502    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2503    LLVMValueRef one = LLVMConstReal(type, 1.0);
2504    LLVMValueRef result;
2505 
2506    if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8) || type == ctx->v2f16) {
2507       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2508        * doesn't expose an intrinsic.
2509        */
2510       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2511    } else {
2512       LLVMTypeRef type;
2513       char *intr;
2514 
2515       if (bitsize == 16) {
2516          intr = "llvm.amdgcn.fmed3.f16";
2517          type = ctx->f16;
2518       } else {
2519          assert(bitsize == 32);
2520          intr = "llvm.amdgcn.fmed3.f32";
2521          type = ctx->f32;
2522       }
2523 
2524       LLVMValueRef params[] = {
2525          zero,
2526          one,
2527          src,
2528       };
2529 
2530       result = ac_build_intrinsic(ctx, intr, type, params, 3,
2531                                   AC_FUNC_ATTR_READNONE);
2532    }
2533 
2534    if (ctx->chip_class < GFX9 && bitsize == 32) {
2535       /* Only pre-GFX9 chips do not flush denorms. */
2536       result = ac_build_canonicalize(ctx, result, bitsize);
2537    }
2538 
2539    return result;
2540 }
2541 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2542 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2543 {
2544    LLVMTypeRef type;
2545    char *intr;
2546 
2547    if (bitsize == 16) {
2548       intr = "llvm.amdgcn.fract.f16";
2549       type = ctx->f16;
2550    } else if (bitsize == 32) {
2551       intr = "llvm.amdgcn.fract.f32";
2552       type = ctx->f32;
2553    } else {
2554       intr = "llvm.amdgcn.fract.f64";
2555       type = ctx->f64;
2556    }
2557 
2558    LLVMValueRef params[] = {
2559       src0,
2560    };
2561    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2562 }
2563 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2564 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2565 {
2566 
2567    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2568       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2569       unsigned vec_size = LLVMGetVectorSize(type);
2570       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2571 
2572       for (unsigned i = 0; i < vec_size; i++)
2573          scalars[i] = scalar;
2574       return LLVMConstVector(scalars, vec_size);
2575    }
2576    return LLVMConstInt(type, value, 0);
2577 }
2578 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2579 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2580 {
2581    LLVMTypeRef type = LLVMTypeOf(src0);
2582    LLVMValueRef val;
2583 
2584    /* v_med3 is selected only when max is first. (LLVM bug?) */
2585    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2586    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2587 }
2588 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2589 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2590 {
2591    ac_enable_signed_zeros(ctx);
2592    /* (val + 0) converts negative zero to positive zero. */
2593    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2594    ac_disable_signed_zeros(ctx);
2595    return val;
2596 }
2597 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2598 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2599 {
2600    LLVMTypeRef type = LLVMTypeOf(src);
2601    LLVMValueRef pos, neg, dw[2], val;
2602    unsigned bitsize = ac_get_elem_bits(ctx, type);
2603 
2604    /* The standard version leads to this:
2605     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2606     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2607     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2608     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2609     *
2610     * The isign version:
2611     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2612     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2613     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2614     *
2615     * (src0 + 0) converts negative zero to positive zero.
2616     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2617     *
2618     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2619     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2620     */
2621    if (bitsize == 16 || bitsize == 32) {
2622       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2623       val = ac_build_isign(ctx, val);
2624       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2625    }
2626 
2627    assert(bitsize == 64);
2628    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2629    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2630    dw[0] = ctx->i32_0;
2631    dw[1] = LLVMBuildSelect(
2632       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2633       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2634       "");
2635    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2636 }
2637 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2638 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2639 {
2640    LLVMValueRef result;
2641    unsigned bitsize;
2642 
2643    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2644 
2645    switch (bitsize) {
2646    case 128:
2647       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2648                                   AC_FUNC_ATTR_READNONE);
2649       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2650       break;
2651    case 64:
2652       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2653                                   AC_FUNC_ATTR_READNONE);
2654 
2655       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2656       break;
2657    case 32:
2658       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2659                                   AC_FUNC_ATTR_READNONE);
2660       break;
2661    case 16:
2662       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2663                                   AC_FUNC_ATTR_READNONE);
2664 
2665       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2666       break;
2667    case 8:
2668       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2669                                   AC_FUNC_ATTR_READNONE);
2670 
2671       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2672       break;
2673    default:
2674       unreachable(!"invalid bitsize");
2675       break;
2676    }
2677 
2678    return result;
2679 }
2680 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2681 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2682 {
2683    LLVMValueRef result;
2684    unsigned bitsize;
2685 
2686    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2687 
2688    switch (bitsize) {
2689    case 64:
2690       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2691                                   AC_FUNC_ATTR_READNONE);
2692 
2693       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2694       break;
2695    case 32:
2696       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2697                                   AC_FUNC_ATTR_READNONE);
2698       break;
2699    case 16:
2700       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2701                                   AC_FUNC_ATTR_READNONE);
2702 
2703       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2704       break;
2705    case 8:
2706       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2707                                   AC_FUNC_ATTR_READNONE);
2708 
2709       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2710       break;
2711    default:
2712       unreachable(!"invalid bitsize");
2713       break;
2714    }
2715 
2716    return result;
2717 }
2718 
2719 #define AC_EXP_TARGET           0
2720 #define AC_EXP_ENABLED_CHANNELS 1
2721 #define AC_EXP_OUT0             2
2722 
2723 enum ac_ir_type
2724 {
2725    AC_IR_UNDEF,
2726    AC_IR_CONST,
2727    AC_IR_VALUE,
2728 };
2729 
2730 struct ac_vs_exp_chan {
2731    LLVMValueRef value;
2732    float const_float;
2733    enum ac_ir_type type;
2734 };
2735 
2736 struct ac_vs_exp_inst {
2737    unsigned offset;
2738    LLVMValueRef inst;
2739    struct ac_vs_exp_chan chan[4];
2740 };
2741 
2742 struct ac_vs_exports {
2743    unsigned num;
2744    struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2745 };
2746 
2747 /* Return true if the PARAM export has been eliminated. */
ac_eliminate_const_output(uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exp_inst * exp)2748 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2749                                       struct ac_vs_exp_inst *exp)
2750 {
2751    unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2752    bool is_zero[4] = {0}, is_one[4] = {0};
2753 
2754    for (i = 0; i < 4; i++) {
2755       /* It's a constant expression. Undef outputs are eliminated too. */
2756       if (exp->chan[i].type == AC_IR_UNDEF) {
2757          is_zero[i] = true;
2758          is_one[i] = true;
2759       } else if (exp->chan[i].type == AC_IR_CONST) {
2760          if (exp->chan[i].const_float == 0)
2761             is_zero[i] = true;
2762          else if (exp->chan[i].const_float == 1)
2763             is_one[i] = true;
2764          else
2765             return false; /* other constant */
2766       } else
2767          return false;
2768    }
2769 
2770    /* Only certain combinations of 0 and 1 can be eliminated. */
2771    if (is_zero[0] && is_zero[1] && is_zero[2])
2772       default_val = is_zero[3] ? 0 : 1;
2773    else if (is_one[0] && is_one[1] && is_one[2])
2774       default_val = is_zero[3] ? 2 : 3;
2775    else
2776       return false;
2777 
2778    /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2779    LLVMInstructionEraseFromParent(exp->inst);
2780 
2781    /* Change OFFSET to DEFAULT_VAL. */
2782    for (i = 0; i < num_outputs; i++) {
2783       if (vs_output_param_offset[i] == exp->offset) {
2784          vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2785          break;
2786       }
2787    }
2788    return true;
2789 }
2790 
ac_eliminate_duplicated_output(struct ac_llvm_context * ctx,uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exports * processed,struct ac_vs_exp_inst * exp)2791 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2792                                            uint8_t *vs_output_param_offset, uint32_t num_outputs,
2793                                            struct ac_vs_exports *processed,
2794                                            struct ac_vs_exp_inst *exp)
2795 {
2796    unsigned p, copy_back_channels = 0;
2797 
2798    /* See if the output is already in the list of processed outputs.
2799     * The LLVMValueRef comparison relies on SSA.
2800     */
2801    for (p = 0; p < processed->num; p++) {
2802       bool different = false;
2803 
2804       for (unsigned j = 0; j < 4; j++) {
2805          struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2806          struct ac_vs_exp_chan *c2 = &exp->chan[j];
2807 
2808          /* Treat undef as a match. */
2809          if (c2->type == AC_IR_UNDEF)
2810             continue;
2811 
2812          /* If c1 is undef but c2 isn't, we can copy c2 to c1
2813           * and consider the instruction duplicated.
2814           */
2815          if (c1->type == AC_IR_UNDEF) {
2816             copy_back_channels |= 1 << j;
2817             continue;
2818          }
2819 
2820          /* Test whether the channels are not equal. */
2821          if (c1->type != c2->type ||
2822              (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2823              (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2824             different = true;
2825             break;
2826          }
2827       }
2828       if (!different)
2829          break;
2830 
2831       copy_back_channels = 0;
2832    }
2833    if (p == processed->num)
2834       return false;
2835 
2836    /* If a match was found, but the matching export has undef where the new
2837     * one has a normal value, copy the normal value to the undef channel.
2838     */
2839    struct ac_vs_exp_inst *match = &processed->exp[p];
2840 
2841    /* Get current enabled channels mask. */
2842    LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2843    unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2844 
2845    while (copy_back_channels) {
2846       unsigned chan = u_bit_scan(&copy_back_channels);
2847 
2848       assert(match->chan[chan].type == AC_IR_UNDEF);
2849       LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2850       match->chan[chan] = exp->chan[chan];
2851 
2852       /* Update number of enabled channels because the original mask
2853        * is not always 0xf.
2854        */
2855       enabled_channels |= (1 << chan);
2856       LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2857                      LLVMConstInt(ctx->i32, enabled_channels, 0));
2858    }
2859 
2860    /* The PARAM export is duplicated. Kill it. */
2861    LLVMInstructionEraseFromParent(exp->inst);
2862 
2863    /* Change OFFSET to the matching export. */
2864    for (unsigned i = 0; i < num_outputs; i++) {
2865       if (vs_output_param_offset[i] == exp->offset) {
2866          vs_output_param_offset[i] = match->offset;
2867          break;
2868       }
2869    }
2870    return true;
2871 }
2872 
ac_optimize_vs_outputs(struct ac_llvm_context * ctx,LLVMValueRef main_fn,uint8_t * vs_output_param_offset,uint32_t num_outputs,uint32_t skip_output_mask,uint8_t * num_param_exports)2873 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2874                             uint8_t *vs_output_param_offset, uint32_t num_outputs,
2875                             uint32_t skip_output_mask, uint8_t *num_param_exports)
2876 {
2877    LLVMBasicBlockRef bb;
2878    bool removed_any = false;
2879    struct ac_vs_exports exports;
2880 
2881    exports.num = 0;
2882 
2883    /* Process all LLVM instructions. */
2884    bb = LLVMGetFirstBasicBlock(main_fn);
2885    while (bb) {
2886       LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2887 
2888       while (inst) {
2889          LLVMValueRef cur = inst;
2890          inst = LLVMGetNextInstruction(inst);
2891          struct ac_vs_exp_inst exp;
2892 
2893          if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2894             continue;
2895 
2896          LLVMValueRef callee = ac_llvm_get_called_value(cur);
2897 
2898          if (!ac_llvm_is_function(callee))
2899             continue;
2900 
2901          const char *name = LLVMGetValueName(callee);
2902          unsigned num_args = LLVMCountParams(callee);
2903 
2904          /* Check if this is an export instruction. */
2905          if ((num_args != 9 && num_args != 8) ||
2906              (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2907             continue;
2908 
2909          LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2910          unsigned target = LLVMConstIntGetZExtValue(arg);
2911 
2912          if (target < V_008DFC_SQ_EXP_PARAM)
2913             continue;
2914 
2915          target -= V_008DFC_SQ_EXP_PARAM;
2916 
2917          /* Parse the instruction. */
2918          memset(&exp, 0, sizeof(exp));
2919          exp.offset = target;
2920          exp.inst = cur;
2921 
2922          for (unsigned i = 0; i < 4; i++) {
2923             LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2924 
2925             exp.chan[i].value = v;
2926 
2927             if (LLVMIsUndef(v)) {
2928                exp.chan[i].type = AC_IR_UNDEF;
2929             } else if (LLVMIsAConstantFP(v)) {
2930                LLVMBool loses_info;
2931                exp.chan[i].type = AC_IR_CONST;
2932                exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2933             } else {
2934                exp.chan[i].type = AC_IR_VALUE;
2935             }
2936          }
2937 
2938          /* Eliminate constant and duplicated PARAM exports. */
2939          if (!((1u << target) & skip_output_mask) &&
2940              (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2941               ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2942                                              &exp))) {
2943             removed_any = true;
2944          } else {
2945             exports.exp[exports.num++] = exp;
2946          }
2947       }
2948       bb = LLVMGetNextBasicBlock(bb);
2949    }
2950 
2951    /* Remove holes in export memory due to removed PARAM exports.
2952     * This is done by renumbering all PARAM exports.
2953     */
2954    if (removed_any) {
2955       uint8_t old_offset[VARYING_SLOT_MAX];
2956       unsigned out, i;
2957 
2958       /* Make a copy of the offsets. We need the old version while
2959        * we are modifying some of them. */
2960       memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2961 
2962       for (i = 0; i < exports.num; i++) {
2963          unsigned offset = exports.exp[i].offset;
2964 
2965          /* Update vs_output_param_offset. Multiple outputs can
2966           * have the same offset.
2967           */
2968          for (out = 0; out < num_outputs; out++) {
2969             if (old_offset[out] == offset)
2970                vs_output_param_offset[out] = i;
2971          }
2972 
2973          /* Change the PARAM offset in the instruction. */
2974          LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2975                         LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2976       }
2977       *num_param_exports = exports.num;
2978    }
2979 }
2980 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2981 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2982 {
2983    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2984    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2985                       AC_FUNC_ATTR_CONVERGENT);
2986 }
2987 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2988 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2989 {
2990    unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2991    ctx->lds = LLVMBuildIntToPtr(
2992       ctx->builder, ctx->i32_0,
2993       LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2994 }
2995 
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2996 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2997 {
2998    return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2999 }
3000 
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)3001 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
3002 {
3003    value = ac_to_integer(ctx, value);
3004    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
3005 }
3006 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)3007 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
3008 {
3009    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3010    const char *intrin_name;
3011    LLVMTypeRef type;
3012    LLVMValueRef zero;
3013 
3014    switch (src0_bitsize) {
3015    case 64:
3016       intrin_name = "llvm.cttz.i64";
3017       type = ctx->i64;
3018       zero = ctx->i64_0;
3019       break;
3020    case 32:
3021       intrin_name = "llvm.cttz.i32";
3022       type = ctx->i32;
3023       zero = ctx->i32_0;
3024       break;
3025    case 16:
3026       intrin_name = "llvm.cttz.i16";
3027       type = ctx->i16;
3028       zero = ctx->i16_0;
3029       break;
3030    case 8:
3031       intrin_name = "llvm.cttz.i8";
3032       type = ctx->i8;
3033       zero = ctx->i8_0;
3034       break;
3035    default:
3036       unreachable(!"invalid bitsize");
3037    }
3038 
3039    LLVMValueRef params[2] = {
3040       src0,
3041 
3042       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3043        * add special code to check for x=0. The reason is that
3044        * the LLVM behavior for x=0 is different from what we
3045        * need here. However, LLVM also assumes that ffs(x) is
3046        * in [0, 31], but GLSL expects that ffs(0) = -1, so
3047        * a conditional assignment to handle 0 is still required.
3048        *
3049        * The hardware already implements the correct behavior.
3050        */
3051       ctx->i1true,
3052    };
3053 
3054    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
3055 
3056    if (src0_bitsize == 64) {
3057       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3058    } else if (src0_bitsize < 32) {
3059       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3060    }
3061 
3062    /* TODO: We need an intrinsic to skip this conditional. */
3063    /* Check for zero: */
3064    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
3065                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3066 }
3067 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)3068 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3069 {
3070    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3071 }
3072 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)3073 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3074 {
3075    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3076 }
3077 
get_current_flow(struct ac_llvm_context * ctx)3078 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
3079 {
3080    if (ctx->flow->depth > 0)
3081       return &ctx->flow->stack[ctx->flow->depth - 1];
3082    return NULL;
3083 }
3084 
get_innermost_loop(struct ac_llvm_context * ctx)3085 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
3086 {
3087    for (unsigned i = ctx->flow->depth; i > 0; --i) {
3088       if (ctx->flow->stack[i - 1].loop_entry_block)
3089          return &ctx->flow->stack[i - 1];
3090    }
3091    return NULL;
3092 }
3093 
push_flow(struct ac_llvm_context * ctx)3094 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
3095 {
3096    struct ac_llvm_flow *flow;
3097 
3098    if (ctx->flow->depth >= ctx->flow->depth_max) {
3099       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
3100 
3101       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3102       ctx->flow->depth_max = new_max;
3103    }
3104 
3105    flow = &ctx->flow->stack[ctx->flow->depth];
3106    ctx->flow->depth++;
3107 
3108    flow->next_block = NULL;
3109    flow->loop_entry_block = NULL;
3110    return flow;
3111 }
3112 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)3113 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
3114 {
3115    char buf[32];
3116    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3117    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3118 }
3119 
3120 /* Append a basic block at the level of the parent flow.
3121  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)3122 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
3123 {
3124    assert(ctx->flow->depth >= 1);
3125 
3126    if (ctx->flow->depth >= 2) {
3127       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3128 
3129       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3130    }
3131 
3132    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3133    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3134 }
3135 
3136 /* Emit a branch to the given default target for the current block if
3137  * applicable -- that is, if the current block does not already contain a
3138  * branch from a break or continue.
3139  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)3140 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3141 {
3142    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3143       LLVMBuildBr(builder, target);
3144 }
3145 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)3146 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3147 {
3148    struct ac_llvm_flow *flow = push_flow(ctx);
3149    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3150    flow->next_block = append_basic_block(ctx, "ENDLOOP");
3151    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3152    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3153    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3154 }
3155 
ac_build_break(struct ac_llvm_context * ctx)3156 void ac_build_break(struct ac_llvm_context *ctx)
3157 {
3158    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3159    LLVMBuildBr(ctx->builder, flow->next_block);
3160 }
3161 
ac_build_continue(struct ac_llvm_context * ctx)3162 void ac_build_continue(struct ac_llvm_context *ctx)
3163 {
3164    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3165    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3166 }
3167 
ac_build_else(struct ac_llvm_context * ctx,int label_id)3168 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3169 {
3170    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3171    LLVMBasicBlockRef endif_block;
3172 
3173    assert(!current_branch->loop_entry_block);
3174 
3175    endif_block = append_basic_block(ctx, "ENDIF");
3176    emit_default_branch(ctx->builder, endif_block);
3177 
3178    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3179    set_basicblock_name(current_branch->next_block, "else", label_id);
3180 
3181    current_branch->next_block = endif_block;
3182 }
3183 
3184 /* Invoked after a branch is exited. */
ac_branch_exited(struct ac_llvm_context * ctx)3185 static void ac_branch_exited(struct ac_llvm_context *ctx)
3186 {
3187    if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {
3188       /* The previous conditional branch contained demote. Kill threads
3189        * after all conditional blocks because amdgcn.wqm.vote doesn't
3190        * return usable values inside the blocks.
3191        *
3192        * This is an optional optimization that only kills whole inactive quads.
3193        */
3194       LLVMValueRef cond = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
3195       ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));
3196       ctx->conditional_demote_seen = false;
3197    }
3198 }
3199 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)3200 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3201 {
3202    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3203 
3204    assert(!current_branch->loop_entry_block);
3205 
3206    emit_default_branch(ctx->builder, current_branch->next_block);
3207    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3208    set_basicblock_name(current_branch->next_block, "endif", label_id);
3209 
3210    ctx->flow->depth--;
3211    ac_branch_exited(ctx);
3212 }
3213 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)3214 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3215 {
3216    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3217 
3218    assert(current_loop->loop_entry_block);
3219 
3220    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3221 
3222    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3223    set_basicblock_name(current_loop->next_block, "endloop", label_id);
3224    ctx->flow->depth--;
3225    ac_branch_exited(ctx);
3226 }
3227 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)3228 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3229 {
3230    struct ac_llvm_flow *flow = push_flow(ctx);
3231    LLVMBasicBlockRef if_block;
3232 
3233    if_block = append_basic_block(ctx, "IF");
3234    flow->next_block = append_basic_block(ctx, "ELSE");
3235    set_basicblock_name(if_block, "if", label_id);
3236    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3237    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3238 }
3239 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3240 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3241 {
3242    LLVMBuilderRef builder = ac->builder;
3243    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3244    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3245    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3246    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3247    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3248    LLVMValueRef res;
3249 
3250    if (first_instr) {
3251       LLVMPositionBuilderBefore(first_builder, first_instr);
3252    } else {
3253       LLVMPositionBuilderAtEnd(first_builder, first_block);
3254    }
3255 
3256    res = LLVMBuildAlloca(first_builder, type, name);
3257    LLVMDisposeBuilder(first_builder);
3258    return res;
3259 }
3260 
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3261 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3262 {
3263    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3264    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3265    return ptr;
3266 }
3267 
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)3268 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
3269 {
3270    LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
3271    LLVMBuildStore(ac->builder, val, ptr);
3272    return ptr;
3273 }
3274 
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)3275 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3276 {
3277    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3278    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3279 }
3280 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)3281 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3282 {
3283    unsigned num_components = ac_get_llvm_num_components(value);
3284    if (count == num_components)
3285       return value;
3286 
3287    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3288    masks[0] = ctx->i32_0;
3289    masks[1] = ctx->i32_1;
3290    for (unsigned i = 2; i < count; i++)
3291       masks[i] = LLVMConstInt(ctx->i32, i, false);
3292 
3293    if (count == 1)
3294       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3295 
3296    LLVMValueRef swizzle = LLVMConstVector(masks, count);
3297    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3298 }
3299 
3300 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)3301 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3302                              unsigned bitwidth)
3303 {
3304    LLVMValueRef value = param;
3305    if (rshift)
3306       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
3307 
3308    if (rshift + bitwidth < 32) {
3309       uint64_t mask = (1ull << bitwidth) - 1;
3310       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
3311    }
3312 
3313    if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
3314       value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
3315    return value;
3316 }
3317 
3318 /* Adjust the sample index according to FMASK.
3319  *
3320  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3321  * which is the identity mapping. Each nibble says which physical sample
3322  * should be fetched to get that sample.
3323  *
3324  * For example, 0x11111100 means there are only 2 samples stored and
3325  * the second sample covers 3/4 of the pixel. When reading samples 0
3326  * and 1, return physical sample 0 (determined by the first two 0s
3327  * in FMASK), otherwise return physical sample 1.
3328  *
3329  * The sample index should be adjusted as follows:
3330  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3331  */
ac_apply_fmask_to_sample(struct ac_llvm_context * ac,LLVMValueRef fmask,LLVMValueRef * addr,bool is_array_tex)3332 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3333                               bool is_array_tex)
3334 {
3335    struct ac_image_args fmask_load = {0};
3336    fmask_load.opcode = ac_image_load;
3337    fmask_load.resource = fmask;
3338    fmask_load.dmask = 0xf;
3339    fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3340    fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3341 
3342    fmask_load.coords[0] = addr[0];
3343    fmask_load.coords[1] = addr[1];
3344    if (is_array_tex)
3345       fmask_load.coords[2] = addr[2];
3346    fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;
3347 
3348    LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3349    fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3350 
3351    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3352     * resource descriptor is 0 (invalid).
3353     */
3354    LLVMValueRef tmp;
3355    tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3356    tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3357    tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3358    fmask_value =
3359       LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), "");
3360 
3361    /* Apply the formula. */
3362    unsigned sample_chan = is_array_tex ? 3 : 2;
3363    LLVMValueRef final_sample;
3364    final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3365                                LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");
3366    final_sample = LLVMBuildLShr(ac->builder, fmask_value,
3367                                 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
3368    /* Mask the sample index by 0x7, because 0x8 means an unknown value
3369     * with EQAA, so those will map to 0. */
3370    addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3371    if (fmask_load.a16)
3372       addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
3373 }
3374 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3375 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3376                                        LLVMValueRef lane, bool with_opt_barrier)
3377 {
3378    LLVMTypeRef type = LLVMTypeOf(src);
3379    LLVMValueRef result;
3380 
3381    if (with_opt_barrier)
3382       ac_build_optimization_barrier(ctx, &src, false);
3383 
3384    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3385    if (lane)
3386       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3387 
3388    result =
3389       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3390                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3391                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3392 
3393    return LLVMBuildTrunc(ctx->builder, result, type, "");
3394 }
3395 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3396 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3397                                              LLVMValueRef lane, bool with_opt_barrier)
3398 {
3399    LLVMTypeRef src_type = LLVMTypeOf(src);
3400    src = ac_to_integer(ctx, src);
3401    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3402    LLVMValueRef ret;
3403 
3404    if (bits > 32) {
3405       assert(bits % 32 == 0);
3406       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3407       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3408       ret = LLVMGetUndef(vec_type);
3409       for (unsigned i = 0; i < bits / 32; i++) {
3410          LLVMValueRef ret_comp;
3411 
3412          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3413 
3414          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3415 
3416          ret =
3417             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3418       }
3419    } else {
3420       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3421    }
3422 
3423    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3424       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3425    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3426 }
3427 
3428 /**
3429  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3430  *
3431  * The optimization barrier is not needed if the value is the same in all lanes
3432  * or if this is called in the outermost block.
3433  *
3434  * @param ctx
3435  * @param src
3436  * @param lane - id of the lane or NULL for the first active lane
3437  * @return value of the lane
3438  */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3439 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3440                                               LLVMValueRef lane)
3441 {
3442    return ac_build_readlane_common(ctx, src, lane, false);
3443 }
3444 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3445 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3446 {
3447    return ac_build_readlane_common(ctx, src, lane, true);
3448 }
3449 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)3450 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3451                                 LLVMValueRef lane)
3452 {
3453    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3454                              (LLVMValueRef[]){value, lane, src}, 3,
3455                              AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3456 }
3457 
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)3458 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
3459 {
3460    if (ctx->wave_size == 32) {
3461       LLVMValueRef val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3462                                 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3463       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
3464       return val;
3465    }
3466    LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3467    LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3468    LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3469    LLVMValueRef val =
3470       ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3471                          (LLVMValueRef[]){mask_lo, add_src}, 2, AC_FUNC_ATTR_READNONE);
3472    val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3473                             2, AC_FUNC_ATTR_READNONE);
3474    ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
3475    return val;
3476 }
3477 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)3478 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3479 {
3480    return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
3481 }
3482 
3483 enum dpp_ctrl
3484 {
3485    _dpp_quad_perm = 0x000,
3486    _dpp_row_sl = 0x100,
3487    _dpp_row_sr = 0x110,
3488    _dpp_row_rr = 0x120,
3489    dpp_wf_sl1 = 0x130,
3490    dpp_wf_rl1 = 0x134,
3491    dpp_wf_sr1 = 0x138,
3492    dpp_wf_rr1 = 0x13C,
3493    dpp_row_mirror = 0x140,
3494    dpp_row_half_mirror = 0x141,
3495    dpp_row_bcast15 = 0x142,
3496    dpp_row_bcast31 = 0x143
3497 };
3498 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3499 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3500                                           unsigned lane3)
3501 {
3502    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3503    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3504 }
3505 
dpp_row_sl(unsigned amount)3506 static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3507 {
3508    assert(amount > 0 && amount < 16);
3509    return _dpp_row_sl | amount;
3510 }
3511 
dpp_row_sr(unsigned amount)3512 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3513 {
3514    assert(amount > 0 && amount < 16);
3515    return _dpp_row_sr | amount;
3516 }
3517 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3518 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3519                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3520                                   bool bound_ctrl)
3521 {
3522    LLVMTypeRef type = LLVMTypeOf(src);
3523    LLVMValueRef res;
3524 
3525    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3526    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3527 
3528    res = ac_build_intrinsic(
3529       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3530       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3531                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3532                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3533       6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3534 
3535    return LLVMBuildTrunc(ctx->builder, res, type, "");
3536 }
3537 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3538 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3539                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3540                                  bool bound_ctrl)
3541 {
3542    LLVMTypeRef src_type = LLVMTypeOf(src);
3543    src = ac_to_integer(ctx, src);
3544    old = ac_to_integer(ctx, old);
3545    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3546    LLVMValueRef ret;
3547    if (bits > 32) {
3548       assert(bits % 32 == 0);
3549       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3550       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3551       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3552       ret = LLVMGetUndef(vec_type);
3553       for (unsigned i = 0; i < bits / 32; i++) {
3554          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3555          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3556          LLVMValueRef ret_comp =
3557             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3558          ret =
3559             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3560       }
3561    } else {
3562       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3563    }
3564    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3565 }
3566 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3567 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3568                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
3569 {
3570    LLVMTypeRef type = LLVMTypeOf(src);
3571    LLVMValueRef result;
3572 
3573    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3574 
3575    LLVMValueRef args[6] = {
3576       src,
3577       src,
3578       LLVMConstInt(ctx->i32, sel, false),
3579       LLVMConstInt(ctx->i32, sel >> 32, false),
3580       ctx->i1true, /* fi */
3581       bound_ctrl ? ctx->i1true : ctx->i1false,
3582    };
3583 
3584    result =
3585       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3586                          ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3587 
3588    return LLVMBuildTrunc(ctx->builder, result, type, "");
3589 }
3590 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3591 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3592                                         bool exchange_rows, bool bound_ctrl)
3593 {
3594    LLVMTypeRef src_type = LLVMTypeOf(src);
3595    src = ac_to_integer(ctx, src);
3596    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3597    LLVMValueRef ret;
3598    if (bits > 32) {
3599       assert(bits % 32 == 0);
3600       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3601       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3602       ret = LLVMGetUndef(vec_type);
3603       for (unsigned i = 0; i < bits / 32; i++) {
3604          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3605          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3606          ret =
3607             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3608       }
3609    } else {
3610       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3611    }
3612    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3613 }
3614 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)3615 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3616 {
3617    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3618    return and_mask | (or_mask << 5) | (xor_mask << 10);
3619 }
3620 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3621 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3622                                          unsigned mask)
3623 {
3624    LLVMTypeRef src_type = LLVMTypeOf(src);
3625    LLVMValueRef ret;
3626 
3627    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3628 
3629    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3630                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3631                             AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3632 
3633    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3634 }
3635 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3636 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3637 {
3638    LLVMTypeRef src_type = LLVMTypeOf(src);
3639    src = ac_to_integer(ctx, src);
3640    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3641    LLVMValueRef ret;
3642    if (bits > 32) {
3643       assert(bits % 32 == 0);
3644       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3645       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3646       ret = LLVMGetUndef(vec_type);
3647       for (unsigned i = 0; i < bits / 32; i++) {
3648          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3649          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3650          ret =
3651             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3652       }
3653    } else {
3654       ret = _ac_build_ds_swizzle(ctx, src, mask);
3655    }
3656    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3657 }
3658 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3659 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3660 {
3661    LLVMTypeRef src_type = LLVMTypeOf(src);
3662    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3663    char name[32], type[8];
3664    LLVMValueRef ret;
3665 
3666    src = ac_to_integer(ctx, src);
3667 
3668    if (bitsize < 32)
3669       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3670 
3671    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3672    snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3673    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3674                             AC_FUNC_ATTR_READNONE);
3675 
3676    if (bitsize < 32)
3677       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3678 
3679    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3680 }
3681 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3682 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3683                                           LLVMValueRef inactive)
3684 {
3685    char name[33], type[8];
3686    LLVMTypeRef src_type = LLVMTypeOf(src);
3687    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3688    src = ac_to_integer(ctx, src);
3689    inactive = ac_to_integer(ctx, inactive);
3690 
3691    if (bitsize < 32) {
3692       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3693       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3694    }
3695 
3696    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3697    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3698    LLVMValueRef ret =
3699       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3700                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3701    if (bitsize < 32)
3702       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3703 
3704    return ret;
3705 }
3706 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3707 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3708                                            unsigned type_size)
3709 {
3710 
3711    if (type_size == 0) {
3712       switch (op) {
3713       case nir_op_ior:
3714       case nir_op_ixor:
3715          return LLVMConstInt(ctx->i1, 0, 0);
3716       case nir_op_iand:
3717          return LLVMConstInt(ctx->i1, 1, 0);
3718       default:
3719          unreachable("bad reduction intrinsic");
3720       }
3721    } else if (type_size == 1) {
3722       switch (op) {
3723       case nir_op_iadd:
3724          return ctx->i8_0;
3725       case nir_op_imul:
3726          return ctx->i8_1;
3727       case nir_op_imin:
3728          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3729       case nir_op_umin:
3730          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3731       case nir_op_imax:
3732          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3733       case nir_op_umax:
3734          return ctx->i8_0;
3735       case nir_op_iand:
3736          return LLVMConstInt(ctx->i8, -1, 0);
3737       case nir_op_ior:
3738          return ctx->i8_0;
3739       case nir_op_ixor:
3740          return ctx->i8_0;
3741       default:
3742          unreachable("bad reduction intrinsic");
3743       }
3744    } else if (type_size == 2) {
3745       switch (op) {
3746       case nir_op_iadd:
3747          return ctx->i16_0;
3748       case nir_op_fadd:
3749          return ctx->f16_0;
3750       case nir_op_imul:
3751          return ctx->i16_1;
3752       case nir_op_fmul:
3753          return ctx->f16_1;
3754       case nir_op_imin:
3755          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3756       case nir_op_umin:
3757          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3758       case nir_op_fmin:
3759          return LLVMConstReal(ctx->f16, INFINITY);
3760       case nir_op_imax:
3761          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3762       case nir_op_umax:
3763          return ctx->i16_0;
3764       case nir_op_fmax:
3765          return LLVMConstReal(ctx->f16, -INFINITY);
3766       case nir_op_iand:
3767          return LLVMConstInt(ctx->i16, -1, 0);
3768       case nir_op_ior:
3769          return ctx->i16_0;
3770       case nir_op_ixor:
3771          return ctx->i16_0;
3772       default:
3773          unreachable("bad reduction intrinsic");
3774       }
3775    } else if (type_size == 4) {
3776       switch (op) {
3777       case nir_op_iadd:
3778          return ctx->i32_0;
3779       case nir_op_fadd:
3780          return ctx->f32_0;
3781       case nir_op_imul:
3782          return ctx->i32_1;
3783       case nir_op_fmul:
3784          return ctx->f32_1;
3785       case nir_op_imin:
3786          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3787       case nir_op_umin:
3788          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3789       case nir_op_fmin:
3790          return LLVMConstReal(ctx->f32, INFINITY);
3791       case nir_op_imax:
3792          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3793       case nir_op_umax:
3794          return ctx->i32_0;
3795       case nir_op_fmax:
3796          return LLVMConstReal(ctx->f32, -INFINITY);
3797       case nir_op_iand:
3798          return LLVMConstInt(ctx->i32, -1, 0);
3799       case nir_op_ior:
3800          return ctx->i32_0;
3801       case nir_op_ixor:
3802          return ctx->i32_0;
3803       default:
3804          unreachable("bad reduction intrinsic");
3805       }
3806    } else { /* type_size == 64bit */
3807       switch (op) {
3808       case nir_op_iadd:
3809          return ctx->i64_0;
3810       case nir_op_fadd:
3811          return ctx->f64_0;
3812       case nir_op_imul:
3813          return ctx->i64_1;
3814       case nir_op_fmul:
3815          return ctx->f64_1;
3816       case nir_op_imin:
3817          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3818       case nir_op_umin:
3819          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3820       case nir_op_fmin:
3821          return LLVMConstReal(ctx->f64, INFINITY);
3822       case nir_op_imax:
3823          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3824       case nir_op_umax:
3825          return ctx->i64_0;
3826       case nir_op_fmax:
3827          return LLVMConstReal(ctx->f64, -INFINITY);
3828       case nir_op_iand:
3829          return LLVMConstInt(ctx->i64, -1, 0);
3830       case nir_op_ior:
3831          return ctx->i64_0;
3832       case nir_op_ixor:
3833          return ctx->i64_0;
3834       default:
3835          unreachable("bad reduction intrinsic");
3836       }
3837    }
3838 }
3839 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3840 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3841                                     nir_op op)
3842 {
3843    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3844    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3845    switch (op) {
3846    case nir_op_iadd:
3847       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3848    case nir_op_fadd:
3849       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3850    case nir_op_imul:
3851       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3852    case nir_op_fmul:
3853       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3854    case nir_op_imin:
3855       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3856                              lhs, rhs, "");
3857    case nir_op_umin:
3858       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3859                              lhs, rhs, "");
3860    case nir_op_fmin:
3861       return ac_build_intrinsic(
3862          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3863          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3864          AC_FUNC_ATTR_READNONE);
3865    case nir_op_imax:
3866       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3867                              lhs, rhs, "");
3868    case nir_op_umax:
3869       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3870                              lhs, rhs, "");
3871    case nir_op_fmax:
3872       return ac_build_intrinsic(
3873          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3874          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3875          AC_FUNC_ATTR_READNONE);
3876    case nir_op_iand:
3877       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3878    case nir_op_ior:
3879       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3880    case nir_op_ixor:
3881       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3882    default:
3883       unreachable("bad reduction intrinsic");
3884    }
3885 }
3886 
3887 /**
3888  * \param src The value to shift.
3889  * \param identity The value to use the first lane.
3890  * \param maxprefix specifies that the result only needs to be correct for a
3891  *     prefix of this many threads
3892  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3893  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3894 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3895                                                LLVMValueRef identity, unsigned maxprefix)
3896 {
3897    if (ctx->chip_class >= GFX10) {
3898       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3899       LLVMValueRef active, tmp1, tmp2;
3900       LLVMValueRef tid = ac_get_thread_id(ctx);
3901 
3902       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3903 
3904       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3905 
3906       if (maxprefix > 32) {
3907          active =
3908             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3909 
3910          tmp2 = LLVMBuildSelect(ctx->builder, active,
3911                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3912                                 tmp2, "");
3913 
3914          active = LLVMBuildOr(
3915             ctx->builder, active,
3916             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3917                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3918                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3919             "");
3920          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3921       } else if (maxprefix > 16) {
3922          active =
3923             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3924 
3925          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3926       }
3927    } else if (ctx->chip_class >= GFX8) {
3928       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3929    }
3930 
3931    /* wavefront shift_right by 1 on SI/CI */
3932    LLVMValueRef active, tmp1, tmp2;
3933    LLVMValueRef tid = ac_get_thread_id(ctx);
3934    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3935    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3936    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3937                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3938                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3939    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3940    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3941    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3942                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3943                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3944    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3945    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3946    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3947                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3948                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3949    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3950    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3951    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3952    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3953    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3954    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3955 }
3956 
3957 /**
3958  * \param maxprefix specifies that the result only needs to be correct for a
3959  *     prefix of this many threads
3960  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3961 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3962                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3963 {
3964    LLVMValueRef result, tmp;
3965 
3966    if (!inclusive)
3967       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3968 
3969    result = src;
3970 
3971    if (ctx->chip_class <= GFX7) {
3972       assert(maxprefix == 64);
3973       LLVMValueRef tid = ac_get_thread_id(ctx);
3974       LLVMValueRef active;
3975       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3976       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3977                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3978       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3979       result = ac_build_alu_op(ctx, result, tmp, op);
3980       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3981       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3982                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3983                              ctx->i32_0, "");
3984       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3985       result = ac_build_alu_op(ctx, result, tmp, op);
3986       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3987       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3988                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3989                              ctx->i32_0, "");
3990       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3991       result = ac_build_alu_op(ctx, result, tmp, op);
3992       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3993       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3994                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3995                              ctx->i32_0, "");
3996       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3997       result = ac_build_alu_op(ctx, result, tmp, op);
3998       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3999       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
4000                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
4001                              ctx->i32_0, "");
4002       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4003       result = ac_build_alu_op(ctx, result, tmp, op);
4004       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
4005       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
4006                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
4007                              ctx->i32_0, "");
4008       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4009       result = ac_build_alu_op(ctx, result, tmp, op);
4010       return result;
4011    }
4012 
4013    if (maxprefix <= 1)
4014       return result;
4015    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4016    result = ac_build_alu_op(ctx, result, tmp, op);
4017    if (maxprefix <= 2)
4018       return result;
4019    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4020    result = ac_build_alu_op(ctx, result, tmp, op);
4021    if (maxprefix <= 3)
4022       return result;
4023    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4024    result = ac_build_alu_op(ctx, result, tmp, op);
4025    if (maxprefix <= 4)
4026       return result;
4027    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4028    result = ac_build_alu_op(ctx, result, tmp, op);
4029    if (maxprefix <= 8)
4030       return result;
4031    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4032    result = ac_build_alu_op(ctx, result, tmp, op);
4033    if (maxprefix <= 16)
4034       return result;
4035 
4036    if (ctx->chip_class >= GFX10) {
4037       LLVMValueRef tid = ac_get_thread_id(ctx);
4038       LLVMValueRef active;
4039 
4040       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
4041 
4042       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
4043                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
4044                              ctx->i32_0, "");
4045 
4046       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4047 
4048       result = ac_build_alu_op(ctx, result, tmp, op);
4049 
4050       if (maxprefix <= 32)
4051          return result;
4052 
4053       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4054 
4055       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
4056 
4057       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
4058 
4059       result = ac_build_alu_op(ctx, result, tmp, op);
4060       return result;
4061    }
4062 
4063    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4064    result = ac_build_alu_op(ctx, result, tmp, op);
4065    if (maxprefix <= 32)
4066       return result;
4067    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4068    result = ac_build_alu_op(ctx, result, tmp, op);
4069    return result;
4070 }
4071 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)4072 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4073 {
4074    LLVMValueRef result;
4075 
4076    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4077       LLVMBuilderRef builder = ctx->builder;
4078       src = LLVMBuildZExt(builder, src, ctx->i32, "");
4079       result = ac_build_ballot(ctx, src);
4080       result = ac_build_mbcnt(ctx, result);
4081       result = LLVMBuildAdd(builder, result, src, "");
4082       return result;
4083    }
4084 
4085    ac_build_optimization_barrier(ctx, &src, false);
4086 
4087    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4088    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4089                              LLVMTypeOf(identity), "");
4090    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4091 
4092    return ac_build_wwm(ctx, result);
4093 }
4094 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)4095 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4096 {
4097    LLVMValueRef result;
4098 
4099    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4100       LLVMBuilderRef builder = ctx->builder;
4101       src = LLVMBuildZExt(builder, src, ctx->i32, "");
4102       result = ac_build_ballot(ctx, src);
4103       result = ac_build_mbcnt(ctx, result);
4104       return result;
4105    }
4106 
4107    ac_build_optimization_barrier(ctx, &src, false);
4108 
4109    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4110    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4111                              LLVMTypeOf(identity), "");
4112    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4113 
4114    return ac_build_wwm(ctx, result);
4115 }
4116 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)4117 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
4118                              unsigned cluster_size)
4119 {
4120    if (cluster_size == 1)
4121       return src;
4122    ac_build_optimization_barrier(ctx, &src, false);
4123    LLVMValueRef result, swap;
4124    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4125    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4126                              LLVMTypeOf(identity), "");
4127    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4128    result = ac_build_alu_op(ctx, result, swap, op);
4129    if (cluster_size == 2)
4130       return ac_build_wwm(ctx, result);
4131 
4132    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4133    result = ac_build_alu_op(ctx, result, swap, op);
4134    if (cluster_size == 4)
4135       return ac_build_wwm(ctx, result);
4136 
4137    if (ctx->chip_class >= GFX8)
4138       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4139    else
4140       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4141    result = ac_build_alu_op(ctx, result, swap, op);
4142    if (cluster_size == 8)
4143       return ac_build_wwm(ctx, result);
4144 
4145    if (ctx->chip_class >= GFX8)
4146       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4147    else
4148       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4149    result = ac_build_alu_op(ctx, result, swap, op);
4150    if (cluster_size == 16)
4151       return ac_build_wwm(ctx, result);
4152 
4153    if (ctx->chip_class >= GFX10)
4154       swap = ac_build_permlane16(ctx, result, 0, true, false);
4155    else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4156       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4157    else
4158       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4159    result = ac_build_alu_op(ctx, result, swap, op);
4160    if (cluster_size == 32)
4161       return ac_build_wwm(ctx, result);
4162 
4163    if (ctx->chip_class >= GFX8) {
4164       if (ctx->wave_size == 64) {
4165          if (ctx->chip_class >= GFX10)
4166             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4167          else
4168             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4169          result = ac_build_alu_op(ctx, result, swap, op);
4170          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4171       }
4172 
4173       return ac_build_wwm(ctx, result);
4174    } else {
4175       swap = ac_build_readlane(ctx, result, ctx->i32_0);
4176       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4177       result = ac_build_alu_op(ctx, result, swap, op);
4178       return ac_build_wwm(ctx, result);
4179    }
4180 }
4181 
4182 /**
4183  * "Top half" of a scan that reduces per-wave values across an entire
4184  * workgroup.
4185  *
4186  * The source value must be present in the highest lane of the wave, and the
4187  * highest lane must be live.
4188  */
ac_build_wg_wavescan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4189 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4190 {
4191    if (ws->maxwaves <= 1)
4192       return;
4193 
4194    const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4195    LLVMBuilderRef builder = ctx->builder;
4196    LLVMValueRef tid = ac_get_thread_id(ctx);
4197    LLVMValueRef tmp;
4198 
4199    tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4200    ac_build_ifcc(ctx, tmp, 1000);
4201    LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4202    ac_build_endif(ctx, 1000);
4203 }
4204 
4205 /**
4206  * "Bottom half" of a scan that reduces per-wave values across an entire
4207  * workgroup.
4208  *
4209  * The caller must place a barrier between the top and bottom halves.
4210  */
ac_build_wg_wavescan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4211 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4212 {
4213    const LLVMTypeRef type = LLVMTypeOf(ws->src);
4214    const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4215 
4216    if (ws->maxwaves <= 1) {
4217       ws->result_reduce = ws->src;
4218       ws->result_inclusive = ws->src;
4219       ws->result_exclusive = identity;
4220       return;
4221    }
4222    assert(ws->maxwaves <= 32);
4223 
4224    LLVMBuilderRef builder = ctx->builder;
4225    LLVMValueRef tid = ac_get_thread_id(ctx);
4226    LLVMBasicBlockRef bbs[2];
4227    LLVMValueRef phivalues_scan[2];
4228    LLVMValueRef tmp, tmp2;
4229 
4230    bbs[0] = LLVMGetInsertBlock(builder);
4231    phivalues_scan[0] = LLVMGetUndef(type);
4232 
4233    if (ws->enable_reduce)
4234       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4235    else if (ws->enable_inclusive)
4236       tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4237    else
4238       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4239    ac_build_ifcc(ctx, tmp, 1001);
4240    {
4241       tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4242 
4243       ac_build_optimization_barrier(ctx, &tmp, false);
4244 
4245       bbs[1] = LLVMGetInsertBlock(builder);
4246       phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4247    }
4248    ac_build_endif(ctx, 1001);
4249 
4250    const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4251 
4252    if (ws->enable_reduce) {
4253       tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4254       ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4255    }
4256    if (ws->enable_inclusive)
4257       ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4258    if (ws->enable_exclusive) {
4259       tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4260       tmp = ac_build_readlane(ctx, scan, tmp);
4261       tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4262       ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4263    }
4264 }
4265 
4266 /**
4267  * Inclusive scan of a per-wave value across an entire workgroup.
4268  *
4269  * This implies an s_barrier instruction.
4270  *
4271  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4272  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4273  * useful manner because of the barrier in the algorithm.)
4274  */
ac_build_wg_wavescan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4275 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4276 {
4277    ac_build_wg_wavescan_top(ctx, ws);
4278    ac_build_s_barrier(ctx);
4279    ac_build_wg_wavescan_bottom(ctx, ws);
4280 }
4281 
4282 /**
4283  * "Top half" of a scan that reduces per-thread values across an entire
4284  * workgroup.
4285  *
4286  * All lanes must be active when this code runs.
4287  */
ac_build_wg_scan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4288 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4289 {
4290    if (ws->enable_exclusive) {
4291       ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4292       if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4293          ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4294       ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4295    } else {
4296       ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4297    }
4298 
4299    bool enable_inclusive = ws->enable_inclusive;
4300    bool enable_exclusive = ws->enable_exclusive;
4301    ws->enable_inclusive = false;
4302    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4303    ac_build_wg_wavescan_top(ctx, ws);
4304    ws->enable_inclusive = enable_inclusive;
4305    ws->enable_exclusive = enable_exclusive;
4306 }
4307 
4308 /**
4309  * "Bottom half" of a scan that reduces per-thread values across an entire
4310  * workgroup.
4311  *
4312  * The caller must place a barrier between the top and bottom halves.
4313  */
ac_build_wg_scan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4314 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4315 {
4316    bool enable_inclusive = ws->enable_inclusive;
4317    bool enable_exclusive = ws->enable_exclusive;
4318    ws->enable_inclusive = false;
4319    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4320    ac_build_wg_wavescan_bottom(ctx, ws);
4321    ws->enable_inclusive = enable_inclusive;
4322    ws->enable_exclusive = enable_exclusive;
4323 
4324    /* ws->result_reduce is already the correct value */
4325    if (ws->enable_inclusive)
4326       ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4327    if (ws->enable_exclusive)
4328       ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4329 }
4330 
4331 /**
4332  * A scan that reduces per-thread values across an entire workgroup.
4333  *
4334  * The caller must ensure that all lanes are active when this code runs
4335  * (WWM is insufficient!), because there is an implied barrier.
4336  */
ac_build_wg_scan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4337 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4338 {
4339    ac_build_wg_scan_top(ctx, ws);
4340    ac_build_s_barrier(ctx);
4341    ac_build_wg_scan_bottom(ctx, ws);
4342 }
4343 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)4344 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4345                                    unsigned lane1, unsigned lane2, unsigned lane3)
4346 {
4347    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4348    if (ctx->chip_class >= GFX8) {
4349       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4350    } else {
4351       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4352    }
4353 }
4354 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)4355 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4356 {
4357    LLVMTypeRef type = LLVMTypeOf(src);
4358    LLVMValueRef result;
4359 
4360    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4361    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4362 
4363    result =
4364       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4365                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4366    return LLVMBuildTrunc(ctx->builder, result, type, "");
4367 }
4368 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4369 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4370 {
4371    LLVMTypeRef type;
4372    char *intr;
4373 
4374    if (bitsize == 16) {
4375       intr = "llvm.amdgcn.frexp.exp.i16.f16";
4376       type = ctx->i16;
4377    } else if (bitsize == 32) {
4378       intr = "llvm.amdgcn.frexp.exp.i32.f32";
4379       type = ctx->i32;
4380    } else {
4381       intr = "llvm.amdgcn.frexp.exp.i32.f64";
4382       type = ctx->i32;
4383    }
4384 
4385    LLVMValueRef params[] = {
4386       src0,
4387    };
4388    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4389 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4390 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4391 {
4392    LLVMTypeRef type;
4393    char *intr;
4394 
4395    if (bitsize == 16) {
4396       intr = "llvm.amdgcn.frexp.mant.f16";
4397       type = ctx->f16;
4398    } else if (bitsize == 32) {
4399       intr = "llvm.amdgcn.frexp.mant.f32";
4400       type = ctx->f32;
4401    } else {
4402       intr = "llvm.amdgcn.frexp.mant.f64";
4403       type = ctx->f64;
4404    }
4405 
4406    LLVMValueRef params[] = {
4407       src0,
4408    };
4409    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4410 }
4411 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4412 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4413 {
4414    LLVMTypeRef type;
4415    char *intr;
4416 
4417    if (bitsize == 16) {
4418       intr = "llvm.canonicalize.f16";
4419       type = ctx->f16;
4420    } else if (bitsize == 32) {
4421       intr = "llvm.canonicalize.f32";
4422       type = ctx->f32;
4423    } else {
4424       intr = "llvm.canonicalize.f64";
4425       type = ctx->f64;
4426    }
4427 
4428    LLVMValueRef params[] = {
4429       src0,
4430    };
4431    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4432 }
4433 
4434 /*
4435  * this takes an I,J coordinate pair,
4436  * and works out the X and Y derivatives.
4437  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4438  */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)4439 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4440 {
4441    LLVMValueRef result[4], a;
4442    unsigned i;
4443 
4444    for (i = 0; i < 2; i++) {
4445       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4446       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4447       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4448    }
4449    return ac_build_gather_values(ctx, result, 4);
4450 }
4451 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)4452 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4453 {
4454    LLVMValueRef result;
4455 
4456    if (LLVM_VERSION_MAJOR >= 13) {
4457       result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,
4458                                   AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
4459    } else {
4460       result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,
4461                                   AC_FUNC_ATTR_READNONE);
4462    }
4463    return LLVMBuildNot(ctx->builder, result, "");
4464 }
4465 
ac_build_is_helper_invocation(struct ac_llvm_context * ctx)4466 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4467 {
4468    if (!ctx->postponed_kill)
4469       return ac_build_load_helper_invocation(ctx);
4470 
4471    /* postponed_kill should be NULL on LLVM 13+ */
4472    assert(LLVM_VERSION_MAJOR < 13);
4473 
4474    /* !(exact && postponed) */
4475    LLVMValueRef exact =
4476       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4477 
4478    LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4479    return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4480 }
4481 
ac_build_call(struct ac_llvm_context * ctx,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)4482 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4483                            unsigned num_args)
4484 {
4485    LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4486    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4487    return ret;
4488 }
4489 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,struct ac_export_args * args)4490 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4491                      LLVMValueRef samplemask, struct ac_export_args *args)
4492 {
4493    unsigned mask = 0;
4494    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4495 
4496    assert(depth || stencil || samplemask);
4497 
4498    memset(args, 0, sizeof(*args));
4499 
4500    args->valid_mask = 1; /* whether the EXEC mask is valid */
4501    args->done = 1;       /* DONE bit */
4502 
4503    /* Specify the target we are exporting */
4504    args->target = V_008DFC_SQ_EXP_MRTZ;
4505 
4506    args->compr = 0;                       /* COMP flag */
4507    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4508    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4509    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4510    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4511 
4512    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4513       assert(!depth);
4514       args->compr = 1; /* COMPR flag */
4515 
4516       if (stencil) {
4517          /* Stencil should be in X[23:16]. */
4518          stencil = ac_to_integer(ctx, stencil);
4519          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4520          args->out[0] = ac_to_float(ctx, stencil);
4521          mask |= 0x3;
4522       }
4523       if (samplemask) {
4524          /* SampleMask should be in Y[15:0]. */
4525          args->out[1] = samplemask;
4526          mask |= 0xc;
4527       }
4528    } else {
4529       if (depth) {
4530          args->out[0] = depth;
4531          mask |= 0x1;
4532       }
4533       if (stencil) {
4534          args->out[1] = stencil;
4535          mask |= 0x2;
4536       }
4537       if (samplemask) {
4538          args->out[2] = samplemask;
4539          mask |= 0x4;
4540       }
4541    }
4542 
4543    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4544     * at the X writemask component. */
4545    if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4546       mask |= 0x1;
4547 
4548    /* Specify which components to enable */
4549    args->enabled_channels = mask;
4550 }
4551 
4552 /* Send GS Alloc Req message from the first wave of the group to SPI.
4553  * Message payload is:
4554  * - bits 0..10: vertices in group
4555  * - bits 12..22: primitives in group
4556  */
ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context * ctx,LLVMValueRef wave_id,LLVMValueRef vtx_cnt,LLVMValueRef prim_cnt)4557 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4558                                    LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4559 {
4560    LLVMBuilderRef builder = ctx->builder;
4561    LLVMValueRef tmp;
4562    bool export_dummy_prim = false;
4563 
4564    /* HW workaround for a GPU hang with 100% culling.
4565     * We always have to export at least 1 primitive.
4566     * Export a degenerate triangle using vertex 0 for all 3 vertices.
4567     */
4568    if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4569       assert(vtx_cnt == ctx->i32_0);
4570       prim_cnt = ctx->i32_1;
4571       vtx_cnt = ctx->i32_1;
4572       export_dummy_prim = true;
4573    }
4574 
4575    ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4576 
4577    tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4578    tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4579    ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4580 
4581    if (export_dummy_prim) {
4582       struct ac_ngg_prim prim = {0};
4583       /* The vertex indices are 0,0,0. */
4584       prim.passthrough = ctx->i32_0;
4585 
4586       struct ac_export_args pos = {0};
4587       /* The hw culls primitives with NaN. */
4588       pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);
4589       pos.target = V_008DFC_SQ_EXP_POS;
4590       pos.enabled_channels = 0xf;
4591       pos.done = true;
4592 
4593       ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4594                     5021);
4595       ac_build_export_prim(ctx, &prim);
4596       ac_build_export(ctx, &pos);
4597       ac_build_endif(ctx, 5021);
4598    }
4599 
4600    ac_build_endif(ctx, 5020);
4601 }
4602 
4603 
ac_pack_edgeflags_for_export(struct ac_llvm_context * ctx,const struct ac_shader_args * args)4604 LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
4605                                           const struct ac_shader_args *args)
4606 {
4607    /* Use the following trick to extract the edge flags:
4608     *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
4609     *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
4610     *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
4611     */
4612    LLVMValueRef tmp = LLVMBuildAnd(ctx->builder,
4613                                    ac_get_arg(ctx, args->gs_invocation_id),
4614                                    LLVMConstInt(ctx->i32, 0x700, 0), "");
4615    tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), "");
4616    return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), "");
4617 }
4618 
ac_pack_prim_export(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4619 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4620 {
4621    /* The prim export format is:
4622     *  - bits 0..8: index 0
4623     *  - bit 9: edge flag 0
4624     *  - bits 10..18: index 1
4625     *  - bit 19: edge flag 1
4626     *  - bits 20..28: index 2
4627     *  - bit 29: edge flag 2
4628     *  - bit 31: null primitive (skip)
4629     */
4630    LLVMBuilderRef builder = ctx->builder;
4631    LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4632    LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4633    result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, "");
4634 
4635    for (unsigned i = 0; i < prim->num_vertices; ++i) {
4636       tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4637       result = LLVMBuildOr(builder, result, tmp, "");
4638    }
4639    return result;
4640 }
4641 
ac_build_export_prim(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4642 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4643 {
4644    struct ac_export_args args;
4645 
4646    if (prim->passthrough) {
4647       args.out[0] = prim->passthrough;
4648    } else {
4649       args.out[0] = ac_pack_prim_export(ctx, prim);
4650    }
4651 
4652    args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4653    args.out[1] = LLVMGetUndef(ctx->f32);
4654    args.out[2] = LLVMGetUndef(ctx->f32);
4655    args.out[3] = LLVMGetUndef(ctx->f32);
4656 
4657    args.target = V_008DFC_SQ_EXP_PRIM;
4658    args.enabled_channels = 1;
4659    args.done = true;
4660    args.valid_mask = false;
4661    args.compr = false;
4662 
4663    ac_build_export(ctx, &args);
4664 }
4665 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)4666 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4667 {
4668    if (type == AC_ARG_FLOAT) {
4669       return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4670    } else if (type == AC_ARG_INT) {
4671       return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4672    } else {
4673       LLVMTypeRef ptr_type;
4674       switch (type) {
4675       case AC_ARG_CONST_PTR:
4676          ptr_type = ctx->i8;
4677          break;
4678       case AC_ARG_CONST_FLOAT_PTR:
4679          ptr_type = ctx->f32;
4680          break;
4681       case AC_ARG_CONST_PTR_PTR:
4682          ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4683          break;
4684       case AC_ARG_CONST_DESC_PTR:
4685          ptr_type = ctx->v4i32;
4686          break;
4687       case AC_ARG_CONST_IMAGE_PTR:
4688          ptr_type = ctx->v8i32;
4689          break;
4690       default:
4691          unreachable("unknown arg type");
4692       }
4693       if (size == 1) {
4694          return ac_array_in_const32_addr_space(ptr_type);
4695       } else {
4696          assert(size == 2);
4697          return ac_array_in_const_addr_space(ptr_type);
4698       }
4699    }
4700 }
4701 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)4702 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4703                            enum ac_llvm_calling_convention convention, const char *name,
4704                            LLVMTypeRef ret_type, LLVMModuleRef module)
4705 {
4706    LLVMTypeRef arg_types[AC_MAX_ARGS];
4707 
4708    for (unsigned i = 0; i < args->arg_count; i++) {
4709       arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4710    }
4711 
4712    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4713 
4714    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4715    LLVMBasicBlockRef main_function_body =
4716       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4717    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4718 
4719    LLVMSetFunctionCallConv(main_function, convention);
4720    for (unsigned i = 0; i < args->arg_count; ++i) {
4721       LLVMValueRef P = LLVMGetParam(main_function, i);
4722 
4723       if (args->args[i].file != AC_ARG_SGPR)
4724          continue;
4725 
4726       ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4727 
4728       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4729          ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4730          ac_add_attr_dereferenceable(P, UINT64_MAX);
4731          ac_add_attr_alignment(P, 4);
4732       }
4733    }
4734 
4735    ctx->main_function = main_function;
4736 
4737    /* Enable denormals for FP16 and FP64: */
4738    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4739    /* Disable denormals for FP32: */
4740    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4741                                       "preserve-sign,preserve-sign");
4742    return main_function;
4743 }
4744 
ac_build_s_endpgm(struct ac_llvm_context * ctx)4745 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4746 {
4747    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4748    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4749    LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4750 }
4751 
4752 /**
4753  * Convert triangle strip indices to triangle indices. This is used to decompose
4754  * triangle strips into triangles.
4755  */
ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context * ctx,LLVMValueRef is_odd,LLVMValueRef flatshade_first,LLVMValueRef index[3])4756 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4757                                                  LLVMValueRef flatshade_first,
4758                                                  LLVMValueRef index[3])
4759 {
4760    LLVMBuilderRef builder = ctx->builder;
4761    LLVMValueRef out[3];
4762 
4763    /* We need to change the vertex order for odd triangles to get correct
4764     * front/back facing by swapping 2 vertex indices, but we also have to
4765     * keep the provoking vertex in the same place.
4766     *
4767     * If the first vertex is provoking, swap index 1 and 2.
4768     * If the last vertex is provoking, swap index 0 and 1.
4769     */
4770    out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4771                             LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4772    out[1] = LLVMBuildSelect(builder, flatshade_first,
4773                             LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4774                             LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4775    out[2] = LLVMBuildSelect(builder, flatshade_first,
4776                             LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4777    memcpy(index, out, sizeof(out));
4778 }
4779