1 /*
2  * Copyright © 2016 Bas Nieuwenhuizen
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "ac_nir_to_llvm.h"
25 #include "ac_gpu_info.h"
26 #include "ac_binary.h"
27 #include "ac_llvm_build.h"
28 #include "ac_llvm_util.h"
29 #include "ac_shader_abi.h"
30 #include "ac_shader_util.h"
31 #include "nir/nir.h"
32 #include "nir/nir_deref.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/u_math.h"
36 #include <llvm/Config/llvm-config.h>
37 
38 struct ac_nir_context {
39    struct ac_llvm_context ac;
40    struct ac_shader_abi *abi;
41    const struct ac_shader_args *args;
42 
43    gl_shader_stage stage;
44    shader_info *info;
45 
46    LLVMValueRef *ssa_defs;
47 
48    LLVMValueRef scratch;
49    LLVMValueRef constant_data;
50 
51    struct hash_table *defs;
52    struct hash_table *phis;
53    struct hash_table *vars;
54    struct hash_table *verified_interp;
55 
56    LLVMValueRef main_function;
57    LLVMBasicBlockRef continue_block;
58    LLVMBasicBlockRef break_block;
59 
60    LLVMValueRef vertex_id_replaced;
61    LLVMValueRef instance_id_replaced;
62    LLVMValueRef tes_u_replaced;
63    LLVMValueRef tes_v_replaced;
64    LLVMValueRef tes_rel_patch_id_replaced;
65    LLVMValueRef tes_patch_id_replaced;
66 };
67 
68 static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
69                                            const nir_instr *instr, bool image);
70 
71 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
72                                      enum ac_descriptor_type desc_type, const nir_instr *instr,
73                                      LLVMValueRef index, bool image, bool write);
74 
get_def_type(struct ac_nir_context * ctx,const nir_ssa_def * def)75 static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, const nir_ssa_def *def)
76 {
77    LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size);
78    if (def->num_components > 1) {
79       type = LLVMVectorType(type, def->num_components);
80    }
81    return type;
82 }
83 
get_src(struct ac_nir_context * nir,nir_src src)84 static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src)
85 {
86    assert(src.is_ssa);
87    return nir->ssa_defs[src.ssa->index];
88 }
89 
get_memory_ptr(struct ac_nir_context * ctx,nir_src src,unsigned bit_size,unsigned c_off)90 static LLVMValueRef get_memory_ptr(struct ac_nir_context *ctx, nir_src src, unsigned bit_size, unsigned c_off)
91 {
92    LLVMValueRef ptr = get_src(ctx, src);
93    LLVMValueRef lds_i8 = ctx->ac.lds;
94    if (ctx->stage != MESA_SHADER_COMPUTE)
95       lds_i8 = LLVMBuildBitCast(ctx->ac.builder, ctx->ac.lds, LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS), "");
96 
97    ptr = LLVMBuildAdd(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i32, c_off, 0), "");
98    ptr = LLVMBuildGEP(ctx->ac.builder, lds_i8, &ptr, 1, "");
99    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
100 
101    LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, bit_size);
102 
103    return LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(type, addr_space), "");
104 }
105 
get_block(struct ac_nir_context * nir,const struct nir_block * b)106 static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, const struct nir_block *b)
107 {
108    struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b);
109    return (LLVMBasicBlockRef)entry->data;
110 }
111 
get_alu_src(struct ac_nir_context * ctx,nir_alu_src src,unsigned num_components)112 static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, nir_alu_src src,
113                                 unsigned num_components)
114 {
115    LLVMValueRef value = get_src(ctx, src.src);
116    bool need_swizzle = false;
117 
118    assert(value);
119    unsigned src_components = ac_get_llvm_num_components(value);
120    for (unsigned i = 0; i < num_components; ++i) {
121       assert(src.swizzle[i] < src_components);
122       if (src.swizzle[i] != i)
123          need_swizzle = true;
124    }
125 
126    if (need_swizzle || num_components != src_components) {
127       LLVMValueRef masks[] = {LLVMConstInt(ctx->ac.i32, src.swizzle[0], false),
128                               LLVMConstInt(ctx->ac.i32, src.swizzle[1], false),
129                               LLVMConstInt(ctx->ac.i32, src.swizzle[2], false),
130                               LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)};
131 
132       if (src_components > 1 && num_components == 1) {
133          value = LLVMBuildExtractElement(ctx->ac.builder, value, masks[0], "");
134       } else if (src_components == 1 && num_components > 1) {
135          LLVMValueRef values[] = {value, value, value, value};
136          value = ac_build_gather_values(&ctx->ac, values, num_components);
137       } else {
138          LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
139          value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, swizzle, "");
140       }
141    }
142    assert(!src.negate);
143    assert(!src.abs);
144    return value;
145 }
146 
emit_int_cmp(struct ac_llvm_context * ctx,LLVMIntPredicate pred,LLVMValueRef src0,LLVMValueRef src1)147 static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, LLVMIntPredicate pred,
148                                  LLVMValueRef src0, LLVMValueRef src1)
149 {
150    src0 = ac_to_integer(ctx, src0);
151    src1 = ac_to_integer(ctx, src1);
152    return LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
153 }
154 
emit_float_cmp(struct ac_llvm_context * ctx,LLVMRealPredicate pred,LLVMValueRef src0,LLVMValueRef src1)155 static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, LLVMRealPredicate pred,
156                                    LLVMValueRef src0, LLVMValueRef src1)
157 {
158    src0 = ac_to_float(ctx, src0);
159    src1 = ac_to_float(ctx, src1);
160    return LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
161 }
162 
emit_intrin_1f_param(struct ac_llvm_context * ctx,const char * intrin,LLVMTypeRef result_type,LLVMValueRef src0)163 static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, const char *intrin,
164                                          LLVMTypeRef result_type, LLVMValueRef src0)
165 {
166    char name[64], type[64];
167    LLVMValueRef params[] = {
168       ac_to_float(ctx, src0),
169    };
170 
171    ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
172    ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
173    assert(length < sizeof(name));
174    return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
175 }
176 
emit_intrin_1f_param_scalar(struct ac_llvm_context * ctx,const char * intrin,LLVMTypeRef result_type,LLVMValueRef src0)177 static LLVMValueRef emit_intrin_1f_param_scalar(struct ac_llvm_context *ctx, const char *intrin,
178                                                 LLVMTypeRef result_type, LLVMValueRef src0)
179 {
180    if (LLVMGetTypeKind(result_type) != LLVMVectorTypeKind)
181       return emit_intrin_1f_param(ctx, intrin, result_type, src0);
182 
183    LLVMTypeRef elem_type = LLVMGetElementType(result_type);
184    LLVMValueRef ret = LLVMGetUndef(result_type);
185 
186    /* Scalarize the intrinsic, because vectors are not supported. */
187    for (unsigned i = 0; i < LLVMGetVectorSize(result_type); i++) {
188       char name[64], type[64];
189       LLVMValueRef params[] = {
190          ac_to_float(ctx, ac_llvm_extract_elem(ctx, src0, i)),
191       };
192 
193       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
194       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
195       assert(length < sizeof(name));
196       ret = LLVMBuildInsertElement(
197          ctx->builder, ret,
198          ac_build_intrinsic(ctx, name, elem_type, params, 1, AC_FUNC_ATTR_READNONE),
199          LLVMConstInt(ctx->i32, i, 0), "");
200    }
201    return ret;
202 }
203 
emit_intrin_2f_param(struct ac_llvm_context * ctx,const char * intrin,LLVMTypeRef result_type,LLVMValueRef src0,LLVMValueRef src1)204 static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, const char *intrin,
205                                          LLVMTypeRef result_type, LLVMValueRef src0,
206                                          LLVMValueRef src1)
207 {
208    char name[64], type[64];
209    LLVMValueRef params[] = {
210       ac_to_float(ctx, src0),
211       ac_to_float(ctx, src1),
212    };
213 
214    ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
215    ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
216    assert(length < sizeof(name));
217    return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
218 }
219 
emit_intrin_3f_param(struct ac_llvm_context * ctx,const char * intrin,LLVMTypeRef result_type,LLVMValueRef src0,LLVMValueRef src1,LLVMValueRef src2)220 static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, const char *intrin,
221                                          LLVMTypeRef result_type, LLVMValueRef src0,
222                                          LLVMValueRef src1, LLVMValueRef src2)
223 {
224    char name[64], type[64];
225    LLVMValueRef params[] = {
226       ac_to_float(ctx, src0),
227       ac_to_float(ctx, src1),
228       ac_to_float(ctx, src2),
229    };
230 
231    ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
232    ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
233    assert(length < sizeof(name));
234    return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
235 }
236 
emit_bcsel(struct ac_llvm_context * ctx,LLVMValueRef src0,LLVMValueRef src1,LLVMValueRef src2)237 static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef src1,
238                                LLVMValueRef src2)
239 {
240    LLVMTypeRef src1_type = LLVMTypeOf(src1);
241    LLVMTypeRef src2_type = LLVMTypeOf(src2);
242 
243    if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
244        LLVMGetTypeKind(src2_type) != LLVMPointerTypeKind) {
245       src2 = LLVMBuildIntToPtr(ctx->builder, src2, src1_type, "");
246    } else if (LLVMGetTypeKind(src2_type) == LLVMPointerTypeKind &&
247               LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
248       src1 = LLVMBuildIntToPtr(ctx->builder, src1, src2_type, "");
249    }
250 
251    return LLVMBuildSelect(ctx->builder, src0, ac_to_integer_or_pointer(ctx, src1),
252                           ac_to_integer_or_pointer(ctx, src2), "");
253 }
254 
emit_iabs(struct ac_llvm_context * ctx,LLVMValueRef src0)255 static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, LLVMValueRef src0)
256 {
257    return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, ""));
258 }
259 
emit_uint_carry(struct ac_llvm_context * ctx,const char * intrin,LLVMValueRef src0,LLVMValueRef src1)260 static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, const char *intrin,
261                                     LLVMValueRef src0, LLVMValueRef src1)
262 {
263    LLVMTypeRef ret_type;
264    LLVMTypeRef types[] = {ctx->i32, ctx->i1};
265    LLVMValueRef res;
266    LLVMValueRef params[] = {src0, src1};
267    ret_type = LLVMStructTypeInContext(ctx->context, types, 2, true);
268 
269    res = ac_build_intrinsic(ctx, intrin, ret_type, params, 2, AC_FUNC_ATTR_READNONE);
270 
271    res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
272    res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
273    return res;
274 }
275 
emit_b2f(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)276 static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
277 {
278    assert(ac_get_elem_bits(ctx, LLVMTypeOf(src0)) == 1);
279 
280    switch (bitsize) {
281    case 16:
282       if (LLVMGetTypeKind(LLVMTypeOf(src0)) == LLVMVectorTypeKind) {
283          assert(LLVMGetVectorSize(LLVMTypeOf(src0)) == 2);
284          LLVMValueRef f[] = {
285             LLVMBuildSelect(ctx->builder, ac_llvm_extract_elem(ctx, src0, 0),
286                             ctx->f16_1, ctx->f16_0, ""),
287             LLVMBuildSelect(ctx->builder, ac_llvm_extract_elem(ctx, src0, 1),
288                             ctx->f16_1, ctx->f16_0, ""),
289          };
290          return ac_build_gather_values(ctx, f, 2);
291       }
292       return LLVMBuildSelect(ctx->builder, src0, ctx->f16_1, ctx->f16_0, "");
293    case 32:
294       return LLVMBuildSelect(ctx->builder, src0, ctx->f32_1, ctx->f32_0, "");
295    case 64:
296       return LLVMBuildSelect(ctx->builder, src0, ctx->f64_1, ctx->f64_0, "");
297    default:
298       unreachable("Unsupported bit size.");
299    }
300 }
301 
emit_f2b(struct ac_llvm_context * ctx,LLVMValueRef src0)302 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, LLVMValueRef src0)
303 {
304    src0 = ac_to_float(ctx, src0);
305    LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
306    return LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, "");
307 }
308 
emit_b2i(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)309 static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
310 {
311    switch (bitsize) {
312    case 8:
313       return LLVMBuildSelect(ctx->builder, src0, ctx->i8_1, ctx->i8_0, "");
314    case 16:
315       return LLVMBuildSelect(ctx->builder, src0, ctx->i16_1, ctx->i16_0, "");
316    case 32:
317       return LLVMBuildSelect(ctx->builder, src0, ctx->i32_1, ctx->i32_0, "");
318    case 64:
319       return LLVMBuildSelect(ctx->builder, src0, ctx->i64_1, ctx->i64_0, "");
320    default:
321       unreachable("Unsupported bit size.");
322    }
323 }
324 
emit_i2b(struct ac_llvm_context * ctx,LLVMValueRef src0)325 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, LLVMValueRef src0)
326 {
327    LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
328    return LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, "");
329 }
330 
emit_f2f16(struct ac_llvm_context * ctx,LLVMValueRef src0)331 static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, LLVMValueRef src0)
332 {
333    LLVMValueRef result;
334    LLVMValueRef cond = NULL;
335 
336    src0 = ac_to_float(ctx, src0);
337    result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");
338 
339    if (ctx->chip_class >= GFX8) {
340       LLVMValueRef args[2];
341       /* Check if the result is a denormal - and flush to 0 if so. */
342       args[0] = result;
343       args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false);
344       cond =
345          ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE);
346    }
347 
348    /* need to convert back up to f32 */
349    result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
350 
351    if (ctx->chip_class >= GFX8)
352       result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
353    else {
354       /* for GFX6-GFX7 */
355       /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
356        * so compare the result and flush to 0 if it's smaller.
357        */
358       LLVMValueRef temp, cond2;
359       temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result);
360       cond = LLVMBuildFCmp(
361          ctx->builder, LLVMRealOGT,
362          LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""),
363          temp, "");
364       cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealONE, temp, ctx->f32_0, "");
365       cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
366       result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
367    }
368    return result;
369 }
370 
emit_umul_high(struct ac_llvm_context * ctx,LLVMValueRef src0,LLVMValueRef src1)371 static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, LLVMValueRef src0,
372                                    LLVMValueRef src1)
373 {
374    LLVMValueRef dst64, result;
375    src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
376    src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
377 
378    dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
379    dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
380    result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
381    return result;
382 }
383 
emit_imul_high(struct ac_llvm_context * ctx,LLVMValueRef src0,LLVMValueRef src1)384 static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, LLVMValueRef src0,
385                                    LLVMValueRef src1)
386 {
387    LLVMValueRef dst64, result;
388    src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
389    src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
390 
391    dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
392    dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
393    result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
394    return result;
395 }
396 
emit_bfm(struct ac_llvm_context * ctx,LLVMValueRef bits,LLVMValueRef offset)397 static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, LLVMValueRef bits, LLVMValueRef offset)
398 {
399    /* mask = ((1 << bits) - 1) << offset */
400    return LLVMBuildShl(
401       ctx->builder,
402       LLVMBuildSub(ctx->builder, LLVMBuildShl(ctx->builder, ctx->i32_1, bits, ""), ctx->i32_1, ""),
403       offset, "");
404 }
405 
emit_bitfield_select(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef insert,LLVMValueRef base)406 static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, LLVMValueRef mask,
407                                          LLVMValueRef insert, LLVMValueRef base)
408 {
409    /* Calculate:
410     *   (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base))
411     * Use the right-hand side, which the LLVM backend can convert to V_BFI.
412     */
413    return LLVMBuildXor(
414       ctx->builder, base,
415       LLVMBuildAnd(ctx->builder, mask, LLVMBuildXor(ctx->builder, insert, base, ""), ""), "");
416 }
417 
emit_pack_2x16(struct ac_llvm_context * ctx,LLVMValueRef src0,LLVMValueRef (* pack)(struct ac_llvm_context * ctx,LLVMValueRef args[2]))418 static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0,
419                                    LLVMValueRef (*pack)(struct ac_llvm_context *ctx,
420                                                         LLVMValueRef args[2]))
421 {
422    LLVMValueRef comp[2];
423 
424    src0 = ac_to_float(ctx, src0);
425    comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
426    comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
427 
428    return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, "");
429 }
430 
emit_unpack_half_2x16(struct ac_llvm_context * ctx,LLVMValueRef src0)431 static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0)
432 {
433    LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
434    LLVMValueRef temps[2], val;
435    int i;
436 
437    for (i = 0; i < 2; i++) {
438       val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
439       val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
440       val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
441       temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
442    }
443    return ac_build_gather_values(ctx, temps, 2);
444 }
445 
emit_ddxy(struct ac_nir_context * ctx,nir_op op,LLVMValueRef src0)446 static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, nir_op op, LLVMValueRef src0)
447 {
448    unsigned mask;
449    int idx;
450    LLVMValueRef result;
451 
452    if (op == nir_op_fddx_fine)
453       mask = AC_TID_MASK_LEFT;
454    else if (op == nir_op_fddy_fine)
455       mask = AC_TID_MASK_TOP;
456    else
457       mask = AC_TID_MASK_TOP_LEFT;
458 
459    /* for DDX we want to next X pixel, DDY next Y pixel. */
460    if (op == nir_op_fddx_fine || op == nir_op_fddx_coarse || op == nir_op_fddx)
461       idx = 1;
462    else
463       idx = 2;
464 
465    result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
466    return result;
467 }
468 
469 struct waterfall_context {
470    LLVMBasicBlockRef phi_bb[2];
471    bool use_waterfall;
472 };
473 
474 /* To deal with divergent descriptors we can create a loop that handles all
475  * lanes with the same descriptor on a given iteration (henceforth a
476  * waterfall loop).
477  *
478  * These helper create the begin and end of the loop leaving the caller
479  * to implement the body.
480  *
481  * params:
482  *  - ctx is the usal nir context
483  *  - wctx is a temporary struct containing some loop info. Can be left uninitialized.
484  *  - value is the possibly divergent value for which we built the loop
485  *  - divergent is whether value is actually divergent. If false we just pass
486  *     things through.
487  */
enter_waterfall(struct ac_nir_context * ctx,struct waterfall_context * wctx,LLVMValueRef value,bool divergent)488 static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx,
489                                     LLVMValueRef value, bool divergent)
490 {
491    /* If the app claims the value is divergent but it is constant we can
492     * end up with a dynamic index of NULL. */
493    if (!value)
494       divergent = false;
495 
496    wctx->use_waterfall = divergent;
497    if (!divergent)
498       return value;
499 
500    ac_build_bgnloop(&ctx->ac, 6000);
501 
502    LLVMValueRef active = LLVMConstInt(ctx->ac.i1, 1, false);
503    LLVMValueRef scalar_value[NIR_MAX_VEC_COMPONENTS];
504 
505    for (unsigned i = 0; i < ac_get_llvm_num_components(value); i++) {
506       LLVMValueRef comp = ac_llvm_extract_elem(&ctx->ac, value, i);
507       scalar_value[i] = ac_build_readlane(&ctx->ac, comp, NULL);
508       active = LLVMBuildAnd(ctx->ac.builder, active,
509                             LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, comp, scalar_value[i], ""), "");
510    }
511 
512    wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
513    ac_build_ifcc(&ctx->ac, active, 6001);
514 
515    return ac_build_gather_values(&ctx->ac, scalar_value, ac_get_llvm_num_components(value));
516 }
517 
exit_waterfall(struct ac_nir_context * ctx,struct waterfall_context * wctx,LLVMValueRef value)518 static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx,
519                                    LLVMValueRef value)
520 {
521    LLVMValueRef ret = NULL;
522    LLVMValueRef phi_src[2];
523    LLVMValueRef cc_phi_src[2] = {
524       LLVMConstInt(ctx->ac.i32, 0, false),
525       LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
526    };
527 
528    if (!wctx->use_waterfall)
529       return value;
530 
531    wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
532 
533    ac_build_endif(&ctx->ac, 6001);
534 
535    if (value) {
536       phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
537       phi_src[1] = value;
538 
539       ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
540    }
541 
542    /*
543     * By using the optimization barrier on the exit decision, we decouple
544     * the operations from the break, and hence avoid LLVM hoisting the
545     * opteration into the break block.
546     */
547    LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
548    ac_build_optimization_barrier(&ctx->ac, &cc, false);
549 
550    LLVMValueRef active =
551       LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
552    ac_build_ifcc(&ctx->ac, active, 6002);
553    ac_build_break(&ctx->ac);
554    ac_build_endif(&ctx->ac, 6002);
555 
556    ac_build_endloop(&ctx->ac, 6000);
557    return ret;
558 }
559 
visit_alu(struct ac_nir_context * ctx,const nir_alu_instr * instr)560 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
561 {
562    LLVMValueRef src[4], result = NULL;
563    unsigned num_components = instr->dest.dest.ssa.num_components;
564    unsigned src_components;
565    LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
566 
567    assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
568    switch (instr->op) {
569    case nir_op_vec2:
570    case nir_op_vec3:
571    case nir_op_vec4:
572    case nir_op_vec5:
573    case nir_op_unpack_32_2x16:
574    case nir_op_unpack_64_2x32:
575    case nir_op_unpack_64_4x16:
576       src_components = 1;
577       break;
578    case nir_op_pack_half_2x16:
579    case nir_op_pack_snorm_2x16:
580    case nir_op_pack_unorm_2x16:
581    case nir_op_pack_32_2x16:
582    case nir_op_pack_64_2x32:
583       src_components = 2;
584       break;
585    case nir_op_unpack_half_2x16:
586       src_components = 1;
587       break;
588    case nir_op_cube_face_coord_amd:
589    case nir_op_cube_face_index_amd:
590       src_components = 3;
591       break;
592    case nir_op_pack_32_4x8:
593    case nir_op_pack_64_4x16:
594       src_components = 4;
595       break;
596    default:
597       src_components = num_components;
598       break;
599    }
600    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
601       src[i] = get_alu_src(ctx, instr->src[i], src_components);
602 
603    switch (instr->op) {
604    case nir_op_mov:
605       result = src[0];
606       break;
607    case nir_op_fneg:
608       src[0] = ac_to_float(&ctx->ac, src[0]);
609       result = LLVMBuildFNeg(ctx->ac.builder, src[0], "");
610       if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) {
611          /* fneg will be optimized by backend compiler with sign
612           * bit removed via XOR. This is probably a LLVM bug.
613           */
614          result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
615       }
616       break;
617    case nir_op_ineg:
618       if (instr->no_unsigned_wrap)
619          result = LLVMBuildNUWNeg(ctx->ac.builder, src[0], "");
620       else if (instr->no_signed_wrap)
621          result = LLVMBuildNSWNeg(ctx->ac.builder, src[0], "");
622       else
623          result = LLVMBuildNeg(ctx->ac.builder, src[0], "");
624       break;
625    case nir_op_inot:
626       result = LLVMBuildNot(ctx->ac.builder, src[0], "");
627       break;
628    case nir_op_iadd:
629       if (instr->no_unsigned_wrap)
630          result = LLVMBuildNUWAdd(ctx->ac.builder, src[0], src[1], "");
631       else if (instr->no_signed_wrap)
632          result = LLVMBuildNSWAdd(ctx->ac.builder, src[0], src[1], "");
633       else
634          result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
635       break;
636    case nir_op_uadd_sat:
637    case nir_op_iadd_sat: {
638       char name[64], type[64];
639       ac_build_type_name_for_intr(def_type, type, sizeof(type));
640       snprintf(name, sizeof(name), "llvm.%cadd.sat.%s",
641                instr->op == nir_op_uadd_sat ? 'u' : 's', type);
642       result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 2, AC_FUNC_ATTR_READNONE);
643       break;
644    }
645    case nir_op_fadd:
646       src[0] = ac_to_float(&ctx->ac, src[0]);
647       src[1] = ac_to_float(&ctx->ac, src[1]);
648       result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], "");
649       break;
650    case nir_op_fsub:
651       src[0] = ac_to_float(&ctx->ac, src[0]);
652       src[1] = ac_to_float(&ctx->ac, src[1]);
653       result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], "");
654       break;
655    case nir_op_isub:
656       if (instr->no_unsigned_wrap)
657          result = LLVMBuildNUWSub(ctx->ac.builder, src[0], src[1], "");
658       else if (instr->no_signed_wrap)
659          result = LLVMBuildNSWSub(ctx->ac.builder, src[0], src[1], "");
660       else
661          result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], "");
662       break;
663    case nir_op_imul:
664       if (instr->no_unsigned_wrap)
665          result = LLVMBuildNUWMul(ctx->ac.builder, src[0], src[1], "");
666       else if (instr->no_signed_wrap)
667          result = LLVMBuildNSWMul(ctx->ac.builder, src[0], src[1], "");
668       else
669          result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], "");
670       break;
671    case nir_op_imod:
672       result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
673       break;
674    case nir_op_umod:
675       result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], "");
676       break;
677    case nir_op_irem:
678       result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
679       break;
680    case nir_op_idiv:
681       result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], "");
682       break;
683    case nir_op_udiv:
684       result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], "");
685       break;
686    case nir_op_fmul:
687       src[0] = ac_to_float(&ctx->ac, src[0]);
688       src[1] = ac_to_float(&ctx->ac, src[1]);
689       result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
690       break;
691    case nir_op_frcp:
692       /* For doubles, we need precise division to pass GLCTS. */
693       if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && ac_get_type_size(def_type) == 8) {
694          result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1, ac_to_float(&ctx->ac, src[0]), "");
695       } else {
696          result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rcp",
697                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
698       }
699       if (ctx->abi->clamp_div_by_zero)
700          result = ac_build_fmin(&ctx->ac, result,
701                                 LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
702       break;
703    case nir_op_iand:
704       result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
705       break;
706    case nir_op_ior:
707       result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], "");
708       break;
709    case nir_op_ixor:
710       result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], "");
711       break;
712    case nir_op_ishl:
713       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
714           ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
715          src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
716       else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
717                ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
718          src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
719       result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], "");
720       break;
721    case nir_op_ishr:
722       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
723           ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
724          src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
725       else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
726                ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
727          src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
728       result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], "");
729       break;
730    case nir_op_ushr:
731       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
732           ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
733          src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
734       else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
735                ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
736          src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
737       result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], "");
738       break;
739    case nir_op_ilt:
740       result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
741       break;
742    case nir_op_ine:
743       result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
744       break;
745    case nir_op_ieq:
746       result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
747       break;
748    case nir_op_ige:
749       result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
750       break;
751    case nir_op_ult:
752       result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
753       break;
754    case nir_op_uge:
755       result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
756       break;
757    case nir_op_feq:
758       result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
759       break;
760    case nir_op_fneu:
761       result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
762       break;
763    case nir_op_flt:
764       result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]);
765       break;
766    case nir_op_fge:
767       result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]);
768       break;
769    case nir_op_fabs:
770       result =
771          emit_intrin_1f_param(&ctx->ac, "llvm.fabs", ac_to_float_type(&ctx->ac, def_type), src[0]);
772       if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) {
773          /* fabs will be optimized by backend compiler with sign
774           * bit removed via AND.
775           */
776          result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
777       }
778       break;
779    case nir_op_fsat:
780       src[0] = ac_to_float(&ctx->ac, src[0]);
781       result = ac_build_fsat(&ctx->ac, src[0],
782                              ac_to_float_type(&ctx->ac, def_type));
783       break;
784    case nir_op_iabs:
785       result = emit_iabs(&ctx->ac, src[0]);
786       break;
787    case nir_op_imax:
788       result = ac_build_imax(&ctx->ac, src[0], src[1]);
789       break;
790    case nir_op_imin:
791       result = ac_build_imin(&ctx->ac, src[0], src[1]);
792       break;
793    case nir_op_umax:
794       result = ac_build_umax(&ctx->ac, src[0], src[1]);
795       break;
796    case nir_op_umin:
797       result = ac_build_umin(&ctx->ac, src[0], src[1]);
798       break;
799    case nir_op_isign:
800       result = ac_build_isign(&ctx->ac, src[0]);
801       break;
802    case nir_op_fsign:
803       src[0] = ac_to_float(&ctx->ac, src[0]);
804       result = ac_build_fsign(&ctx->ac, src[0]);
805       break;
806    case nir_op_ffloor:
807       result =
808          emit_intrin_1f_param(&ctx->ac, "llvm.floor", ac_to_float_type(&ctx->ac, def_type), src[0]);
809       break;
810    case nir_op_ftrunc:
811       result =
812          emit_intrin_1f_param(&ctx->ac, "llvm.trunc", ac_to_float_type(&ctx->ac, def_type), src[0]);
813       break;
814    case nir_op_fceil:
815       result =
816          emit_intrin_1f_param(&ctx->ac, "llvm.ceil", ac_to_float_type(&ctx->ac, def_type), src[0]);
817       break;
818    case nir_op_fround_even:
819       result =
820          emit_intrin_1f_param(&ctx->ac, "llvm.rint", ac_to_float_type(&ctx->ac, def_type), src[0]);
821       break;
822    case nir_op_ffract:
823       result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.fract",
824                                            ac_to_float_type(&ctx->ac, def_type), src[0]);
825       break;
826    case nir_op_fsin:
827       result =
828          emit_intrin_1f_param(&ctx->ac, "llvm.sin", ac_to_float_type(&ctx->ac, def_type), src[0]);
829       break;
830    case nir_op_fcos:
831       result =
832          emit_intrin_1f_param(&ctx->ac, "llvm.cos", ac_to_float_type(&ctx->ac, def_type), src[0]);
833       break;
834    case nir_op_fsqrt:
835       result =
836          emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]);
837       break;
838    case nir_op_fexp2:
839       result =
840          emit_intrin_1f_param(&ctx->ac, "llvm.exp2", ac_to_float_type(&ctx->ac, def_type), src[0]);
841       break;
842    case nir_op_flog2:
843       result =
844          emit_intrin_1f_param(&ctx->ac, "llvm.log2", ac_to_float_type(&ctx->ac, def_type), src[0]);
845       break;
846    case nir_op_frsq:
847       result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rsq",
848                                            ac_to_float_type(&ctx->ac, def_type), src[0]);
849       if (ctx->abi->clamp_div_by_zero)
850          result = ac_build_fmin(&ctx->ac, result,
851                                 LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
852       break;
853    case nir_op_frexp_exp:
854       src[0] = ac_to_float(&ctx->ac, src[0]);
855       result = ac_build_frexp_exp(&ctx->ac, src[0], ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])));
856       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16)
857          result = LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, "");
858       break;
859    case nir_op_frexp_sig:
860       src[0] = ac_to_float(&ctx->ac, src[0]);
861       result = ac_build_frexp_mant(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
862       break;
863    case nir_op_fpow:
864       if (instr->dest.dest.ssa.bit_size != 32) {
865          /* 16 and 64 bits */
866          result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
867                                        ac_to_float_type(&ctx->ac, def_type), src[0]);
868          result = LLVMBuildFMul(ctx->ac.builder, result, ac_to_float(&ctx->ac, src[1]), "");
869          result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
870                                        ac_to_float_type(&ctx->ac, def_type), result);
871          break;
872       }
873       if (LLVM_VERSION_MAJOR >= 12) {
874          result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
875                                        ac_to_float_type(&ctx->ac, def_type), src[0]);
876          result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.fmul.legacy", ctx->ac.f32,
877                                      (LLVMValueRef[]){result, ac_to_float(&ctx->ac, src[1])},
878                                      2, AC_FUNC_ATTR_READNONE);
879          result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
880                                        ac_to_float_type(&ctx->ac, def_type), result);
881          break;
882       }
883       /* Older LLVM doesn't have fmul.legacy. */
884       result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", ac_to_float_type(&ctx->ac, def_type),
885                                     src[0], src[1]);
886       break;
887    case nir_op_fmax:
888       result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", ac_to_float_type(&ctx->ac, def_type),
889                                     src[0], src[1]);
890       if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) {
891          /* Only pre-GFX9 chips do not flush denorms. */
892          result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
893       }
894       break;
895    case nir_op_fmin:
896       result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", ac_to_float_type(&ctx->ac, def_type),
897                                     src[0], src[1]);
898       if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) {
899          /* Only pre-GFX9 chips do not flush denorms. */
900          result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
901       }
902       break;
903    case nir_op_ffma:
904       /* FMA is slow on gfx6-8, so it shouldn't be used. */
905       assert(instr->dest.dest.ssa.bit_size != 32 || ctx->ac.chip_class >= GFX9);
906       result = emit_intrin_3f_param(&ctx->ac, "llvm.fma", ac_to_float_type(&ctx->ac, def_type),
907                                     src[0], src[1], src[2]);
908       break;
909    case nir_op_ldexp:
910       src[0] = ac_to_float(&ctx->ac, src[0]);
911       if (ac_get_elem_bits(&ctx->ac, def_type) == 32)
912          result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2,
913                                      AC_FUNC_ATTR_READNONE);
914       else if (ac_get_elem_bits(&ctx->ac, def_type) == 16)
915          result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2,
916                                      AC_FUNC_ATTR_READNONE);
917       else
918          result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2,
919                                      AC_FUNC_ATTR_READNONE);
920       break;
921    case nir_op_bfm:
922       result = emit_bfm(&ctx->ac, src[0], src[1]);
923       break;
924    case nir_op_bitfield_select:
925       result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]);
926       break;
927    case nir_op_ubfe:
928       result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false);
929       break;
930    case nir_op_ibfe:
931       result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true);
932       break;
933    case nir_op_bitfield_reverse:
934       result = ac_build_bitfield_reverse(&ctx->ac, src[0]);
935       break;
936    case nir_op_bit_count:
937       result = ac_build_bit_count(&ctx->ac, src[0]);
938       break;
939    case nir_op_vec2:
940    case nir_op_vec3:
941    case nir_op_vec4:
942    case nir_op_vec5:
943       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
944          src[i] = ac_to_integer(&ctx->ac, src[i]);
945       result = ac_build_gather_values(&ctx->ac, src, num_components);
946       break;
947    case nir_op_f2i8:
948    case nir_op_f2i16:
949    case nir_op_f2imp:
950    case nir_op_f2i32:
951    case nir_op_f2i64:
952       src[0] = ac_to_float(&ctx->ac, src[0]);
953       result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
954       break;
955    case nir_op_f2u8:
956    case nir_op_f2u16:
957    case nir_op_f2ump:
958    case nir_op_f2u32:
959    case nir_op_f2u64:
960       src[0] = ac_to_float(&ctx->ac, src[0]);
961       result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
962       break;
963    case nir_op_i2f16:
964    case nir_op_i2fmp:
965    case nir_op_i2f32:
966    case nir_op_i2f64:
967       result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
968       break;
969    case nir_op_u2f16:
970    case nir_op_u2fmp:
971    case nir_op_u2f32:
972    case nir_op_u2f64:
973       result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
974       break;
975    case nir_op_f2f16_rtz:
976    case nir_op_f2f16:
977    case nir_op_f2fmp:
978       src[0] = ac_to_float(&ctx->ac, src[0]);
979 
980       /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
981        * all f32->f16 conversions have to round towards zero, because both scalar
982        * and vec2 down-conversions have to round equally.
983        */
984       if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == nir_op_f2f16_rtz) {
985          src[0] = ac_to_float(&ctx->ac, src[0]);
986 
987          if (LLVMTypeOf(src[0]) == ctx->ac.f64)
988             src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
989 
990          /* Fast path conversion. This only works if NIR is vectorized
991           * to vec2 16.
992           */
993          if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
994             LLVMValueRef args[] = {
995                ac_llvm_extract_elem(&ctx->ac, src[0], 0),
996                ac_llvm_extract_elem(&ctx->ac, src[0], 1),
997             };
998             result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
999             break;
1000          }
1001 
1002          assert(ac_get_llvm_num_components(src[0]) == 1);
1003          LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
1004          result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
1005          result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
1006       } else {
1007          if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
1008             result =
1009                LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1010          else
1011             result =
1012                LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1013       }
1014       break;
1015    case nir_op_f2f16_rtne:
1016    case nir_op_f2f32:
1017    case nir_op_f2f64:
1018       src[0] = ac_to_float(&ctx->ac, src[0]);
1019       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
1020          result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1021       else
1022          result =
1023             LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1024       break;
1025    case nir_op_u2u8:
1026    case nir_op_u2u16:
1027    case nir_op_u2u32:
1028    case nir_op_u2u64:
1029       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
1030          result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, "");
1031       else
1032          result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
1033       break;
1034    case nir_op_i2i8:
1035    case nir_op_i2i16:
1036    case nir_op_i2imp:
1037    case nir_op_i2i32:
1038    case nir_op_i2i64:
1039       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
1040          result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, "");
1041       else
1042          result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
1043       break;
1044    case nir_op_bcsel:
1045       result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
1046       break;
1047    case nir_op_find_lsb:
1048       result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]);
1049       break;
1050    case nir_op_ufind_msb:
1051       result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32);
1052       break;
1053    case nir_op_ifind_msb:
1054       result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32);
1055       break;
1056    case nir_op_uadd_carry:
1057       result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
1058       break;
1059    case nir_op_usub_borrow:
1060       result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
1061       break;
1062    case nir_op_b2f16:
1063    case nir_op_b2f32:
1064    case nir_op_b2f64:
1065       result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1066       break;
1067    case nir_op_f2b1:
1068       result = emit_f2b(&ctx->ac, src[0]);
1069       break;
1070    case nir_op_b2i8:
1071    case nir_op_b2i16:
1072    case nir_op_b2i32:
1073    case nir_op_b2i64:
1074       result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1075       break;
1076    case nir_op_i2b1:
1077    case nir_op_b2b1: /* after loads */
1078       result = emit_i2b(&ctx->ac, src[0]);
1079       break;
1080    case nir_op_b2b16: /* before stores */
1081       result = LLVMBuildZExt(ctx->ac.builder, src[0], ctx->ac.i16, "");
1082       break;
1083    case nir_op_b2b32: /* before stores */
1084       result = LLVMBuildZExt(ctx->ac.builder, src[0], ctx->ac.i32, "");
1085       break;
1086    case nir_op_fquantize2f16:
1087       result = emit_f2f16(&ctx->ac, src[0]);
1088       break;
1089    case nir_op_umul_high:
1090       result = emit_umul_high(&ctx->ac, src[0], src[1]);
1091       break;
1092    case nir_op_imul_high:
1093       result = emit_imul_high(&ctx->ac, src[0], src[1]);
1094       break;
1095    case nir_op_pack_half_2x16:
1096       result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16);
1097       break;
1098    case nir_op_pack_half_2x16_split:
1099       src[0] = ac_to_float(&ctx->ac, src[0]);
1100       src[1] = ac_to_float(&ctx->ac, src[1]);
1101       result = LLVMBuildBitCast(ctx->ac.builder,
1102                                 ac_build_cvt_pkrtz_f16(&ctx->ac, src),
1103                                 ctx->ac.i32, "");
1104       break;
1105    case nir_op_pack_snorm_2x16:
1106       result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16);
1107       break;
1108    case nir_op_pack_unorm_2x16:
1109       result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16);
1110       break;
1111    case nir_op_unpack_half_2x16:
1112       result = emit_unpack_half_2x16(&ctx->ac, src[0]);
1113       break;
1114    case nir_op_unpack_half_2x16_split_x: {
1115       assert(ac_get_llvm_num_components(src[0]) == 1);
1116       LLVMValueRef tmp = emit_unpack_half_2x16(&ctx->ac, src[0]);
1117       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, "");
1118       break;
1119    }
1120    case nir_op_unpack_half_2x16_split_y: {
1121       assert(ac_get_llvm_num_components(src[0]) == 1);
1122       LLVMValueRef tmp = emit_unpack_half_2x16(&ctx->ac, src[0]);
1123       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
1124       break;
1125    }
1126    case nir_op_fddx:
1127    case nir_op_fddy:
1128    case nir_op_fddx_fine:
1129    case nir_op_fddy_fine:
1130    case nir_op_fddx_coarse:
1131    case nir_op_fddy_coarse:
1132       result = emit_ddxy(ctx, instr->op, src[0]);
1133       break;
1134 
1135    case nir_op_unpack_64_4x16: {
1136       result = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v4i16, "");
1137       break;
1138    }
1139    case nir_op_pack_64_4x16: {
1140       result = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.i64, "");
1141       break;
1142    }
1143 
1144    case nir_op_unpack_64_2x32: {
1145       result = LLVMBuildBitCast(ctx->ac.builder, src[0],
1146             ctx->ac.v2i32, "");
1147       break;
1148    }
1149    case nir_op_unpack_64_2x32_split_x: {
1150       assert(ac_get_llvm_num_components(src[0]) == 1);
1151       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, "");
1152       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, "");
1153       break;
1154    }
1155    case nir_op_unpack_64_2x32_split_y: {
1156       assert(ac_get_llvm_num_components(src[0]) == 1);
1157       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, "");
1158       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
1159       break;
1160    }
1161 
1162    case nir_op_pack_64_2x32: {
1163       result = LLVMBuildBitCast(ctx->ac.builder, src[0],
1164             ctx->ac.i64, "");
1165       break;
1166    }
1167    case nir_op_pack_64_2x32_split: {
1168       LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
1169       result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, "");
1170       break;
1171    }
1172 
1173    case nir_op_pack_32_4x8:
1174    case nir_op_pack_32_2x16: {
1175       result = LLVMBuildBitCast(ctx->ac.builder, src[0],
1176             ctx->ac.i32, "");
1177       break;
1178    }
1179    case nir_op_pack_32_2x16_split: {
1180       LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
1181       result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, "");
1182       break;
1183    }
1184 
1185    case nir_op_unpack_32_2x16: {
1186       result = LLVMBuildBitCast(ctx->ac.builder, src[0],
1187             ctx->ac.v2i16, "");
1188       break;
1189    }
1190    case nir_op_unpack_32_2x16_split_x: {
1191       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
1192       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, "");
1193       break;
1194    }
1195    case nir_op_unpack_32_2x16_split_y: {
1196       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
1197       result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
1198       break;
1199    }
1200 
1201    case nir_op_cube_face_coord_amd: {
1202       src[0] = ac_to_float(&ctx->ac, src[0]);
1203       LLVMValueRef results[2];
1204       LLVMValueRef in[3];
1205       for (unsigned chan = 0; chan < 3; chan++)
1206          in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
1207       results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3,
1208                                       AC_FUNC_ATTR_READNONE);
1209       results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3,
1210                                       AC_FUNC_ATTR_READNONE);
1211       LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", ctx->ac.f32, in, 3,
1212                                            AC_FUNC_ATTR_READNONE);
1213       results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
1214       results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
1215       LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
1216       results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
1217       results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
1218       result = ac_build_gather_values(&ctx->ac, results, 2);
1219       break;
1220    }
1221 
1222    case nir_op_cube_face_index_amd: {
1223       src[0] = ac_to_float(&ctx->ac, src[0]);
1224       LLVMValueRef in[3];
1225       for (unsigned chan = 0; chan < 3; chan++)
1226          in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
1227       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", ctx->ac.f32, in, 3,
1228                                   AC_FUNC_ATTR_READNONE);
1229       break;
1230    }
1231 
1232    case nir_op_extract_u8:
1233    case nir_op_extract_i8:
1234    case nir_op_extract_u16:
1235    case nir_op_extract_i16: {
1236       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
1237       unsigned size = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 8 : 16;
1238       LLVMValueRef offset = LLVMConstInt(LLVMTypeOf(src[0]), nir_src_as_uint(instr->src[1].src) * size, false);
1239       result = LLVMBuildLShr(ctx->ac.builder, src[0], offset, "");
1240       result = LLVMBuildTrunc(ctx->ac.builder, result, LLVMIntTypeInContext(ctx->ac.context, size), "");
1241       if (is_signed)
1242          result = LLVMBuildSExt(ctx->ac.builder, result, LLVMTypeOf(src[0]), "");
1243       else
1244          result = LLVMBuildZExt(ctx->ac.builder, result, LLVMTypeOf(src[0]), "");
1245       break;
1246    }
1247 
1248    case nir_op_insert_u8:
1249    case nir_op_insert_u16: {
1250       unsigned size = instr->op == nir_op_insert_u8 ? 8 : 16;
1251       LLVMValueRef offset = LLVMConstInt(LLVMTypeOf(src[0]), nir_src_as_uint(instr->src[1].src) * size, false);
1252       LLVMValueRef mask = LLVMConstInt(LLVMTypeOf(src[0]), u_bit_consecutive(0, size), false);
1253       result = LLVMBuildShl(ctx->ac.builder, LLVMBuildAnd(ctx->ac.builder, src[0], mask, ""), offset, "");
1254       break;
1255    }
1256 
1257    case nir_op_sdot_4x8_iadd:
1258    case nir_op_udot_4x8_uadd:
1259    case nir_op_sdot_4x8_iadd_sat:
1260    case nir_op_udot_4x8_uadd_sat: {
1261       const char *name = instr->op == nir_op_sdot_4x8_iadd ||
1262                          instr->op == nir_op_sdot_4x8_iadd_sat
1263                          ? "llvm.amdgcn.sdot4" : "llvm.amdgcn.udot4";
1264       src[3] = LLVMConstInt(ctx->ac.i1, instr->op == nir_op_sdot_4x8_iadd_sat ||
1265                                         instr->op == nir_op_udot_4x8_uadd_sat, false);
1266       result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 4, AC_FUNC_ATTR_READNONE);
1267       break;
1268    }
1269 
1270    case nir_op_sdot_2x16_iadd:
1271    case nir_op_udot_2x16_uadd:
1272    case nir_op_sdot_2x16_iadd_sat:
1273    case nir_op_udot_2x16_uadd_sat: {
1274       const char *name = instr->op == nir_op_sdot_2x16_iadd ||
1275                          instr->op == nir_op_sdot_2x16_iadd_sat
1276                          ? "llvm.amdgcn.sdot2" : "llvm.amdgcn.udot2";
1277       src[0] = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
1278       src[1] = LLVMBuildBitCast(ctx->ac.builder, src[1], ctx->ac.v2i16, "");
1279       src[3] = LLVMConstInt(ctx->ac.i1, instr->op == nir_op_sdot_2x16_iadd_sat ||
1280                                         instr->op == nir_op_udot_2x16_uadd_sat, false);
1281       result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 4, AC_FUNC_ATTR_READNONE);
1282       break;
1283    }
1284 
1285    case nir_op_sad_u8x4:
1286       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
1287                                   (LLVMValueRef[]){src[0], src[1], src[2]}, 3,
1288                                   AC_FUNC_ATTR_READNONE);
1289       break;
1290 
1291    default:
1292       fprintf(stderr, "Unknown NIR alu instr: ");
1293       nir_print_instr(&instr->instr, stderr);
1294       fprintf(stderr, "\n");
1295       abort();
1296    }
1297 
1298    if (result) {
1299       assert(instr->dest.dest.is_ssa);
1300       result = ac_to_integer_or_pointer(&ctx->ac, result);
1301       ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
1302    }
1303 }
1304 
visit_load_const(struct ac_nir_context * ctx,const nir_load_const_instr * instr)1305 static void visit_load_const(struct ac_nir_context *ctx, const nir_load_const_instr *instr)
1306 {
1307    LLVMValueRef values[4], value = NULL;
1308    LLVMTypeRef element_type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
1309 
1310    for (unsigned i = 0; i < instr->def.num_components; ++i) {
1311       switch (instr->def.bit_size) {
1312       case 1:
1313          values[i] = LLVMConstInt(element_type, instr->value[i].b, false);
1314          break;
1315       case 8:
1316          values[i] = LLVMConstInt(element_type, instr->value[i].u8, false);
1317          break;
1318       case 16:
1319          values[i] = LLVMConstInt(element_type, instr->value[i].u16, false);
1320          break;
1321       case 32:
1322          values[i] = LLVMConstInt(element_type, instr->value[i].u32, false);
1323          break;
1324       case 64:
1325          values[i] = LLVMConstInt(element_type, instr->value[i].u64, false);
1326          break;
1327       default:
1328          fprintf(stderr, "unsupported nir load_const bit_size: %d\n", instr->def.bit_size);
1329          abort();
1330       }
1331    }
1332    if (instr->def.num_components > 1) {
1333       value = LLVMConstVector(values, instr->def.num_components);
1334    } else
1335       value = values[0];
1336 
1337    ctx->ssa_defs[instr->def.index] = value;
1338 }
1339 
get_buffer_size(struct ac_nir_context * ctx,LLVMValueRef descriptor,bool in_elements)1340 static LLVMValueRef get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor,
1341                                     bool in_elements)
1342 {
1343    LLVMValueRef size =
1344       LLVMBuildExtractElement(ctx->ac.builder, descriptor, LLVMConstInt(ctx->ac.i32, 2, false), "");
1345 
1346    /* GFX8 only */
1347    if (ctx->ac.chip_class == GFX8 && in_elements) {
1348       /* On GFX8, the descriptor contains the size in bytes,
1349        * but TXQ must return the size in elements.
1350        * The stride is always non-zero for resources using TXQ.
1351        */
1352       LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, descriptor, ctx->ac.i32_1, "");
1353       stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, false), "");
1354       stride = LLVMBuildAnd(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 0x3fff, false), "");
1355 
1356       size = LLVMBuildUDiv(ctx->ac.builder, size, stride, "");
1357    }
1358    return size;
1359 }
1360 
1361 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1362  * incorrectly forces nearest filtering if the texture format is integer.
1363  * The only effect it has on Gather4, which always returns 4 texels for
1364  * bilinear filtering, is that the final coordinates are off by 0.5 of
1365  * the texel size.
1366  *
1367  * The workaround is to subtract 0.5 from the unnormalized coordinates,
1368  * or (0.5 / size) from the normalized coordinates.
1369  *
1370  * However, cube textures with 8_8_8_8 data formats require a different
1371  * workaround of overriding the num format to USCALED/SSCALED. This would lose
1372  * precision in 32-bit data formats, so it needs to be applied dynamically at
1373  * runtime. In this case, return an i1 value that indicates whether the
1374  * descriptor was overridden (and hence a fixup of the sampler result is needed).
1375  */
lower_gather4_integer(struct ac_llvm_context * ctx,nir_variable * var,struct ac_image_args * args,const nir_tex_instr * instr)1376 static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, nir_variable *var,
1377                                           struct ac_image_args *args, const nir_tex_instr *instr)
1378 {
1379    const struct glsl_type *type = glsl_without_array(var->type);
1380    enum glsl_base_type stype = glsl_get_sampler_result_type(type);
1381    LLVMValueRef wa_8888 = NULL;
1382    LLVMValueRef half_texel[2];
1383    LLVMValueRef result;
1384 
1385    assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT);
1386 
1387    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1388       LLVMValueRef formats;
1389       LLVMValueRef data_format;
1390       LLVMValueRef wa_formats;
1391 
1392       formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, "");
1393 
1394       data_format = LLVMBuildLShr(ctx->builder, formats, LLVMConstInt(ctx->i32, 20, false), "");
1395       data_format =
1396          LLVMBuildAnd(ctx->builder, data_format, LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1397       wa_8888 = LLVMBuildICmp(ctx->builder, LLVMIntEQ, data_format,
1398                               LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), "");
1399 
1400       uint32_t wa_num_format = stype == GLSL_TYPE_UINT
1401                                   ? S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED)
1402                                   : S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
1403       wa_formats = LLVMBuildAnd(ctx->builder, formats,
1404                                 LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), "");
1405       wa_formats =
1406          LLVMBuildOr(ctx->builder, wa_formats, LLVMConstInt(ctx->i32, wa_num_format, false), "");
1407 
1408       formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, "");
1409       args->resource =
1410          LLVMBuildInsertElement(ctx->builder, args->resource, formats, ctx->i32_1, "");
1411    }
1412 
1413    if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
1414       assert(!wa_8888);
1415       half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1416    } else {
1417       struct ac_image_args resinfo = {0};
1418       LLVMBasicBlockRef bbs[2];
1419 
1420       LLVMValueRef unnorm = NULL;
1421       LLVMValueRef default_offset = ctx->f32_0;
1422       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
1423          /* In vulkan, whether the sampler uses unnormalized
1424           * coordinates or not is a dynamic property of the
1425           * sampler. Hence, to figure out whether or not we
1426           * need to divide by the texture size, we need to test
1427           * the sampler at runtime. This tests the bit set by
1428           * radv_init_sampler().
1429           */
1430          LLVMValueRef sampler0 =
1431             LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, "");
1432          sampler0 = LLVMBuildLShr(ctx->builder, sampler0, LLVMConstInt(ctx->i32, 15, false), "");
1433          sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, "");
1434          unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, "");
1435          default_offset = LLVMConstReal(ctx->f32, -0.5);
1436       }
1437 
1438       bbs[0] = LLVMGetInsertBlock(ctx->builder);
1439       if (wa_8888 || unnorm) {
1440          assert(!(wa_8888 && unnorm));
1441          LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm;
1442          /* Skip the texture size query entirely if we don't need it. */
1443          ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000);
1444          bbs[1] = LLVMGetInsertBlock(ctx->builder);
1445       }
1446 
1447       /* Query the texture size. */
1448       resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array);
1449       resinfo.opcode = ac_image_get_resinfo;
1450       resinfo.dmask = 0xf;
1451       resinfo.lod = ctx->i32_0;
1452       resinfo.resource = args->resource;
1453       resinfo.attributes = AC_FUNC_ATTR_READNONE;
1454       LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo);
1455 
1456       /* Compute -0.5 / size. */
1457       for (unsigned c = 0; c < 2; c++) {
1458          half_texel[c] =
1459             LLVMBuildExtractElement(ctx->builder, size, LLVMConstInt(ctx->i32, c, 0), "");
1460          half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
1461          half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]);
1462          half_texel[c] =
1463             LLVMBuildFMul(ctx->builder, half_texel[c], LLVMConstReal(ctx->f32, -0.5), "");
1464       }
1465 
1466       if (wa_8888 || unnorm) {
1467          ac_build_endif(ctx, 2000);
1468 
1469          for (unsigned c = 0; c < 2; c++) {
1470             LLVMValueRef values[2] = {default_offset, half_texel[c]};
1471             half_texel[c] = ac_build_phi(ctx, ctx->f32, 2, values, bbs);
1472          }
1473       }
1474    }
1475 
1476    for (unsigned c = 0; c < 2; c++) {
1477       LLVMValueRef tmp;
1478       tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, "");
1479       args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
1480    }
1481 
1482    args->attributes = AC_FUNC_ATTR_READNONE;
1483    result = ac_build_image_opcode(ctx, args);
1484 
1485    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1486       LLVMValueRef tmp, tmp2;
1487 
1488       /* if the cube workaround is in place, f2i the result. */
1489       for (unsigned c = 0; c < 4; c++) {
1490          tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
1491          if (stype == GLSL_TYPE_UINT)
1492             tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
1493          else
1494             tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
1495          tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
1496          tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
1497          tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, "");
1498          tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
1499          result =
1500             LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
1501       }
1502    }
1503    return result;
1504 }
1505 
get_tex_texture_deref(const nir_tex_instr * instr)1506 static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr)
1507 {
1508    nir_deref_instr *texture_deref_instr = NULL;
1509 
1510    for (unsigned i = 0; i < instr->num_srcs; i++) {
1511       switch (instr->src[i].src_type) {
1512       case nir_tex_src_texture_deref:
1513          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
1514          break;
1515       default:
1516          break;
1517       }
1518    }
1519    return texture_deref_instr;
1520 }
1521 
build_tex_intrinsic(struct ac_nir_context * ctx,const nir_tex_instr * instr,struct ac_image_args * args)1522 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_tex_instr *instr,
1523                                         struct ac_image_args *args)
1524 {
1525    assert((!args->tfe || !args->d16) && "unsupported");
1526 
1527    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
1528       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
1529 
1530       assert(instr->dest.is_ssa);
1531 
1532       /* Buffers don't support A16. */
1533       if (args->a16)
1534          args->coords[0] = LLVMBuildZExt(ctx->ac.builder, args->coords[0], ctx->ac.i32, "");
1535 
1536       return ac_build_buffer_load_format(&ctx->ac, args->resource, args->coords[0], ctx->ac.i32_0,
1537                                          util_last_bit(mask), 0, true,
1538                                          instr->dest.ssa.bit_size == 16,
1539                                          args->tfe);
1540    }
1541 
1542    args->opcode = ac_image_sample;
1543 
1544    switch (instr->op) {
1545    case nir_texop_txf:
1546    case nir_texop_txf_ms:
1547    case nir_texop_samples_identical:
1548       args->opcode = args->level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS
1549                         ? ac_image_load
1550                         : ac_image_load_mip;
1551       args->level_zero = false;
1552       break;
1553    case nir_texop_txs:
1554    case nir_texop_query_levels:
1555       args->opcode = ac_image_get_resinfo;
1556       if (!args->lod)
1557          args->lod = ctx->ac.i32_0;
1558       args->level_zero = false;
1559       break;
1560    case nir_texop_tex:
1561       if (ctx->stage != MESA_SHADER_FRAGMENT &&
1562           (ctx->stage != MESA_SHADER_COMPUTE ||
1563            ctx->info->cs.derivative_group == DERIVATIVE_GROUP_NONE)) {
1564          assert(!args->lod);
1565          args->level_zero = true;
1566       }
1567       break;
1568    case nir_texop_tg4:
1569       args->opcode = ac_image_gather4;
1570       if (!args->lod && !args->bias)
1571          args->level_zero = true;
1572       break;
1573    case nir_texop_lod:
1574       args->opcode = ac_image_get_lod;
1575       break;
1576    case nir_texop_fragment_fetch_amd:
1577    case nir_texop_fragment_mask_fetch_amd:
1578       args->opcode = ac_image_load;
1579       args->level_zero = false;
1580       break;
1581    default:
1582       break;
1583    }
1584 
1585    /* Aldebaran doesn't have image_sample_lz, but image_sample behaves like lz. */
1586    if (!ctx->ac.info->has_3d_cube_border_color_mipmap)
1587       args->level_zero = false;
1588 
1589    if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) {
1590       nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr);
1591       nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr);
1592       const struct glsl_type *type = glsl_without_array(var->type);
1593       enum glsl_base_type stype = glsl_get_sampler_result_type(type);
1594       if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
1595          return lower_gather4_integer(&ctx->ac, var, args, instr);
1596       }
1597    }
1598 
1599    /* Fixup for GFX9 which allocates 1D textures as 2D. */
1600    if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) {
1601       if ((args->dim == ac_image_2darray || args->dim == ac_image_2d) && !args->coords[1]) {
1602          args->coords[1] = ctx->ac.i32_0;
1603       }
1604    }
1605 
1606    args->attributes = AC_FUNC_ATTR_READNONE;
1607    bool cs_derivs =
1608       ctx->stage == MESA_SHADER_COMPUTE && ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE;
1609    if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) {
1610       /* Prevent texture instructions with implicit derivatives from being
1611        * sinked into branches. */
1612       switch (instr->op) {
1613       case nir_texop_tex:
1614       case nir_texop_txb:
1615       case nir_texop_lod:
1616          args->attributes |= AC_FUNC_ATTR_CONVERGENT;
1617          break;
1618       default:
1619          break;
1620       }
1621    }
1622 
1623    return ac_build_image_opcode(&ctx->ac, args);
1624 }
1625 
visit_load_push_constant(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)1626 static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1627 {
1628    LLVMValueRef ptr, addr;
1629    LLVMValueRef src0 = get_src(ctx, instr->src[0]);
1630    unsigned index = nir_intrinsic_base(instr);
1631 
1632    addr = LLVMConstInt(ctx->ac.i32, index, 0);
1633    addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
1634 
1635    /* Load constant values from user SGPRS when possible, otherwise
1636     * fallback to the default path that loads directly from memory.
1637     */
1638    if (LLVMIsConstant(src0) && instr->dest.ssa.bit_size == 32) {
1639       unsigned count = instr->dest.ssa.num_components;
1640       unsigned offset = index;
1641 
1642       offset += LLVMConstIntGetZExtValue(src0);
1643       offset /= 4;
1644 
1645       offset -= ctx->args->base_inline_push_consts;
1646 
1647       unsigned num_inline_push_consts = 0;
1648       for (unsigned i = 0; i < ARRAY_SIZE(ctx->args->inline_push_consts); i++) {
1649          if (ctx->args->inline_push_consts[i].used)
1650             num_inline_push_consts++;
1651       }
1652 
1653       if (offset + count <= num_inline_push_consts) {
1654          LLVMValueRef *const push_constants = alloca(num_inline_push_consts * sizeof(LLVMValueRef));
1655          for (unsigned i = 0; i < num_inline_push_consts; i++)
1656             push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]);
1657          return ac_build_gather_values(&ctx->ac, push_constants + offset, count);
1658       }
1659    }
1660 
1661    ptr =
1662       LLVMBuildGEP(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->push_constants), &addr, 1, "");
1663 
1664    if (instr->dest.ssa.bit_size == 8) {
1665       unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
1666       LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
1667       ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
1668       LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
1669 
1670       LLVMValueRef params[3];
1671       if (load_dwords > 1) {
1672          LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
1673          params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec,
1674                                              LLVMConstInt(ctx->ac.i32, 1, false), "");
1675          params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec,
1676                                              LLVMConstInt(ctx->ac.i32, 0, false), "");
1677       } else {
1678          res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, "");
1679          params[0] = ctx->ac.i32_0;
1680          params[1] = res;
1681       }
1682       params[2] = addr;
1683       res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0);
1684 
1685       res = LLVMBuildTrunc(
1686          ctx->ac.builder, res,
1687          LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
1688       if (instr->dest.ssa.num_components > 1)
1689          res = LLVMBuildBitCast(ctx->ac.builder, res,
1690                                 LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
1691       return res;
1692    } else if (instr->dest.ssa.bit_size == 16) {
1693       unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
1694       LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
1695       ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
1696       LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
1697       res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
1698       LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, "");
1699       cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
1700       LLVMValueRef mask[] = {
1701          LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
1702          LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
1703          LLVMConstInt(ctx->ac.i32, 4, false)};
1704       LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components);
1705       LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components);
1706       LLVMValueRef shuffle_aligned =
1707          LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, "");
1708       LLVMValueRef shuffle_unaligned =
1709          LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, "");
1710       res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, "");
1711       return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), "");
1712    }
1713 
1714    ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa));
1715 
1716    return LLVMBuildLoad(ctx->ac.builder, ptr, "");
1717 }
1718 
visit_get_ssbo_size(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr)1719 static LLVMValueRef visit_get_ssbo_size(struct ac_nir_context *ctx,
1720                                         const nir_intrinsic_instr *instr)
1721 {
1722    bool non_uniform = nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM;
1723    LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, get_src(ctx, instr->src[0]), false, non_uniform);
1724    return get_buffer_size(ctx, rsrc, false);
1725 }
1726 
extract_vector_range(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned start,unsigned count)1727 static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
1728                                          unsigned start, unsigned count)
1729 {
1730    LLVMValueRef mask[] = {ctx->i32_0, ctx->i32_1, LLVMConstInt(ctx->i32, 2, false),
1731                           LLVMConstInt(ctx->i32, 3, false)};
1732 
1733    unsigned src_elements = ac_get_llvm_num_components(src);
1734 
1735    if (count == src_elements) {
1736       assert(start == 0);
1737       return src;
1738    } else if (count == 1) {
1739       assert(start < src_elements);
1740       return LLVMBuildExtractElement(ctx->builder, src, mask[start], "");
1741    } else {
1742       assert(start + count <= src_elements);
1743       assert(count <= 4);
1744       LLVMValueRef swizzle = LLVMConstVector(&mask[start], count);
1745       return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
1746    }
1747 }
1748 
get_cache_policy(struct ac_nir_context * ctx,enum gl_access_qualifier access,bool may_store_unaligned,bool writeonly_memory)1749 static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access,
1750                                  bool may_store_unaligned, bool writeonly_memory)
1751 {
1752    unsigned cache_policy = 0;
1753 
1754    /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores.  All
1755     * store opcodes not aligned to a dword are affected. The only way to
1756     * get unaligned stores is through shader images.
1757     */
1758    if (((may_store_unaligned && ctx->ac.chip_class == GFX6) ||
1759         /* If this is write-only, don't keep data in L1 to prevent
1760          * evicting L1 cache lines that may be needed by other
1761          * instructions.
1762          */
1763         writeonly_memory || access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
1764       cache_policy |= ac_glc;
1765    }
1766 
1767    if (access & ACCESS_STREAM_CACHE_POLICY)
1768       cache_policy |= ac_slc | ac_glc;
1769 
1770    return cache_policy;
1771 }
1772 
enter_waterfall_ssbo(struct ac_nir_context * ctx,struct waterfall_context * wctx,const nir_intrinsic_instr * instr,nir_src src)1773 static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
1774                                          const nir_intrinsic_instr *instr, nir_src src)
1775 {
1776    return enter_waterfall(ctx, wctx, get_src(ctx, src),
1777                           nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
1778 }
1779 
visit_store_ssbo(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)1780 static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1781 {
1782    if (ctx->ac.postponed_kill) {
1783       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
1784       ac_build_ifcc(&ctx->ac, cond, 7000);
1785    }
1786 
1787    LLVMValueRef src_data = get_src(ctx, instr->src[0]);
1788    int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
1789    unsigned writemask = nir_intrinsic_write_mask(instr);
1790    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1791    bool writeonly_memory = access & ACCESS_NON_READABLE;
1792    unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
1793 
1794    struct waterfall_context wctx;
1795    LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
1796 
1797    LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true, false);
1798    LLVMValueRef base_data = src_data;
1799    base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
1800    LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
1801 
1802    while (writemask) {
1803       int start, count;
1804       LLVMValueRef data, offset;
1805       LLVMTypeRef data_type;
1806 
1807       u_bit_scan_consecutive_range(&writemask, &start, &count);
1808 
1809       if (count == 3 && (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) {
1810          writemask |= 1 << (start + 2);
1811          count = 2;
1812       }
1813       int num_bytes = count * elem_size_bytes; /* count in bytes */
1814 
1815       /* we can only store 4 DWords at the same time.
1816        * can only happen for 64 Bit vectors. */
1817       if (num_bytes > 16) {
1818          writemask |= ((1u << (count - 2)) - 1u) << (start + 2);
1819          count = 2;
1820          num_bytes = 16;
1821       }
1822 
1823       /* check alignment of 16 Bit stores */
1824       if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) {
1825          writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
1826          count = 1;
1827          num_bytes = 2;
1828       }
1829 
1830       /* Due to alignment issues, split stores of 8-bit/16-bit
1831        * vectors.
1832        */
1833       if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) {
1834          writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
1835          count = 1;
1836          num_bytes = elem_size_bytes;
1837       }
1838 
1839       data = extract_vector_range(&ctx->ac, base_data, start, count);
1840 
1841       offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
1842                             LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");
1843 
1844       if (num_bytes == 1) {
1845          ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
1846       } else if (num_bytes == 2) {
1847          ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
1848       } else {
1849          int num_channels = num_bytes / 4;
1850 
1851          switch (num_bytes) {
1852          case 16: /* v4f32 */
1853             data_type = ctx->ac.v4f32;
1854             break;
1855          case 12: /* v3f32 */
1856             data_type = ctx->ac.v3f32;
1857             break;
1858          case 8: /* v2f32 */
1859             data_type = ctx->ac.v2f32;
1860             break;
1861          case 4: /* f32 */
1862             data_type = ctx->ac.f32;
1863             break;
1864          default:
1865             unreachable("Malformed vector store.");
1866          }
1867          data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");
1868 
1869          ac_build_buffer_store_dword(&ctx->ac, rsrc, data, num_channels, offset, ctx->ac.i32_0, 0,
1870                                      cache_policy);
1871       }
1872    }
1873 
1874    exit_waterfall(ctx, &wctx, NULL);
1875 
1876    if (ctx->ac.postponed_kill)
1877       ac_build_endif(&ctx->ac, 7000);
1878 }
1879 
emit_ssbo_comp_swap_64(struct ac_nir_context * ctx,LLVMValueRef descriptor,LLVMValueRef offset,LLVMValueRef compare,LLVMValueRef exchange,bool image)1880 static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, LLVMValueRef descriptor,
1881                                            LLVMValueRef offset, LLVMValueRef compare,
1882                                            LLVMValueRef exchange, bool image)
1883 {
1884    LLVMBasicBlockRef start_block = NULL, then_block = NULL;
1885    if (ctx->abi->robust_buffer_access || image) {
1886       LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2);
1887 
1888       LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, "");
1889       start_block = LLVMGetInsertBlock(ctx->ac.builder);
1890 
1891       ac_build_ifcc(&ctx->ac, cond, -1);
1892 
1893       then_block = LLVMGetInsertBlock(ctx->ac.builder);
1894    }
1895 
1896    if (image)
1897       offset = LLVMBuildMul(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, 8, false), "");
1898 
1899    LLVMValueRef ptr_parts[2] = {
1900       ac_llvm_extract_elem(&ctx->ac, descriptor, 0),
1901       LLVMBuildAnd(ctx->ac.builder, ac_llvm_extract_elem(&ctx->ac, descriptor, 1),
1902                    LLVMConstInt(ctx->ac.i32, 65535, 0), "")};
1903 
1904    ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, "");
1905    ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, "");
1906 
1907    offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, "");
1908 
1909    LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2);
1910    ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
1911    ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, "");
1912    ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL),
1913                            "");
1914 
1915    LLVMValueRef result =
1916       ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as");
1917    result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
1918 
1919    if (ctx->abi->robust_buffer_access || image) {
1920       ac_build_endif(&ctx->ac, -1);
1921 
1922       LLVMBasicBlockRef incoming_blocks[2] = {
1923          start_block,
1924          then_block,
1925       };
1926 
1927       LLVMValueRef incoming_values[2] = {
1928          LLVMConstInt(ctx->ac.i64, 0, 0),
1929          result,
1930       };
1931       LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, "");
1932       LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2);
1933       return ret;
1934    } else {
1935       return result;
1936    }
1937 }
1938 
visit_atomic_ssbo(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)1939 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1940 {
1941    if (ctx->ac.postponed_kill) {
1942       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
1943       ac_build_ifcc(&ctx->ac, cond, 7001);
1944    }
1945 
1946    LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2]));
1947    const char *op;
1948    char name[64], type[8];
1949    LLVMValueRef params[6], descriptor;
1950    LLVMValueRef result;
1951    int arg_count = 0;
1952 
1953    struct waterfall_context wctx;
1954    LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
1955 
1956    switch (instr->intrinsic) {
1957    case nir_intrinsic_ssbo_atomic_add:
1958       op = "add";
1959       break;
1960    case nir_intrinsic_ssbo_atomic_imin:
1961       op = "smin";
1962       break;
1963    case nir_intrinsic_ssbo_atomic_umin:
1964       op = "umin";
1965       break;
1966    case nir_intrinsic_ssbo_atomic_imax:
1967       op = "smax";
1968       break;
1969    case nir_intrinsic_ssbo_atomic_umax:
1970       op = "umax";
1971       break;
1972    case nir_intrinsic_ssbo_atomic_and:
1973       op = "and";
1974       break;
1975    case nir_intrinsic_ssbo_atomic_or:
1976       op = "or";
1977       break;
1978    case nir_intrinsic_ssbo_atomic_xor:
1979       op = "xor";
1980       break;
1981    case nir_intrinsic_ssbo_atomic_exchange:
1982       op = "swap";
1983       break;
1984    case nir_intrinsic_ssbo_atomic_comp_swap:
1985       op = "cmpswap";
1986       break;
1987    case nir_intrinsic_ssbo_atomic_fmin:
1988       op = "fmin";
1989       break;
1990    case nir_intrinsic_ssbo_atomic_fmax:
1991       op = "fmax";
1992       break;
1993    default:
1994       abort();
1995    }
1996 
1997    descriptor = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true, false);
1998 
1999    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && return_type == ctx->ac.i64) {
2000       result = emit_ssbo_comp_swap_64(ctx, descriptor, get_src(ctx, instr->src[1]),
2001                                       get_src(ctx, instr->src[2]), get_src(ctx, instr->src[3]), false);
2002    } else {
2003       LLVMValueRef data = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
2004 
2005       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
2006          params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
2007       }
2008       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_fmin ||
2009           instr->intrinsic == nir_intrinsic_ssbo_atomic_fmax) {
2010          data = ac_to_float(&ctx->ac, data);
2011          return_type = LLVMTypeOf(data);
2012       }
2013       params[arg_count++] = data;
2014       params[arg_count++] = descriptor;
2015       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
2016       params[arg_count++] = ctx->ac.i32_0;               /* soffset */
2017       params[arg_count++] = ctx->ac.i32_0;               /* slc */
2018 
2019       ac_build_type_name_for_intr(return_type, type, sizeof(type));
2020       snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
2021 
2022       result = ac_build_intrinsic(&ctx->ac, name, return_type, params, arg_count, 0);
2023 
2024       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_fmin ||
2025           instr->intrinsic == nir_intrinsic_ssbo_atomic_fmax) {
2026          result = ac_to_integer(&ctx->ac, result);
2027       }
2028    }
2029 
2030    result = exit_waterfall(ctx, &wctx, result);
2031    if (ctx->ac.postponed_kill)
2032       ac_build_endif(&ctx->ac, 7001);
2033    return result;
2034 }
2035 
visit_load_buffer(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2036 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2037 {
2038    struct waterfall_context wctx;
2039    LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
2040 
2041    int elem_size_bytes = instr->dest.ssa.bit_size / 8;
2042    int num_components = instr->num_components;
2043    enum gl_access_qualifier access = nir_intrinsic_access(instr);
2044    unsigned cache_policy = get_cache_policy(ctx, access, false, false);
2045 
2046    LLVMValueRef offset = get_src(ctx, instr->src[1]);
2047    LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false, false);
2048    LLVMValueRef vindex = ctx->ac.i32_0;
2049 
2050    LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
2051    LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type;
2052 
2053    LLVMValueRef results[4];
2054    for (int i = 0; i < num_components;) {
2055       int num_elems = num_components - i;
2056       if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
2057          num_elems = 1;
2058       if (num_elems * elem_size_bytes > 16)
2059          num_elems = 16 / elem_size_bytes;
2060       int load_bytes = num_elems * elem_size_bytes;
2061 
2062       LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false);
2063 
2064       LLVMValueRef ret;
2065 
2066       if (load_bytes == 1) {
2067          ret = ac_build_tbuffer_load_byte(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset,
2068                                           cache_policy);
2069       } else if (load_bytes == 2) {
2070          ret = ac_build_tbuffer_load_short(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset,
2071                                            cache_policy);
2072       } else {
2073          int num_channels = util_next_power_of_two(load_bytes) / 4;
2074          bool can_speculate = access & ACCESS_CAN_REORDER;
2075 
2076          ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, offset, immoffset, 0,
2077                                     ctx->ac.f32, cache_policy, can_speculate, false);
2078       }
2079 
2080       LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
2081       ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, "");
2082       ret = ac_trim_vector(&ctx->ac, ret, load_bytes);
2083 
2084       LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems);
2085       ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, "");
2086 
2087       for (unsigned j = 0; j < num_elems; j++) {
2088          results[i + j] =
2089             LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), "");
2090       }
2091       i += num_elems;
2092    }
2093 
2094    LLVMValueRef ret = ac_build_gather_values(&ctx->ac, results, num_components);
2095    return exit_waterfall(ctx, &wctx, ret);
2096 }
2097 
enter_waterfall_ubo(struct ac_nir_context * ctx,struct waterfall_context * wctx,const nir_intrinsic_instr * instr)2098 static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
2099                                         const nir_intrinsic_instr *instr)
2100 {
2101    return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
2102                           nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
2103 }
2104 
visit_load_global(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2105 static LLVMValueRef visit_load_global(struct ac_nir_context *ctx,
2106                                       nir_intrinsic_instr *instr)
2107 {
2108    LLVMValueRef addr = get_src(ctx, instr->src[0]);
2109    LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
2110    LLVMValueRef val;
2111 
2112    LLVMTypeRef ptr_type = LLVMPointerType(result_type, AC_ADDR_SPACE_GLOBAL);
2113 
2114    addr = LLVMBuildIntToPtr(ctx->ac.builder, addr, ptr_type, "");
2115 
2116    val = LLVMBuildLoad(ctx->ac.builder, addr, "");
2117 
2118    if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) {
2119       LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
2120       LLVMSetAlignment(val, ac_get_type_size(result_type));
2121    }
2122 
2123    return val;
2124 }
2125 
visit_store_global(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2126 static void visit_store_global(struct ac_nir_context *ctx,
2127 				     nir_intrinsic_instr *instr)
2128 {
2129    if (ctx->ac.postponed_kill) {
2130       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2131       ac_build_ifcc(&ctx->ac, cond, 7002);
2132    }
2133 
2134    LLVMValueRef data = get_src(ctx, instr->src[0]);
2135    LLVMValueRef addr = get_src(ctx, instr->src[1]);
2136    LLVMTypeRef type = LLVMTypeOf(data);
2137    LLVMValueRef val;
2138 
2139    LLVMTypeRef ptr_type = LLVMPointerType(type, AC_ADDR_SPACE_GLOBAL);
2140 
2141    addr = LLVMBuildIntToPtr(ctx->ac.builder, addr, ptr_type, "");
2142 
2143    val = LLVMBuildStore(ctx->ac.builder, data, addr);
2144 
2145    if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) {
2146       LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
2147       LLVMSetAlignment(val, ac_get_type_size(type));
2148    }
2149 
2150    if (ctx->ac.postponed_kill)
2151       ac_build_endif(&ctx->ac, 7002);
2152 }
2153 
visit_global_atomic(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2154 static LLVMValueRef visit_global_atomic(struct ac_nir_context *ctx,
2155 					nir_intrinsic_instr *instr)
2156 {
2157    if (ctx->ac.postponed_kill) {
2158       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2159       ac_build_ifcc(&ctx->ac, cond, 7002);
2160    }
2161 
2162    LLVMValueRef addr = get_src(ctx, instr->src[0]);
2163    LLVMValueRef data = get_src(ctx, instr->src[1]);
2164    LLVMAtomicRMWBinOp op;
2165    LLVMValueRef result;
2166 
2167    /* use "singlethread" sync scope to implement relaxed ordering */
2168    const char *sync_scope = "singlethread-one-as";
2169 
2170    if (instr->intrinsic == nir_intrinsic_global_atomic_fmin ||
2171        instr->intrinsic == nir_intrinsic_global_atomic_fmax) {
2172       data = ac_to_float(&ctx->ac, data);
2173    }
2174 
2175    LLVMTypeRef data_type = LLVMTypeOf(data);
2176    LLVMTypeRef ptr_type = LLVMPointerType(data_type, AC_ADDR_SPACE_GLOBAL);
2177 
2178    addr = LLVMBuildIntToPtr(ctx->ac.builder, addr, ptr_type, "");
2179 
2180    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) {
2181       LLVMValueRef data1 = get_src(ctx, instr->src[2]);
2182       result = ac_build_atomic_cmp_xchg(&ctx->ac, addr, data, data1, sync_scope);
2183       result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
2184    } else if (instr->intrinsic == nir_intrinsic_global_atomic_fmin ||
2185               instr->intrinsic == nir_intrinsic_global_atomic_fmax) {
2186       const char *op = instr->intrinsic == nir_intrinsic_global_atomic_fmin ? "fmin" : "fmax";
2187       char name[64], type[8];
2188       LLVMValueRef params[2];
2189       int arg_count = 0;
2190 
2191       params[arg_count++] = addr;
2192       params[arg_count++] = data;
2193 
2194       ac_build_type_name_for_intr(data_type, type, sizeof(type));
2195       snprintf(name, sizeof(name), "llvm.amdgcn.global.atomic.%s.%s.p1%s.%s", op, type, type, type);
2196 
2197       result = ac_build_intrinsic(&ctx->ac, name, data_type, params, arg_count, 0);
2198       result = ac_to_integer(&ctx->ac, result);
2199    } else {
2200       switch (instr->intrinsic) {
2201       case nir_intrinsic_global_atomic_add:
2202          op = LLVMAtomicRMWBinOpAdd;
2203          break;
2204       case nir_intrinsic_global_atomic_umin:
2205          op = LLVMAtomicRMWBinOpUMin;
2206          break;
2207       case nir_intrinsic_global_atomic_umax:
2208          op = LLVMAtomicRMWBinOpUMax;
2209          break;
2210       case nir_intrinsic_global_atomic_imin:
2211          op = LLVMAtomicRMWBinOpMin;
2212          break;
2213       case nir_intrinsic_global_atomic_imax:
2214          op = LLVMAtomicRMWBinOpMax;
2215          break;
2216       case nir_intrinsic_global_atomic_and:
2217          op = LLVMAtomicRMWBinOpAnd;
2218          break;
2219       case nir_intrinsic_global_atomic_or:
2220          op = LLVMAtomicRMWBinOpOr;
2221          break;
2222       case nir_intrinsic_global_atomic_xor:
2223          op = LLVMAtomicRMWBinOpXor;
2224          break;
2225       case nir_intrinsic_global_atomic_exchange:
2226          op = LLVMAtomicRMWBinOpXchg;
2227          break;
2228       default:
2229          unreachable("Invalid global atomic operation");
2230       }
2231 
2232       result = ac_build_atomic_rmw(&ctx->ac, op, addr, ac_to_integer(&ctx->ac, data), sync_scope);
2233    }
2234 
2235    if (ctx->ac.postponed_kill)
2236       ac_build_endif(&ctx->ac, 7002);
2237 
2238    return result;
2239 }
2240 
visit_load_ubo_buffer(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2241 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2242 {
2243    struct waterfall_context wctx;
2244    LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
2245 
2246    LLVMValueRef ret;
2247    LLVMValueRef rsrc = rsrc_base;
2248    LLVMValueRef offset = get_src(ctx, instr->src[1]);
2249    int num_components = instr->num_components;
2250 
2251    if (ctx->abi->load_ubo) {
2252       nir_binding binding = nir_chase_binding(instr->src[0]);
2253       rsrc = ctx->abi->load_ubo(ctx->abi, binding.desc_set, binding.binding, binding.success, rsrc);
2254    }
2255 
2256    /* Convert to a scalar 32-bit load. */
2257    if (instr->dest.ssa.bit_size == 64)
2258       num_components *= 2;
2259    else if (instr->dest.ssa.bit_size == 16)
2260       num_components = DIV_ROUND_UP(num_components, 2);
2261    else if (instr->dest.ssa.bit_size == 8)
2262       num_components = DIV_ROUND_UP(num_components, 4);
2263 
2264    ret =
2265       ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL, 0,
2266                            ctx->ac.f32, 0, true, true);
2267 
2268    /* Convert to the original type. */
2269    if (instr->dest.ssa.bit_size == 64) {
2270       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
2271                              LLVMVectorType(ctx->ac.i64, num_components / 2), "");
2272    } else if (instr->dest.ssa.bit_size == 16) {
2273       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
2274                              LLVMVectorType(ctx->ac.i16, num_components * 2), "");
2275    } else if (instr->dest.ssa.bit_size == 8) {
2276       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
2277                              LLVMVectorType(ctx->ac.i8, num_components * 4), "");
2278    }
2279 
2280    ret = ac_trim_vector(&ctx->ac, ret, instr->num_components);
2281    ret = LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
2282 
2283    return exit_waterfall(ctx, &wctx, ret);
2284 }
2285 
type_scalar_size_bytes(const struct glsl_type * type)2286 static unsigned type_scalar_size_bytes(const struct glsl_type *type)
2287 {
2288    assert(glsl_type_is_vector_or_scalar(type) || glsl_type_is_matrix(type));
2289    return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
2290 }
2291 
visit_store_output(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2292 static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2293 {
2294    if (ctx->ac.postponed_kill) {
2295       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2296       ac_build_ifcc(&ctx->ac, cond, 7002);
2297    }
2298 
2299    unsigned base = nir_intrinsic_base(instr);
2300    unsigned writemask = nir_intrinsic_write_mask(instr);
2301    unsigned component = nir_intrinsic_component(instr);
2302    LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
2303    nir_src offset = *nir_get_io_offset_src(instr);
2304    LLVMValueRef indir_index = NULL;
2305 
2306    if (nir_src_is_const(offset))
2307       assert(nir_src_as_uint(offset) == 0);
2308    else
2309       indir_index = get_src(ctx, offset);
2310 
2311    switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) {
2312    case 16:
2313    case 32:
2314       break;
2315    case 64:
2316       unreachable("64-bit IO should have been lowered to 32 bits");
2317       return;
2318    default:
2319       unreachable("unhandled store_output bit size");
2320       return;
2321    }
2322 
2323    writemask <<= component;
2324 
2325    if (ctx->stage == MESA_SHADER_TESS_CTRL) {
2326       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
2327       LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
2328       unsigned location = nir_intrinsic_io_semantics(instr).location;
2329 
2330       ctx->abi->store_tcs_outputs(ctx->abi, vertex_index, indir_index, src,
2331                                   writemask, component, location, base);
2332       return;
2333    }
2334 
2335    /* No indirect indexing is allowed after this point. */
2336    assert(!indir_index);
2337 
2338    for (unsigned chan = 0; chan < 8; chan++) {
2339       if (!(writemask & (1 << chan)))
2340          continue;
2341 
2342       LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
2343       LLVMValueRef output_addr = ctx->abi->outputs[base * 4 + chan];
2344 
2345       if (LLVMGetElementType(LLVMTypeOf(output_addr)) == ctx->ac.f32 &&
2346           LLVMTypeOf(value) == ctx->ac.f16) {
2347          LLVMValueRef output, index;
2348 
2349          /* Insert the 16-bit value into the low or high bits of the 32-bit output
2350           * using read-modify-write.
2351           */
2352          index = LLVMConstInt(ctx->ac.i32, nir_intrinsic_io_semantics(instr).high_16bits, 0);
2353          output = LLVMBuildLoad(ctx->ac.builder, output_addr, "");
2354          output = LLVMBuildBitCast(ctx->ac.builder, output, ctx->ac.v2f16, "");
2355          output = LLVMBuildInsertElement(ctx->ac.builder, output, value, index, "");
2356          value = LLVMBuildBitCast(ctx->ac.builder, output, ctx->ac.f32, "");
2357       }
2358       LLVMBuildStore(ctx->ac.builder, value, output_addr);
2359    }
2360 
2361    if (ctx->ac.postponed_kill)
2362       ac_build_endif(&ctx->ac, 7002);
2363 }
2364 
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)2365 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
2366 {
2367    switch (dim) {
2368    case GLSL_SAMPLER_DIM_BUF:
2369       return 1;
2370    case GLSL_SAMPLER_DIM_1D:
2371       return array ? 2 : 1;
2372    case GLSL_SAMPLER_DIM_2D:
2373       return array ? 3 : 2;
2374    case GLSL_SAMPLER_DIM_MS:
2375       return array ? 4 : 3;
2376    case GLSL_SAMPLER_DIM_3D:
2377    case GLSL_SAMPLER_DIM_CUBE:
2378       return 3;
2379    case GLSL_SAMPLER_DIM_RECT:
2380    case GLSL_SAMPLER_DIM_SUBPASS:
2381       return 2;
2382    case GLSL_SAMPLER_DIM_SUBPASS_MS:
2383       return 3;
2384    default:
2385       break;
2386    }
2387    return 0;
2388 }
2389 
adjust_sample_index_using_fmask(struct ac_llvm_context * ctx,LLVMValueRef coord_x,LLVMValueRef coord_y,LLVMValueRef coord_z,LLVMValueRef sample_index,LLVMValueRef fmask_desc_ptr)2390 static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
2391                                                     LLVMValueRef coord_x, LLVMValueRef coord_y,
2392                                                     LLVMValueRef coord_z, LLVMValueRef sample_index,
2393                                                     LLVMValueRef fmask_desc_ptr)
2394 {
2395    if (!fmask_desc_ptr)
2396       return sample_index;
2397 
2398    unsigned sample_chan = coord_z ? 3 : 2;
2399    LLVMValueRef addr[4] = {coord_x, coord_y, coord_z};
2400    addr[sample_chan] = sample_index;
2401 
2402    ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL);
2403    return addr[sample_chan];
2404 }
2405 
get_image_deref(const nir_intrinsic_instr * instr)2406 static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
2407 {
2408    assert(instr->src[0].is_ssa);
2409    return nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2410 }
2411 
get_image_descriptor(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,LLVMValueRef dynamic_index,enum ac_descriptor_type desc_type,bool write)2412 static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
2413                                          const nir_intrinsic_instr *instr,
2414                                          LLVMValueRef dynamic_index,
2415                                          enum ac_descriptor_type desc_type, bool write)
2416 {
2417    nir_deref_instr *deref_instr = instr->src[0].ssa->parent_instr->type == nir_instr_type_deref
2418                                      ? nir_instr_as_deref(instr->src[0].ssa->parent_instr)
2419                                      : NULL;
2420 
2421    return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
2422 }
2423 
get_image_coords(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,LLVMValueRef dynamic_desc_index,struct ac_image_args * args,enum glsl_sampler_dim dim,bool is_array)2424 static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2425                              LLVMValueRef dynamic_desc_index, struct ac_image_args *args,
2426                              enum glsl_sampler_dim dim, bool is_array)
2427 {
2428    LLVMValueRef src0 = get_src(ctx, instr->src[1]);
2429    LLVMValueRef masks[] = {
2430       LLVMConstInt(ctx->ac.i32, 0, false),
2431       LLVMConstInt(ctx->ac.i32, 1, false),
2432       LLVMConstInt(ctx->ac.i32, 2, false),
2433       LLVMConstInt(ctx->ac.i32, 3, false),
2434    };
2435    LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
2436 
2437    int count;
2438    ASSERTED bool add_frag_pos =
2439       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2440    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2441    bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
2442    assert(!add_frag_pos && "Input attachments should be lowered by this point.");
2443    count = image_type_to_components_count(dim, is_array);
2444 
2445    if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load ||
2446                  instr->intrinsic == nir_intrinsic_bindless_image_load ||
2447                  instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
2448                  instr->intrinsic == nir_intrinsic_bindless_image_sparse_load)) {
2449       LLVMValueRef fmask_load_address[3];
2450 
2451       fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
2452       fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
2453       if (is_array)
2454          fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
2455       else
2456          fmask_load_address[2] = NULL;
2457 
2458       sample_index = adjust_sample_index_using_fmask(
2459          &ctx->ac, fmask_load_address[0], fmask_load_address[1], fmask_load_address[2],
2460          sample_index,
2461          get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), AC_DESC_FMASK,
2462                           &instr->instr, dynamic_desc_index, true, false));
2463    }
2464    if (count == 1 && !gfx9_1d) {
2465       if (instr->src[1].ssa->num_components)
2466          args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
2467       else
2468          args->coords[0] = src0;
2469    } else {
2470       int chan;
2471       if (is_ms)
2472          count--;
2473       for (chan = 0; chan < count; ++chan) {
2474          args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
2475       }
2476 
2477       if (gfx9_1d) {
2478          if (is_array) {
2479             args->coords[2] = args->coords[1];
2480             args->coords[1] = ctx->ac.i32_0;
2481          } else
2482             args->coords[1] = ctx->ac.i32_0;
2483          count++;
2484       }
2485       if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
2486          /* The hw can't bind a slice of a 3D image as a 2D
2487           * image, because it ignores BASE_ARRAY if the target
2488           * is 3D. The workaround is to read BASE_ARRAY and set
2489           * it as the 3rd address operand for all 2D images.
2490           */
2491          LLVMValueRef first_layer, const5, mask;
2492 
2493          const5 = LLVMConstInt(ctx->ac.i32, 5, 0);
2494          mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0);
2495          first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, "");
2496          first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, "");
2497 
2498          args->coords[count] = first_layer;
2499          count++;
2500       }
2501 
2502       if (is_ms) {
2503          args->coords[count] = sample_index;
2504          count++;
2505       }
2506    }
2507 }
2508 
enter_waterfall_image(struct ac_nir_context * ctx,struct waterfall_context * wctx,const nir_intrinsic_instr * instr)2509 static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
2510                                           struct waterfall_context *wctx,
2511                                           const nir_intrinsic_instr *instr)
2512 {
2513    nir_deref_instr *deref_instr = NULL;
2514 
2515    if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
2516       deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2517 
2518    LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
2519    return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
2520 }
2521 
visit_image_load(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,bool bindless)2522 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2523                                      bool bindless)
2524 {
2525    LLVMValueRef res;
2526 
2527    enum glsl_sampler_dim dim;
2528    enum gl_access_qualifier access = nir_intrinsic_access(instr);
2529    bool is_array;
2530    if (bindless) {
2531       dim = nir_intrinsic_image_dim(instr);
2532       is_array = nir_intrinsic_image_array(instr);
2533    } else {
2534       const nir_deref_instr *image_deref = get_image_deref(instr);
2535       const struct glsl_type *type = image_deref->type;
2536       const nir_variable *var = nir_deref_instr_get_variable(image_deref);
2537       dim = glsl_get_sampler_dim(type);
2538       access |= var->data.access;
2539       is_array = glsl_sampler_type_is_array(type);
2540    }
2541 
2542    struct waterfall_context wctx;
2543    LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2544 
2545    struct ac_image_args args = {0};
2546 
2547    args.cache_policy = get_cache_policy(ctx, access, false, false);
2548    args.tfe = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
2549 
2550    if (dim == GLSL_SAMPLER_DIM_BUF) {
2551       unsigned num_channels = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
2552       if (instr->dest.ssa.bit_size == 64)
2553          num_channels = num_channels < 4 ? 2 : 4;
2554       LLVMValueRef rsrc, vindex;
2555 
2556       rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false);
2557       vindex =
2558          LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");
2559 
2560       assert(instr->dest.is_ssa);
2561       bool can_speculate = access & ACCESS_CAN_REORDER;
2562       res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels,
2563                                         args.cache_policy, can_speculate,
2564                                         instr->dest.ssa.bit_size == 16,
2565                                         args.tfe);
2566       res = ac_build_expand(&ctx->ac, res, num_channels, args.tfe ? 5 : 4);
2567 
2568       res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
2569       res = ac_to_integer(&ctx->ac, res);
2570    } else {
2571       bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
2572 
2573       args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
2574       args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2575       get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2576       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2577       if (!level_zero)
2578          args.lod = get_src(ctx, instr->src[3]);
2579       args.dmask = 15;
2580       args.attributes = AC_FUNC_ATTR_READONLY;
2581 
2582       assert(instr->dest.is_ssa);
2583       args.d16 = instr->dest.ssa.bit_size == 16;
2584 
2585       res = ac_build_image_opcode(&ctx->ac, &args);
2586    }
2587 
2588    if (instr->dest.ssa.bit_size == 64) {
2589       LLVMValueRef code = NULL;
2590       if (args.tfe) {
2591          code = ac_llvm_extract_elem(&ctx->ac, res, 4);
2592          res = ac_trim_vector(&ctx->ac, res, 4);
2593       }
2594 
2595       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i64, 2), "");
2596       LLVMValueRef x = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_0, "");
2597       LLVMValueRef w = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_1, "");
2598 
2599       if (code)
2600          code = LLVMBuildZExt(ctx->ac.builder, code, ctx->ac.i64, "");
2601       LLVMValueRef values[5] = {x, ctx->ac.i64_0, ctx->ac.i64_0, w, code};
2602       res = ac_build_gather_values(&ctx->ac, values, 4 + args.tfe);
2603    }
2604 
2605    return exit_waterfall(ctx, &wctx, res);
2606 }
2607 
visit_image_store(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,bool bindless)2608 static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2609                               bool bindless)
2610 {
2611    if (ctx->ac.postponed_kill) {
2612       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2613       ac_build_ifcc(&ctx->ac, cond, 7003);
2614    }
2615 
2616    enum glsl_sampler_dim dim;
2617    enum gl_access_qualifier access = nir_intrinsic_access(instr);
2618    bool is_array;
2619 
2620    if (bindless) {
2621       dim = nir_intrinsic_image_dim(instr);
2622       is_array = nir_intrinsic_image_array(instr);
2623    } else {
2624       const nir_deref_instr *image_deref = get_image_deref(instr);
2625       const struct glsl_type *type = image_deref->type;
2626       const nir_variable *var = nir_deref_instr_get_variable(image_deref);
2627       dim = glsl_get_sampler_dim(type);
2628       access |= var->data.access;
2629       is_array = glsl_sampler_type_is_array(type);
2630    }
2631 
2632    struct waterfall_context wctx;
2633    LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2634 
2635    bool writeonly_memory = access & ACCESS_NON_READABLE;
2636    struct ac_image_args args = {0};
2637 
2638    args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
2639 
2640    LLVMValueRef src = get_src(ctx, instr->src[3]);
2641    if (instr->src[3].ssa->bit_size == 64) {
2642       /* only R64_UINT and R64_SINT supported */
2643       src = ac_llvm_extract_elem(&ctx->ac, src, 0);
2644       src = LLVMBuildBitCast(ctx->ac.builder, src, ctx->ac.v2f32, "");
2645    } else {
2646       src = ac_to_float(&ctx->ac, src);
2647    }
2648 
2649    if (dim == GLSL_SAMPLER_DIM_BUF) {
2650       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, true);
2651       unsigned src_channels = ac_get_llvm_num_components(src);
2652       LLVMValueRef vindex;
2653 
2654       if (src_channels == 3)
2655          src = ac_build_expand_to_vec4(&ctx->ac, src, 3);
2656 
2657       vindex =
2658          LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");
2659 
2660       ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy);
2661    } else {
2662       bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
2663 
2664       args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
2665       args.data[0] = src;
2666       args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
2667       get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2668       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2669       if (!level_zero)
2670          args.lod = get_src(ctx, instr->src[4]);
2671       args.dmask = 15;
2672       args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
2673 
2674       ac_build_image_opcode(&ctx->ac, &args);
2675    }
2676 
2677    exit_waterfall(ctx, &wctx, NULL);
2678    if (ctx->ac.postponed_kill)
2679       ac_build_endif(&ctx->ac, 7003);
2680 }
2681 
visit_image_atomic(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,bool bindless)2682 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2683                                        bool bindless)
2684 {
2685    if (ctx->ac.postponed_kill) {
2686       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2687       ac_build_ifcc(&ctx->ac, cond, 7004);
2688    }
2689 
2690    LLVMValueRef params[7];
2691    int param_count = 0;
2692 
2693    bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap ||
2694                   instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap;
2695    const char *atomic_name;
2696    char intrinsic_name[64];
2697    enum ac_atomic_op atomic_subop;
2698    ASSERTED int length;
2699 
2700    enum glsl_sampler_dim dim;
2701    bool is_array;
2702    if (bindless) {
2703       dim = nir_intrinsic_image_dim(instr);
2704       is_array = nir_intrinsic_image_array(instr);
2705    } else {
2706       const struct glsl_type *type = get_image_deref(instr)->type;
2707       dim = glsl_get_sampler_dim(type);
2708       is_array = glsl_sampler_type_is_array(type);
2709    }
2710 
2711    struct waterfall_context wctx;
2712    LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2713 
2714    switch (instr->intrinsic) {
2715    case nir_intrinsic_bindless_image_atomic_add:
2716    case nir_intrinsic_image_deref_atomic_add:
2717       atomic_name = "add";
2718       atomic_subop = ac_atomic_add;
2719       break;
2720    case nir_intrinsic_bindless_image_atomic_imin:
2721    case nir_intrinsic_image_deref_atomic_imin:
2722       atomic_name = "smin";
2723       atomic_subop = ac_atomic_smin;
2724       break;
2725    case nir_intrinsic_bindless_image_atomic_umin:
2726    case nir_intrinsic_image_deref_atomic_umin:
2727       atomic_name = "umin";
2728       atomic_subop = ac_atomic_umin;
2729       break;
2730    case nir_intrinsic_bindless_image_atomic_imax:
2731    case nir_intrinsic_image_deref_atomic_imax:
2732       atomic_name = "smax";
2733       atomic_subop = ac_atomic_smax;
2734       break;
2735    case nir_intrinsic_bindless_image_atomic_umax:
2736    case nir_intrinsic_image_deref_atomic_umax:
2737       atomic_name = "umax";
2738       atomic_subop = ac_atomic_umax;
2739       break;
2740    case nir_intrinsic_bindless_image_atomic_and:
2741    case nir_intrinsic_image_deref_atomic_and:
2742       atomic_name = "and";
2743       atomic_subop = ac_atomic_and;
2744       break;
2745    case nir_intrinsic_bindless_image_atomic_or:
2746    case nir_intrinsic_image_deref_atomic_or:
2747       atomic_name = "or";
2748       atomic_subop = ac_atomic_or;
2749       break;
2750    case nir_intrinsic_bindless_image_atomic_xor:
2751    case nir_intrinsic_image_deref_atomic_xor:
2752       atomic_name = "xor";
2753       atomic_subop = ac_atomic_xor;
2754       break;
2755    case nir_intrinsic_bindless_image_atomic_exchange:
2756    case nir_intrinsic_image_deref_atomic_exchange:
2757       atomic_name = "swap";
2758       atomic_subop = ac_atomic_swap;
2759       break;
2760    case nir_intrinsic_bindless_image_atomic_comp_swap:
2761    case nir_intrinsic_image_deref_atomic_comp_swap:
2762       atomic_name = "cmpswap";
2763       atomic_subop = 0; /* not used */
2764       break;
2765    case nir_intrinsic_bindless_image_atomic_inc_wrap:
2766    case nir_intrinsic_image_deref_atomic_inc_wrap: {
2767       atomic_name = "inc";
2768       atomic_subop = ac_atomic_inc_wrap;
2769       break;
2770    }
2771    case nir_intrinsic_bindless_image_atomic_dec_wrap:
2772    case nir_intrinsic_image_deref_atomic_dec_wrap:
2773       atomic_name = "dec";
2774       atomic_subop = ac_atomic_dec_wrap;
2775       break;
2776    case nir_intrinsic_image_deref_atomic_fmin:
2777       atomic_name = "fmin";
2778       atomic_subop = ac_atomic_fmin;
2779       break;
2780    case nir_intrinsic_image_deref_atomic_fmax:
2781       atomic_name = "fmax";
2782       atomic_subop = ac_atomic_fmax;
2783       break;
2784    default:
2785       abort();
2786    }
2787 
2788    if (cmpswap)
2789       params[param_count++] = get_src(ctx, instr->src[4]);
2790    params[param_count++] = get_src(ctx, instr->src[3]);
2791 
2792    if (atomic_subop == ac_atomic_fmin || atomic_subop == ac_atomic_fmax)
2793       params[0] = ac_to_float(&ctx->ac, params[0]);
2794 
2795    LLVMValueRef result;
2796    if (dim == GLSL_SAMPLER_DIM_BUF) {
2797       params[param_count++] = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, true);
2798       params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
2799                                                       ctx->ac.i32_0, ""); /* vindex */
2800       params[param_count++] = ctx->ac.i32_0;                              /* voffset */
2801       if (cmpswap && instr->dest.ssa.bit_size == 64) {
2802          result = emit_ssbo_comp_swap_64(ctx, params[2], params[3], params[1], params[0], true);
2803       } else {
2804          LLVMTypeRef data_type = LLVMTypeOf(params[0]);
2805          char type[8];
2806 
2807          params[param_count++] = ctx->ac.i32_0; /* soffset */
2808          params[param_count++] = ctx->ac.i32_0; /* slc */
2809 
2810          ac_build_type_name_for_intr(data_type, type, sizeof(type));
2811          length = snprintf(intrinsic_name, sizeof(intrinsic_name),
2812                            "llvm.amdgcn.struct.buffer.atomic.%s.%s",
2813                            atomic_name, type);
2814 
2815          assert(length < sizeof(intrinsic_name));
2816          result = ac_build_intrinsic(&ctx->ac, intrinsic_name, LLVMTypeOf(params[0]), params, param_count, 0);
2817       }
2818    } else {
2819       struct ac_image_args args = {0};
2820       args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic;
2821       args.atomic = atomic_subop;
2822       args.data[0] = params[0];
2823       if (cmpswap)
2824          args.data[1] = params[1];
2825       args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
2826       get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2827       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2828 
2829       result = ac_build_image_opcode(&ctx->ac, &args);
2830    }
2831 
2832    result = exit_waterfall(ctx, &wctx, result);
2833    if (ctx->ac.postponed_kill)
2834       ac_build_endif(&ctx->ac, 7004);
2835    return result;
2836 }
2837 
visit_image_samples(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)2838 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2839 {
2840    struct waterfall_context wctx;
2841    LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2842    LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2843 
2844    LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
2845    if (ctx->abi->robust_buffer_access) {
2846       LLVMValueRef dword1, is_null_descriptor;
2847 
2848       /* Extract the second dword of the descriptor, if it's
2849        * all zero, then it's a null descriptor.
2850        */
2851       dword1 =
2852          LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, false), "");
2853       is_null_descriptor = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
2854                                          LLVMConstInt(ctx->ac.i32, 0, false), "");
2855       ret = LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, ctx->ac.i32_0, ret, "");
2856    }
2857 
2858    return exit_waterfall(ctx, &wctx, ret);
2859 }
2860 
visit_image_size(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,bool bindless)2861 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2862                                      bool bindless)
2863 {
2864    LLVMValueRef res;
2865 
2866    enum glsl_sampler_dim dim;
2867    bool is_array;
2868    if (bindless) {
2869       dim = nir_intrinsic_image_dim(instr);
2870       is_array = nir_intrinsic_image_array(instr);
2871    } else {
2872       const struct glsl_type *type = get_image_deref(instr)->type;
2873       dim = glsl_get_sampler_dim(type);
2874       is_array = glsl_sampler_type_is_array(type);
2875    }
2876 
2877    struct waterfall_context wctx;
2878    LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2879 
2880    if (dim == GLSL_SAMPLER_DIM_BUF) {
2881       res = get_buffer_size(
2882          ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
2883    } else {
2884 
2885       struct ac_image_args args = {0};
2886 
2887       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2888       args.dmask = 0xf;
2889       args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2890       args.opcode = ac_image_get_resinfo;
2891       assert(nir_src_as_uint(instr->src[1]) == 0);
2892       args.lod = ctx->ac.i32_0;
2893       args.attributes = AC_FUNC_ATTR_READNONE;
2894 
2895       res = ac_build_image_opcode(&ctx->ac, &args);
2896 
2897       if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
2898          LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
2899          LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
2900          res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, ctx->ac.i32_1, "");
2901       }
2902    }
2903    return exit_waterfall(ctx, &wctx, res);
2904 }
2905 
emit_membar(struct ac_llvm_context * ac,const nir_intrinsic_instr * instr)2906 static void emit_membar(struct ac_llvm_context *ac, const nir_intrinsic_instr *instr)
2907 {
2908    unsigned wait_flags = 0;
2909 
2910    switch (instr->intrinsic) {
2911    case nir_intrinsic_memory_barrier:
2912    case nir_intrinsic_group_memory_barrier:
2913       wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE;
2914       break;
2915    case nir_intrinsic_memory_barrier_buffer:
2916    case nir_intrinsic_memory_barrier_image:
2917       wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE;
2918       break;
2919    case nir_intrinsic_memory_barrier_shared:
2920       wait_flags = AC_WAIT_LGKM;
2921       break;
2922    default:
2923       break;
2924    }
2925 
2926    ac_build_waitcnt(ac, wait_flags);
2927 }
2928 
ac_emit_barrier(struct ac_llvm_context * ac,gl_shader_stage stage)2929 void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
2930 {
2931    /* GFX6 only (thanks to a hw bug workaround):
2932     * The real barrier instruction isn’t needed, because an entire patch
2933     * always fits into a single wave.
2934     */
2935    if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) {
2936       ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
2937       return;
2938    }
2939    ac_build_s_barrier(ac);
2940 }
2941 
emit_discard(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr)2942 static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
2943 {
2944    LLVMValueRef cond;
2945 
2946    if (instr->intrinsic == nir_intrinsic_discard_if ||
2947        instr->intrinsic == nir_intrinsic_terminate_if) {
2948       cond = LLVMBuildNot(ctx->ac.builder, get_src(ctx, instr->src[0]), "");
2949    } else {
2950       assert(instr->intrinsic == nir_intrinsic_discard ||
2951              instr->intrinsic == nir_intrinsic_terminate);
2952       cond = ctx->ac.i1false;
2953    }
2954 
2955    ac_build_kill_if_false(&ctx->ac, cond);
2956 }
2957 
emit_demote(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr)2958 static void emit_demote(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
2959 {
2960    LLVMValueRef cond;
2961 
2962    if (instr->intrinsic == nir_intrinsic_demote_if) {
2963       cond = LLVMBuildNot(ctx->ac.builder, get_src(ctx, instr->src[0]), "");
2964    } else {
2965       assert(instr->intrinsic == nir_intrinsic_demote);
2966       cond = ctx->ac.i1false;
2967    }
2968 
2969    if (LLVM_VERSION_MAJOR >= 13) {
2970       /* This demotes the pixel if the condition is false. */
2971       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.wqm.demote", ctx->ac.voidt, &cond, 1, 0);
2972       return;
2973    }
2974 
2975    LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2976    mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, "");
2977    LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill);
2978 
2979    if (!ctx->info->fs.needs_all_helper_invocations) {
2980       /* This is an optional optimization that only kills whole inactive quads.
2981        * It's not used when subgroup operations can possibly use all helper
2982        * invocations.
2983        */
2984       if (ctx->ac.flow->depth == 0) {
2985          ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond));
2986       } else {
2987          /* amdgcn.wqm.vote doesn't work inside conditional blocks. Here's why.
2988           *
2989           * The problem is that kill(wqm.vote(0)) kills all active threads within
2990           * the block, which breaks the whole quad mode outside the block if
2991           * the conditional block has partially active quads (2x2 pixel blocks).
2992           * E.g. threads 0-3 are active outside the block, but only thread 0 is
2993           * active inside the block. Thread 0 shouldn't be killed by demote,
2994           * because threads 1-3 are still active outside the block.
2995           *
2996           * The fix for amdgcn.wqm.vote would be to return S_WQM((live & ~exec) | cond)
2997           * instead of S_WQM(cond).
2998           *
2999           * The less efficient workaround we do here is to save the kill condition
3000           * to a temporary (postponed_kill) and do kill(wqm.vote(cond)) after we
3001           * exit the conditional block.
3002           */
3003          ctx->ac.conditional_demote_seen = true;
3004       }
3005    }
3006 }
3007 
visit_load_local_invocation_index(struct ac_nir_context * ctx)3008 static LLVMValueRef visit_load_local_invocation_index(struct ac_nir_context *ctx)
3009 {
3010    if (ctx->args->vs_rel_patch_id.used) {
3011       return ac_get_arg(&ctx->ac, ctx->args->vs_rel_patch_id);
3012    } else if (ctx->args->merged_wave_info.used) {
3013       /* Thread ID in threadgroup in merged ESGS. */
3014       LLVMValueRef wave_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
3015       LLVMValueRef wave_size = LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false);
3016 	   LLVMValueRef threads_before = LLVMBuildMul(ctx->ac.builder, wave_id, wave_size, "");
3017 	   return LLVMBuildAdd(ctx->ac.builder, threads_before, ac_get_thread_id(&ctx->ac), "");
3018    }
3019 
3020    LLVMValueRef result;
3021    LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
3022    result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3023                          LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
3024 
3025    if (ctx->ac.wave_size == 32)
3026       result = LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 1, false), "");
3027 
3028    return LLVMBuildAdd(ctx->ac.builder, result, thread_id, "");
3029 }
3030 
visit_load_subgroup_id(struct ac_nir_context * ctx)3031 static LLVMValueRef visit_load_subgroup_id(struct ac_nir_context *ctx)
3032 {
3033    if (ctx->stage == MESA_SHADER_COMPUTE) {
3034       LLVMValueRef result;
3035       result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3036                             LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
3037       return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), "");
3038    } else if (ctx->args->merged_wave_info.used) {
3039       return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
3040    } else {
3041       return LLVMConstInt(ctx->ac.i32, 0, false);
3042    }
3043 }
3044 
visit_load_num_subgroups(struct ac_nir_context * ctx)3045 static LLVMValueRef visit_load_num_subgroups(struct ac_nir_context *ctx)
3046 {
3047    if (ctx->stage == MESA_SHADER_COMPUTE) {
3048       return LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3049                           LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
3050    } else {
3051       return LLVMConstInt(ctx->ac.i32, 1, false);
3052    }
3053 }
3054 
visit_first_invocation(struct ac_nir_context * ctx)3055 static LLVMValueRef visit_first_invocation(struct ac_nir_context *ctx)
3056 {
3057    LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1);
3058    const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64";
3059 
3060    /* The second argument is whether cttz(0) should be defined, but we do not care. */
3061    LLVMValueRef args[] = {active_set, ctx->ac.i1false};
3062    LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, ctx->ac.iN_wavemask, args, 2,
3063                                             AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE);
3064 
3065    return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, "");
3066 }
3067 
visit_load_shared(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr)3068 static LLVMValueRef visit_load_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3069 {
3070    LLVMValueRef values[4], derived_ptr, index, ret;
3071    unsigned const_off = nir_intrinsic_base(instr);
3072 
3073    LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->dest.ssa.bit_size, const_off);
3074 
3075    for (int chan = 0; chan < instr->num_components; chan++) {
3076       index = LLVMConstInt(ctx->ac.i32, chan, 0);
3077       derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
3078       values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
3079    }
3080 
3081    ret = ac_build_gather_values(&ctx->ac, values, instr->num_components);
3082 
3083    return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
3084 }
3085 
visit_store_shared(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr)3086 static void visit_store_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3087 {
3088    LLVMValueRef derived_ptr, data, index;
3089    LLVMBuilderRef builder = ctx->ac.builder;
3090 
3091    unsigned const_off = nir_intrinsic_base(instr);
3092    LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1], instr->src[0].ssa->bit_size, const_off);
3093    LLVMValueRef src = get_src(ctx, instr->src[0]);
3094 
3095    int writemask = nir_intrinsic_write_mask(instr);
3096    for (int chan = 0; chan < 4; chan++) {
3097       if (!(writemask & (1 << chan))) {
3098          continue;
3099       }
3100       data = ac_llvm_extract_elem(&ctx->ac, src, chan);
3101       index = LLVMConstInt(ctx->ac.i32, chan, 0);
3102       derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3103       LLVMBuildStore(builder, data, derived_ptr);
3104    }
3105 }
3106 
visit_var_atomic(struct ac_nir_context * ctx,const nir_intrinsic_instr * instr,LLVMValueRef ptr,int src_idx)3107 static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
3108                                      LLVMValueRef ptr, int src_idx)
3109 {
3110    if (ctx->ac.postponed_kill) {
3111       LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
3112       ac_build_ifcc(&ctx->ac, cond, 7005);
3113    }
3114 
3115    LLVMValueRef result;
3116    LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
3117 
3118    const char *sync_scope = "workgroup-one-as";
3119 
3120    if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap) {
3121       LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
3122       result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope);
3123       result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
3124    } else if (instr->intrinsic == nir_intrinsic_shared_atomic_fmin ||
3125               instr->intrinsic == nir_intrinsic_shared_atomic_fmax) {
3126       const char *op = instr->intrinsic == nir_intrinsic_shared_atomic_fmin ? "fmin" : "fmax";
3127       char name[64], type[8];
3128       LLVMValueRef params[5];
3129       LLVMTypeRef src_type;
3130       int arg_count = 0;
3131 
3132       src = ac_to_float(&ctx->ac, src);
3133       src_type = LLVMTypeOf(src);
3134 
3135       LLVMTypeRef ptr_type =
3136          LLVMPointerType(src_type, LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)));
3137       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
3138 
3139       params[arg_count++] = ptr;
3140       params[arg_count++] = src;
3141       params[arg_count++] = ctx->ac.i32_0;
3142       params[arg_count++] = ctx->ac.i32_0;
3143       params[arg_count++] = ctx->ac.i1false;
3144 
3145       ac_build_type_name_for_intr(src_type, type, sizeof(type));
3146       snprintf(name, sizeof(name), "llvm.amdgcn.ds.%s.%s", op, type);
3147 
3148       result = ac_build_intrinsic(&ctx->ac, name, src_type, params, arg_count, 0);
3149       result = ac_to_integer(&ctx->ac, result);
3150    } else {
3151       LLVMAtomicRMWBinOp op;
3152       switch (instr->intrinsic) {
3153       case nir_intrinsic_shared_atomic_add:
3154          op = LLVMAtomicRMWBinOpAdd;
3155          break;
3156       case nir_intrinsic_shared_atomic_umin:
3157          op = LLVMAtomicRMWBinOpUMin;
3158          break;
3159       case nir_intrinsic_shared_atomic_umax:
3160          op = LLVMAtomicRMWBinOpUMax;
3161          break;
3162       case nir_intrinsic_shared_atomic_imin:
3163          op = LLVMAtomicRMWBinOpMin;
3164          break;
3165       case nir_intrinsic_shared_atomic_imax:
3166          op = LLVMAtomicRMWBinOpMax;
3167          break;
3168       case nir_intrinsic_shared_atomic_and:
3169          op = LLVMAtomicRMWBinOpAnd;
3170          break;
3171       case nir_intrinsic_shared_atomic_or:
3172          op = LLVMAtomicRMWBinOpOr;
3173          break;
3174       case nir_intrinsic_shared_atomic_xor:
3175          op = LLVMAtomicRMWBinOpXor;
3176          break;
3177       case nir_intrinsic_shared_atomic_exchange:
3178          op = LLVMAtomicRMWBinOpXchg;
3179          break;
3180       case nir_intrinsic_shared_atomic_fadd:
3181          op = LLVMAtomicRMWBinOpFAdd;
3182          break;
3183       default:
3184          return NULL;
3185       }
3186 
3187       LLVMValueRef val;
3188 
3189       if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd) {
3190          val = ac_to_float(&ctx->ac, src);
3191 
3192          LLVMTypeRef ptr_type =
3193             LLVMPointerType(LLVMTypeOf(val), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)));
3194          ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
3195       } else {
3196          val = ac_to_integer(&ctx->ac, src);
3197       }
3198 
3199       result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope);
3200 
3201       if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd ||
3202           instr->intrinsic == nir_intrinsic_deref_atomic_fadd) {
3203          result = ac_to_integer(&ctx->ac, result);
3204       }
3205    }
3206 
3207    if (ctx->ac.postponed_kill)
3208       ac_build_endif(&ctx->ac, 7005);
3209    return result;
3210 }
3211 
load_sample_pos(struct ac_nir_context * ctx)3212 static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx)
3213 {
3214    LLVMValueRef values[2];
3215    LLVMValueRef pos[2];
3216 
3217    pos[0] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]));
3218    pos[1] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]));
3219 
3220    values[0] = ac_build_fract(&ctx->ac, pos[0], 32);
3221    values[1] = ac_build_fract(&ctx->ac, pos[1], 32);
3222    return ac_build_gather_values(&ctx->ac, values, 2);
3223 }
3224 
lookup_interp_param(struct ac_nir_context * ctx,enum glsl_interp_mode interp,unsigned location)3225 static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx, enum glsl_interp_mode interp,
3226                                         unsigned location)
3227 {
3228    switch (interp) {
3229    case INTERP_MODE_FLAT:
3230    default:
3231       return NULL;
3232    case INTERP_MODE_SMOOTH:
3233    case INTERP_MODE_NONE:
3234       if (location == INTERP_CENTER)
3235          return ac_get_arg(&ctx->ac, ctx->args->persp_center);
3236       else if (location == INTERP_CENTROID)
3237          return ctx->abi->persp_centroid;
3238       else if (location == INTERP_SAMPLE)
3239          return ac_get_arg(&ctx->ac, ctx->args->persp_sample);
3240       break;
3241    case INTERP_MODE_NOPERSPECTIVE:
3242       if (location == INTERP_CENTER)
3243          return ac_get_arg(&ctx->ac, ctx->args->linear_center);
3244       else if (location == INTERP_CENTROID)
3245          return ctx->abi->linear_centroid;
3246       else if (location == INTERP_SAMPLE)
3247          return ac_get_arg(&ctx->ac, ctx->args->linear_sample);
3248       break;
3249    }
3250    return NULL;
3251 }
3252 
barycentric_center(struct ac_nir_context * ctx,unsigned mode)3253 static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, unsigned mode)
3254 {
3255    LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
3256    return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3257 }
3258 
barycentric_offset(struct ac_nir_context * ctx,unsigned mode,LLVMValueRef offset)3259 static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, unsigned mode,
3260                                        LLVMValueRef offset)
3261 {
3262    LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
3263    LLVMValueRef src_c0 =
3264       ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, ""));
3265    LLVMValueRef src_c1 =
3266       ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, ""));
3267 
3268    LLVMValueRef ij_out[2];
3269    LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param);
3270 
3271    /*
3272     * take the I then J parameters, and the DDX/Y for it, and
3273     * calculate the IJ inputs for the interpolator.
3274     * temp1 = ddx * offset/sample.x + I;
3275     * interp_param.I = ddy * offset/sample.y + temp1;
3276     * temp1 = ddx * offset/sample.x + J;
3277     * interp_param.J = ddy * offset/sample.y + temp1;
3278     */
3279    for (unsigned i = 0; i < 2; i++) {
3280       LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false);
3281       LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false);
3282       LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, ix_ll, "");
3283       LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, iy_ll, "");
3284       LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, "");
3285       LLVMValueRef temp1, temp2;
3286 
3287       interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, ctx->ac.f32, "");
3288 
3289       temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el);
3290       temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1);
3291 
3292       ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, temp2, ctx->ac.i32, "");
3293    }
3294    interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
3295    return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3296 }
3297 
barycentric_centroid(struct ac_nir_context * ctx,unsigned mode)3298 static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, unsigned mode)
3299 {
3300    LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID);
3301    return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3302 }
3303 
barycentric_at_sample(struct ac_nir_context * ctx,unsigned mode,LLVMValueRef sample_id)3304 static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, unsigned mode,
3305                                           LLVMValueRef sample_id)
3306 {
3307    if (ctx->abi->interp_at_sample_force_center)
3308       return barycentric_center(ctx, mode);
3309 
3310    LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
3311 
3312    /* fetch sample ID */
3313    LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id);
3314 
3315    LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, "");
3316    src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, "");
3317    LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, "");
3318    src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
3319    LLVMValueRef coords[] = {src_c0, src_c1};
3320    LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2);
3321 
3322    return barycentric_offset(ctx, mode, offset);
3323 }
3324 
barycentric_sample(struct ac_nir_context * ctx,unsigned mode)3325 static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, unsigned mode)
3326 {
3327    LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE);
3328    return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3329 }
3330 
barycentric_model(struct ac_nir_context * ctx)3331 static LLVMValueRef barycentric_model(struct ac_nir_context *ctx)
3332 {
3333    return LLVMBuildBitCast(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->pull_model),
3334                            ctx->ac.v3i32, "");
3335 }
3336 
load_interpolated_input(struct ac_nir_context * ctx,LLVMValueRef interp_param,unsigned index,unsigned comp_start,unsigned num_components,unsigned bitsize,bool high_16bits)3337 static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, LLVMValueRef interp_param,
3338                                             unsigned index, unsigned comp_start,
3339                                             unsigned num_components, unsigned bitsize,
3340                                             bool high_16bits)
3341 {
3342    LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
3343    LLVMValueRef interp_param_f;
3344 
3345    interp_param_f = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, "");
3346    LLVMValueRef i = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
3347    LLVMValueRef j = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
3348 
3349    /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
3350    if (ctx->verified_interp && !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
3351       LLVMValueRef args[2];
3352       args[0] = i;
3353       args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
3354       LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, args, 2,
3355                                              AC_FUNC_ATTR_READNONE);
3356       ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
3357       _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
3358    }
3359 
3360    LLVMValueRef values[4];
3361    assert(bitsize == 16 || bitsize == 32);
3362    for (unsigned comp = 0; comp < num_components; comp++) {
3363       LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false);
3364       if (bitsize == 16) {
3365          values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number,
3366                                                ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j,
3367                                                high_16bits);
3368       } else {
3369          values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number,
3370                                            ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j);
3371       }
3372    }
3373 
3374    return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
3375 }
3376 
visit_load(struct ac_nir_context * ctx,nir_intrinsic_instr * instr,bool is_output)3377 static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr *instr,
3378                                bool is_output)
3379 {
3380    LLVMValueRef values[8];
3381    LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
3382    LLVMTypeRef component_type;
3383    unsigned base = nir_intrinsic_base(instr);
3384    unsigned component = nir_intrinsic_component(instr);
3385    unsigned count = instr->dest.ssa.num_components;
3386    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3387    LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
3388    nir_src offset = *nir_get_io_offset_src(instr);
3389    LLVMValueRef indir_index = NULL;
3390 
3391    switch (instr->dest.ssa.bit_size) {
3392    case 16:
3393    case 32:
3394       break;
3395    case 64:
3396       unreachable("64-bit IO should have been lowered");
3397       return NULL;
3398    default:
3399       unreachable("unhandled load type");
3400       return NULL;
3401    }
3402 
3403    if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
3404       component_type = LLVMGetElementType(dest_type);
3405    else
3406       component_type = dest_type;
3407 
3408    if (nir_src_is_const(offset))
3409       assert(nir_src_as_uint(offset) == 0);
3410    else
3411       indir_index = get_src(ctx, offset);
3412 
3413    if (ctx->stage == MESA_SHADER_TESS_CTRL ||
3414        (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) {
3415       bool vertex_index_is_invoc_id =
3416          vertex_index_src &&
3417          vertex_index_src->ssa->parent_instr->type == nir_instr_type_intrinsic &&
3418          nir_instr_as_intrinsic(vertex_index_src->ssa->parent_instr)->intrinsic ==
3419          nir_intrinsic_load_invocation_id;
3420 
3421       LLVMValueRef result = ctx->abi->load_tess_varyings(ctx->abi, component_type,
3422                                                          vertex_index, indir_index,
3423                                                          base, component,
3424                                                          count, !is_output,
3425                                                          vertex_index_is_invoc_id);
3426       if (instr->dest.ssa.bit_size == 16) {
3427          result = ac_to_integer(&ctx->ac, result);
3428          result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
3429       }
3430       return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3431    }
3432 
3433    /* No indirect indexing is allowed after this point. */
3434    assert(!indir_index);
3435 
3436    if (ctx->stage == MESA_SHADER_GEOMETRY) {
3437       assert(nir_src_is_const(*vertex_index_src));
3438 
3439       return ctx->abi->load_inputs(ctx->abi, base, component, count,
3440                                    nir_src_as_uint(*vertex_index_src), component_type);
3441    }
3442 
3443    if (ctx->stage == MESA_SHADER_FRAGMENT && is_output &&
3444        nir_intrinsic_io_semantics(instr).fb_fetch_output)
3445       return ctx->abi->emit_fbfetch(ctx->abi);
3446 
3447    if (ctx->stage == MESA_SHADER_VERTEX && !is_output)
3448       return ctx->abi->load_inputs(ctx->abi, base, component, count, 0, component_type);
3449 
3450    /* Other non-fragment cases have outputs in temporaries. */
3451    if (is_output && (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
3452       assert(is_output);
3453 
3454       for (unsigned chan = component; chan < count + component; chan++)
3455          values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->abi->outputs[base * 4 + chan], "");
3456 
3457       LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component);
3458       return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3459    }
3460 
3461    /* Fragment shader inputs. */
3462    assert(ctx->stage == MESA_SHADER_FRAGMENT);
3463    unsigned vertex_id = 2; /* P0 */
3464 
3465    if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
3466       nir_const_value *src0 = nir_src_as_const_value(instr->src[0]);
3467 
3468       switch (src0[0].i32) {
3469       case 0:
3470          vertex_id = 2;
3471          break;
3472       case 1:
3473          vertex_id = 0;
3474          break;
3475       case 2:
3476          vertex_id = 1;
3477          break;
3478       default:
3479          unreachable("Invalid vertex index");
3480       }
3481    }
3482 
3483    LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false);
3484 
3485    for (unsigned chan = 0; chan < count; chan++) {
3486       LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false);
3487       values[chan] =
3488          ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, vertex_id, false), llvm_chan,
3489                                 attr_number, ac_get_arg(&ctx->ac, ctx->args->prim_mask));
3490       values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
3491       if (instr->dest.ssa.bit_size == 16 &&
3492           nir_intrinsic_io_semantics(instr).high_16bits)
3493          values[chan] = LLVMBuildLShr(ctx->ac.builder, values[chan], LLVMConstInt(ctx->ac.i32, 16, 0), "");
3494       values[chan] =
3495          LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan],
3496                                  instr->dest.ssa.bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, "");
3497    }
3498 
3499    LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count);
3500    return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3501 }
3502 
3503 static LLVMValueRef
emit_load_frag_shading_rate(struct ac_nir_context * ctx)3504 emit_load_frag_shading_rate(struct ac_nir_context *ctx)
3505 {
3506    LLVMValueRef x_rate, y_rate, cond;
3507 
3508    /* VRS Rate X = Ancillary[2:3]
3509     * VRS Rate Y = Ancillary[4:5]
3510     */
3511    x_rate = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 2, 2);
3512    y_rate = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 4, 2);
3513 
3514    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
3515    cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, x_rate, ctx->ac.i32_1, "");
3516    x_rate = LLVMBuildSelect(ctx->ac.builder, cond,
3517                             LLVMConstInt(ctx->ac.i32, 4, false), ctx->ac.i32_0, "");
3518 
3519    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
3520    cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, y_rate, ctx->ac.i32_1, "");
3521    y_rate = LLVMBuildSelect(ctx->ac.builder, cond,
3522                             LLVMConstInt(ctx->ac.i32, 1, false), ctx->ac.i32_0, "");
3523 
3524    return LLVMBuildOr(ctx->ac.builder, x_rate, y_rate, "");
3525 }
3526 
3527 static LLVMValueRef
emit_load_frag_coord(struct ac_nir_context * ctx)3528 emit_load_frag_coord(struct ac_nir_context *ctx)
3529 {
3530    LLVMValueRef values[4] = {
3531       ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]), ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]),
3532       ac_get_arg(&ctx->ac, ctx->args->frag_pos[2]),
3533       ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ac_get_arg(&ctx->ac, ctx->args->frag_pos[3]))};
3534 
3535    if (ctx->abi->adjust_frag_coord_z) {
3536       /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
3537       LLVMValueRef frag_z = values[2];
3538 
3539       /* dFdx fine */
3540       LLVMValueRef adjusted_frag_z = emit_ddxy(ctx, nir_op_fddx_fine, frag_z);
3541 
3542       /* adjusted_frag_z * 0.0625 + frag_z */
3543       adjusted_frag_z = LLVMBuildFAdd(ctx->ac.builder, frag_z,
3544                                       LLVMBuildFMul(ctx->ac.builder, adjusted_frag_z,
3545                                                     LLVMConstReal(ctx->ac.f32, 0.0625), ""), "");
3546 
3547       /* VRS Rate X = Ancillary[2:3] */
3548       LLVMValueRef x_rate = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 2, 2);
3549 
3550       /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
3551       LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, x_rate, ctx->ac.i32_1, "");
3552       values[2] = LLVMBuildSelect(ctx->ac.builder, cond, adjusted_frag_z, frag_z, "");
3553    }
3554 
3555    return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
3556 }
3557 
visit_intrinsic(struct ac_nir_context * ctx,nir_intrinsic_instr * instr)3558 static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
3559 {
3560    LLVMValueRef result = NULL;
3561 
3562    switch (instr->intrinsic) {
3563    case nir_intrinsic_ballot:
3564       result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
3565       if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size)
3566          result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, "");
3567       break;
3568    case nir_intrinsic_read_invocation:
3569       result =
3570          ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1]));
3571       break;
3572    case nir_intrinsic_read_first_invocation:
3573       result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL);
3574       break;
3575    case nir_intrinsic_load_subgroup_invocation:
3576       result = ac_get_thread_id(&ctx->ac);
3577       break;
3578    case nir_intrinsic_load_workgroup_id: {
3579       LLVMValueRef values[3];
3580 
3581       for (int i = 0; i < 3; i++) {
3582          values[i] = ctx->args->workgroup_ids[i].used
3583                         ? ac_get_arg(&ctx->ac, ctx->args->workgroup_ids[i])
3584                         : ctx->ac.i32_0;
3585       }
3586 
3587       result = ac_build_gather_values(&ctx->ac, values, 3);
3588       break;
3589    }
3590    case nir_intrinsic_load_base_vertex:
3591    case nir_intrinsic_load_first_vertex:
3592       result = ctx->abi->load_base_vertex(ctx->abi,
3593                                           instr->intrinsic == nir_intrinsic_load_base_vertex);
3594       break;
3595    case nir_intrinsic_load_workgroup_size:
3596       result = ctx->abi->load_local_group_size(ctx->abi);
3597       break;
3598    case nir_intrinsic_load_vertex_id:
3599       result = LLVMBuildAdd(ctx->ac.builder,
3600                             ctx->vertex_id_replaced ? ctx->vertex_id_replaced :
3601                                                       ac_get_arg(&ctx->ac, ctx->args->vertex_id),
3602                             ac_get_arg(&ctx->ac, ctx->args->base_vertex), "");
3603       break;
3604    case nir_intrinsic_load_vertex_id_zero_base: {
3605       result = ctx->vertex_id_replaced ? ctx->vertex_id_replaced : ctx->abi->vertex_id;
3606       break;
3607    }
3608    case nir_intrinsic_load_local_invocation_id: {
3609       LLVMValueRef ids = ac_get_arg(&ctx->ac, ctx->args->local_invocation_ids);
3610 
3611       if (LLVMGetTypeKind(LLVMTypeOf(ids)) == LLVMIntegerTypeKind) {
3612          /* Thread IDs are packed in VGPR0, 10 bits per component. */
3613          LLVMValueRef id[3];
3614 
3615          for (unsigned i = 0; i < 3; i++)
3616             id[i] = ac_unpack_param(&ctx->ac, ids, i * 10, 10);
3617 
3618          result = ac_build_gather_values(&ctx->ac, id, 3);
3619       } else {
3620          result = ids;
3621       }
3622       break;
3623    }
3624    case nir_intrinsic_load_base_instance:
3625       result = ac_get_arg(&ctx->ac, ctx->args->start_instance);
3626       break;
3627    case nir_intrinsic_load_draw_id:
3628       result = ac_get_arg(&ctx->ac, ctx->args->draw_id);
3629       break;
3630    case nir_intrinsic_load_view_index:
3631       result = ac_get_arg(&ctx->ac, ctx->args->view_index);
3632       break;
3633    case nir_intrinsic_load_invocation_id:
3634       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3635          result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), 8, 5);
3636       } else {
3637          if (ctx->ac.chip_class >= GFX10) {
3638             result =
3639                LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id),
3640                             LLVMConstInt(ctx->ac.i32, 127, 0), "");
3641          } else {
3642             result = ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id);
3643          }
3644       }
3645       break;
3646    case nir_intrinsic_load_primitive_id:
3647       if (ctx->stage == MESA_SHADER_GEOMETRY) {
3648          result = ac_get_arg(&ctx->ac, ctx->args->gs_prim_id);
3649       } else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3650          result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id);
3651       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
3652          result = ctx->tes_patch_id_replaced ? ctx->tes_patch_id_replaced
3653                                              : ac_get_arg(&ctx->ac, ctx->args->tes_patch_id);
3654       } else
3655          fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
3656       break;
3657    case nir_intrinsic_load_sample_id:
3658       result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 8, 4);
3659       break;
3660    case nir_intrinsic_load_sample_pos:
3661       result = load_sample_pos(ctx);
3662       break;
3663    case nir_intrinsic_load_sample_mask_in:
3664       result = ctx->abi->load_sample_mask_in(ctx->abi);
3665       break;
3666    case nir_intrinsic_load_frag_coord:
3667       result = emit_load_frag_coord(ctx);
3668       break;
3669    case nir_intrinsic_load_frag_shading_rate:
3670       result = emit_load_frag_shading_rate(ctx);
3671       break;
3672    case nir_intrinsic_load_front_face:
3673       result = emit_i2b(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->front_face));
3674       break;
3675    case nir_intrinsic_load_helper_invocation:
3676       result = ac_build_load_helper_invocation(&ctx->ac);
3677       break;
3678    case nir_intrinsic_is_helper_invocation:
3679       result = ac_build_is_helper_invocation(&ctx->ac);
3680       break;
3681    case nir_intrinsic_load_color0:
3682       result = ctx->abi->color0;
3683       break;
3684    case nir_intrinsic_load_color1:
3685       result = ctx->abi->color1;
3686       break;
3687    case nir_intrinsic_load_user_data_amd:
3688       assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32);
3689       result = ctx->abi->user_data;
3690       break;
3691    case nir_intrinsic_load_instance_id:
3692       result = ctx->instance_id_replaced ? ctx->instance_id_replaced : ctx->abi->instance_id;
3693       break;
3694    case nir_intrinsic_load_num_workgroups:
3695       result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
3696       break;
3697    case nir_intrinsic_load_local_invocation_index:
3698       result = visit_load_local_invocation_index(ctx);
3699       break;
3700    case nir_intrinsic_load_subgroup_id:
3701       result = visit_load_subgroup_id(ctx);
3702       break;
3703    case nir_intrinsic_load_num_subgroups:
3704       result = visit_load_num_subgroups(ctx);
3705       break;
3706    case nir_intrinsic_first_invocation:
3707       result = visit_first_invocation(ctx);
3708       break;
3709    case nir_intrinsic_load_push_constant:
3710       result = visit_load_push_constant(ctx, instr);
3711       break;
3712    case nir_intrinsic_vulkan_resource_index: {
3713       LLVMValueRef index = get_src(ctx, instr->src[0]);
3714       unsigned desc_set = nir_intrinsic_desc_set(instr);
3715       unsigned binding = nir_intrinsic_binding(instr);
3716 
3717       result = ctx->abi->load_resource(ctx->abi, index, desc_set, binding);
3718       break;
3719    }
3720    case nir_intrinsic_store_ssbo:
3721       visit_store_ssbo(ctx, instr);
3722       break;
3723    case nir_intrinsic_load_ssbo:
3724       result = visit_load_buffer(ctx, instr);
3725       break;
3726    case nir_intrinsic_load_global_constant:
3727    case nir_intrinsic_load_global:
3728       result = visit_load_global(ctx, instr);
3729       break;
3730    case nir_intrinsic_store_global:
3731       visit_store_global(ctx, instr);
3732       break;
3733    case nir_intrinsic_global_atomic_add:
3734    case nir_intrinsic_global_atomic_imin:
3735    case nir_intrinsic_global_atomic_umin:
3736    case nir_intrinsic_global_atomic_imax:
3737    case nir_intrinsic_global_atomic_umax:
3738    case nir_intrinsic_global_atomic_and:
3739    case nir_intrinsic_global_atomic_or:
3740    case nir_intrinsic_global_atomic_xor:
3741    case nir_intrinsic_global_atomic_exchange:
3742    case nir_intrinsic_global_atomic_comp_swap:
3743    case nir_intrinsic_global_atomic_fmin:
3744    case nir_intrinsic_global_atomic_fmax:
3745       result = visit_global_atomic(ctx, instr);
3746       break;
3747    case nir_intrinsic_ssbo_atomic_add:
3748    case nir_intrinsic_ssbo_atomic_imin:
3749    case nir_intrinsic_ssbo_atomic_umin:
3750    case nir_intrinsic_ssbo_atomic_imax:
3751    case nir_intrinsic_ssbo_atomic_umax:
3752    case nir_intrinsic_ssbo_atomic_and:
3753    case nir_intrinsic_ssbo_atomic_or:
3754    case nir_intrinsic_ssbo_atomic_xor:
3755    case nir_intrinsic_ssbo_atomic_exchange:
3756    case nir_intrinsic_ssbo_atomic_comp_swap:
3757    case nir_intrinsic_ssbo_atomic_fmin:
3758    case nir_intrinsic_ssbo_atomic_fmax:
3759       result = visit_atomic_ssbo(ctx, instr);
3760       break;
3761    case nir_intrinsic_load_ubo:
3762       result = visit_load_ubo_buffer(ctx, instr);
3763       break;
3764    case nir_intrinsic_get_ssbo_size:
3765       result = visit_get_ssbo_size(ctx, instr);
3766       break;
3767    case nir_intrinsic_load_input:
3768    case nir_intrinsic_load_input_vertex:
3769    case nir_intrinsic_load_per_vertex_input:
3770       result = visit_load(ctx, instr, false);
3771       break;
3772    case nir_intrinsic_load_output:
3773    case nir_intrinsic_load_per_vertex_output:
3774       result = visit_load(ctx, instr, true);
3775       break;
3776    case nir_intrinsic_store_output:
3777    case nir_intrinsic_store_per_vertex_output:
3778       visit_store_output(ctx, instr);
3779       break;
3780    case nir_intrinsic_load_shared:
3781       result = visit_load_shared(ctx, instr);
3782       break;
3783    case nir_intrinsic_store_shared:
3784       visit_store_shared(ctx, instr);
3785       break;
3786    case nir_intrinsic_bindless_image_samples:
3787    case nir_intrinsic_image_deref_samples:
3788       result = visit_image_samples(ctx, instr);
3789       break;
3790    case nir_intrinsic_bindless_image_load:
3791       result = visit_image_load(ctx, instr, true);
3792       break;
3793    case nir_intrinsic_image_deref_load:
3794    case nir_intrinsic_image_deref_sparse_load:
3795       result = visit_image_load(ctx, instr, false);
3796       break;
3797    case nir_intrinsic_bindless_image_store:
3798       visit_image_store(ctx, instr, true);
3799       break;
3800    case nir_intrinsic_image_deref_store:
3801       visit_image_store(ctx, instr, false);
3802       break;
3803    case nir_intrinsic_bindless_image_atomic_add:
3804    case nir_intrinsic_bindless_image_atomic_imin:
3805    case nir_intrinsic_bindless_image_atomic_umin:
3806    case nir_intrinsic_bindless_image_atomic_imax:
3807    case nir_intrinsic_bindless_image_atomic_umax:
3808    case nir_intrinsic_bindless_image_atomic_and:
3809    case nir_intrinsic_bindless_image_atomic_or:
3810    case nir_intrinsic_bindless_image_atomic_xor:
3811    case nir_intrinsic_bindless_image_atomic_exchange:
3812    case nir_intrinsic_bindless_image_atomic_comp_swap:
3813    case nir_intrinsic_bindless_image_atomic_inc_wrap:
3814    case nir_intrinsic_bindless_image_atomic_dec_wrap:
3815       result = visit_image_atomic(ctx, instr, true);
3816       break;
3817    case nir_intrinsic_image_deref_atomic_add:
3818    case nir_intrinsic_image_deref_atomic_imin:
3819    case nir_intrinsic_image_deref_atomic_umin:
3820    case nir_intrinsic_image_deref_atomic_imax:
3821    case nir_intrinsic_image_deref_atomic_umax:
3822    case nir_intrinsic_image_deref_atomic_and:
3823    case nir_intrinsic_image_deref_atomic_or:
3824    case nir_intrinsic_image_deref_atomic_xor:
3825    case nir_intrinsic_image_deref_atomic_exchange:
3826    case nir_intrinsic_image_deref_atomic_comp_swap:
3827    case nir_intrinsic_image_deref_atomic_inc_wrap:
3828    case nir_intrinsic_image_deref_atomic_dec_wrap:
3829    case nir_intrinsic_image_deref_atomic_fmin:
3830    case nir_intrinsic_image_deref_atomic_fmax:
3831       result = visit_image_atomic(ctx, instr, false);
3832       break;
3833    case nir_intrinsic_bindless_image_size:
3834       result = visit_image_size(ctx, instr, true);
3835       break;
3836    case nir_intrinsic_image_deref_size:
3837       result = visit_image_size(ctx, instr, false);
3838       break;
3839    case nir_intrinsic_shader_clock:
3840       result = ac_build_shader_clock(&ctx->ac, nir_intrinsic_memory_scope(instr));
3841       break;
3842    case nir_intrinsic_discard:
3843    case nir_intrinsic_discard_if:
3844    case nir_intrinsic_terminate:
3845    case nir_intrinsic_terminate_if:
3846       emit_discard(ctx, instr);
3847       break;
3848    case nir_intrinsic_demote:
3849    case nir_intrinsic_demote_if:
3850       emit_demote(ctx, instr);
3851       break;
3852    case nir_intrinsic_memory_barrier:
3853    case nir_intrinsic_group_memory_barrier:
3854    case nir_intrinsic_memory_barrier_buffer:
3855    case nir_intrinsic_memory_barrier_image:
3856    case nir_intrinsic_memory_barrier_shared:
3857       emit_membar(&ctx->ac, instr);
3858       break;
3859    case nir_intrinsic_scoped_barrier: {
3860       assert(!(nir_intrinsic_memory_semantics(instr) &
3861                (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
3862 
3863       nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
3864 
3865       unsigned wait_flags = 0;
3866       if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
3867          wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
3868       if (modes & nir_var_mem_shared)
3869          wait_flags |= AC_WAIT_LGKM;
3870 
3871       if (wait_flags)
3872          ac_build_waitcnt(&ctx->ac, wait_flags);
3873 
3874       if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP)
3875          ac_emit_barrier(&ctx->ac, ctx->stage);
3876       break;
3877    }
3878    case nir_intrinsic_memory_barrier_tcs_patch:
3879       break;
3880    case nir_intrinsic_control_barrier:
3881       ac_emit_barrier(&ctx->ac, ctx->stage);
3882       break;
3883    case nir_intrinsic_shared_atomic_add:
3884    case nir_intrinsic_shared_atomic_imin:
3885    case nir_intrinsic_shared_atomic_umin:
3886    case nir_intrinsic_shared_atomic_imax:
3887    case nir_intrinsic_shared_atomic_umax:
3888    case nir_intrinsic_shared_atomic_and:
3889    case nir_intrinsic_shared_atomic_or:
3890    case nir_intrinsic_shared_atomic_xor:
3891    case nir_intrinsic_shared_atomic_exchange:
3892    case nir_intrinsic_shared_atomic_comp_swap:
3893    case nir_intrinsic_shared_atomic_fadd:
3894    case nir_intrinsic_shared_atomic_fmin:
3895    case nir_intrinsic_shared_atomic_fmax: {
3896       LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->src[1].ssa->bit_size, 0);
3897       result = visit_var_atomic(ctx, instr, ptr, 1);
3898       break;
3899    }
3900    case nir_intrinsic_deref_atomic_add:
3901    case nir_intrinsic_deref_atomic_imin:
3902    case nir_intrinsic_deref_atomic_umin:
3903    case nir_intrinsic_deref_atomic_imax:
3904    case nir_intrinsic_deref_atomic_umax:
3905    case nir_intrinsic_deref_atomic_and:
3906    case nir_intrinsic_deref_atomic_or:
3907    case nir_intrinsic_deref_atomic_xor:
3908    case nir_intrinsic_deref_atomic_exchange:
3909    case nir_intrinsic_deref_atomic_comp_swap:
3910    case nir_intrinsic_deref_atomic_fadd: {
3911       LLVMValueRef ptr = get_src(ctx, instr->src[0]);
3912       result = visit_var_atomic(ctx, instr, ptr, 1);
3913       break;
3914    }
3915    case nir_intrinsic_load_barycentric_pixel:
3916       result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr));
3917       break;
3918    case nir_intrinsic_load_barycentric_centroid:
3919       result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr));
3920       break;
3921    case nir_intrinsic_load_barycentric_sample:
3922       result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr));
3923       break;
3924    case nir_intrinsic_load_barycentric_model:
3925       result = barycentric_model(ctx);
3926       break;
3927    case nir_intrinsic_load_barycentric_at_offset: {
3928       LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
3929       result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset);
3930       break;
3931    }
3932    case nir_intrinsic_load_barycentric_at_sample: {
3933       LLVMValueRef sample_id = get_src(ctx, instr->src[0]);
3934       result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id);
3935       break;
3936    }
3937    case nir_intrinsic_load_interpolated_input: {
3938       /* We assume any indirect loads have been lowered away */
3939       ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]);
3940       assert(offset);
3941       assert(offset[0].i32 == 0);
3942 
3943       LLVMValueRef interp_param = get_src(ctx, instr->src[0]);
3944       unsigned index = nir_intrinsic_base(instr);
3945       unsigned component = nir_intrinsic_component(instr);
3946       result = load_interpolated_input(ctx, interp_param, index, component,
3947                                        instr->dest.ssa.num_components, instr->dest.ssa.bit_size,
3948                                        nir_intrinsic_io_semantics(instr).high_16bits);
3949       break;
3950    }
3951    case nir_intrinsic_emit_vertex:
3952       ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
3953       break;
3954    case nir_intrinsic_emit_vertex_with_counter: {
3955       unsigned stream = nir_intrinsic_stream_id(instr);
3956       LLVMValueRef next_vertex = get_src(ctx, instr->src[0]);
3957       ctx->abi->emit_vertex_with_counter(ctx->abi, stream, next_vertex, ctx->abi->outputs);
3958       break;
3959    }
3960    case nir_intrinsic_end_primitive:
3961    case nir_intrinsic_end_primitive_with_counter:
3962       ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
3963       break;
3964    case nir_intrinsic_load_tess_coord: {
3965       LLVMValueRef coord[] = {
3966          ctx->tes_u_replaced ? ctx->tes_u_replaced : ac_get_arg(&ctx->ac, ctx->args->tes_u),
3967          ctx->tes_v_replaced ? ctx->tes_v_replaced : ac_get_arg(&ctx->ac, ctx->args->tes_v),
3968          ctx->ac.f32_0,
3969       };
3970 
3971       /* For triangles, the vector should be (u, v, 1-u-v). */
3972       if (ctx->info->tess.primitive_mode == GL_TRIANGLES) {
3973          coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
3974                                   LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
3975       }
3976       result = ac_build_gather_values(&ctx->ac, coord, 3);
3977       break;
3978    }
3979    case nir_intrinsic_load_tess_level_outer:
3980       result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false);
3981       break;
3982    case nir_intrinsic_load_tess_level_inner:
3983       result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false);
3984       break;
3985    case nir_intrinsic_load_tess_level_outer_default:
3986       result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true);
3987       break;
3988    case nir_intrinsic_load_tess_level_inner_default:
3989       result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true);
3990       break;
3991    case nir_intrinsic_load_patch_vertices_in:
3992       result = ctx->abi->load_patch_vertices_in(ctx->abi);
3993       break;
3994    case nir_intrinsic_load_tess_rel_patch_id_amd:
3995       if (ctx->stage == MESA_SHADER_TESS_CTRL)
3996          result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), 0, 8);
3997       else if (ctx->stage == MESA_SHADER_TESS_EVAL)
3998          result = ctx->tes_rel_patch_id_replaced ? ctx->tes_rel_patch_id_replaced
3999                                                  : ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id);
4000       else
4001          unreachable("tess_rel_patch_id_amd is only supported by tessellation shaders");
4002       break;
4003    case nir_intrinsic_load_ring_tess_factors_amd:
4004       result = ctx->abi->load_ring_tess_factors(ctx->abi);
4005       break;
4006    case nir_intrinsic_load_ring_tess_factors_offset_amd:
4007       result = ac_get_arg(&ctx->ac, ctx->args->tcs_factor_offset);
4008       break;
4009    case nir_intrinsic_load_ring_tess_offchip_amd:
4010       result = ctx->abi->load_ring_tess_offchip(ctx->abi);
4011       break;
4012    case nir_intrinsic_load_ring_tess_offchip_offset_amd:
4013       result = ac_get_arg(&ctx->ac, ctx->args->tess_offchip_offset);
4014       break;
4015    case nir_intrinsic_load_ring_esgs_amd:
4016       result = ctx->abi->load_ring_esgs(ctx->abi);
4017       break;
4018    case nir_intrinsic_load_ring_es2gs_offset_amd:
4019       result = ac_get_arg(&ctx->ac, ctx->args->es2gs_offset);
4020       break;
4021    case nir_intrinsic_load_gs_vertex_offset_amd:
4022       result = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[nir_intrinsic_base(instr)]);
4023       break;
4024    case nir_intrinsic_vote_all: {
4025       result = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0]));
4026       break;
4027    }
4028    case nir_intrinsic_vote_any: {
4029       result = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0]));
4030       break;
4031    }
4032    case nir_intrinsic_shuffle:
4033       if (ctx->ac.chip_class == GFX8 || ctx->ac.chip_class == GFX9 ||
4034           (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) {
4035          result =
4036             ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1]));
4037       } else {
4038          LLVMValueRef src = get_src(ctx, instr->src[0]);
4039          LLVMValueRef index = get_src(ctx, instr->src[1]);
4040          LLVMTypeRef type = LLVMTypeOf(src);
4041          struct waterfall_context wctx;
4042          LLVMValueRef index_val;
4043 
4044          index_val = enter_waterfall(ctx, &wctx, index, true);
4045 
4046          src = LLVMBuildZExt(ctx->ac.builder, src, ctx->ac.i32, "");
4047 
4048          result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane", ctx->ac.i32,
4049                                      (LLVMValueRef[]){src, index_val}, 2,
4050                                      AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4051 
4052          result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
4053 
4054          result = exit_waterfall(ctx, &wctx, result);
4055       }
4056       break;
4057    case nir_intrinsic_reduce:
4058       result = ac_build_reduce(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0],
4059                                instr->const_index[1]);
4060       break;
4061    case nir_intrinsic_inclusive_scan:
4062       result =
4063          ac_build_inclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]);
4064       break;
4065    case nir_intrinsic_exclusive_scan:
4066       result =
4067          ac_build_exclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]);
4068       break;
4069    case nir_intrinsic_quad_broadcast: {
4070       unsigned lane = nir_src_as_uint(instr->src[1]);
4071       result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), lane, lane, lane, lane);
4072       break;
4073    }
4074    case nir_intrinsic_quad_swap_horizontal:
4075       result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3, 2);
4076       break;
4077    case nir_intrinsic_quad_swap_vertical:
4078       result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0, 1);
4079       break;
4080    case nir_intrinsic_quad_swap_diagonal:
4081       result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1, 0);
4082       break;
4083    case nir_intrinsic_quad_swizzle_amd: {
4084       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
4085       result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask & 0x3,
4086                                      (mask >> 2) & 0x3, (mask >> 4) & 0x3, (mask >> 6) & 0x3);
4087       break;
4088    }
4089    case nir_intrinsic_masked_swizzle_amd: {
4090       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
4091       result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask);
4092       break;
4093    }
4094    case nir_intrinsic_write_invocation_amd:
4095       result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]),
4096                                   get_src(ctx, instr->src[1]), get_src(ctx, instr->src[2]));
4097       break;
4098    case nir_intrinsic_mbcnt_amd:
4099       result = ac_build_mbcnt_add(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1]));
4100       break;
4101    case nir_intrinsic_load_scratch: {
4102       LLVMValueRef offset = get_src(ctx, instr->src[0]);
4103       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset);
4104       LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
4105       LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1
4106                                 ? comp_type
4107                                 : LLVMVectorType(comp_type, instr->dest.ssa.num_components);
4108       unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
4109       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), "");
4110       result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
4111       break;
4112    }
4113    case nir_intrinsic_store_scratch: {
4114       LLVMValueRef offset = get_src(ctx, instr->src[1]);
4115       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset);
4116       LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size);
4117       unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
4118       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(comp_type, addr_space), "");
4119       LLVMValueRef src = get_src(ctx, instr->src[0]);
4120       unsigned wrmask = nir_intrinsic_write_mask(instr);
4121       while (wrmask) {
4122          int start, count;
4123          u_bit_scan_consecutive_range(&wrmask, &start, &count);
4124 
4125          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false);
4126          LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, "");
4127          LLVMTypeRef vec_type = count == 1 ? comp_type : LLVMVectorType(comp_type, count);
4128          offset_ptr = LLVMBuildBitCast(ctx->ac.builder, offset_ptr,
4129                                        LLVMPointerType(vec_type, addr_space), "");
4130          LLVMValueRef offset_src = ac_extract_components(&ctx->ac, src, start, count);
4131          LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr);
4132       }
4133       break;
4134    }
4135    case nir_intrinsic_load_constant: {
4136       unsigned base = nir_intrinsic_base(instr);
4137       unsigned range = nir_intrinsic_range(instr);
4138 
4139       LLVMValueRef offset = get_src(ctx, instr->src[0]);
4140       offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, base, false), "");
4141 
4142       /* Clamp the offset to avoid out-of-bound access because global
4143        * instructions can't handle them.
4144        */
4145       LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false);
4146       LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, "");
4147       offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, "");
4148 
4149       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data, offset);
4150       LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
4151       LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1
4152                                 ? comp_type
4153                                 : LLVMVectorType(comp_type, instr->dest.ssa.num_components);
4154       unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
4155       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), "");
4156       result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
4157       break;
4158    }
4159    case nir_intrinsic_set_vertex_and_primitive_count:
4160       /* Currently ignored. */
4161       break;
4162    case nir_intrinsic_load_buffer_amd: {
4163       LLVMValueRef descriptor = get_src(ctx, instr->src[0]);
4164       LLVMValueRef addr_voffset = get_src(ctx, instr->src[1]);
4165       LLVMValueRef addr_soffset = get_src(ctx, instr->src[2]);
4166       unsigned num_components = instr->dest.ssa.num_components;
4167       unsigned const_offset = nir_intrinsic_base(instr);
4168       bool swizzled = nir_intrinsic_is_swizzled(instr);
4169       bool reorder = nir_intrinsic_can_reorder(instr);
4170       bool slc = nir_intrinsic_slc_amd(instr);
4171 
4172       enum ac_image_cache_policy cache_policy = ac_glc;
4173       if (swizzled)
4174          cache_policy |= ac_swizzled;
4175       if (slc)
4176          cache_policy |= ac_slc;
4177       if (ctx->ac.chip_class >= GFX10)
4178          cache_policy |= ac_dlc;
4179 
4180       LLVMTypeRef channel_type;
4181       if (instr->dest.ssa.bit_size == 8)
4182          channel_type = ctx->ac.i8;
4183       else if (instr->dest.ssa.bit_size == 16)
4184          channel_type = ctx->ac.i16;
4185       else if (instr->dest.ssa.bit_size == 32)
4186          channel_type = ctx->ac.i32;
4187       else if (instr->dest.ssa.bit_size == 64)
4188          channel_type = ctx->ac.i64;
4189       else if (instr->dest.ssa.bit_size == 128)
4190          channel_type = ctx->ac.i128;
4191       else
4192          unreachable("Unsupported channel type for load_buffer_amd");
4193 
4194       result = ac_build_buffer_load(&ctx->ac, descriptor, num_components, NULL,
4195                                     addr_voffset, addr_soffset, const_offset,
4196                                     channel_type, cache_policy, reorder, false);
4197       result = ac_to_integer(&ctx->ac, ac_trim_vector(&ctx->ac, result, num_components));
4198       break;
4199    }
4200    case nir_intrinsic_store_buffer_amd: {
4201       LLVMValueRef store_data = get_src(ctx, instr->src[0]);
4202       LLVMValueRef descriptor = get_src(ctx, instr->src[1]);
4203       LLVMValueRef addr_voffset = get_src(ctx, instr->src[2]);
4204       LLVMValueRef addr_soffset = get_src(ctx, instr->src[3]);
4205       unsigned num_components = instr->src[0].ssa->num_components;
4206       unsigned const_offset = nir_intrinsic_base(instr);
4207       bool swizzled = nir_intrinsic_is_swizzled(instr);
4208       bool slc = nir_intrinsic_slc_amd(instr);
4209 
4210       enum ac_image_cache_policy cache_policy = ac_glc;
4211       if (swizzled)
4212          cache_policy |= ac_swizzled;
4213       if (slc)
4214          cache_policy |= ac_slc;
4215 
4216       ac_build_buffer_store_dword(&ctx->ac, descriptor, store_data, num_components,
4217                                   addr_voffset, addr_soffset, const_offset,
4218                                   cache_policy);
4219       break;
4220    }
4221    case nir_intrinsic_load_packed_passthrough_primitive_amd:
4222       result = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]);
4223       break;
4224    case nir_intrinsic_load_initial_edgeflags_amd:
4225       if (ctx->stage == MESA_SHADER_VERTEX && !ctx->info->vs.blit_sgprs_amd)
4226          result = ac_pack_edgeflags_for_export(&ctx->ac, ctx->args);
4227       else
4228          result = ctx->ac.i32_0;
4229       break;
4230    case nir_intrinsic_has_input_vertex_amd: {
4231       LLVMValueRef num =
4232          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8);
4233       result = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), num, "");
4234       break;
4235    }
4236    case nir_intrinsic_has_input_primitive_amd: {
4237       LLVMValueRef num =
4238          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8);
4239       result = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), num, "");
4240       break;
4241    }
4242    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
4243       result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), 12, 9);
4244       break;
4245    case nir_intrinsic_load_workgroup_num_input_primitives_amd:
4246       result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), 22, 9);
4247       break;
4248    case nir_intrinsic_alloc_vertices_and_primitives_amd:
4249       /* The caller should only call this conditionally for wave 0, so assume that the current
4250        * wave is always wave 0.
4251        */
4252       ac_build_sendmsg_gs_alloc_req(&ctx->ac, ctx->ac.i32_0,
4253                                     get_src(ctx, instr->src[0]),
4254                                     get_src(ctx, instr->src[1]));
4255       break;
4256    case nir_intrinsic_overwrite_vs_arguments_amd:
4257       ctx->vertex_id_replaced = get_src(ctx, instr->src[0]);
4258       ctx->instance_id_replaced = get_src(ctx, instr->src[1]);
4259       break;
4260    case nir_intrinsic_overwrite_tes_arguments_amd:
4261       ctx->tes_u_replaced = get_src(ctx, instr->src[0]);
4262       ctx->tes_v_replaced = get_src(ctx, instr->src[1]);
4263       ctx->tes_rel_patch_id_replaced = get_src(ctx, instr->src[2]);
4264       ctx->tes_patch_id_replaced = get_src(ctx, instr->src[3]);
4265       break;
4266    case nir_intrinsic_export_primitive_amd: {
4267       struct ac_ngg_prim prim = {0};
4268       prim.passthrough = get_src(ctx, instr->src[0]);
4269       ac_build_export_prim(&ctx->ac, &prim);
4270       break;
4271    }
4272    case nir_intrinsic_export_vertex_amd:
4273       ctx->abi->export_vertex(ctx->abi);
4274       break;
4275    case nir_intrinsic_elect:
4276       result = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, visit_first_invocation(ctx),
4277                              ac_get_thread_id(&ctx->ac), "");
4278       break;
4279    case nir_intrinsic_byte_permute_amd:
4280       if (LLVM_VERSION_MAJOR < 13) {
4281          assert("unimplemented byte_permute, LLVM 12 doesn't have amdgcn.perm");
4282          break;
4283       }
4284       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.perm", ctx->ac.i32,
4285                                   (LLVMValueRef[]){get_src(ctx, instr->src[0]),
4286                                                    get_src(ctx, instr->src[1]),
4287                                                    get_src(ctx, instr->src[2])},
4288                                   3, AC_FUNC_ATTR_READNONE);
4289       break;
4290    case nir_intrinsic_lane_permute_16_amd:
4291       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.permlane16", ctx->ac.i32,
4292                                   (LLVMValueRef[]){get_src(ctx, instr->src[0]),
4293                                                    get_src(ctx, instr->src[0]),
4294                                                    get_src(ctx, instr->src[1]),
4295                                                    get_src(ctx, instr->src[2]),
4296                                                    ctx->ac.i1false,
4297                                                    ctx->ac.i1false},
4298                                   6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4299       break;
4300    default:
4301       fprintf(stderr, "Unknown intrinsic: ");
4302       nir_print_instr(&instr->instr, stderr);
4303       fprintf(stderr, "\n");
4304       abort();
4305       break;
4306    }
4307    if (result) {
4308       ctx->ssa_defs[instr->dest.ssa.index] = result;
4309    }
4310 }
4311 
get_bindless_index_from_uniform(struct ac_nir_context * ctx,unsigned base_index,unsigned constant_index,LLVMValueRef dynamic_index)4312 static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, unsigned base_index,
4313                                                     unsigned constant_index,
4314                                                     LLVMValueRef dynamic_index)
4315 {
4316    LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0);
4317    LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
4318                                      LLVMConstInt(ctx->ac.i32, constant_index, 0), "");
4319 
4320    /* Bindless uniforms are 64bit so multiple index by 8 */
4321    index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), "");
4322    offset = LLVMBuildAdd(ctx->ac.builder, offset, index, "");
4323 
4324    LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, 0, 0, false, ctx->ac.i32_0);
4325 
4326    LLVMValueRef ret =
4327       ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, NULL, 0, ctx->ac.f32, 0, true, true);
4328 
4329    return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
4330 }
4331 
4332 struct sampler_desc_address {
4333    unsigned descriptor_set;
4334    unsigned base_index; /* binding in vulkan */
4335    unsigned constant_index;
4336    LLVMValueRef dynamic_index;
4337    bool image;
4338    bool bindless;
4339 };
4340 
get_sampler_desc_internal(struct ac_nir_context * ctx,nir_deref_instr * deref_instr,const nir_instr * instr,bool image)4341 static struct sampler_desc_address get_sampler_desc_internal(struct ac_nir_context *ctx,
4342                                                              nir_deref_instr *deref_instr,
4343                                                              const nir_instr *instr, bool image)
4344 {
4345    LLVMValueRef index = NULL;
4346    unsigned constant_index = 0;
4347    unsigned descriptor_set;
4348    unsigned base_index;
4349    bool bindless = false;
4350 
4351    if (!deref_instr) {
4352       descriptor_set = 0;
4353       if (image) {
4354          nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr);
4355          base_index = 0;
4356          bindless = true;
4357          index = get_src(ctx, img_instr->src[0]);
4358       } else {
4359          nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
4360          int sampSrcIdx = nir_tex_instr_src_index(tex_instr, nir_tex_src_sampler_handle);
4361          if (sampSrcIdx != -1) {
4362             base_index = 0;
4363             bindless = true;
4364             index = get_src(ctx, tex_instr->src[sampSrcIdx].src);
4365          } else {
4366             assert(tex_instr && !image);
4367             base_index = tex_instr->sampler_index;
4368          }
4369       }
4370    } else {
4371       while (deref_instr->deref_type != nir_deref_type_var) {
4372          if (deref_instr->deref_type == nir_deref_type_array) {
4373             unsigned array_size = glsl_get_aoa_size(deref_instr->type);
4374             if (!array_size)
4375                array_size = 1;
4376 
4377             if (nir_src_is_const(deref_instr->arr.index)) {
4378                constant_index += array_size * nir_src_as_uint(deref_instr->arr.index);
4379             } else {
4380                LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index);
4381 
4382                indirect = LLVMBuildMul(ctx->ac.builder, indirect,
4383                                        LLVMConstInt(ctx->ac.i32, array_size, false), "");
4384 
4385                if (!index)
4386                   index = indirect;
4387                else
4388                   index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
4389             }
4390 
4391             deref_instr = nir_src_as_deref(deref_instr->parent);
4392          } else if (deref_instr->deref_type == nir_deref_type_struct) {
4393             unsigned sidx = deref_instr->strct.index;
4394             deref_instr = nir_src_as_deref(deref_instr->parent);
4395             constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx);
4396          } else {
4397             unreachable("Unsupported deref type");
4398          }
4399       }
4400       descriptor_set = deref_instr->var->data.descriptor_set;
4401 
4402       if (deref_instr->var->data.bindless) {
4403          /* For now just assert on unhandled variable types */
4404          assert(deref_instr->var->data.mode == nir_var_uniform);
4405 
4406          base_index = deref_instr->var->data.driver_location;
4407          bindless = true;
4408 
4409          index = index ? index : ctx->ac.i32_0;
4410          index = get_bindless_index_from_uniform(ctx, base_index, constant_index, index);
4411       } else
4412          base_index = deref_instr->var->data.binding;
4413    }
4414    return (struct sampler_desc_address){
4415       .descriptor_set = descriptor_set,
4416       .base_index = base_index,
4417       .constant_index = constant_index,
4418       .dynamic_index = index,
4419       .image = image,
4420       .bindless = bindless,
4421    };
4422 }
4423 
4424 /* Extract any possibly divergent index into a separate value that can be fed
4425  * into get_sampler_desc with the same arguments. */
get_sampler_desc_index(struct ac_nir_context * ctx,nir_deref_instr * deref_instr,const nir_instr * instr,bool image)4426 static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
4427                                            const nir_instr *instr, bool image)
4428 {
4429    struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
4430    return addr.dynamic_index;
4431 }
4432 
get_sampler_desc(struct ac_nir_context * ctx,nir_deref_instr * deref_instr,enum ac_descriptor_type desc_type,const nir_instr * instr,LLVMValueRef index,bool image,bool write)4433 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
4434                                      enum ac_descriptor_type desc_type, const nir_instr *instr,
4435                                      LLVMValueRef index, bool image, bool write)
4436 {
4437    struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
4438    return ctx->abi->load_sampler_desc(ctx->abi, addr.descriptor_set, addr.base_index,
4439                                       addr.constant_index, index, desc_type, addr.image, write,
4440                                       addr.bindless);
4441 }
4442 
4443 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4444  *
4445  * GFX6-GFX7:
4446  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4447  *   filtering manually. The driver sets img7 to a mask clearing
4448  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4449  *     s_and_b32 samp0, samp0, img7
4450  *
4451  * GFX8:
4452  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4453  */
sici_fix_sampler_aniso(struct ac_nir_context * ctx,LLVMValueRef res,LLVMValueRef samp)4454 static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, LLVMValueRef res,
4455                                            LLVMValueRef samp)
4456 {
4457    LLVMBuilderRef builder = ctx->ac.builder;
4458    LLVMValueRef img7, samp0;
4459 
4460    if (ctx->ac.chip_class >= GFX8)
4461       return samp;
4462 
4463    img7 = LLVMBuildExtractElement(builder, res, LLVMConstInt(ctx->ac.i32, 7, 0), "");
4464    samp0 = LLVMBuildExtractElement(builder, samp, LLVMConstInt(ctx->ac.i32, 0, 0), "");
4465    samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4466    return LLVMBuildInsertElement(builder, samp, samp0, LLVMConstInt(ctx->ac.i32, 0, 0), "");
4467 }
4468 
tex_fetch_ptrs(struct ac_nir_context * ctx,nir_tex_instr * instr,struct waterfall_context * wctx,LLVMValueRef * res_ptr,LLVMValueRef * samp_ptr,LLVMValueRef * fmask_ptr)4469 static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
4470                            struct waterfall_context *wctx, LLVMValueRef *res_ptr,
4471                            LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4472 {
4473    nir_deref_instr *texture_deref_instr = NULL;
4474    nir_deref_instr *sampler_deref_instr = NULL;
4475    int plane = -1;
4476 
4477    for (unsigned i = 0; i < instr->num_srcs; i++) {
4478       switch (instr->src[i].src_type) {
4479       case nir_tex_src_texture_deref:
4480          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
4481          break;
4482       case nir_tex_src_sampler_deref:
4483          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
4484          break;
4485       case nir_tex_src_plane:
4486          plane = nir_src_as_int(instr->src[i].src);
4487          break;
4488       default:
4489          break;
4490       }
4491    }
4492 
4493    LLVMValueRef texture_dynamic_index =
4494       get_sampler_desc_index(ctx, texture_deref_instr, &instr->instr, false);
4495    if (!sampler_deref_instr)
4496       sampler_deref_instr = texture_deref_instr;
4497 
4498    LLVMValueRef sampler_dynamic_index =
4499       get_sampler_desc_index(ctx, sampler_deref_instr, &instr->instr, false);
4500    if (instr->texture_non_uniform)
4501       texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
4502 
4503    if (instr->sampler_non_uniform)
4504       sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
4505 
4506    enum ac_descriptor_type main_descriptor =
4507       instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
4508 
4509    if (plane >= 0) {
4510       assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
4511       assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
4512 
4513       main_descriptor = AC_DESC_PLANE_0 + plane;
4514    }
4515 
4516    if (instr->op == nir_texop_fragment_mask_fetch_amd) {
4517       /* The fragment mask is fetched from the compressed
4518        * multisampled surface.
4519        */
4520       main_descriptor = AC_DESC_FMASK;
4521    }
4522 
4523    *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
4524                                texture_dynamic_index, false, false);
4525 
4526    if (samp_ptr) {
4527       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
4528                                    sampler_dynamic_index, false, false);
4529       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
4530          *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4531    }
4532    if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
4533       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr,
4534                                     texture_dynamic_index, false, false);
4535 }
4536 
apply_round_slice(struct ac_llvm_context * ctx,LLVMValueRef coord)4537 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, LLVMValueRef coord)
4538 {
4539    coord = ac_to_float(ctx, coord);
4540    coord = ac_build_round(ctx, coord);
4541    coord = ac_to_integer(ctx, coord);
4542    return coord;
4543 }
4544 
visit_tex(struct ac_nir_context * ctx,nir_tex_instr * instr)4545 static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
4546 {
4547    LLVMValueRef result = NULL;
4548    struct ac_image_args args = {0};
4549    LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
4550    LLVMValueRef ddx = NULL, ddy = NULL;
4551    unsigned offset_src = 0;
4552    struct waterfall_context wctx[2] = {{{0}}};
4553 
4554    tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
4555 
4556    for (unsigned i = 0; i < instr->num_srcs; i++) {
4557       switch (instr->src[i].src_type) {
4558       case nir_tex_src_coord: {
4559          LLVMValueRef coord = get_src(ctx, instr->src[i].src);
4560          args.a16 = instr->src[i].src.ssa->bit_size == 16;
4561          for (unsigned chan = 0; chan < instr->coord_components; ++chan)
4562             args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
4563          break;
4564       }
4565       case nir_tex_src_projector:
4566          break;
4567       case nir_tex_src_comparator:
4568          if (instr->is_shadow) {
4569             args.compare = get_src(ctx, instr->src[i].src);
4570             args.compare = ac_to_float(&ctx->ac, args.compare);
4571             assert(instr->src[i].src.ssa->bit_size == 32);
4572          }
4573          break;
4574       case nir_tex_src_offset:
4575          args.offset = get_src(ctx, instr->src[i].src);
4576          offset_src = i;
4577          /* We pack it with bit shifts, so we need it to be 32-bit. */
4578          assert(ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.offset)) == 32);
4579          break;
4580       case nir_tex_src_bias:
4581          args.bias = get_src(ctx, instr->src[i].src);
4582          assert(ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.bias)) == 32);
4583          break;
4584       case nir_tex_src_lod:
4585          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
4586             args.level_zero = true;
4587          else
4588             args.lod = get_src(ctx, instr->src[i].src);
4589          break;
4590       case nir_tex_src_ms_index:
4591          sample_index = get_src(ctx, instr->src[i].src);
4592          break;
4593       case nir_tex_src_ddx:
4594          ddx = get_src(ctx, instr->src[i].src);
4595          args.g16 = instr->src[i].src.ssa->bit_size == 16;
4596          break;
4597       case nir_tex_src_ddy:
4598          ddy = get_src(ctx, instr->src[i].src);
4599          assert(LLVMTypeOf(ddy) == LLVMTypeOf(ddx));
4600          break;
4601       case nir_tex_src_min_lod:
4602          args.min_lod = get_src(ctx, instr->src[i].src);
4603          break;
4604       case nir_tex_src_texture_offset:
4605       case nir_tex_src_sampler_offset:
4606       case nir_tex_src_plane:
4607       default:
4608          break;
4609       }
4610    }
4611 
4612    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
4613       result = get_buffer_size(ctx, args.resource, true);
4614       goto write_result;
4615    }
4616 
4617    if (instr->op == nir_texop_texture_samples) {
4618       LLVMValueRef res, samples, is_msaa;
4619       LLVMValueRef default_sample;
4620 
4621       res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
4622       samples =
4623          LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 3, false), "");
4624       is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 28, false), "");
4625       is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4626       is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa,
4627                               LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4628 
4629       samples = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 16, false), "");
4630       samples = LLVMBuildAnd(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 0xf, false), "");
4631       samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, samples, "");
4632 
4633       if (ctx->abi->robust_buffer_access) {
4634          LLVMValueRef dword1, is_null_descriptor;
4635 
4636          /* Extract the second dword of the descriptor, if it's
4637           * all zero, then it's a null descriptor.
4638           */
4639          dword1 =
4640             LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 1, false), "");
4641          is_null_descriptor = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
4642                                             LLVMConstInt(ctx->ac.i32, 0, false), "");
4643          default_sample =
4644             LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, ctx->ac.i32_0, ctx->ac.i32_1, "");
4645       } else {
4646          default_sample = ctx->ac.i32_1;
4647       }
4648 
4649       samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, default_sample, "");
4650       result = samples;
4651       goto write_result;
4652    }
4653 
4654    if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
4655       LLVMValueRef offset[3], pack;
4656       for (unsigned chan = 0; chan < 3; ++chan)
4657          offset[chan] = ctx->ac.i32_0;
4658 
4659       unsigned num_components = ac_get_llvm_num_components(args.offset);
4660       for (unsigned chan = 0; chan < num_components; chan++) {
4661          offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan);
4662          offset[chan] =
4663             LLVMBuildAnd(ctx->ac.builder, offset[chan], LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
4664          if (chan)
4665             offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
4666                                         LLVMConstInt(ctx->ac.i32, chan * 8, false), "");
4667       }
4668       pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
4669       pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
4670       args.offset = pack;
4671    }
4672 
4673    /* Section 8.23.1 (Depth Texture Comparison Mode) of the
4674     * OpenGL 4.5 spec says:
4675     *
4676     *    "If the texture’s internal format indicates a fixed-point
4677     *     depth texture, then D_t and D_ref are clamped to the
4678     *     range [0, 1]; otherwise no clamping is performed."
4679     *
4680     * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4681     * so the depth comparison value isn't clamped for Z16 and
4682     * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
4683     * an explicitly clamped 32-bit float format.
4684     */
4685    if (args.compare && ctx->ac.chip_class >= GFX8 && ctx->ac.chip_class <= GFX9 &&
4686        ctx->abi->clamp_shadow_reference) {
4687       LLVMValueRef upgraded, clamped;
4688 
4689       upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
4690                                          LLVMConstInt(ctx->ac.i32, 3, false), "");
4691       upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, LLVMConstInt(ctx->ac.i32, 29, false), "");
4692       upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, "");
4693       clamped = ac_build_clamp(&ctx->ac, args.compare);
4694       args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, args.compare, "");
4695    }
4696 
4697    /* pack derivatives */
4698    if (ddx || ddy) {
4699       int num_src_deriv_channels, num_dest_deriv_channels;
4700       switch (instr->sampler_dim) {
4701       case GLSL_SAMPLER_DIM_3D:
4702       case GLSL_SAMPLER_DIM_CUBE:
4703          num_src_deriv_channels = 3;
4704          num_dest_deriv_channels = 3;
4705          break;
4706       case GLSL_SAMPLER_DIM_2D:
4707       default:
4708          num_src_deriv_channels = 2;
4709          num_dest_deriv_channels = 2;
4710          break;
4711       case GLSL_SAMPLER_DIM_1D:
4712          num_src_deriv_channels = 1;
4713          if (ctx->ac.chip_class == GFX9) {
4714             num_dest_deriv_channels = 2;
4715          } else {
4716             num_dest_deriv_channels = 1;
4717          }
4718          break;
4719       }
4720 
4721       for (unsigned i = 0; i < num_src_deriv_channels; i++) {
4722          args.derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i));
4723          args.derivs[num_dest_deriv_channels + i] =
4724             ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i));
4725       }
4726       for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
4727          LLVMValueRef zero = args.g16 ? ctx->ac.f16_0 : ctx->ac.f32_0;
4728          args.derivs[i] = zero;
4729          args.derivs[num_dest_deriv_channels + i] = zero;
4730       }
4731    }
4732 
4733    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) {
4734       for (unsigned chan = 0; chan < instr->coord_components; chan++)
4735          args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]);
4736       if (instr->coord_components == 3)
4737          args.coords[3] = LLVMGetUndef(args.a16 ? ctx->ac.f16 : ctx->ac.f32);
4738       ac_prepare_cube_coords(&ctx->ac, instr->op == nir_texop_txd, instr->is_array,
4739                              instr->op == nir_texop_lod, args.coords, args.derivs);
4740    }
4741 
4742    /* Texture coordinates fixups */
4743    if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
4744        instr->is_array && instr->op != nir_texop_txf) {
4745       args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]);
4746    }
4747 
4748    if (instr->coord_components > 2 &&
4749        (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
4750         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
4751         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
4752        instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
4753        instr->op != nir_texop_fragment_fetch_amd && instr->op != nir_texop_fragment_mask_fetch_amd) {
4754       args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
4755    }
4756 
4757    if (ctx->ac.chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
4758        instr->op != nir_texop_lod) {
4759       LLVMValueRef filler;
4760       if (instr->op == nir_texop_txf)
4761          filler = args.a16 ? ctx->ac.i16_0 : ctx->ac.i32_0;
4762       else
4763          filler = LLVMConstReal(args.a16 ? ctx->ac.f16 : ctx->ac.f32, 0.5);
4764 
4765       if (instr->is_array)
4766          args.coords[2] = args.coords[1];
4767       args.coords[1] = filler;
4768    }
4769 
4770    /* Pack sample index */
4771    if (sample_index && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_fragment_fetch_amd))
4772       args.coords[instr->coord_components] = sample_index;
4773 
4774    if (instr->op == nir_texop_samples_identical) {
4775       struct ac_image_args txf_args = {0};
4776       memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords));
4777 
4778       txf_args.dmask = 0xf;
4779       txf_args.resource = fmask_ptr;
4780       txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d;
4781       result = build_tex_intrinsic(ctx, instr, &txf_args);
4782 
4783       result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
4784       result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0);
4785       goto write_result;
4786    }
4787 
4788    if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
4789         instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
4790        instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch_amd &&
4791        instr->op != nir_texop_fragment_mask_fetch_amd) {
4792       unsigned sample_chan = instr->is_array ? 3 : 2;
4793       args.coords[sample_chan] = adjust_sample_index_using_fmask(
4794          &ctx->ac, args.coords[0], args.coords[1], instr->is_array ? args.coords[2] : NULL,
4795          args.coords[sample_chan], fmask_ptr);
4796    }
4797 
4798    if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
4799       int num_offsets = instr->src[offset_src].src.ssa->num_components;
4800       num_offsets = MIN2(num_offsets, instr->coord_components);
4801       for (unsigned i = 0; i < num_offsets; ++i) {
4802          LLVMValueRef off = ac_llvm_extract_elem(&ctx->ac, args.offset, i);
4803          if (args.a16)
4804             off = LLVMBuildTrunc(ctx->ac.builder, off, ctx->ac.i16, "");
4805          args.coords[i] = LLVMBuildAdd(ctx->ac.builder, args.coords[i], off, "");
4806       }
4807       args.offset = NULL;
4808    }
4809 
4810    /* DMASK was repurposed for GATHER4. 4 components are always
4811     * returned and DMASK works like a swizzle - it selects
4812     * the component to fetch. The only valid DMASK values are
4813     * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4814     * (red,red,red,red) etc.) The ISA document doesn't mention
4815     * this.
4816     */
4817    args.dmask = 0xf;
4818    if (instr->op == nir_texop_tg4) {
4819       if (instr->is_shadow)
4820          args.dmask = 1;
4821       else
4822          args.dmask = 1 << instr->component;
4823    }
4824 
4825    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
4826       args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array);
4827       args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
4828    }
4829 
4830    /* Adjust the number of coordinates because we only need (x,y) for 2D
4831     * multisampled images and (x,y,layer) for 2D multisampled layered
4832     * images or for multisampled input attachments.
4833     */
4834    if (instr->op == nir_texop_fragment_mask_fetch_amd) {
4835       if (args.dim == ac_image_2dmsaa) {
4836          args.dim = ac_image_2d;
4837       } else {
4838          assert(args.dim == ac_image_2darraymsaa);
4839          args.dim = ac_image_2darray;
4840       }
4841    }
4842 
4843    /* Set TRUNC_COORD=0 for textureGather(). */
4844    if (instr->op == nir_texop_tg4) {
4845       LLVMValueRef dword0 = LLVMBuildExtractElement(ctx->ac.builder, args.sampler, ctx->ac.i32_0, "");
4846       dword0 = LLVMBuildAnd(ctx->ac.builder, dword0, LLVMConstInt(ctx->ac.i32, C_008F30_TRUNC_COORD, 0), "");
4847       args.sampler = LLVMBuildInsertElement(ctx->ac.builder, args.sampler, dword0, ctx->ac.i32_0, "");
4848    }
4849 
4850    assert(instr->dest.is_ssa);
4851    args.d16 = instr->dest.ssa.bit_size == 16;
4852    args.tfe = instr->is_sparse;
4853 
4854    result = build_tex_intrinsic(ctx, instr, &args);
4855 
4856    LLVMValueRef code = NULL;
4857    if (instr->is_sparse) {
4858       code = ac_llvm_extract_elem(&ctx->ac, result, 4);
4859       result = ac_trim_vector(&ctx->ac, result, 4);
4860    }
4861 
4862    if (instr->op == nir_texop_query_levels)
4863       result =
4864          LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
4865    else if (instr->is_shadow && instr->is_new_style_shadow && instr->op != nir_texop_txs &&
4866             instr->op != nir_texop_lod && instr->op != nir_texop_tg4)
4867       result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
4868    else if (ctx->ac.chip_class == GFX9 && instr->op == nir_texop_txs &&
4869               instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
4870       LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
4871       LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
4872       result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, ctx->ac.i32_1, "");
4873    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
4874       /* Use 0x76543210 if the image doesn't have FMASK. */
4875       LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
4876       tmp = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
4877       tmp = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, tmp, ctx->ac.i32_0, "");
4878       result = LLVMBuildSelect(ctx->ac.builder, tmp,
4879                                LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""),
4880                                LLVMConstInt(ctx->ac.i32, 0x76543210, false), "");
4881    } else if (nir_tex_instr_result_size(instr) != 4)
4882       result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
4883 
4884    if (instr->is_sparse)
4885       result = ac_build_concat(&ctx->ac, result, code);
4886 
4887 write_result:
4888    if (result) {
4889       assert(instr->dest.is_ssa);
4890       result = ac_to_integer(&ctx->ac, result);
4891 
4892       for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
4893          result = exit_waterfall(ctx, wctx + i, result);
4894       }
4895 
4896       ctx->ssa_defs[instr->dest.ssa.index] = result;
4897    }
4898 }
4899 
visit_phi(struct ac_nir_context * ctx,nir_phi_instr * instr)4900 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
4901 {
4902    LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
4903    LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, "");
4904 
4905    ctx->ssa_defs[instr->dest.ssa.index] = result;
4906    _mesa_hash_table_insert(ctx->phis, instr, result);
4907 }
4908 
visit_post_phi(struct ac_nir_context * ctx,nir_phi_instr * instr,LLVMValueRef llvm_phi)4909 static void visit_post_phi(struct ac_nir_context *ctx, nir_phi_instr *instr, LLVMValueRef llvm_phi)
4910 {
4911    nir_foreach_phi_src (src, instr) {
4912       LLVMBasicBlockRef block = get_block(ctx, src->pred);
4913       LLVMValueRef llvm_src = get_src(ctx, src->src);
4914 
4915       LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
4916    }
4917 }
4918 
phi_post_pass(struct ac_nir_context * ctx)4919 static void phi_post_pass(struct ac_nir_context *ctx)
4920 {
4921    hash_table_foreach(ctx->phis, entry)
4922    {
4923       visit_post_phi(ctx, (nir_phi_instr *)entry->key, (LLVMValueRef)entry->data);
4924    }
4925 }
4926 
is_def_used_in_an_export(const nir_ssa_def * def)4927 static bool is_def_used_in_an_export(const nir_ssa_def *def)
4928 {
4929    nir_foreach_use (use_src, def) {
4930       if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
4931          nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
4932          if (instr->intrinsic == nir_intrinsic_store_deref)
4933             return true;
4934       } else if (use_src->parent_instr->type == nir_instr_type_alu) {
4935          nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
4936          if (instr->op == nir_op_vec4 && is_def_used_in_an_export(&instr->dest.dest.ssa)) {
4937             return true;
4938          }
4939       }
4940    }
4941    return false;
4942 }
4943 
visit_ssa_undef(struct ac_nir_context * ctx,const nir_ssa_undef_instr * instr)4944 static void visit_ssa_undef(struct ac_nir_context *ctx, const nir_ssa_undef_instr *instr)
4945 {
4946    unsigned num_components = instr->def.num_components;
4947    LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
4948 
4949    if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
4950       LLVMValueRef undef;
4951 
4952       if (num_components == 1)
4953          undef = LLVMGetUndef(type);
4954       else {
4955          undef = LLVMGetUndef(LLVMVectorType(type, num_components));
4956       }
4957       ctx->ssa_defs[instr->def.index] = undef;
4958    } else {
4959       LLVMValueRef zero = LLVMConstInt(type, 0, false);
4960       if (num_components > 1) {
4961          zero = ac_build_gather_values_extended(&ctx->ac, &zero, 4, 0, false, false);
4962       }
4963       ctx->ssa_defs[instr->def.index] = zero;
4964    }
4965 }
4966 
visit_jump(struct ac_llvm_context * ctx,const nir_jump_instr * instr)4967 static void visit_jump(struct ac_llvm_context *ctx, const nir_jump_instr *instr)
4968 {
4969    switch (instr->type) {
4970    case nir_jump_break:
4971       ac_build_break(ctx);
4972       break;
4973    case nir_jump_continue:
4974       ac_build_continue(ctx);
4975       break;
4976    default:
4977       fprintf(stderr, "Unknown NIR jump instr: ");
4978       nir_print_instr(&instr->instr, stderr);
4979       fprintf(stderr, "\n");
4980       abort();
4981    }
4982 }
4983 
glsl_base_to_llvm_type(struct ac_llvm_context * ac,enum glsl_base_type type)4984 static LLVMTypeRef glsl_base_to_llvm_type(struct ac_llvm_context *ac, enum glsl_base_type type)
4985 {
4986    switch (type) {
4987    case GLSL_TYPE_INT:
4988    case GLSL_TYPE_UINT:
4989    case GLSL_TYPE_BOOL:
4990    case GLSL_TYPE_SUBROUTINE:
4991       return ac->i32;
4992    case GLSL_TYPE_INT8:
4993    case GLSL_TYPE_UINT8:
4994       return ac->i8;
4995    case GLSL_TYPE_INT16:
4996    case GLSL_TYPE_UINT16:
4997       return ac->i16;
4998    case GLSL_TYPE_FLOAT:
4999       return ac->f32;
5000    case GLSL_TYPE_FLOAT16:
5001       return ac->f16;
5002    case GLSL_TYPE_INT64:
5003    case GLSL_TYPE_UINT64:
5004       return ac->i64;
5005    case GLSL_TYPE_DOUBLE:
5006       return ac->f64;
5007    default:
5008       unreachable("unknown GLSL type");
5009    }
5010 }
5011 
glsl_to_llvm_type(struct ac_llvm_context * ac,const struct glsl_type * type)5012 static LLVMTypeRef glsl_to_llvm_type(struct ac_llvm_context *ac, const struct glsl_type *type)
5013 {
5014    if (glsl_type_is_scalar(type)) {
5015       return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
5016    }
5017 
5018    if (glsl_type_is_vector(type)) {
5019       return LLVMVectorType(glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
5020                             glsl_get_vector_elements(type));
5021    }
5022 
5023    if (glsl_type_is_matrix(type)) {
5024       return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_column_type(type)),
5025                            glsl_get_matrix_columns(type));
5026    }
5027 
5028    if (glsl_type_is_array(type)) {
5029       return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_array_element(type)),
5030                            glsl_get_length(type));
5031    }
5032 
5033    assert(glsl_type_is_struct_or_ifc(type));
5034 
5035    LLVMTypeRef *const member_types = alloca(glsl_get_length(type) * sizeof(LLVMTypeRef));
5036 
5037    for (unsigned i = 0; i < glsl_get_length(type); i++) {
5038       member_types[i] = glsl_to_llvm_type(ac, glsl_get_struct_field(type, i));
5039    }
5040 
5041    return LLVMStructTypeInContext(ac->context, member_types, glsl_get_length(type), false);
5042 }
5043 
visit_deref(struct ac_nir_context * ctx,nir_deref_instr * instr)5044 static void visit_deref(struct ac_nir_context *ctx, nir_deref_instr *instr)
5045 {
5046    if (!nir_deref_mode_is_one_of(instr, nir_var_mem_shared | nir_var_mem_global))
5047       return;
5048 
5049    LLVMValueRef result = NULL;
5050    switch (instr->deref_type) {
5051    case nir_deref_type_var: {
5052       struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var);
5053       result = entry->data;
5054       break;
5055    }
5056    case nir_deref_type_struct:
5057       if (nir_deref_mode_is(instr, nir_var_mem_global)) {
5058          nir_deref_instr *parent = nir_deref_instr_parent(instr);
5059          uint64_t offset = glsl_get_struct_field_offset(parent->type, instr->strct.index);
5060          result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
5061                                    LLVMConstInt(ctx->ac.i32, offset, 0));
5062       } else {
5063          result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
5064                                 LLVMConstInt(ctx->ac.i32, instr->strct.index, 0));
5065       }
5066       break;
5067    case nir_deref_type_array:
5068       if (nir_deref_mode_is(instr, nir_var_mem_global)) {
5069          nir_deref_instr *parent = nir_deref_instr_parent(instr);
5070          unsigned stride = glsl_get_explicit_stride(parent->type);
5071 
5072          if ((glsl_type_is_matrix(parent->type) && glsl_matrix_type_is_row_major(parent->type)) ||
5073              (glsl_type_is_vector(parent->type) && stride == 0))
5074             stride = type_scalar_size_bytes(parent->type);
5075 
5076          assert(stride > 0);
5077          LLVMValueRef index = get_src(ctx, instr->arr.index);
5078          if (LLVMTypeOf(index) != ctx->ac.i64)
5079             index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
5080 
5081          LLVMValueRef offset =
5082             LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
5083 
5084          result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
5085       } else {
5086          result =
5087             ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index));
5088       }
5089       break;
5090    case nir_deref_type_ptr_as_array:
5091       if (nir_deref_mode_is(instr, nir_var_mem_global)) {
5092          unsigned stride = nir_deref_instr_array_stride(instr);
5093 
5094          LLVMValueRef index = get_src(ctx, instr->arr.index);
5095          if (LLVMTypeOf(index) != ctx->ac.i64)
5096             index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
5097 
5098          LLVMValueRef offset =
5099             LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
5100 
5101          result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
5102       } else {
5103          result =
5104             ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index));
5105       }
5106       break;
5107    case nir_deref_type_cast: {
5108       result = get_src(ctx, instr->parent);
5109 
5110       /* We can't use the structs from LLVM because the shader
5111        * specifies its own offsets. */
5112       LLVMTypeRef pointee_type = ctx->ac.i8;
5113       if (nir_deref_mode_is(instr, nir_var_mem_shared))
5114          pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
5115 
5116       unsigned address_space;
5117 
5118       switch (instr->modes) {
5119       case nir_var_mem_shared:
5120          address_space = AC_ADDR_SPACE_LDS;
5121          break;
5122       case nir_var_mem_global:
5123          address_space = AC_ADDR_SPACE_GLOBAL;
5124          break;
5125       default:
5126          unreachable("Unhandled address space");
5127       }
5128 
5129       LLVMTypeRef type = LLVMPointerType(pointee_type, address_space);
5130 
5131       if (LLVMTypeOf(result) != type) {
5132          if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
5133             result = LLVMBuildBitCast(ctx->ac.builder, result, type, "");
5134          } else {
5135             result = LLVMBuildIntToPtr(ctx->ac.builder, result, type, "");
5136          }
5137       }
5138       break;
5139    }
5140    default:
5141       unreachable("Unhandled deref_instr deref type");
5142    }
5143 
5144    ctx->ssa_defs[instr->dest.ssa.index] = result;
5145 }
5146 
5147 static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list);
5148 
visit_block(struct ac_nir_context * ctx,nir_block * block)5149 static void visit_block(struct ac_nir_context *ctx, nir_block *block)
5150 {
5151    LLVMBasicBlockRef blockref = LLVMGetInsertBlock(ctx->ac.builder);
5152    LLVMValueRef first = LLVMGetFirstInstruction(blockref);
5153    if (first) {
5154       /* ac_branch_exited() might have already inserted non-phis */
5155       LLVMPositionBuilderBefore(ctx->ac.builder, LLVMGetFirstInstruction(blockref));
5156    }
5157 
5158    nir_foreach_instr(instr, block) {
5159       if (instr->type != nir_instr_type_phi)
5160          break;
5161       visit_phi(ctx, nir_instr_as_phi(instr));
5162    }
5163 
5164    LLVMPositionBuilderAtEnd(ctx->ac.builder, blockref);
5165 
5166    nir_foreach_instr (instr, block) {
5167       switch (instr->type) {
5168       case nir_instr_type_alu:
5169          visit_alu(ctx, nir_instr_as_alu(instr));
5170          break;
5171       case nir_instr_type_load_const:
5172          visit_load_const(ctx, nir_instr_as_load_const(instr));
5173          break;
5174       case nir_instr_type_intrinsic:
5175          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
5176          break;
5177       case nir_instr_type_tex:
5178          visit_tex(ctx, nir_instr_as_tex(instr));
5179          break;
5180       case nir_instr_type_phi:
5181          break;
5182       case nir_instr_type_ssa_undef:
5183          visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
5184          break;
5185       case nir_instr_type_jump:
5186          visit_jump(&ctx->ac, nir_instr_as_jump(instr));
5187          break;
5188       case nir_instr_type_deref:
5189          visit_deref(ctx, nir_instr_as_deref(instr));
5190          break;
5191       default:
5192          fprintf(stderr, "Unknown NIR instr type: ");
5193          nir_print_instr(instr, stderr);
5194          fprintf(stderr, "\n");
5195          abort();
5196       }
5197    }
5198 
5199    _mesa_hash_table_insert(ctx->defs, block, LLVMGetInsertBlock(ctx->ac.builder));
5200 }
5201 
visit_if(struct ac_nir_context * ctx,nir_if * if_stmt)5202 static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt)
5203 {
5204    LLVMValueRef value = get_src(ctx, if_stmt->condition);
5205 
5206    nir_block *then_block = (nir_block *)exec_list_get_head(&if_stmt->then_list);
5207 
5208    ac_build_ifcc(&ctx->ac, value, then_block->index);
5209 
5210    visit_cf_list(ctx, &if_stmt->then_list);
5211 
5212    if (!exec_list_is_empty(&if_stmt->else_list)) {
5213       nir_block *else_block = (nir_block *)exec_list_get_head(&if_stmt->else_list);
5214 
5215       ac_build_else(&ctx->ac, else_block->index);
5216       visit_cf_list(ctx, &if_stmt->else_list);
5217    }
5218 
5219    ac_build_endif(&ctx->ac, then_block->index);
5220 }
5221 
visit_loop(struct ac_nir_context * ctx,nir_loop * loop)5222 static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop)
5223 {
5224    nir_block *first_loop_block = (nir_block *)exec_list_get_head(&loop->body);
5225 
5226    ac_build_bgnloop(&ctx->ac, first_loop_block->index);
5227 
5228    visit_cf_list(ctx, &loop->body);
5229 
5230    ac_build_endloop(&ctx->ac, first_loop_block->index);
5231 }
5232 
visit_cf_list(struct ac_nir_context * ctx,struct exec_list * list)5233 static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list)
5234 {
5235    foreach_list_typed(nir_cf_node, node, node, list)
5236    {
5237       switch (node->type) {
5238       case nir_cf_node_block:
5239          visit_block(ctx, nir_cf_node_as_block(node));
5240          break;
5241 
5242       case nir_cf_node_if:
5243          visit_if(ctx, nir_cf_node_as_if(node));
5244          break;
5245 
5246       case nir_cf_node_loop:
5247          visit_loop(ctx, nir_cf_node_as_loop(node));
5248          break;
5249 
5250       default:
5251          assert(0);
5252       }
5253    }
5254 }
5255 
ac_handle_shader_output_decl(struct ac_llvm_context * ctx,struct ac_shader_abi * abi,struct nir_shader * nir,struct nir_variable * variable,gl_shader_stage stage)5256 void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi,
5257                                   struct nir_shader *nir, struct nir_variable *variable,
5258                                   gl_shader_stage stage)
5259 {
5260    unsigned output_loc = variable->data.driver_location;
5261    unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
5262 
5263    /* tess ctrl has it's own load/store paths for outputs */
5264    if (stage == MESA_SHADER_TESS_CTRL)
5265       return;
5266 
5267    if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL ||
5268        stage == MESA_SHADER_GEOMETRY) {
5269       int idx = variable->data.location + variable->data.index;
5270       if (idx == VARYING_SLOT_CLIP_DIST0) {
5271          int length = nir->info.clip_distance_array_size + nir->info.cull_distance_array_size;
5272 
5273          if (length > 4)
5274             attrib_count = 2;
5275          else
5276             attrib_count = 1;
5277       }
5278    }
5279 
5280    bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
5281    LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
5282    for (unsigned i = 0; i < attrib_count; ++i) {
5283       for (unsigned chan = 0; chan < 4; chan++) {
5284          abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] =
5285             ac_build_alloca_undef(ctx, type, "");
5286       }
5287    }
5288 }
5289 
setup_scratch(struct ac_nir_context * ctx,struct nir_shader * shader)5290 static void setup_scratch(struct ac_nir_context *ctx, struct nir_shader *shader)
5291 {
5292    if (shader->scratch_size == 0)
5293       return;
5294 
5295    ctx->scratch =
5296       ac_build_alloca_undef(&ctx->ac, LLVMArrayType(ctx->ac.i8, shader->scratch_size), "scratch");
5297 }
5298 
setup_constant_data(struct ac_nir_context * ctx,struct nir_shader * shader)5299 static void setup_constant_data(struct ac_nir_context *ctx, struct nir_shader *shader)
5300 {
5301    if (!shader->constant_data)
5302       return;
5303 
5304    LLVMValueRef data = LLVMConstStringInContext(ctx->ac.context, shader->constant_data,
5305                                                 shader->constant_data_size, true);
5306    LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size);
5307    LLVMValueRef global =
5308       LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "const_data", AC_ADDR_SPACE_CONST);
5309 
5310    LLVMSetInitializer(global, data);
5311    LLVMSetGlobalConstant(global, true);
5312    LLVMSetVisibility(global, LLVMHiddenVisibility);
5313    ctx->constant_data = global;
5314 }
5315 
setup_shared(struct ac_nir_context * ctx,struct nir_shader * nir)5316 static void setup_shared(struct ac_nir_context *ctx, struct nir_shader *nir)
5317 {
5318    if (ctx->ac.lds)
5319       return;
5320 
5321    LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, nir->info.shared_size);
5322 
5323    LLVMValueRef lds =
5324       LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "compute_lds", AC_ADDR_SPACE_LDS);
5325    LLVMSetAlignment(lds, 64 * 1024);
5326 
5327    ctx->ac.lds =
5328       LLVMBuildBitCast(ctx->ac.builder, lds, LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS), "");
5329 }
5330 
ac_nir_translate(struct ac_llvm_context * ac,struct ac_shader_abi * abi,const struct ac_shader_args * args,struct nir_shader * nir)5331 void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
5332                       const struct ac_shader_args *args, struct nir_shader *nir)
5333 {
5334    struct ac_nir_context ctx = {0};
5335    struct nir_function *func;
5336 
5337    ctx.ac = *ac;
5338    ctx.abi = abi;
5339    ctx.args = args;
5340 
5341    ctx.stage = nir->info.stage;
5342    ctx.info = &nir->info;
5343 
5344    ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
5345 
5346    /* TODO: remove this after RADV switches to lowered IO */
5347    if (!nir->info.io_lowered) {
5348       nir_foreach_shader_out_variable(variable, nir)
5349       {
5350          ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage);
5351       }
5352    }
5353 
5354    ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5355    ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5356    ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5357 
5358    if (ctx.abi->kill_ps_if_inf_interp)
5359       ctx.verified_interp =
5360          _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5361 
5362    func = (struct nir_function *)exec_list_get_head(&nir->functions);
5363 
5364    nir_index_ssa_defs(func->impl);
5365    ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef));
5366 
5367    setup_scratch(&ctx, nir);
5368    setup_constant_data(&ctx, nir);
5369 
5370    if (gl_shader_stage_is_compute(nir->info.stage))
5371       setup_shared(&ctx, nir);
5372 
5373    if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote &&
5374        LLVM_VERSION_MAJOR < 13) {
5375       /* true = don't kill. */
5376       ctx.ac.postponed_kill = ac_build_alloca_init(&ctx.ac, ctx.ac.i1true, "");
5377    }
5378 
5379    visit_cf_list(&ctx, &func->impl->body);
5380    phi_post_pass(&ctx);
5381 
5382    if (ctx.ac.postponed_kill)
5383       ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder, ctx.ac.postponed_kill, ""));
5384 
5385    if (!gl_shader_stage_is_compute(nir->info.stage))
5386       ctx.abi->emit_outputs(ctx.abi);
5387 
5388    free(ctx.ssa_defs);
5389    ralloc_free(ctx.defs);
5390    ralloc_free(ctx.phis);
5391    ralloc_free(ctx.vars);
5392    if (ctx.abi->kill_ps_if_inf_interp)
5393       ralloc_free(ctx.verified_interp);
5394 }
5395 
get_inst_tessfactor_writemask(nir_intrinsic_instr * intrin)5396 static unsigned get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
5397 {
5398    if (intrin->intrinsic != nir_intrinsic_store_output)
5399       return 0;
5400 
5401    unsigned writemask = nir_intrinsic_write_mask(intrin) << nir_intrinsic_component(intrin);
5402    unsigned location = nir_intrinsic_io_semantics(intrin).location;
5403 
5404    if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
5405       return writemask << 4;
5406    else if (location == VARYING_SLOT_TESS_LEVEL_INNER)
5407       return writemask;
5408 
5409    return 0;
5410 }
5411 
scan_tess_ctrl(nir_cf_node * cf_node,unsigned * upper_block_tf_writemask,unsigned * cond_block_tf_writemask,bool * tessfactors_are_def_in_all_invocs,bool is_nested_cf)5412 static void scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask,
5413                            unsigned *cond_block_tf_writemask,
5414                            bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf)
5415 {
5416    switch (cf_node->type) {
5417    case nir_cf_node_block: {
5418       nir_block *block = nir_cf_node_as_block(cf_node);
5419       nir_foreach_instr (instr, block) {
5420          if (instr->type != nir_instr_type_intrinsic)
5421             continue;
5422 
5423          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
5424          if (intrin->intrinsic == nir_intrinsic_control_barrier) {
5425 
5426             /* If we find a barrier in nested control flow put this in the
5427              * too hard basket. In GLSL this is not possible but it is in
5428              * SPIR-V.
5429              */
5430             if (is_nested_cf) {
5431                *tessfactors_are_def_in_all_invocs = false;
5432                return;
5433             }
5434 
5435             /* The following case must be prevented:
5436              *    gl_TessLevelInner = ...;
5437              *    barrier();
5438              *    if (gl_InvocationID == 1)
5439              *       gl_TessLevelInner = ...;
5440              *
5441              * If you consider disjoint code segments separated by barriers, each
5442              * such segment that writes tess factor channels should write the same
5443              * channels in all codepaths within that segment.
5444              */
5445             if (*upper_block_tf_writemask || *cond_block_tf_writemask) {
5446                /* Accumulate the result: */
5447                *tessfactors_are_def_in_all_invocs &=
5448                   !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask));
5449 
5450                /* Analyze the next code segment from scratch. */
5451                *upper_block_tf_writemask = 0;
5452                *cond_block_tf_writemask = 0;
5453             }
5454          } else
5455             *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin);
5456       }
5457 
5458       break;
5459    }
5460    case nir_cf_node_if: {
5461       unsigned then_tessfactor_writemask = 0;
5462       unsigned else_tessfactor_writemask = 0;
5463 
5464       nir_if *if_stmt = nir_cf_node_as_if(cf_node);
5465       foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list)
5466       {
5467          scan_tess_ctrl(nested_node, &then_tessfactor_writemask, cond_block_tf_writemask,
5468                         tessfactors_are_def_in_all_invocs, true);
5469       }
5470 
5471       foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list)
5472       {
5473          scan_tess_ctrl(nested_node, &else_tessfactor_writemask, cond_block_tf_writemask,
5474                         tessfactors_are_def_in_all_invocs, true);
5475       }
5476 
5477       if (then_tessfactor_writemask || else_tessfactor_writemask) {
5478          /* If both statements write the same tess factor channels,
5479           * we can say that the upper block writes them too.
5480           */
5481          *upper_block_tf_writemask |= then_tessfactor_writemask & else_tessfactor_writemask;
5482          *cond_block_tf_writemask |= then_tessfactor_writemask | else_tessfactor_writemask;
5483       }
5484 
5485       break;
5486    }
5487    case nir_cf_node_loop: {
5488       nir_loop *loop = nir_cf_node_as_loop(cf_node);
5489       foreach_list_typed(nir_cf_node, nested_node, node, &loop->body)
5490       {
5491          scan_tess_ctrl(nested_node, cond_block_tf_writemask, cond_block_tf_writemask,
5492                         tessfactors_are_def_in_all_invocs, true);
5493       }
5494 
5495       break;
5496    }
5497    default:
5498       unreachable("unknown cf node type");
5499    }
5500 }
5501 
ac_are_tessfactors_def_in_all_invocs(const struct nir_shader * nir)5502 bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir)
5503 {
5504    assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
5505 
5506    /* The pass works as follows:
5507     * If all codepaths write tess factors, we can say that all
5508     * invocations define tess factors.
5509     *
5510     * Each tess factor channel is tracked separately.
5511     */
5512    unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */
5513    unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */
5514 
5515    /* Initial value = true. Here the pass will accumulate results from
5516     * multiple segments surrounded by barriers. If tess factors aren't
5517     * written at all, it's a shader bug and we don't care if this will be
5518     * true.
5519     */
5520    bool tessfactors_are_def_in_all_invocs = true;
5521 
5522    nir_foreach_function (function, nir) {
5523       if (function->impl) {
5524          foreach_list_typed(nir_cf_node, node, node, &function->impl->body)
5525          {
5526             scan_tess_ctrl(node, &main_block_tf_writemask, &cond_block_tf_writemask,
5527                            &tessfactors_are_def_in_all_invocs, false);
5528          }
5529       }
5530    }
5531 
5532    /* Accumulate the result for the last code segment separated by a
5533     * barrier.
5534     */
5535    if (main_block_tf_writemask || cond_block_tf_writemask) {
5536       tessfactors_are_def_in_all_invocs &= !(cond_block_tf_writemask & ~main_block_tf_writemask);
5537    }
5538 
5539    return tessfactors_are_def_in_all_invocs;
5540 }
5541