1 /*
2  * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 /* 500 gets us LDIB but doesn't change any other a4xx instructions */
28 #define GPU 500
29 
30 #include "ir3_context.h"
31 #include "ir3_image.h"
32 
33 /*
34  * Handlers for instructions changed/added in a4xx:
35  */
36 
37 /* src[] = { buffer_index, offset }. No const_index */
38 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)39 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
40                          struct ir3_instruction **dst)
41 {
42    struct ir3_block *b = ctx->block;
43    struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
44 
45    struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
46 
47    byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
48    offset = ir3_get_src(ctx, &intr->src[2])[0];
49 
50    /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
51    src0 = ir3_collect(b, byte_offset, create_immed(b, 0));
52    src1 = offset;
53 
54    ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0);
55    ldgb->dsts[0]->wrmask = MASK(intr->num_components);
56    ldgb->cat6.iim_val = intr->num_components;
57    ldgb->cat6.d = 4;
58    ldgb->cat6.type = TYPE_U32;
59    ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
60    ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
61 
62    ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
63 }
64 
65 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
66 static void
emit_intrinsic_store_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr)67 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
68 {
69    struct ir3_block *b = ctx->block;
70    struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
71    unsigned wrmask = nir_intrinsic_write_mask(intr);
72    unsigned ncomp = ffs(~wrmask) - 1;
73 
74    assert(wrmask == BITFIELD_MASK(intr->num_components));
75 
76    struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
77 
78    byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
79    offset = ir3_get_src(ctx, &intr->src[3])[0];
80 
81    /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
82     * nir already *= 4:
83     */
84    src0 = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
85    src1 = offset;
86    src2 = ir3_collect(b, byte_offset, create_immed(b, 0));
87 
88    stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
89    stgb->cat6.iim_val = ncomp;
90    stgb->cat6.d = 4;
91    stgb->cat6.type = TYPE_U32;
92    stgb->barrier_class = IR3_BARRIER_BUFFER_W;
93    stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
94 
95    array_insert(b, b->keeps, stgb);
96 }
97 
98 /*
99  * SSBO atomic intrinsics
100  *
101  * All of the SSBO atomic memory operations read a value from memory,
102  * compute a new value using one of the operations below, write the new
103  * value to memory, and return the original value read.
104  *
105  * All operations take 3 sources except CompSwap that takes 4. These
106  * sources represent:
107  *
108  * 0: The SSBO buffer index.
109  * 1: The byte offset into the SSBO buffer of the variable that the atomic
110  *    operation will operate on.
111  * 2: The data parameter to the atomic function (i.e. the value to add
112  *    in ssbo_atomic_add, etc).
113  * 3: CompSwap: the second data parameter.
114  *    Non-CompSwap: The dword offset into the SSBO buffer variable.
115  * 4: CompSwap: The dword offset into the SSBO buffer variable.
116  *
117  * We use custom ssbo_*_ir3 intrinsics generated by ir3_nir_lower_io_offsets()
118  * so we can have the dword offset generated in NIR.
119  */
120 static struct ir3_instruction *
emit_intrinsic_atomic_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr)121 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
122 {
123    struct ir3_block *b = ctx->block;
124    struct ir3_instruction *atomic;
125    type_t type = TYPE_U32;
126 
127    struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
128 
129    struct ir3_instruction *data = ir3_get_src(ctx, &intr->src[2])[0];
130    /* 64b byte offset */
131    struct ir3_instruction *byte_offset =
132       ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0], create_immed(b, 0));
133    /* dword offset for everything but comp_swap */
134    struct ir3_instruction *src3 = ir3_get_src(ctx, &intr->src[3])[0];
135 
136    switch (intr->intrinsic) {
137    case nir_intrinsic_ssbo_atomic_add_ir3:
138       atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
139       break;
140    case nir_intrinsic_ssbo_atomic_imin_ir3:
141       atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
142       type = TYPE_S32;
143       break;
144    case nir_intrinsic_ssbo_atomic_umin_ir3:
145       atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
146       break;
147    case nir_intrinsic_ssbo_atomic_imax_ir3:
148       atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
149       type = TYPE_S32;
150       break;
151    case nir_intrinsic_ssbo_atomic_umax_ir3:
152       atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
153       break;
154    case nir_intrinsic_ssbo_atomic_and_ir3:
155       atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
156       break;
157    case nir_intrinsic_ssbo_atomic_or_ir3:
158       atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
159       break;
160    case nir_intrinsic_ssbo_atomic_xor_ir3:
161       atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
162       break;
163    case nir_intrinsic_ssbo_atomic_exchange_ir3:
164       atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
165       break;
166    case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
167       /* for cmpxchg, src0 is [ui]vec2(data, compare): */
168       data = ir3_collect(b, src3, data);
169       struct ir3_instruction *dword_offset = ir3_get_src(ctx, &intr->src[4])[0];
170       atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, data, 0, dword_offset, 0,
171                                     byte_offset, 0);
172       break;
173    default:
174       unreachable("boo");
175    }
176 
177    atomic->cat6.iim_val = 1;
178    atomic->cat6.d = 4;
179    atomic->cat6.type = type;
180    atomic->barrier_class = IR3_BARRIER_BUFFER_W;
181    atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
182 
183    /* even if nothing consume the result, we can't DCE the instruction: */
184    array_insert(b, b->keeps, atomic);
185 
186    return atomic;
187 }
188 
189 static struct ir3_instruction *
get_image_offset(struct ir3_context * ctx,const nir_intrinsic_instr * instr,struct ir3_instruction * const * coords,bool byteoff)190 get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
191                  struct ir3_instruction *const *coords, bool byteoff)
192 {
193    struct ir3_block *b = ctx->block;
194    struct ir3_instruction *offset;
195    unsigned index = nir_src_as_uint(instr->src[0]);
196    unsigned ncoords = ir3_get_image_coords(instr, NULL);
197 
198    /* to calculate the byte offset (yes, uggg) we need (up to) three
199     * const values to know the bytes per pixel, and y and z stride:
200     */
201    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
202    unsigned cb = regid(const_state->offsets.image_dims, 0) +
203                  const_state->image_dims.off[index];
204 
205    debug_assert(const_state->image_dims.mask & (1 << index));
206 
207    /* offset = coords.x * bytes_per_pixel: */
208    offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
209    if (ncoords > 1) {
210       /* offset += coords.y * y_pitch: */
211       offset =
212          ir3_MAD_S24(b, create_uniform(b, cb + 1), 0, coords[1], 0, offset, 0);
213    }
214    if (ncoords > 2) {
215       /* offset += coords.z * z_pitch: */
216       offset =
217          ir3_MAD_S24(b, create_uniform(b, cb + 2), 0, coords[2], 0, offset, 0);
218    }
219 
220    if (!byteoff) {
221       /* Some cases, like atomics, seem to use dword offset instead
222        * of byte offsets.. blob just puts an extra shr.b in there
223        * in those cases:
224        */
225       offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
226    }
227 
228    return ir3_collect(b, offset, create_immed(b, 0));
229 }
230 
231 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
232 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)233 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
234                           struct ir3_instruction **dst)
235 {
236    struct ir3_block *b = ctx->block;
237    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
238    struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
239    struct ir3_instruction *offset = get_image_offset(ctx, intr, coords, true);
240    unsigned ncoords = ir3_get_image_coords(intr, NULL);
241    unsigned ncomp =
242       ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
243 
244    struct ir3_instruction *ldib = ir3_LDIB(
245       b, ibo, 0, offset, 0, ir3_create_collect(b, coords, ncoords), 0);
246    ldib->dsts[0]->wrmask = MASK(intr->num_components);
247    ldib->cat6.iim_val = ncomp;
248    ldib->cat6.d = ncoords;
249    ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
250    ldib->cat6.typed = true;
251    ldib->barrier_class = IR3_BARRIER_IMAGE_R;
252    ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
253 
254    ir3_split_dest(b, dst, ldib, 0, intr->num_components);
255 }
256 
257 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
258 static void
emit_intrinsic_store_image(struct ir3_context * ctx,nir_intrinsic_instr * intr)259 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
260 {
261    struct ir3_block *b = ctx->block;
262    struct ir3_instruction *stib, *offset;
263    struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
264    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
265    struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
266    unsigned ncoords = ir3_get_image_coords(intr, NULL);
267    unsigned ncomp =
268       ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
269 
270    /* src0 is value
271     * src1 is coords
272     * src2 is 64b byte offset
273     */
274 
275    offset = get_image_offset(ctx, intr, coords, true);
276 
277    /* NOTE: stib seems to take byte offset, but stgb.typed can be used
278     * too and takes a dword offset.. not quite sure yet why blob uses
279     * one over the other in various cases.
280     */
281 
282    stib = ir3_STIB(b, ibo, 0, ir3_create_collect(b, value, ncomp), 0,
283                    ir3_create_collect(b, coords, ncoords), 0, offset, 0);
284    stib->cat6.iim_val = ncomp;
285    stib->cat6.d = ncoords;
286    stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
287    stib->cat6.typed = true;
288    stib->barrier_class = IR3_BARRIER_IMAGE_W;
289    stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
290 
291    array_insert(b, b->keeps, stib);
292 }
293 
294 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
295 static struct ir3_instruction *
emit_intrinsic_atomic_image(struct ir3_context * ctx,nir_intrinsic_instr * intr)296 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
297 {
298    struct ir3_block *b = ctx->block;
299    struct ir3_instruction *atomic, *src0, *src1, *src2;
300    struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
301    struct ir3_instruction *image = ir3_image_to_ibo(ctx, intr->src[0]);
302    unsigned ncoords = ir3_get_image_coords(intr, NULL);
303 
304    /* src0 is value (or uvec2(value, compare))
305     * src1 is coords
306     * src2 is 64b byte offset
307     */
308    src0 = ir3_get_src(ctx, &intr->src[3])[0];
309    src1 = ir3_create_collect(b, coords, ncoords);
310    src2 = get_image_offset(ctx, intr, coords, false);
311 
312    switch (intr->intrinsic) {
313    case nir_intrinsic_image_atomic_add:
314       atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
315       break;
316    case nir_intrinsic_image_atomic_imin:
317    case nir_intrinsic_image_atomic_umin:
318       atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
319       break;
320    case nir_intrinsic_image_atomic_imax:
321    case nir_intrinsic_image_atomic_umax:
322       atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
323       break;
324    case nir_intrinsic_image_atomic_and:
325       atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
326       break;
327    case nir_intrinsic_image_atomic_or:
328       atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
329       break;
330    case nir_intrinsic_image_atomic_xor:
331       atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
332       break;
333    case nir_intrinsic_image_atomic_exchange:
334       atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
335       break;
336    case nir_intrinsic_image_atomic_comp_swap:
337       /* for cmpxchg, src0 is [ui]vec2(data, compare): */
338       src0 = ir3_collect(b, ir3_get_src(ctx, &intr->src[4])[0], src0);
339       atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
340       break;
341    default:
342       unreachable("boo");
343    }
344 
345    atomic->cat6.iim_val = 1;
346    atomic->cat6.d = ncoords;
347    atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
348    atomic->cat6.typed = true;
349    atomic->barrier_class = IR3_BARRIER_IMAGE_W;
350    atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
351 
352    /* even if nothing consume the result, we can't DCE the instruction: */
353    array_insert(b, b->keeps, atomic);
354 
355    return atomic;
356 }
357 
358 const struct ir3_context_funcs ir3_a4xx_funcs = {
359    .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
360    .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
361    .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
362    .emit_intrinsic_load_image = emit_intrinsic_load_image,
363    .emit_intrinsic_store_image = emit_intrinsic_store_image,
364    .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
365    .emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
366    .emit_intrinsic_load_global_ir3 = NULL,
367    .emit_intrinsic_store_global_ir3 = NULL,
368 };
369