1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "util/format/u_format.h"
32 #include "util/u_inlines.h"
33 #include "util/u_memory.h"
34 #include "util/u_string.h"
35 
36 #include "nir/tgsi_to_nir.h"
37 
38 #include "freedreno_context.h"
39 #include "freedreno_util.h"
40 
41 #include "ir3/ir3_cache.h"
42 #include "ir3/ir3_compiler.h"
43 #include "ir3/ir3_gallium.h"
44 #include "ir3/ir3_nir.h"
45 #include "ir3/ir3_shader.h"
46 
47 /**
48  * The hardware cso for shader state
49  *
50  * Initially just a container for the ir3_shader, but this is where we'll
51  * plumb in async compile.
52  */
53 struct ir3_shader_state {
54    struct ir3_shader *shader;
55 
56    /* Fence signalled when async compile is completed: */
57    struct util_queue_fence ready;
58 };
59 
60 /**
61  * Should initial variants be compiled synchronously?
62  *
63  * The only case where pipe_debug_message() is used in the initial-variants
64  * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
65  * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
66  * compile the initial shader variant asynchronously.
67  */
68 static bool
initial_variants_synchronous(struct fd_context * ctx)69 initial_variants_synchronous(struct fd_context *ctx)
70 {
71    return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
72           FD_DBG(SERIALC);
73 }
74 
75 static void
dump_shader_info(struct ir3_shader_variant * v,struct pipe_debug_callback * debug)76 dump_shader_info(struct ir3_shader_variant *v,
77                  struct pipe_debug_callback *debug)
78 {
79    if (!FD_DBG(SHADERDB))
80       return;
81 
82    pipe_debug_message(
83       debug, SHADER_INFO,
84       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
85       "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
86       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
87       "%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
88       "%d loops\n",
89       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
90       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
91       v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
92       v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen,
93       v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
94       v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
95       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
96       v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
97       v->info.stp_count, v->info.ldp_count, v->info.sstall,
98       v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
99 }
100 
101 static void
upload_shader_variant(struct ir3_shader_variant * v)102 upload_shader_variant(struct ir3_shader_variant *v)
103 {
104    struct shader_info *info = &v->shader->nir->info;
105    struct ir3_compiler *compiler = v->shader->compiler;
106 
107    assert(!v->bo);
108 
109    v->bo =
110       fd_bo_new(compiler->dev, v->info.size, 0,
111                 "%s:%s", ir3_shader_stage(v), info->name);
112 
113    /* Always include shaders in kernel crash dumps. */
114    fd_bo_mark_for_dump(v->bo);
115 
116    memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
117 }
118 
119 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct pipe_debug_callback * debug)120 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
121                    bool binning_pass, struct pipe_debug_callback *debug)
122 {
123    struct ir3_shader_variant *v;
124    bool created = false;
125 
126    /* Some shader key values may not be used by a given ir3_shader (for
127     * example, fragment shader saturates in the vertex shader), so clean out
128     * those flags to avoid recompiling.
129     */
130    ir3_key_clear_unused(&key, shader);
131 
132    v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
133 
134    if (created) {
135       if (shader->initial_variants_done) {
136          perf_debug_message(debug, SHADER_INFO,
137                             "%s shader: recompiling at draw time: global "
138                             "0x%08x, vfsamples %x/%x, astc %x/%x\n",
139                             ir3_shader_stage(v), key.global, key.vsamples,
140                             key.fsamples, key.vastc_srgb, key.fastc_srgb);
141       }
142 
143       dump_shader_info(v, debug);
144       upload_shader_variant(v);
145 
146       if (v->binning) {
147          upload_shader_variant(v->binning);
148          dump_shader_info(v->binning, debug);
149       }
150    }
151 
152    return v;
153 }
154 
155 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)156 copy_stream_out(struct ir3_stream_output_info *i,
157                 const struct pipe_stream_output_info *p)
158 {
159    STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
160    STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
161 
162    i->num_outputs = p->num_outputs;
163    for (int n = 0; n < ARRAY_SIZE(i->stride); n++)
164       i->stride[n] = p->stride[n];
165 
166    for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
167       i->output[n].register_index = p->output[n].register_index;
168       i->output[n].start_component = p->output[n].start_component;
169       i->output[n].num_components = p->output[n].num_components;
170       i->output[n].output_buffer = p->output[n].output_buffer;
171       i->output[n].dst_offset = p->output[n].dst_offset;
172       i->output[n].stream = p->output[n].stream;
173    }
174 }
175 
176 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct pipe_debug_callback * debug)177 create_initial_variants(struct ir3_shader_state *hwcso,
178                         struct pipe_debug_callback *debug)
179 {
180    struct ir3_shader *shader = hwcso->shader;
181    struct ir3_compiler *compiler = shader->compiler;
182    nir_shader *nir = shader->nir;
183 
184    /* Compile standard variants immediately to try to avoid draw-time stalls
185     * to run the compiler.
186     */
187    struct ir3_shader_key key = {
188       .tessellation = IR3_TESS_NONE,
189       .ucp_enables = MASK(nir->info.clip_distance_array_size),
190       .msaa = true,
191    };
192 
193    switch (nir->info.stage) {
194    case MESA_SHADER_TESS_EVAL:
195       key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode);
196       break;
197 
198    case MESA_SHADER_TESS_CTRL:
199       /* The primitive_mode field, while it exists for TCS, is not
200        * populated (since separable shaders between TCS/TES are legal,
201        * so TCS wouldn't have access to TES's declaration).  Make a
202        * guess so that we shader-db something plausible for TCS.
203        */
204       if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
205          key.tessellation = IR3_TESS_TRIANGLES;
206       else
207          key.tessellation = IR3_TESS_ISOLINES;
208       break;
209 
210    case MESA_SHADER_GEOMETRY:
211       key.has_gs = true;
212       break;
213 
214    default:
215       break;
216    }
217 
218    key.safe_constlen = false;
219    struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
220    if (!v)
221       return;
222 
223    if (v->constlen > compiler->max_const_safe) {
224       key.safe_constlen = true;
225       ir3_shader_variant(shader, key, false, debug);
226    }
227 
228    /* For vertex shaders, also compile initial binning pass shader: */
229    if (nir->info.stage == MESA_SHADER_VERTEX) {
230       key.safe_constlen = false;
231       v = ir3_shader_variant(shader, key, true, debug);
232       if (!v)
233          return;
234 
235       if (v->constlen > compiler->max_const_safe) {
236          key.safe_constlen = true;
237          ir3_shader_variant(shader, key, true, debug);
238       }
239    }
240 
241    shader->initial_variants_done = true;
242 }
243 
244 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)245 create_initial_variants_async(void *job, void *gdata, int thread_index)
246 {
247    struct ir3_shader_state *hwcso = job;
248    struct pipe_debug_callback debug = {};
249 
250    create_initial_variants(hwcso, &debug);
251 }
252 
253 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)254 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
255 {
256    struct ir3_shader_state *hwcso = job;
257    struct ir3_shader *shader = hwcso->shader;
258    struct pipe_debug_callback debug = {};
259    static struct ir3_shader_key key; /* static is implicitly zeroed */
260 
261    ir3_shader_variant(shader, key, false, &debug);
262    shader->initial_variants_done = true;
263 }
264 
265 /* a bit annoying that compute-shader and normal shader state objects
266  * aren't a bit more aligned.
267  */
268 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)269 ir3_shader_compute_state_create(struct pipe_context *pctx,
270                                 const struct pipe_compute_state *cso)
271 {
272    struct fd_context *ctx = fd_context(pctx);
273 
274    /* req_input_mem will only be non-zero for cl kernels (ie. clover).
275     * This isn't a perfect test because I guess it is possible (but
276     * uncommon) for none for the kernel parameters to be a global,
277     * but ctx->set_global_bindings() can't fail, so this is the next
278     * best place to fail if we need a newer version of kernel driver:
279     */
280    if ((cso->req_input_mem > 0) &&
281        fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
282       return NULL;
283    }
284 
285    struct ir3_compiler *compiler = ctx->screen->compiler;
286    nir_shader *nir;
287 
288    if (cso->ir_type == PIPE_SHADER_IR_NIR) {
289       /* we take ownership of the reference: */
290       nir = (nir_shader *)cso->prog;
291    } else {
292       debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
293       if (ir3_shader_debug & IR3_DBG_DISASM) {
294          tgsi_dump(cso->prog, 0);
295       }
296       nir = tgsi_to_nir(cso->prog, pctx->screen, false);
297    }
298 
299    struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
300    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
301 
302    util_queue_fence_init(&hwcso->ready);
303    hwcso->shader = shader;
304 
305    /* Immediately compile a standard variant.  We have so few variants in our
306     * shaders, that doing so almost eliminates draw-time recompiles.  (This
307     * is also how we get data from shader-db's ./run)
308     */
309 
310    if (initial_variants_synchronous(ctx)) {
311       static struct ir3_shader_key key; /* static is implicitly zeroed */
312       ir3_shader_variant(shader, key, false, &ctx->debug);
313       shader->initial_variants_done = true;
314    } else {
315       struct fd_screen *screen = ctx->screen;
316       util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
317                          create_initial_compute_variants_async, NULL, 0);
318    }
319 
320    return hwcso;
321 }
322 
323 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)324 ir3_shader_state_create(struct pipe_context *pctx,
325                         const struct pipe_shader_state *cso)
326 {
327    struct fd_context *ctx = fd_context(pctx);
328    struct ir3_compiler *compiler = ctx->screen->compiler;
329    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
330 
331    /*
332     * Convert to nir (if necessary):
333     */
334 
335    nir_shader *nir;
336    if (cso->type == PIPE_SHADER_IR_NIR) {
337       /* we take ownership of the reference: */
338       nir = cso->ir.nir;
339    } else {
340       debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
341       if (ir3_shader_debug & IR3_DBG_DISASM) {
342          tgsi_dump(cso->tokens, 0);
343       }
344       nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
345    }
346 
347    /*
348     * Create ir3_shader:
349     *
350     * This part is cheap, it doesn't compile initial variants
351     */
352 
353    struct ir3_stream_output_info stream_output = {};
354    copy_stream_out(&stream_output, &cso->stream_output);
355 
356    hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);
357 
358    /*
359     * Create initial variants to avoid draw-time stalls.  This is
360     * normally done asynchronously, unless debug is enabled (which
361     * will be the case for shader-db)
362     */
363 
364    util_queue_fence_init(&hwcso->ready);
365 
366    if (initial_variants_synchronous(ctx)) {
367       create_initial_variants(hwcso, &ctx->debug);
368    } else {
369       util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
370                          create_initial_variants_async, NULL, 0);
371    }
372 
373    return hwcso;
374 }
375 
376 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)377 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
378 {
379    struct fd_context *ctx = fd_context(pctx);
380    struct fd_screen *screen = ctx->screen;
381    struct ir3_shader_state *hwcso = _hwcso;
382    struct ir3_shader *so = hwcso->shader;
383 
384    ir3_cache_invalidate(ctx->shader_cache, hwcso);
385 
386    /* util_queue_drop_job() guarantees that either:
387     *  1) job did not execute
388     *  2) job completed
389     *
390     * In either case the fence is signaled
391     */
392    util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
393 
394    /* free the uploaded shaders, since this is handled outside of the
395     * shared ir3 code (ie. not used by turnip):
396     */
397    for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
398       fd_bo_del(v->bo);
399       v->bo = NULL;
400 
401       if (v->binning && v->binning->bo) {
402          fd_bo_del(v->binning->bo);
403          v->binning->bo = NULL;
404       }
405    }
406 
407    ir3_shader_destroy(so);
408    util_queue_fence_destroy(&hwcso->ready);
409    free(hwcso);
410 }
411 
412 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)413 ir3_get_shader(struct ir3_shader_state *hwcso)
414 {
415    if (!hwcso)
416       return NULL;
417 
418    struct ir3_shader *shader = hwcso->shader;
419    perf_time (1000, "waited for %s:%s:%s variants",
420               _mesa_shader_stage_to_abbrev(shader->type),
421               shader->nir->info.name,
422               shader->nir->info.label) {
423       /* wait for initial variants to compile: */
424       util_queue_fence_wait(&hwcso->ready);
425    }
426 
427    return shader;
428 }
429 
430 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)431 ir3_get_shader_info(struct ir3_shader_state *hwcso)
432 {
433    if (!hwcso)
434       return NULL;
435    return &hwcso->shader->nir->info;
436 }
437 
438 /* fixup dirty shader state in case some "unrelated" (from the state-
439  * tracker's perspective) state change causes us to switch to a
440  * different variant.
441  */
442 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)443 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
444 {
445    struct fd_context *ctx = fd_context(pctx);
446 
447    if (!ir3_shader_key_equal(ctx->last.key, key)) {
448       if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
449          fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
450                                  FD_DIRTY_SHADER_PROG);
451       }
452 
453       if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
454          fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
455       }
456 
457       /* NOTE: currently only a6xx has gs/tess, but needs no
458        * gs/tess specific lowering.
459        */
460 
461       *ctx->last.key = *key;
462    }
463 }
464 
465 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,void * nir)466 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
467 {
468    struct fd_screen *screen = fd_screen(pscreen);
469 
470    ir3_nir_lower_io_to_temporaries(nir);
471    ir3_finalize_nir(screen->compiler, nir);
472 
473    return NULL;
474 }
475 
476 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)477 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
478                                     unsigned max_threads)
479 {
480    struct fd_screen *screen = fd_screen(pscreen);
481 
482    /* This function doesn't allow a greater number of threads than
483     * the queue had at its creation.
484     */
485    util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
486 }
487 
488 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)489 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
490                                             void *shader,
491                                             enum pipe_shader_type shader_type)
492 {
493    struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
494 
495    return util_queue_fence_is_signalled(&hwcso->ready);
496 }
497 
498 void
ir3_prog_init(struct pipe_context * pctx)499 ir3_prog_init(struct pipe_context *pctx)
500 {
501    pctx->create_vs_state = ir3_shader_state_create;
502    pctx->delete_vs_state = ir3_shader_state_delete;
503 
504    pctx->create_tcs_state = ir3_shader_state_create;
505    pctx->delete_tcs_state = ir3_shader_state_delete;
506 
507    pctx->create_tes_state = ir3_shader_state_create;
508    pctx->delete_tes_state = ir3_shader_state_delete;
509 
510    pctx->create_gs_state = ir3_shader_state_create;
511    pctx->delete_gs_state = ir3_shader_state_delete;
512 
513    pctx->create_fs_state = ir3_shader_state_create;
514    pctx->delete_fs_state = ir3_shader_state_delete;
515 }
516 
517 void
ir3_screen_init(struct pipe_screen * pscreen)518 ir3_screen_init(struct pipe_screen *pscreen)
519 {
520    struct fd_screen *screen = fd_screen(pscreen);
521 
522    screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, false);
523 
524    /* TODO do we want to limit things to # of fast cores, or just limit
525     * based on total # of both big and little cores.  The little cores
526     * tend to be in-order and probably much slower for compiling than
527     * big cores.  OTOH if they are sitting idle, maybe it is useful to
528     * use them?
529     */
530    unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
531 
532    util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
533                    UTIL_QUEUE_INIT_RESIZE_IF_FULL |
534                       UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
535 
536    pscreen->finalize_nir = ir3_screen_finalize_nir;
537    pscreen->set_max_shader_compiler_threads =
538       ir3_set_max_shader_compiler_threads;
539    pscreen->is_parallel_shader_compilation_finished =
540       ir3_is_parallel_shader_compilation_finished;
541 }
542 
543 void
ir3_screen_fini(struct pipe_screen * pscreen)544 ir3_screen_fini(struct pipe_screen *pscreen)
545 {
546    struct fd_screen *screen = fd_screen(pscreen);
547 
548    util_queue_destroy(&screen->compile_queue);
549    ir3_compiler_destroy(screen->compiler);
550    screen->compiler = NULL;
551 }
552 
553 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)554 ir3_update_max_tf_vtx(struct fd_context *ctx,
555                       const struct ir3_shader_variant *v)
556 {
557    struct fd_streamout_stateobj *so = &ctx->streamout;
558    struct ir3_stream_output_info *info = &v->shader->stream_output;
559    uint32_t maxvtxcnt = 0x7fffffff;
560 
561    if (v->shader->stream_output.num_outputs == 0)
562       ctx->streamout.max_tf_vtx = 0;
563    if (so->num_targets == 0)
564       ctx->streamout.max_tf_vtx = 0;
565 
566    /* offset to write to is:
567     *
568     *   total_vtxcnt = vtxcnt + offsets[i]
569     *   offset = total_vtxcnt * stride[i]
570     *
571     *   offset =   vtxcnt * stride[i]       ; calculated in shader
572     *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
573     *
574     * assuming for each vtx, each target buffer will have data written
575     * up to 'offset + stride[i]', that leaves maxvtxcnt as:
576     *
577     *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
578     *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
579     *
580     * but shader is actually doing a less-than (rather than less-than-
581     * equal) check, so we can drop the -stride[i].
582     *
583     * TODO is assumption about `offset + stride[i]` legit?
584     */
585    for (unsigned i = 0; i < so->num_targets; i++) {
586       struct pipe_stream_output_target *target = so->targets[i];
587       unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
588       if (target) {
589          uint32_t max = target->buffer_size / stride;
590          maxvtxcnt = MIN2(maxvtxcnt, max);
591       }
592    }
593 
594    ctx->streamout.max_tf_vtx = maxvtxcnt;
595 }
596