1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 #include <pthread.h>
33 #include "main/glspirv.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
36 #include "program/prog_to_nir.h"
37 #include "program/program.h"
38 #include "program/programopt.h"
39 #include "tnl/tnl.h"
40 #include "util/ralloc.h"
41 #include "compiler/glsl/ir.h"
42 #include "compiler/glsl/program.h"
43 #include "compiler/glsl/gl_nir.h"
44 #include "compiler/glsl/glsl_to_nir.h"
45 
46 #include "brw_program.h"
47 #include "brw_context.h"
48 #include "compiler/brw_nir.h"
49 #include "brw_defines.h"
50 #include "intel_batchbuffer.h"
51 
52 #include "brw_cs.h"
53 #include "brw_gs.h"
54 #include "brw_vs.h"
55 #include "brw_wm.h"
56 #include "brw_state.h"
57 
58 #include "main/shaderapi.h"
59 #include "main/shaderobj.h"
60 
61 static bool
brw_nir_lower_uniforms(nir_shader * nir,bool is_scalar)62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
63 {
64    if (is_scalar) {
65       nir_assign_var_locations(nir, nir_var_uniform, &nir->num_uniforms,
66                                type_size_scalar_bytes);
67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
68    } else {
69       nir_assign_var_locations(nir, nir_var_uniform, &nir->num_uniforms,
70                                type_size_vec4_bytes);
71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
72    }
73 }
74 
75 static struct gl_program *brwNewProgram(struct gl_context *ctx,
76                                         gl_shader_stage stage,
77                                         GLuint id, bool is_arb_asm);
78 
79 nir_shader *
brw_create_nir(struct brw_context * brw,const struct gl_shader_program * shader_prog,struct gl_program * prog,gl_shader_stage stage,bool is_scalar)80 brw_create_nir(struct brw_context *brw,
81                const struct gl_shader_program *shader_prog,
82                struct gl_program *prog,
83                gl_shader_stage stage,
84                bool is_scalar)
85 {
86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
87    struct gl_context *ctx = &brw->ctx;
88    const nir_shader_compiler_options *options =
89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
90    nir_shader *nir;
91 
92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
93    if (shader_prog) {
94       if (shader_prog->data->spirv) {
95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
96       } else {
97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
98 
99          /* Remap the locations to slots so those requiring two slots will
100           * occupy two locations. For instance, if we have in the IR code a
101           * dvec3 attr0 in location 0 and vec4 attr1 in location 1, in NIR attr0
102           * will use locations/slots 0 and 1, and attr1 will use location/slot 2
103           */
104          if (nir->info.stage == MESA_SHADER_VERTEX)
105             nir_remap_dual_slot_attributes(nir, &prog->DualSlotInputs);
106       }
107       assert (nir);
108 
109       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out,
110                                 NULL);
111       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
112       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
113                  nir_shader_get_entrypoint(nir), true, false);
114    } else {
115       nir = prog_to_nir(prog, options);
116       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
117    }
118    nir_validate_shader(nir, "before brw_preprocess_nir");
119 
120    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
121 
122    if (!ctx->SoftFP64 && nir->info.uses_64bit &&
123        (options->lower_doubles_options & nir_lower_fp64_full_software)) {
124       ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
125    }
126 
127    brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
128 
129    if (stage == MESA_SHADER_TESS_CTRL) {
130       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
131       static const gl_state_index16 tokens[STATE_LENGTH] =
132          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
133       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
134    }
135 
136    if (stage == MESA_SHADER_TESS_EVAL) {
137       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
138        * a uniform if we don't.
139        */
140       struct gl_linked_shader *tcs =
141          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
142       uint32_t static_patch_vertices =
143          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
144       static const gl_state_index16 tokens[STATE_LENGTH] =
145          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
146       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
147    }
148 
149    if (stage == MESA_SHADER_FRAGMENT) {
150       static const struct nir_lower_wpos_ytransform_options wpos_options = {
151          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
152          .fs_coord_pixel_center_integer = 1,
153          .fs_coord_origin_upper_left = 1,
154       };
155 
156       bool progress = false;
157       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
158       if (progress) {
159          _mesa_add_state_reference(prog->Parameters,
160                                    wpos_options.state_tokens);
161       }
162    }
163 
164    return nir;
165 }
166 
167 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)168 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
169 {
170    assert(glsl_type_is_vector_or_scalar(type));
171 
172    uint32_t comp_size = glsl_type_is_boolean(type)
173       ? 4 : glsl_get_bit_size(type) / 8;
174    unsigned length = glsl_get_vector_elements(type);
175    *size = comp_size * length,
176    *align = comp_size * (length == 3 ? 4 : length);
177 }
178 
179 void
brw_nir_lower_resources(nir_shader * nir,struct gl_shader_program * shader_prog,struct gl_program * prog,const struct gen_device_info * devinfo)180 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
181                         struct gl_program *prog,
182                         const struct gen_device_info *devinfo)
183 {
184    NIR_PASS_V(nir, brw_nir_lower_uniforms, nir->options->lower_to_scalar);
185    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
186    prog->info.textures_used = prog->nir->info.textures_used;
187    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
188 
189    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo, NULL);
190 
191    if (prog->nir->info.stage == MESA_SHADER_COMPUTE &&
192        shader_prog->data->spirv) {
193       NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types,
194                  nir_var_mem_shared, shared_type_info);
195       NIR_PASS_V(prog->nir, nir_lower_explicit_io,
196                  nir_var_mem_shared, nir_address_format_32bit_offset);
197    }
198 
199    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
200    /* Do a round of constant folding to clean up address calculations */
201    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
202 }
203 
204 void
brw_shader_gather_info(nir_shader * nir,struct gl_program * prog)205 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
206 {
207    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
208 
209    /* Copy the info we just generated back into the gl_program */
210    const char *prog_name = prog->info.name;
211    const char *prog_label = prog->info.label;
212    prog->info = nir->info;
213    prog->info.name = prog_name;
214    prog->info.label = prog_label;
215 }
216 
217 static unsigned
get_new_program_id(struct intel_screen * screen)218 get_new_program_id(struct intel_screen *screen)
219 {
220    return p_atomic_inc_return(&screen->program_id);
221 }
222 
brwNewProgram(struct gl_context * ctx,gl_shader_stage stage,GLuint id,bool is_arb_asm)223 static struct gl_program *brwNewProgram(struct gl_context *ctx,
224                                         gl_shader_stage stage,
225                                         GLuint id, bool is_arb_asm)
226 {
227    struct brw_context *brw = brw_context(ctx);
228    struct brw_program *prog = rzalloc(NULL, struct brw_program);
229 
230    if (prog) {
231       prog->id = get_new_program_id(brw->screen);
232 
233       return _mesa_init_gl_program(&prog->program, stage, id, is_arb_asm);
234    }
235 
236    return NULL;
237 }
238 
brwDeleteProgram(struct gl_context * ctx,struct gl_program * prog)239 static void brwDeleteProgram( struct gl_context *ctx,
240 			      struct gl_program *prog )
241 {
242    struct brw_context *brw = brw_context(ctx);
243 
244    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
245     *
246     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
247     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
248     * pointer has changed.
249     *
250     * We cannot leave brw->programs[i] as a dangling pointer to the dead
251     * program.  malloc() may allocate the same memory for a new gl_program,
252     * causing us to see matching pointers...but totally different programs.
253     *
254     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
255     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
256     * would cause us to see matching pointers (NULL == NULL), and fail to
257     * detect that a program has changed since our last draw.
258     *
259     * So, set it to a bogus gl_program pointer that will never match,
260     * causing us to properly reevaluate the state on our next draw.
261     *
262     * Getting this wrong causes heisenbugs which are very hard to catch,
263     * as you need a very specific allocation pattern to hit the problem.
264     */
265    static const struct gl_program deleted_program;
266 
267    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
268       if (brw->programs[i] == prog)
269          brw->programs[i] = (struct gl_program *) &deleted_program;
270    }
271 
272    _mesa_delete_program( ctx, prog );
273 }
274 
275 
276 static GLboolean
brwProgramStringNotify(struct gl_context * ctx,GLenum target,struct gl_program * prog)277 brwProgramStringNotify(struct gl_context *ctx,
278 		       GLenum target,
279 		       struct gl_program *prog)
280 {
281    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
282 
283    struct brw_context *brw = brw_context(ctx);
284    const struct brw_compiler *compiler = brw->screen->compiler;
285 
286    switch (target) {
287    case GL_FRAGMENT_PROGRAM_ARB: {
288       struct brw_program *newFP = brw_program(prog);
289       const struct brw_program *curFP =
290          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
291 
292       if (newFP == curFP)
293 	 brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
294       _mesa_program_fragment_position_to_sysval(&newFP->program);
295       newFP->id = get_new_program_id(brw->screen);
296 
297       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
298 
299       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
300 
301       brw_shader_gather_info(prog->nir, prog);
302 
303       brw_fs_precompile(ctx, prog);
304       break;
305    }
306    case GL_VERTEX_PROGRAM_ARB: {
307       struct brw_program *newVP = brw_program(prog);
308       const struct brw_program *curVP =
309          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
310 
311       if (newVP == curVP)
312 	 brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
313       if (newVP->program.arb.IsPositionInvariant) {
314 	 _mesa_insert_mvp_code(ctx, &newVP->program);
315       }
316       newVP->id = get_new_program_id(brw->screen);
317 
318       /* Also tell tnl about it:
319        */
320       _tnl_program_string(ctx, target, prog);
321 
322       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
323                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
324 
325       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
326 
327       brw_shader_gather_info(prog->nir, prog);
328 
329       brw_vs_precompile(ctx, prog);
330       break;
331    }
332    default:
333       /*
334        * driver->ProgramStringNotify is only called for ARB programs, fixed
335        * function vertex programs, and ir_to_mesa (which isn't used by the
336        * i965 back-end).  Therefore, even after geometry shaders are added,
337        * this function should only ever be called with a target of
338        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
339        */
340       unreachable("Unexpected target in brwProgramStringNotify");
341    }
342 
343    return true;
344 }
345 
346 static void
brw_memory_barrier(struct gl_context * ctx,GLbitfield barriers)347 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
348 {
349    struct brw_context *brw = brw_context(ctx);
350    const struct gen_device_info *devinfo = &brw->screen->devinfo;
351    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
352    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
353 
354    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
355                    GL_ELEMENT_ARRAY_BARRIER_BIT |
356                    GL_COMMAND_BARRIER_BIT))
357       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
358 
359    if (barriers & GL_UNIFORM_BARRIER_BIT)
360       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
361                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
362 
363    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
364       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
365 
366    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
367                    GL_PIXEL_BUFFER_BARRIER_BIT))
368       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
369                PIPE_CONTROL_RENDER_TARGET_FLUSH);
370 
371    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
372       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
373                PIPE_CONTROL_RENDER_TARGET_FLUSH);
374 
375    /* Typed surface messages are handled by the render cache on IVB, so we
376     * need to flush it too.
377     */
378    if (devinfo->gen == 7 && !devinfo->is_haswell)
379       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
380 
381    brw_emit_pipe_control_flush(brw, bits);
382 }
383 
384 static void
brw_framebuffer_fetch_barrier(struct gl_context * ctx)385 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
386 {
387    struct brw_context *brw = brw_context(ctx);
388    const struct gen_device_info *devinfo = &brw->screen->devinfo;
389 
390    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
391       if (devinfo->gen >= 6) {
392          brw_emit_pipe_control_flush(brw,
393                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
394                                      PIPE_CONTROL_CS_STALL);
395          brw_emit_pipe_control_flush(brw,
396                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
397       } else {
398          brw_emit_pipe_control_flush(brw,
399                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
400       }
401    }
402 }
403 
404 void
brw_get_scratch_bo(struct brw_context * brw,struct brw_bo ** scratch_bo,int size)405 brw_get_scratch_bo(struct brw_context *brw,
406 		   struct brw_bo **scratch_bo, int size)
407 {
408    struct brw_bo *old_bo = *scratch_bo;
409 
410    if (old_bo && old_bo->size < size) {
411       brw_bo_unreference(old_bo);
412       old_bo = NULL;
413    }
414 
415    if (!old_bo) {
416       *scratch_bo =
417          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
418    }
419 }
420 
421 /**
422  * Reserve enough scratch space for the given stage to hold \p per_thread_size
423  * bytes times the given \p thread_count.
424  */
425 void
brw_alloc_stage_scratch(struct brw_context * brw,struct brw_stage_state * stage_state,unsigned per_thread_size)426 brw_alloc_stage_scratch(struct brw_context *brw,
427                         struct brw_stage_state *stage_state,
428                         unsigned per_thread_size)
429 {
430    if (stage_state->per_thread_scratch >= per_thread_size)
431       return;
432 
433    stage_state->per_thread_scratch = per_thread_size;
434 
435    if (stage_state->scratch_bo)
436       brw_bo_unreference(stage_state->scratch_bo);
437 
438    const struct gen_device_info *devinfo = &brw->screen->devinfo;
439    unsigned thread_count;
440    switch(stage_state->stage) {
441    case MESA_SHADER_VERTEX:
442       thread_count = devinfo->max_vs_threads;
443       break;
444    case MESA_SHADER_TESS_CTRL:
445       thread_count = devinfo->max_tcs_threads;
446       break;
447    case MESA_SHADER_TESS_EVAL:
448       thread_count = devinfo->max_tes_threads;
449       break;
450    case MESA_SHADER_GEOMETRY:
451       thread_count = devinfo->max_gs_threads;
452       break;
453    case MESA_SHADER_FRAGMENT:
454       thread_count = devinfo->max_wm_threads;
455       break;
456    case MESA_SHADER_COMPUTE: {
457       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
458 
459       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
460        *
461        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
462        *  allocate scratch space enough so that each slice has 4 slices
463        *  allowed."
464        *
465        * According to the other driver team, this applies to compute shaders
466        * as well.  This is not currently documented at all.
467        *
468        * brw->screen->subslice_total is the TOTAL number of subslices
469        * and we wish to view that there are 4 subslices per slice
470        * instead of the actual number of subslices per slice.
471        *
472        * For, ICL, scratch space allocation is based on the number of threads
473        * in the base configuration.
474        */
475       if (devinfo->gen == 11)
476          subslices = 8;
477       else if (devinfo->gen >= 9 && devinfo->gen < 11)
478          subslices = 4 * brw->screen->devinfo.num_slices;
479 
480       unsigned scratch_ids_per_subslice;
481       if (devinfo->gen >= 11) {
482          /* The MEDIA_VFE_STATE docs say:
483           *
484           *    "Starting with this configuration, the Maximum Number of
485           *     Threads must be set to (#EU * 8) for GPGPU dispatches.
486           *
487           *     Although there are only 7 threads per EU in the configuration,
488           *     the FFTID is calculated as if there are 8 threads per EU,
489           *     which in turn requires a larger amount of Scratch Space to be
490           *     allocated by the driver."
491           */
492          scratch_ids_per_subslice = 8 * 8;
493       } else if (devinfo->is_haswell) {
494          /* WaCSScratchSize:hsw
495           *
496           * Haswell's scratch space address calculation appears to be sparse
497           * rather than tightly packed. The Thread ID has bits indicating
498           * which subslice, EU within a subslice, and thread within an EU it
499           * is. There's a maximum of two slices and two subslices, so these
500           * can be stored with a single bit. Even though there are only 10 EUs
501           * per subslice, this is stored in 4 bits, so there's an effective
502           * maximum value of 16 EUs. Similarly, although there are only 7
503           * threads per EU, this is stored in a 3 bit number, giving an
504           * effective maximum value of 8 threads per EU.
505           *
506           * This means that we need to use 16 * 8 instead of 10 * 7 for the
507           * number of threads per subslice.
508           */
509          scratch_ids_per_subslice = 16 * 8;
510       } else if (devinfo->is_cherryview) {
511          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
512           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
513           * as if it had 8 EUs.
514           */
515          scratch_ids_per_subslice = 8 * 7;
516       } else {
517          scratch_ids_per_subslice = devinfo->max_cs_threads;
518       }
519 
520       thread_count = scratch_ids_per_subslice * subslices;
521       break;
522    }
523    default:
524       unreachable("Unsupported stage!");
525    }
526 
527    stage_state->scratch_bo =
528       brw_bo_alloc(brw->bufmgr, "shader scratch space",
529                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
530 }
531 
brwInitFragProgFuncs(struct dd_function_table * functions)532 void brwInitFragProgFuncs( struct dd_function_table *functions )
533 {
534    assert(functions->ProgramStringNotify == _tnl_program_string);
535 
536    functions->NewProgram = brwNewProgram;
537    functions->DeleteProgram = brwDeleteProgram;
538    functions->ProgramStringNotify = brwProgramStringNotify;
539 
540    functions->LinkShader = brw_link_shader;
541 
542    functions->MemoryBarrier = brw_memory_barrier;
543    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
544 }
545 
546 struct shader_times {
547    uint64_t time;
548    uint64_t written;
549    uint64_t reset;
550 };
551 
552 void
brw_init_shader_time(struct brw_context * brw)553 brw_init_shader_time(struct brw_context *brw)
554 {
555    const int max_entries = 2048;
556    brw->shader_time.bo =
557       brw_bo_alloc(brw->bufmgr, "shader time",
558                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
559                    BRW_MEMZONE_OTHER);
560    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
561    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
562    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
563                                           max_entries);
564    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
565                                                max_entries);
566    brw->shader_time.max_entries = max_entries;
567 }
568 
569 static int
compare_time(const void * a,const void * b)570 compare_time(const void *a, const void *b)
571 {
572    uint64_t * const *a_val = a;
573    uint64_t * const *b_val = b;
574 
575    /* We don't just subtract because we're turning the value to an int. */
576    if (**a_val < **b_val)
577       return -1;
578    else if (**a_val == **b_val)
579       return 0;
580    else
581       return 1;
582 }
583 
584 static void
print_shader_time_line(const char * stage,const char * name,int shader_num,uint64_t time,uint64_t total)585 print_shader_time_line(const char *stage, const char *name,
586                        int shader_num, uint64_t time, uint64_t total)
587 {
588    fprintf(stderr, "%-6s%-18s", stage, name);
589 
590    if (shader_num != 0)
591       fprintf(stderr, "%4d: ", shader_num);
592    else
593       fprintf(stderr, "    : ");
594 
595    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
596            (long long)time,
597            (double)time / 1000000000.0,
598            (double)time / total * 100.0);
599 }
600 
601 static void
brw_report_shader_time(struct brw_context * brw)602 brw_report_shader_time(struct brw_context *brw)
603 {
604    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
605       return;
606 
607    uint64_t scaled[brw->shader_time.num_entries];
608    uint64_t *sorted[brw->shader_time.num_entries];
609    uint64_t total_by_type[ST_CS + 1];
610    memset(total_by_type, 0, sizeof(total_by_type));
611    double total = 0;
612    for (int i = 0; i < brw->shader_time.num_entries; i++) {
613       uint64_t written = 0, reset = 0;
614       enum shader_time_shader_type type = brw->shader_time.types[i];
615 
616       sorted[i] = &scaled[i];
617 
618       switch (type) {
619       case ST_VS:
620       case ST_TCS:
621       case ST_TES:
622       case ST_GS:
623       case ST_FS8:
624       case ST_FS16:
625       case ST_FS32:
626       case ST_CS:
627          written = brw->shader_time.cumulative[i].written;
628          reset = brw->shader_time.cumulative[i].reset;
629          break;
630 
631       default:
632          /* I sometimes want to print things that aren't the 3 shader times.
633           * Just print the sum in that case.
634           */
635          written = 1;
636          reset = 0;
637          break;
638       }
639 
640       uint64_t time = brw->shader_time.cumulative[i].time;
641       if (written) {
642          scaled[i] = time / written * (written + reset);
643       } else {
644          scaled[i] = time;
645       }
646 
647       switch (type) {
648       case ST_VS:
649       case ST_TCS:
650       case ST_TES:
651       case ST_GS:
652       case ST_FS8:
653       case ST_FS16:
654       case ST_FS32:
655       case ST_CS:
656          total_by_type[type] += scaled[i];
657          break;
658       default:
659          break;
660       }
661 
662       total += scaled[i];
663    }
664 
665    if (total == 0) {
666       fprintf(stderr, "No shader time collected yet\n");
667       return;
668    }
669 
670    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
671 
672    fprintf(stderr, "\n");
673    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
674    for (int s = 0; s < brw->shader_time.num_entries; s++) {
675       const char *stage;
676       /* Work back from the sorted pointers times to a time to print. */
677       int i = sorted[s] - scaled;
678 
679       if (scaled[i] == 0)
680          continue;
681 
682       int shader_num = brw->shader_time.ids[i];
683       const char *shader_name = brw->shader_time.names[i];
684 
685       switch (brw->shader_time.types[i]) {
686       case ST_VS:
687          stage = "vs";
688          break;
689       case ST_TCS:
690          stage = "tcs";
691          break;
692       case ST_TES:
693          stage = "tes";
694          break;
695       case ST_GS:
696          stage = "gs";
697          break;
698       case ST_FS8:
699          stage = "fs8";
700          break;
701       case ST_FS16:
702          stage = "fs16";
703          break;
704       case ST_FS32:
705          stage = "fs32";
706          break;
707       case ST_CS:
708          stage = "cs";
709          break;
710       default:
711          stage = "other";
712          break;
713       }
714 
715       print_shader_time_line(stage, shader_name, shader_num,
716                              scaled[i], total);
717    }
718 
719    fprintf(stderr, "\n");
720    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
721    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
722    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
723    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
724    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
725    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
726    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
727    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
728 }
729 
730 static void
brw_collect_shader_time(struct brw_context * brw)731 brw_collect_shader_time(struct brw_context *brw)
732 {
733    if (!brw->shader_time.bo)
734       return;
735 
736    /* This probably stalls on the last rendering.  We could fix that by
737     * delaying reading the reports, but it doesn't look like it's a big
738     * overhead compared to the cost of tracking the time in the first place.
739     */
740    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
741 
742    for (int i = 0; i < brw->shader_time.num_entries; i++) {
743       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
744 
745       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
746       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
747       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
748    }
749 
750    /* Zero the BO out to clear it out for our next collection.
751     */
752    memset(bo_map, 0, brw->shader_time.bo->size);
753    brw_bo_unmap(brw->shader_time.bo);
754 }
755 
756 void
brw_collect_and_report_shader_time(struct brw_context * brw)757 brw_collect_and_report_shader_time(struct brw_context *brw)
758 {
759    brw_collect_shader_time(brw);
760 
761    if (brw->shader_time.report_time == 0 ||
762        get_time() - brw->shader_time.report_time >= 1.0) {
763       brw_report_shader_time(brw);
764       brw->shader_time.report_time = get_time();
765    }
766 }
767 
768 /**
769  * Chooses an index in the shader_time buffer and sets up tracking information
770  * for our printouts.
771  *
772  * Note that this holds on to references to the underlying programs, which may
773  * change their lifetimes compared to normal operation.
774  */
775 int
brw_get_shader_time_index(struct brw_context * brw,struct gl_program * prog,enum shader_time_shader_type type,bool is_glsl_sh)776 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
777                           enum shader_time_shader_type type, bool is_glsl_sh)
778 {
779    int shader_time_index = brw->shader_time.num_entries++;
780    assert(shader_time_index < brw->shader_time.max_entries);
781    brw->shader_time.types[shader_time_index] = type;
782 
783    const char *name;
784    if (prog->Id == 0) {
785       name = "ff";
786    } else if (is_glsl_sh) {
787       name = prog->info.label ?
788          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
789    } else {
790       name = "prog";
791    }
792 
793    brw->shader_time.names[shader_time_index] = name;
794    brw->shader_time.ids[shader_time_index] = prog->Id;
795 
796    return shader_time_index;
797 }
798 
799 void
brw_destroy_shader_time(struct brw_context * brw)800 brw_destroy_shader_time(struct brw_context *brw)
801 {
802    brw_bo_unreference(brw->shader_time.bo);
803    brw->shader_time.bo = NULL;
804 }
805 
806 void
brw_stage_prog_data_free(const void * p)807 brw_stage_prog_data_free(const void *p)
808 {
809    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
810 
811    ralloc_free(prog_data->param);
812    ralloc_free(prog_data->pull_param);
813 }
814 
815 void
brw_dump_arb_asm(const char * stage,struct gl_program * prog)816 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
817 {
818    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
819            stage, prog->Id, stage);
820    _mesa_print_program(prog);
821 }
822 
823 void
brw_setup_tex_for_precompile(const struct gen_device_info * devinfo,struct brw_sampler_prog_key_data * tex,const struct gl_program * prog)824 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
825                              struct brw_sampler_prog_key_data *tex,
826                              const struct gl_program *prog)
827 {
828    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
829    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
830    for (unsigned i = 0; i < sampler_count; i++) {
831       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
832          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
833          tex->swizzles[i] =
834             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
835       } else {
836          /* Color sampler: assume no swizzling. */
837          tex->swizzles[i] = SWIZZLE_XYZW;
838       }
839    }
840 }
841 
842 /**
843  * Sets up the starting offsets for the groups of binding table entries
844  * common to all pipeline stages.
845  *
846  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
847  * unused but also make sure that addition of small offsets to them will
848  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
849  */
850 uint32_t
brw_assign_common_binding_table_offsets(const struct gen_device_info * devinfo,const struct gl_program * prog,struct brw_stage_prog_data * stage_prog_data,uint32_t next_binding_table_offset)851 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
852                                         const struct gl_program *prog,
853                                         struct brw_stage_prog_data *stage_prog_data,
854                                         uint32_t next_binding_table_offset)
855 {
856    int num_textures = util_last_bit(prog->SamplersUsed);
857 
858    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
859    next_binding_table_offset += num_textures;
860 
861    if (prog->info.num_ubos) {
862       assert(prog->info.num_ubos <= BRW_MAX_UBO);
863       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
864       next_binding_table_offset += prog->info.num_ubos;
865    } else {
866       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
867    }
868 
869    if (prog->info.num_ssbos || prog->info.num_abos) {
870       assert(prog->info.num_abos <= BRW_MAX_ABO);
871       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
872       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
873       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
874    } else {
875       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
876    }
877 
878    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
879       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
880       next_binding_table_offset++;
881    } else {
882       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
883    }
884 
885    if (prog->info.uses_texture_gather) {
886       if (devinfo->gen >= 8) {
887          stage_prog_data->binding_table.gather_texture_start =
888             stage_prog_data->binding_table.texture_start;
889       } else {
890          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
891          next_binding_table_offset += num_textures;
892       }
893    } else {
894       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
895    }
896 
897    if (prog->info.num_images) {
898       stage_prog_data->binding_table.image_start = next_binding_table_offset;
899       next_binding_table_offset += prog->info.num_images;
900    } else {
901       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
902    }
903 
904    /* This may or may not be used depending on how the compile goes. */
905    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
906    next_binding_table_offset++;
907 
908    /* Plane 0 is just the regular texture section */
909    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
910 
911    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
912    next_binding_table_offset += num_textures;
913 
914    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
915    next_binding_table_offset += num_textures;
916 
917    /* Set the binding table size.  Some callers may append new entries
918     * and increase this accordingly.
919     */
920    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
921 
922    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
923    return next_binding_table_offset;
924 }
925 
926 void
brw_populate_default_key(const struct brw_compiler * compiler,union brw_any_prog_key * prog_key,struct gl_shader_program * sh_prog,struct gl_program * prog)927 brw_populate_default_key(const struct brw_compiler *compiler,
928                          union brw_any_prog_key *prog_key,
929                          struct gl_shader_program *sh_prog,
930                          struct gl_program *prog)
931 {
932    switch (prog->info.stage) {
933    case MESA_SHADER_VERTEX:
934       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
935       break;
936    case MESA_SHADER_TESS_CTRL:
937       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
938       break;
939    case MESA_SHADER_TESS_EVAL:
940       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
941       break;
942    case MESA_SHADER_GEOMETRY:
943       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
944       break;
945    case MESA_SHADER_FRAGMENT:
946       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
947       break;
948    case MESA_SHADER_COMPUTE:
949       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
950       break;
951    default:
952       unreachable("Unsupported stage!");
953    }
954 }
955 
956 void
brw_debug_recompile(struct brw_context * brw,gl_shader_stage stage,unsigned api_id,struct brw_base_prog_key * key)957 brw_debug_recompile(struct brw_context *brw,
958                     gl_shader_stage stage,
959                     unsigned api_id,
960                     struct brw_base_prog_key *key)
961 {
962    const struct brw_compiler *compiler = brw->screen->compiler;
963    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
964 
965    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
966                              _mesa_shader_stage_to_string(stage), api_id);
967 
968    const void *old_key =
969       brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
970 
971    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
972 }
973