1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "util/macros.h"
27 #include "util/u_prim.h"
28 #include "util/u_vbuf.h"
29 #include "util/u_helpers.h"
30 #include "util/u_draw.h"
31 #include "util/u_memory.h"
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "gallium/auxiliary/util/u_blend.h"
35 
36 #include "panfrost-quirks.h"
37 #include "genxml/gen_macros.h"
38 
39 #include "pan_pool.h"
40 #include "pan_bo.h"
41 #include "pan_blend.h"
42 #include "pan_context.h"
43 #include "pan_job.h"
44 #include "pan_shader.h"
45 #include "pan_texture.h"
46 #include "pan_util.h"
47 #include "pan_indirect_draw.h"
48 #include "pan_indirect_dispatch.h"
49 #include "pan_blitter.h"
50 
51 struct panfrost_rasterizer {
52         struct pipe_rasterizer_state base;
53 
54         /* Partially packed RSD words */
55         struct mali_multisample_misc_packed multisample;
56         struct mali_stencil_mask_misc_packed stencil_misc;
57 };
58 
59 struct panfrost_zsa_state {
60         struct pipe_depth_stencil_alpha_state base;
61 
62         /* Is any depth, stencil, or alpha testing enabled? */
63         bool enabled;
64 
65         /* Mask of PIPE_CLEAR_{DEPTH,STENCIL} written */
66         unsigned draws;
67 
68         /* Prepacked words from the RSD */
69         struct mali_multisample_misc_packed rsd_depth;
70         struct mali_stencil_mask_misc_packed rsd_stencil;
71         struct mali_stencil_packed stencil_front, stencil_back;
72 };
73 
74 struct panfrost_sampler_state {
75         struct pipe_sampler_state base;
76         struct mali_sampler_packed hw;
77 };
78 
79 /* Misnomer: Sampler view corresponds to textures, not samplers */
80 
81 struct panfrost_sampler_view {
82         struct pipe_sampler_view base;
83         struct panfrost_pool_ref state;
84         struct mali_texture_packed bifrost_descriptor;
85         mali_ptr texture_bo;
86         uint64_t modifier;
87 };
88 
89 /* Statically assert that PIPE_* enums match the hardware enums.
90  * (As long as they match, we don't need to translate them.)
91  */
92 UNUSED static void
pan_pipe_asserts()93 pan_pipe_asserts()
94 {
95 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
96 
97         /* Compare functions are natural in both Gallium and Mali */
98         PIPE_ASSERT(PIPE_FUNC_NEVER    == MALI_FUNC_NEVER);
99         PIPE_ASSERT(PIPE_FUNC_LESS     == MALI_FUNC_LESS);
100         PIPE_ASSERT(PIPE_FUNC_EQUAL    == MALI_FUNC_EQUAL);
101         PIPE_ASSERT(PIPE_FUNC_LEQUAL   == MALI_FUNC_LEQUAL);
102         PIPE_ASSERT(PIPE_FUNC_GREATER  == MALI_FUNC_GREATER);
103         PIPE_ASSERT(PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL);
104         PIPE_ASSERT(PIPE_FUNC_GEQUAL   == MALI_FUNC_GEQUAL);
105         PIPE_ASSERT(PIPE_FUNC_ALWAYS   == MALI_FUNC_ALWAYS);
106 }
107 
108 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)109 panfrost_sample_pattern(unsigned samples)
110 {
111         switch (samples) {
112         case 1:  return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
113         case 4:  return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
114         case 8:  return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
115         case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
116         default: unreachable("Unsupported sample count");
117         }
118 }
119 
120 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)121 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
122 {
123         /* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use
124          * CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for
125          * nearest filtering, so use CLAMP_TO_EDGE in that case. */
126 
127         switch (w) {
128         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
129         case PIPE_TEX_WRAP_CLAMP:
130                 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
131 #if PAN_ARCH <= 5
132                      MALI_WRAP_MODE_CLAMP;
133 #else
134                      MALI_WRAP_MODE_CLAMP_TO_BORDER;
135 #endif
136         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
137         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
138         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
139         case PIPE_TEX_WRAP_MIRROR_CLAMP:
140                 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
141 #if PAN_ARCH <= 5
142                      MALI_WRAP_MODE_MIRRORED_CLAMP;
143 #else
144                      MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
145 #endif
146         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
147         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
148         default: unreachable("Invalid wrap");
149         }
150 }
151 
152 /* The hardware compares in the wrong order order, so we have to flip before
153  * encoding. Yes, really. */
154 
155 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)156 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
157 {
158         return !cso->compare_mode ? MALI_FUNC_NEVER :
159                 panfrost_flip_compare_func((enum mali_func) cso->compare_func);
160 }
161 
162 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)163 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
164 {
165         switch (f) {
166         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
167         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
168 #if PAN_ARCH >= 6
169         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
170 #else
171         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST;
172 #endif
173         default: unreachable("Invalid");
174         }
175 }
176 
177 
178 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)179 panfrost_create_sampler_state(
180         struct pipe_context *pctx,
181         const struct pipe_sampler_state *cso)
182 {
183         struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
184         so->base = *cso;
185 
186         bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
187 
188         pan_pack(&so->hw, SAMPLER, cfg) {
189                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
190                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
191 
192                 cfg.normalized_coordinates = cso->normalized_coords;
193                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
194                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
195                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
196 
197                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
198                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
199                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
200 
201                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
202                 cfg.compare_function = panfrost_sampler_compare_func(cso);
203                 cfg.seamless_cube_map = cso->seamless_cube_map;
204 
205                 cfg.border_color_r = cso->border_color.ui[0];
206                 cfg.border_color_g = cso->border_color.ui[1];
207                 cfg.border_color_b = cso->border_color.ui[2];
208                 cfg.border_color_a = cso->border_color.ui[3];
209 
210 #if PAN_ARCH >= 6
211                 if (cso->max_anisotropy > 1) {
212                         cfg.maximum_anisotropy = cso->max_anisotropy;
213                         cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
214                 }
215 #else
216                 /* Emulate disabled mipmapping by clamping the LOD as tight as
217                  * possible (from 0 to epsilon = 1/256) */
218                 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
219                         cfg.maximum_lod = cfg.minimum_lod + 1;
220 #endif
221         }
222 
223         return so;
224 }
225 
226 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_state * blend,struct pipe_framebuffer_state * state,const struct panfrost_zsa_state * zsa)227 panfrost_fs_required(
228                 struct panfrost_shader_state *fs,
229                 struct panfrost_blend_state *blend,
230                 struct pipe_framebuffer_state *state,
231                 const struct panfrost_zsa_state *zsa)
232 {
233         /* If we generally have side effects. This inclues use of discard,
234          * which can affect the results of an occlusion query. */
235         if (fs->info.fs.sidefx)
236                 return true;
237 
238         /* Using an empty FS requires early-z to be enabled, but alpha test
239          * needs it disabled */
240         if ((enum mali_func) zsa->base.alpha_func != MALI_FUNC_ALWAYS)
241                 return true;
242 
243         /* If colour is written we need to execute */
244         for (unsigned i = 0; i < state->nr_cbufs; ++i) {
245                 if (state->cbufs[i] && !blend->info[i].no_colour)
246                         return true;
247         }
248 
249         /* If depth is written and not implied we need to execute.
250          * TODO: Predicate on Z/S writes being enabled */
251         return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);
252 }
253 
254 #if PAN_ARCH >= 5
255 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)256 pack_blend_constant(enum pipe_format format, float cons)
257 {
258         const struct util_format_description *format_desc =
259                 util_format_description(format);
260 
261         unsigned chan_size = 0;
262 
263         for (unsigned i = 0; i < format_desc->nr_channels; i++)
264                 chan_size = MAX2(format_desc->channel[0].size, chan_size);
265 
266         uint16_t unorm = (cons * ((1 << chan_size) - 1));
267         return unorm << (16 - chan_size);
268 }
269 
270 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)271 panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)
272 {
273         unsigned rt_count = batch->key.nr_cbufs;
274         struct panfrost_context *ctx = batch->ctx;
275         const struct panfrost_blend_state *so = ctx->blend;
276         bool dithered = so->base.dither;
277 
278         /* Always have at least one render target for depth-only passes */
279         for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
280                 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
281 
282                 /* Disable blending for unbacked render targets */
283                 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
284                         pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
285                                 cfg.enable = false;
286 #if PAN_ARCH >= 6
287                                 cfg.internal.mode = MALI_BLEND_MODE_OFF;
288 #endif
289                         }
290 
291                         continue;
292                 }
293 
294                 struct pan_blend_info info = so->info[i];
295                 enum pipe_format format = batch->key.cbufs[i]->format;
296                 float cons = pan_blend_get_constant(info.constant_mask,
297                                                     ctx->blend_color.color);
298 
299                 /* Word 0: Flags and constant */
300                 pan_pack(packed, BLEND, cfg) {
301                         cfg.srgb = util_format_is_srgb(format);
302                         cfg.load_destination = info.load_dest;
303                         cfg.round_to_fb_precision = !dithered;
304                         cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
305 #if PAN_ARCH >= 6
306                         cfg.constant = pack_blend_constant(format, cons);
307 #else
308                         cfg.blend_shader = (blend_shaders[i] != 0);
309 
310                         if (blend_shaders[i])
311                                 cfg.shader_pc = blend_shaders[i];
312                         else
313                                 cfg.constant = cons;
314 #endif
315                 }
316 
317                 if (!blend_shaders[i]) {
318                         /* Word 1: Blend Equation */
319                         STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
320                         packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
321                 }
322 
323 #if PAN_ARCH >= 6
324                 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325                 struct panfrost_shader_state *fs =
326                         panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
327 
328                 /* Words 2 and 3: Internal blend */
329                 if (blend_shaders[i]) {
330                         /* The blend shader's address needs to be at
331                          * the same top 32 bit as the fragment shader.
332                          * TODO: Ensure that's always the case.
333                          */
334                         assert(!fs->bin.bo ||
335                                         (blend_shaders[i] & (0xffffffffull << 32)) ==
336                                         (fs->bin.gpu & (0xffffffffull << 32)));
337 
338                         unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
339                         assert(!(ret_offset & 0x7));
340 
341                         pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
342                                 cfg.mode = MALI_BLEND_MODE_SHADER;
343                                 cfg.shader.pc = (u32) blend_shaders[i];
344                                 cfg.shader.return_value = ret_offset ?
345                                         fs->bin.gpu + ret_offset : 0;
346                         }
347                 } else {
348                         pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
349                                 cfg.mode = info.opaque ?
350                                         MALI_BLEND_MODE_OPAQUE :
351                                         MALI_BLEND_MODE_FIXED_FUNCTION;
352 
353                                 /* If we want the conversion to work properly,
354                                  * num_comps must be set to 4
355                                  */
356                                 cfg.fixed_function.num_comps = 4;
357                                 cfg.fixed_function.conversion.memory_format =
358                                         panfrost_format_to_bifrost_blend(dev, format, dithered);
359                                 cfg.fixed_function.conversion.register_format =
360                                         fs->info.bifrost.blend[i].format;
361                                 cfg.fixed_function.rt = i;
362                         }
363                 }
364 #endif
365         }
366 
367         for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
368                 if (!so->info[i].no_colour && batch->key.cbufs[i]) {
369                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
370                         batch->resolve |= (PIPE_CLEAR_COLOR0 << i);
371                 }
372         }
373 }
374 #endif
375 
376 /* Construct a partial RSD corresponding to no executed fragment shader, and
377  * merge with the existing partial RSD. */
378 
379 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)380 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
381 {
382         struct mali_renderer_state_packed empty_rsd;
383 
384         pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
385 #if PAN_ARCH >= 6
386                 cfg.properties.shader_modifies_coverage = true;
387                 cfg.properties.allow_forward_pixel_to_kill = true;
388                 cfg.properties.allow_forward_pixel_to_be_killed = true;
389                 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
390 #else
391                 cfg.shader.shader = 0x1;
392                 cfg.properties.work_register_count = 1;
393                 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
394                 cfg.properties.force_early_z = true;
395 #endif
396         }
397 
398         pan_merge((*rsd), empty_rsd, RENDERER_STATE);
399 }
400 
401 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)402 panfrost_prepare_fs_state(struct panfrost_context *ctx,
403                           mali_ptr *blend_shaders,
404                           struct mali_renderer_state_packed *rsd)
405 {
406         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
407         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
408         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
409         struct panfrost_blend_state *so = ctx->blend;
410         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
411         bool msaa = rast->multisample;
412 
413         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
414 
415         bool has_blend_shader = false;
416 
417         for (unsigned c = 0; c < rt_count; ++c)
418                 has_blend_shader |= (blend_shaders[c] != 0);
419 
420         pan_pack(rsd, RENDERER_STATE, cfg) {
421                 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
422 #if PAN_ARCH >= 6
423                         /* Track if any colour buffer is reused across draws, either
424                          * from reading it directly, or from failing to write it */
425                         unsigned rt_mask = ctx->fb_rt_mask;
426                         uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
427                         bool blend_reads_dest = (so->load_dest_mask & rt_mask);
428 
429                         cfg.properties.allow_forward_pixel_to_kill =
430                                 fs->info.fs.can_fpk &&
431                                 !(rt_mask & ~rt_written) &&
432                                 !alpha_to_coverage &&
433                                 !blend_reads_dest;
434 #else
435                         cfg.properties.force_early_z =
436                                 fs->info.fs.can_early_z && !alpha_to_coverage &&
437                                 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
438 
439                         /* TODO: Reduce this limit? */
440                         if (has_blend_shader)
441                                 cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8);
442                         else
443                                 cfg.properties.work_register_count = fs->info.work_reg_count;
444 
445                         /* Hardware quirks around early-zs forcing without a
446                          * depth buffer. Note this breaks occlusion queries. */
447                         bool has_oq = ctx->occlusion_query && ctx->active_queries;
448                         bool force_ez_with_discard = !zsa->enabled && !has_oq;
449 
450                         cfg.properties.shader_reads_tilebuffer =
451                                 force_ez_with_discard && fs->info.fs.can_discard;
452                         cfg.properties.shader_contains_discard =
453                                 !force_ez_with_discard && fs->info.fs.can_discard;
454 #endif
455                 }
456 
457 #if PAN_ARCH == 4
458                 if (rt_count > 0) {
459                         cfg.multisample_misc.load_destination = so->info[0].load_dest;
460                         cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
461                         cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour;
462                         cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
463                         cfg.stencil_mask_misc.dither_disable = !so->base.dither;
464                         cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
465 
466                         if (blend_shaders[0]) {
467                                 cfg.blend_shader = blend_shaders[0];
468                         } else {
469                                 cfg.blend_constant = pan_blend_get_constant(
470                                                 so->info[0].constant_mask,
471                                                 ctx->blend_color.color);
472                         }
473                 } else {
474                         /* If there is no colour buffer, leaving fields default is
475                          * fine, except for blending which is nonnullable */
476                         cfg.blend_equation.color_mask = 0xf;
477                         cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
478                         cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
479                         cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
480                         cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
481                         cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
482                         cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
483                 }
484 #elif PAN_ARCH == 5
485                 /* Workaround */
486                 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
487 #endif
488 
489                 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
490 
491                 cfg.multisample_misc.evaluate_per_sample =
492                         msaa && (ctx->min_samples > 1);
493 
494 #if PAN_ARCH >= 6
495                 /* MSAA blend shaders need to pass their sample ID to
496                  * LD_TILE/ST_TILE, so we must preload it. Additionally, we
497                  * need per-sample shading for the blend shader, accomplished
498                  * by forcing per-sample shading for the whole program. */
499 
500                 if (msaa && has_blend_shader) {
501                         cfg.multisample_misc.evaluate_per_sample = true;
502                         cfg.preload.fragment.sample_mask_id = true;
503                 }
504 #endif
505 
506                 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
507                 cfg.depth_units = rast->offset_units * 2.0f;
508                 cfg.depth_factor = rast->offset_scale;
509 
510                 bool back_enab = zsa->base.stencil[1].enabled;
511                 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
512                 cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
513 
514 #if PAN_ARCH <= 5
515                 /* v6+ fits register preload here, no alpha testing */
516                 cfg.alpha_reference = zsa->base.alpha_ref_value;
517 #endif
518         }
519 }
520 
521 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)522 panfrost_emit_frag_shader(struct panfrost_context *ctx,
523                           struct mali_renderer_state_packed *fragmeta,
524                           mali_ptr *blend_shaders)
525 {
526         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
527         const struct panfrost_rasterizer *rast = ctx->rasterizer;
528         struct panfrost_shader_state *fs =
529                 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
530 
531         /* We need to merge several several partial renderer state descriptors,
532          * so stage to temporary storage rather than reading back write-combine
533          * memory, which will trash performance. */
534         struct mali_renderer_state_packed rsd;
535         panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
536 
537 #if PAN_ARCH == 4
538         if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
539                 /* Word 14: SFBD Blend Equation */
540                 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
541                 rsd.opaque[14] = ctx->blend->equation[0];
542         }
543 #endif
544 
545         /* Merge with CSO state and upload */
546         if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
547                 struct mali_renderer_state_packed *partial_rsd =
548                         (struct mali_renderer_state_packed *)&fs->partial_rsd;
549                 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
550                 pan_merge(rsd, *partial_rsd, RENDERER_STATE);
551         } else {
552                 pan_merge_empty_fs(&rsd);
553         }
554 
555         /* Word 8, 9 Misc state */
556         rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
557                        | rast->multisample.opaque[0];
558 
559         rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]
560                        | rast->stencil_misc.opaque[0];
561 
562         /* Word 10, 11 Stencil Front and Back */
563         rsd.opaque[10] |= zsa->stencil_front.opaque[0];
564         rsd.opaque[11] |= zsa->stencil_back.opaque[0];
565 
566         memcpy(fragmeta, &rsd, sizeof(rsd));
567 }
568 
569 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)570 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
571 {
572         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
573 
574         panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
575         panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
576 
577         return ss->state.gpu;
578 }
579 
580 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)581 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
582 {
583         struct panfrost_context *ctx = batch->ctx;
584         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
585 
586         panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
587 
588         struct panfrost_ptr xfer;
589 
590 #if PAN_ARCH == 4
591         xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
592 #else
593         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
594 
595         xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,
596                                              PAN_DESC(RENDERER_STATE),
597                                              PAN_DESC_ARRAY(rt_count, BLEND));
598 #endif
599 
600         mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
601         unsigned shader_offset = 0;
602         struct panfrost_bo *shader_bo = NULL;
603 
604         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c) {
605                 if (ctx->pipe_framebuffer.cbufs[c]) {
606                         blend_shaders[c] = panfrost_get_blend(batch,
607                                         c, &shader_bo, &shader_offset);
608                 }
609         }
610 
611         panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
612 
613 #if PAN_ARCH >= 5
614         panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders);
615 #else
616         batch->draws |= PIPE_CLEAR_COLOR0;
617         batch->resolve |= PIPE_CLEAR_COLOR0;
618 #endif
619 
620         if (ctx->depth_stencil->base.depth_enabled)
621                 batch->read |= PIPE_CLEAR_DEPTH;
622 
623         if (ctx->depth_stencil->base.stencil[0].enabled)
624                 batch->read |= PIPE_CLEAR_STENCIL;
625 
626         return xfer.gpu;
627 }
628 
629 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)630 panfrost_emit_viewport(struct panfrost_batch *batch)
631 {
632         struct panfrost_context *ctx = batch->ctx;
633         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
634         const struct pipe_scissor_state *ss = &ctx->scissor;
635         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
636 
637         /* Derive min/max from translate/scale. Note since |x| >= 0 by
638          * definition, we have that -|x| <= |x| hence translate - |scale| <=
639          * translate + |scale|, so the ordering is correct here. */
640         float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
641         float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
642         float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
643         float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
644         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
645         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
646 
647         /* Scissor to the intersection of viewport and to the scissor, clamped
648          * to the framebuffer */
649 
650         unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));
651         unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));
652         unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));
653         unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));
654 
655         if (ss && rast->scissor) {
656                 minx = MAX2(ss->minx, minx);
657                 miny = MAX2(ss->miny, miny);
658                 maxx = MIN2(ss->maxx, maxx);
659                 maxy = MIN2(ss->maxy, maxy);
660         }
661 
662         /* Set the range to [1, 1) so max values don't wrap round */
663         if (maxx == 0 || maxy == 0)
664                 maxx = maxy = minx = miny = 1;
665 
666         struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
667 
668         pan_pack(T.cpu, VIEWPORT, cfg) {
669                 /* [minx, maxx) and [miny, maxy) are exclusive ranges, but
670                  * these are inclusive */
671                 cfg.scissor_minimum_x = minx;
672                 cfg.scissor_minimum_y = miny;
673                 cfg.scissor_maximum_x = maxx - 1;
674                 cfg.scissor_maximum_y = maxy - 1;
675 
676                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
677                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
678         }
679 
680         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
681         batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
682 
683         return T.gpu;
684 }
685 
686 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)687 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
688                                  enum pipe_shader_type st,
689                                  struct panfrost_constant_buffer *buf,
690                                  unsigned index)
691 {
692         struct pipe_constant_buffer *cb = &buf->cb[index];
693         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
694 
695         if (rsrc) {
696                 panfrost_batch_read_rsrc(batch, rsrc, st);
697 
698                 /* Alignment gauranteed by
699                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
700                 return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;
701         } else if (cb->user_buffer) {
702                 return pan_pool_upload_aligned(&batch->pool.base,
703                                                cb->user_buffer +
704                                                cb->buffer_offset,
705                                                cb->buffer_size, 16);
706         } else {
707                 unreachable("No constant buffer");
708         }
709 }
710 
711 struct sysval_uniform {
712         union {
713                 float f[4];
714                 int32_t i[4];
715                 uint32_t u[4];
716                 uint64_t du[2];
717         };
718 };
719 
720 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)721 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
722                                       struct sysval_uniform *uniform)
723 {
724         struct panfrost_context *ctx = batch->ctx;
725         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
726 
727         uniform->f[0] = vp->scale[0];
728         uniform->f[1] = vp->scale[1];
729         uniform->f[2] = vp->scale[2];
730 }
731 
732 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)733 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
734                                        struct sysval_uniform *uniform)
735 {
736         struct panfrost_context *ctx = batch->ctx;
737         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
738 
739         uniform->f[0] = vp->translate[0];
740         uniform->f[1] = vp->translate[1];
741         uniform->f[2] = vp->translate[2];
742 }
743 
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)744 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
745                                        enum pipe_shader_type st,
746                                        unsigned int sysvalid,
747                                        struct sysval_uniform *uniform)
748 {
749         struct panfrost_context *ctx = batch->ctx;
750         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
751         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
752         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
753         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
754 
755         assert(dim);
756 
757         if (tex->target == PIPE_BUFFER) {
758                 assert(dim == 1);
759                 uniform->i[0] =
760                         tex->u.buf.size / util_format_get_blocksize(tex->format);
761                 return;
762         }
763 
764         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
765 
766         if (dim > 1)
767                 uniform->i[1] = u_minify(tex->texture->height0,
768                                          tex->u.tex.first_level);
769 
770         if (dim > 2)
771                 uniform->i[2] = u_minify(tex->texture->depth0,
772                                          tex->u.tex.first_level);
773 
774         if (is_array)
775                 uniform->i[dim] = tex->texture->array_size;
776 }
777 
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)778 static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
779                                               enum pipe_shader_type st,
780                                               unsigned int sysvalid,
781                                               struct sysval_uniform *uniform)
782 {
783         struct panfrost_context *ctx = batch->ctx;
784         unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
785         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
786         unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
787 
788         assert(dim && dim < 4);
789 
790         struct pipe_image_view *image = &ctx->images[st][idx];
791 
792         if (image->resource->target == PIPE_BUFFER) {
793                 unsigned blocksize = util_format_get_blocksize(image->format);
794                 uniform->i[0] = image->resource->width0 / blocksize;
795                 return;
796         }
797 
798         uniform->i[0] = u_minify(image->resource->width0,
799                                  image->u.tex.level);
800 
801         if (dim > 1)
802                 uniform->i[1] = u_minify(image->resource->height0,
803                                          image->u.tex.level);
804 
805         if (dim > 2)
806                 uniform->i[2] = u_minify(image->resource->depth0,
807                                          image->u.tex.level);
808 
809         if (is_array)
810                 uniform->i[dim] = image->resource->array_size;
811 }
812 
813 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)814 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
815                             enum pipe_shader_type st,
816                             unsigned ssbo_id,
817                             struct sysval_uniform *uniform)
818 {
819         struct panfrost_context *ctx = batch->ctx;
820 
821         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
822         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
823 
824         /* Compute address */
825         struct panfrost_resource *rsrc = pan_resource(sb.buffer);
826         struct panfrost_bo *bo = rsrc->image.data.bo;
827 
828         panfrost_batch_write_rsrc(batch, rsrc, st);
829 
830         util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
831                         sb.buffer_offset, sb.buffer_size);
832 
833         /* Upload address and size as sysval */
834         uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
835         uniform->u[2] = sb.buffer_size;
836 }
837 
838 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)839 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
840                                enum pipe_shader_type st,
841                                unsigned samp_idx,
842                                struct sysval_uniform *uniform)
843 {
844         struct panfrost_context *ctx = batch->ctx;
845         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
846 
847         uniform->f[0] = sampl->min_lod;
848         uniform->f[1] = sampl->max_lod;
849         uniform->f[2] = sampl->lod_bias;
850 
851         /* Even without any errata, Midgard represents "no mipmapping" as
852          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
853          * panfrost_create_sampler_state which also explains our choice of
854          * epsilon value (again to keep behaviour consistent) */
855 
856         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
857                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
858 }
859 
860 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)861 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
862                                        struct sysval_uniform *uniform)
863 {
864         struct panfrost_context *ctx = batch->ctx;
865 
866         uniform->u[0] = ctx->compute_grid->grid[0];
867         uniform->u[1] = ctx->compute_grid->grid[1];
868         uniform->u[2] = ctx->compute_grid->grid[2];
869 }
870 
871 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)872 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
873                                         struct sysval_uniform *uniform)
874 {
875         struct panfrost_context *ctx = batch->ctx;
876 
877         uniform->u[0] = ctx->compute_grid->block[0];
878         uniform->u[1] = ctx->compute_grid->block[1];
879         uniform->u[2] = ctx->compute_grid->block[2];
880 }
881 
882 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)883 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
884                                 struct sysval_uniform *uniform)
885 {
886         struct panfrost_context *ctx = batch->ctx;
887 
888         uniform->u[0] = ctx->compute_grid->work_dim;
889 }
890 
891 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
892  * Midgard, we emulate the Bifrost path with some extra arithmetic in the
893  * shader, to keep the code as unified as possible. */
894 
895 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)896 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
897                                 struct sysval_uniform *uniform)
898 {
899         struct panfrost_context *ctx = batch->ctx;
900         struct panfrost_device *dev = pan_device(ctx->base.screen);
901 
902         unsigned samples = util_framebuffer_get_num_samples(&batch->key);
903         uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));
904 }
905 
906 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)907 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
908                                 struct sysval_uniform *uniform)
909 {
910         unsigned samples = util_framebuffer_get_num_samples(&batch->key);
911         uniform->u[0] = samples > 1;
912 }
913 
914 #if PAN_ARCH >= 6
915 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)916 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
917                 unsigned size_and_rt, struct sysval_uniform *uniform)
918 {
919         struct panfrost_context *ctx = batch->ctx;
920         struct panfrost_device *dev = pan_device(ctx->base.screen);
921         unsigned rt = size_and_rt & 0xF;
922         unsigned size = size_and_rt >> 4;
923 
924         if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
925                 enum pipe_format format = batch->key.cbufs[rt]->format;
926                 uniform->u[0] =
927                         GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32;
928         } else {
929                 pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
930                         cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
931         }
932 }
933 #endif
934 
935 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,const struct panfrost_ptr * ptr,struct panfrost_shader_state * ss,enum pipe_shader_type st)936 panfrost_upload_sysvals(struct panfrost_batch *batch,
937                         const struct panfrost_ptr *ptr,
938                         struct panfrost_shader_state *ss,
939                         enum pipe_shader_type st)
940 {
941         struct sysval_uniform *uniforms = ptr->cpu;
942 
943         for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
944                 int sysval = ss->info.sysvals.sysvals[i];
945 
946                 switch (PAN_SYSVAL_TYPE(sysval)) {
947                 case PAN_SYSVAL_VIEWPORT_SCALE:
948                         panfrost_upload_viewport_scale_sysval(batch,
949                                                               &uniforms[i]);
950                         break;
951                 case PAN_SYSVAL_VIEWPORT_OFFSET:
952                         panfrost_upload_viewport_offset_sysval(batch,
953                                                                &uniforms[i]);
954                         break;
955                 case PAN_SYSVAL_TEXTURE_SIZE:
956                         panfrost_upload_txs_sysval(batch, st,
957                                                    PAN_SYSVAL_ID(sysval),
958                                                    &uniforms[i]);
959                         break;
960                 case PAN_SYSVAL_SSBO:
961                         panfrost_upload_ssbo_sysval(batch, st,
962                                                     PAN_SYSVAL_ID(sysval),
963                                                     &uniforms[i]);
964                         break;
965                 case PAN_SYSVAL_NUM_WORK_GROUPS:
966                         for (unsigned j = 0; j < 3; j++) {
967                                 batch->num_wg_sysval[j] =
968                                         ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);
969                         }
970                         panfrost_upload_num_work_groups_sysval(batch,
971                                                                &uniforms[i]);
972                         break;
973                 case PAN_SYSVAL_LOCAL_GROUP_SIZE:
974                         panfrost_upload_local_group_size_sysval(batch,
975                                                                 &uniforms[i]);
976                         break;
977                 case PAN_SYSVAL_WORK_DIM:
978                         panfrost_upload_work_dim_sysval(batch,
979                                                         &uniforms[i]);
980                         break;
981                 case PAN_SYSVAL_SAMPLER:
982                         panfrost_upload_sampler_sysval(batch, st,
983                                                        PAN_SYSVAL_ID(sysval),
984                                                        &uniforms[i]);
985                         break;
986                 case PAN_SYSVAL_IMAGE_SIZE:
987                         panfrost_upload_image_size_sysval(batch, st,
988                                                           PAN_SYSVAL_ID(sysval),
989                                                           &uniforms[i]);
990                         break;
991                 case PAN_SYSVAL_SAMPLE_POSITIONS:
992                         panfrost_upload_sample_positions_sysval(batch,
993                                                         &uniforms[i]);
994                         break;
995                 case PAN_SYSVAL_MULTISAMPLED:
996                         panfrost_upload_multisampled_sysval(batch,
997                                                                &uniforms[i]);
998                         break;
999 #if PAN_ARCH >= 6
1000                 case PAN_SYSVAL_RT_CONVERSION:
1001                         panfrost_upload_rt_conversion_sysval(batch,
1002                                         PAN_SYSVAL_ID(sysval), &uniforms[i]);
1003                         break;
1004 #endif
1005                 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1006                         batch->ctx->first_vertex_sysval_ptr =
1007                                 ptr->gpu + (i * sizeof(*uniforms));
1008                         batch->ctx->base_vertex_sysval_ptr =
1009                                 batch->ctx->first_vertex_sysval_ptr + 4;
1010                         batch->ctx->base_instance_sysval_ptr =
1011                                 batch->ctx->first_vertex_sysval_ptr + 8;
1012 
1013                         uniforms[i].u[0] = batch->ctx->offset_start;
1014                         uniforms[i].u[1] = batch->ctx->base_vertex;
1015                         uniforms[i].u[2] = batch->ctx->base_instance;
1016                         break;
1017                 case PAN_SYSVAL_DRAWID:
1018                         uniforms[i].u[0] = batch->ctx->drawid;
1019                         break;
1020                 default:
1021                         assert(0);
1022                 }
1023         }
1024 }
1025 
1026 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1027 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1028                                  struct panfrost_constant_buffer *buf,
1029                                  unsigned index)
1030 {
1031         struct pipe_constant_buffer *cb = &buf->cb[index];
1032         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1033 
1034         if (rsrc) {
1035                 panfrost_bo_mmap(rsrc->image.data.bo);
1036                 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1037                 panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);
1038 
1039                 return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;
1040         } else if (cb->user_buffer) {
1041                 return cb->user_buffer + cb->buffer_offset;
1042         } else
1043                 unreachable("No constant buffer");
1044 }
1045 
1046 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr * push_constants)1047 panfrost_emit_const_buf(struct panfrost_batch *batch,
1048                         enum pipe_shader_type stage,
1049                         mali_ptr *push_constants)
1050 {
1051         struct panfrost_context *ctx = batch->ctx;
1052         struct panfrost_shader_variants *all = ctx->shader[stage];
1053 
1054         if (!all)
1055                 return 0;
1056 
1057         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1058         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1059 
1060         /* Allocate room for the sysval and the uniforms */
1061         size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
1062         struct panfrost_ptr transfer =
1063                 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1064 
1065         /* Upload sysvals requested by the shader */
1066         panfrost_upload_sysvals(batch, &transfer, ss, stage);
1067 
1068         /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1069         struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
1070         unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1071         unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1072 
1073         struct panfrost_ptr ubos =
1074                 pan_pool_alloc_desc_array(&batch->pool.base,
1075                                           ubo_count + 1,
1076                                           UNIFORM_BUFFER);
1077 
1078         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1079 
1080         /* Upload sysval as a final UBO */
1081 
1082         if (sys_size) {
1083                 pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {
1084                         cfg.entries = DIV_ROUND_UP(sys_size, 16);
1085                         cfg.pointer = transfer.gpu;
1086                 }
1087         }
1088 
1089         /* The rest are honest-to-goodness UBOs */
1090 
1091         u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1092                 size_t usz = buf->cb[ubo].buffer_size;
1093 
1094                 if (usz == 0) {
1095                         ubo_ptr[ubo] = 0;
1096                         continue;
1097                 }
1098 
1099                 /* Issue (57) for the ARB_uniform_buffer_object spec says that
1100                  * the buffer can be larger than the uniform data inside it,
1101                  * so clamp ubo size to what hardware supports. */
1102 
1103                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1104                         cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);
1105                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1106                                         stage, buf, ubo);
1107                 }
1108         }
1109 
1110         if (ss->info.push.count == 0)
1111                 return ubos.gpu;
1112 
1113         /* Copy push constants required by the shader */
1114         struct panfrost_ptr push_transfer =
1115                 pan_pool_alloc_aligned(&batch->pool.base,
1116                                        ss->info.push.count * 4, 16);
1117 
1118         uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
1119         *push_constants = push_transfer.gpu;
1120 
1121         for (unsigned i = 0; i < ss->info.push.count; ++i) {
1122                 struct panfrost_ubo_word src = ss->info.push.words[i];
1123 
1124                 if (src.ubo == sysval_ubo) {
1125                         unsigned sysval_idx = src.offset / 16;
1126                         unsigned sysval_comp = (src.offset % 16) / 4;
1127                         unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);
1128                         mali_ptr ptr = push_transfer.gpu + (4 * i);
1129 
1130                         switch (sysval_type) {
1131                         case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1132                                 switch (sysval_comp) {
1133                                 case 0:
1134                                         batch->ctx->first_vertex_sysval_ptr = ptr;
1135                                         break;
1136                                 case 1:
1137                                         batch->ctx->base_vertex_sysval_ptr = ptr;
1138                                         break;
1139                                 case 2:
1140                                         batch->ctx->base_instance_sysval_ptr = ptr;
1141                                         break;
1142                                 case 3:
1143                                         /* Spurious (Midgard doesn't pack) */
1144                                         break;
1145                                 default:
1146                                         unreachable("Invalid vertex/instance offset component\n");
1147                                 }
1148                                 break;
1149 
1150                         case PAN_SYSVAL_NUM_WORK_GROUPS:
1151                                 batch->num_wg_sysval[sysval_comp] = ptr;
1152                                 break;
1153 
1154                         default:
1155                                 break;
1156                         }
1157                 }
1158                 /* Map the UBO, this should be cheap. However this is reading
1159                  * from write-combine memory which is _very_ slow. It might pay
1160                  * off to upload sysvals to a staging buffer on the CPU on the
1161                  * assumption sysvals will get pushed (TODO) */
1162 
1163                 const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
1164                         panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1165 
1166                 /* TODO: Is there any benefit to combining ranges */
1167                 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
1168         }
1169 
1170         return ubos.gpu;
1171 }
1172 
1173 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * info)1174 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1175                             const struct pipe_grid_info *info)
1176 {
1177         struct panfrost_context *ctx = batch->ctx;
1178         struct panfrost_device *dev = pan_device(ctx->base.screen);
1179         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1180         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1181         struct panfrost_ptr t =
1182                 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1183 
1184         pan_pack(t.cpu, LOCAL_STORAGE, ls) {
1185                 unsigned wls_single_size =
1186                         util_next_power_of_two(MAX2(ss->info.wls_size, 128));
1187 
1188                 if (ss->info.wls_size) {
1189                         ls.wls_instances =
1190                                 util_next_power_of_two(info->grid[0]) *
1191                                 util_next_power_of_two(info->grid[1]) *
1192                                 util_next_power_of_two(info->grid[2]);
1193 
1194                         ls.wls_size_scale = util_logbase2(wls_single_size) + 1;
1195 
1196                         unsigned wls_size = wls_single_size * ls.wls_instances * dev->core_count;
1197 
1198                         ls.wls_base_pointer =
1199                                 (panfrost_batch_get_shared_memory(batch,
1200                                                                   wls_size,
1201                                                                   1))->ptr.gpu;
1202                 } else {
1203                         ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1204                 }
1205 
1206                 if (ss->info.tls_size) {
1207                         unsigned shift =
1208                                 panfrost_get_stack_shift(ss->info.tls_size);
1209                         struct panfrost_bo *bo =
1210                                 panfrost_batch_get_scratchpad(batch,
1211                                                               ss->info.tls_size,
1212                                                               dev->thread_tls_alloc,
1213                                                               dev->core_count);
1214 
1215                         ls.tls_size = shift;
1216                         ls.tls_base_pointer = bo->ptr.gpu;
1217                 }
1218         };
1219 
1220         return t.gpu;
1221 }
1222 
1223 #if PAN_ARCH <= 5
1224 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1225 panfrost_get_tex_desc(struct panfrost_batch *batch,
1226                       enum pipe_shader_type st,
1227                       struct panfrost_sampler_view *view)
1228 {
1229         if (!view)
1230                 return (mali_ptr) 0;
1231 
1232         struct pipe_sampler_view *pview = &view->base;
1233         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1234 
1235         panfrost_batch_read_rsrc(batch, rsrc, st);
1236         panfrost_batch_add_bo(batch, view->state.bo, st);
1237 
1238         return view->state.gpu;
1239 }
1240 #endif
1241 
1242 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1243 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1244                                 struct pipe_context *pctx,
1245                                 struct pipe_resource *texture)
1246 {
1247         struct panfrost_device *device = pan_device(pctx->screen);
1248         struct panfrost_context *ctx = pan_context(pctx);
1249         struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1250         enum pipe_format format = so->base.format;
1251         assert(prsrc->image.data.bo);
1252 
1253         /* Format to access the stencil/depth portion of a Z32_S8 texture */
1254         if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1255                 assert(prsrc->separate_stencil);
1256                 texture = &prsrc->separate_stencil->base;
1257                 prsrc = (struct panfrost_resource *)texture;
1258                 format = texture->format;
1259         } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1260                 format = PIPE_FORMAT_Z32_FLOAT;
1261         }
1262 
1263         const struct util_format_description *desc = util_format_description(format);
1264 
1265         bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);
1266 
1267         if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {
1268                 if (desc->is_snorm)
1269                         format = PIPE_FORMAT_R8G8B8A8_SNORM;
1270                 else
1271                         format = PIPE_FORMAT_R8G8B8A8_UNORM;
1272                 desc = util_format_description(format);
1273         }
1274 
1275         so->texture_bo = prsrc->image.data.bo->ptr.gpu;
1276         so->modifier = prsrc->image.layout.modifier;
1277 
1278         /* MSAA only supported for 2D textures */
1279 
1280         assert(texture->nr_samples <= 1 ||
1281                so->base.target == PIPE_TEXTURE_2D ||
1282                so->base.target == PIPE_TEXTURE_2D_ARRAY);
1283 
1284         enum mali_texture_dimension type =
1285                 panfrost_translate_texture_dimension(so->base.target);
1286 
1287         bool is_buffer = (so->base.target == PIPE_BUFFER);
1288 
1289         unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1290         unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1291         unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1292         unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1293         unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1294         unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /
1295                             util_format_get_blocksize(format);
1296 
1297         if (so->base.target == PIPE_TEXTURE_3D) {
1298                 first_layer /= prsrc->image.layout.depth;
1299                 last_layer /= prsrc->image.layout.depth;
1300                 assert(!first_layer && !last_layer);
1301         }
1302 
1303         struct pan_image_view iview = {
1304                 .format = format,
1305                 .dim = type,
1306                 .first_level = first_level,
1307                 .last_level = last_level,
1308                 .first_layer = first_layer,
1309                 .last_layer = last_layer,
1310                 .swizzle = {
1311                         so->base.swizzle_r,
1312                         so->base.swizzle_g,
1313                         so->base.swizzle_b,
1314                         so->base.swizzle_a,
1315                 },
1316                 .image = &prsrc->image,
1317 
1318                 .buf.offset = buf_offset,
1319                 .buf.size = buf_size,
1320         };
1321 
1322         unsigned size =
1323                 (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1324                 GENX(panfrost_estimate_texture_payload_size)(&iview);
1325 
1326         struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64);
1327         so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1328 
1329         void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1330 
1331         if (PAN_ARCH <= 5) {
1332                 payload.cpu += pan_size(TEXTURE);
1333                 payload.gpu += pan_size(TEXTURE);
1334         }
1335 
1336         GENX(panfrost_new_texture)(device, &iview, tex, &payload);
1337 }
1338 
1339 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1340 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1341                              struct pipe_context *pctx)
1342 {
1343         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1344         if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||
1345             view->modifier != rsrc->image.layout.modifier) {
1346                 panfrost_bo_unreference(view->state.bo);
1347                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1348         }
1349 }
1350 
1351 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1352 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1353                                   enum pipe_shader_type stage)
1354 {
1355         struct panfrost_context *ctx = batch->ctx;
1356 
1357         if (!ctx->sampler_view_count[stage])
1358                 return 0;
1359 
1360 #if PAN_ARCH >= 6
1361         struct panfrost_ptr T =
1362                 pan_pool_alloc_desc_array(&batch->pool.base,
1363                                           ctx->sampler_view_count[stage],
1364                                           TEXTURE);
1365         struct mali_texture_packed *out =
1366                 (struct mali_texture_packed *) T.cpu;
1367 
1368         for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1369                 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1370                 struct pipe_sampler_view *pview = &view->base;
1371                 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1372 
1373                 panfrost_update_sampler_view(view, &ctx->base);
1374                 out[i] = view->bifrost_descriptor;
1375 
1376                 panfrost_batch_read_rsrc(batch, rsrc, stage);
1377                 panfrost_batch_add_bo(batch, view->state.bo, stage);
1378         }
1379 
1380         return T.gpu;
1381 #else
1382         uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1383 
1384         for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1385                 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1386 
1387                 panfrost_update_sampler_view(view, &ctx->base);
1388 
1389                 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1390         }
1391 
1392         return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1393                                        sizeof(uint64_t) *
1394                                        ctx->sampler_view_count[stage],
1395                                        sizeof(uint64_t));
1396 #endif
1397 }
1398 
1399 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1400 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1401                                   enum pipe_shader_type stage)
1402 {
1403         struct panfrost_context *ctx = batch->ctx;
1404 
1405         if (!ctx->sampler_count[stage])
1406                 return 0;
1407 
1408         struct panfrost_ptr T =
1409                 pan_pool_alloc_desc_array(&batch->pool.base,
1410                                           ctx->sampler_count[stage],
1411                                           SAMPLER);
1412         struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu;
1413 
1414         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1415                 out[i] = ctx->samplers[stage][i]->hw;
1416 
1417         return T.gpu;
1418 }
1419 
1420 /* Packs all image attribute descs and attribute buffer descs.
1421  * `first_image_buf_index` must be the index of the first image attribute buffer descriptor.
1422  */
1423 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1424 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1425                    struct mali_attribute_packed *attribs, unsigned first_buf)
1426 {
1427         struct panfrost_device *dev = pan_device(ctx->base.screen);
1428         unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1429 
1430         for (unsigned i = 0; i < last_bit; ++i) {
1431                 enum pipe_format format = ctx->images[shader][i].format;
1432 
1433                 pan_pack(attribs + i, ATTRIBUTE, cfg) {
1434                         /* Continuation record means 2 buffers per image */
1435                         cfg.buffer_index = first_buf + (i * 2);
1436                         cfg.offset_enable = (PAN_ARCH <= 5);
1437                         cfg.format = dev->formats[format].hw;
1438                 }
1439         }
1440 }
1441 
1442 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1443 pan_modifier_to_attr_type(uint64_t modifier)
1444 {
1445         switch (modifier) {
1446         case DRM_FORMAT_MOD_LINEAR:
1447                 return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1448         case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1449                 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1450         default:
1451                 unreachable("Invalid modifier for attribute record");
1452         }
1453 }
1454 
1455 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1456 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1457                 struct mali_attribute_buffer_packed *bufs,
1458                 unsigned first_image_buf_index)
1459 {
1460         struct panfrost_context *ctx = batch->ctx;
1461         unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1462 
1463         for (unsigned i = 0; i < last_bit; ++i) {
1464                 struct pipe_image_view *image = &ctx->images[shader][i];
1465 
1466                 if (!(ctx->image_mask[shader] & (1 << i)) ||
1467                     !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1468                         /* Unused image bindings */
1469                         pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);
1470                         pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);
1471                         continue;
1472                 }
1473 
1474                 struct panfrost_resource *rsrc = pan_resource(image->resource);
1475 
1476                 /* TODO: MSAA */
1477                 assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");
1478 
1479                 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1480                 bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1481 
1482                 unsigned offset = is_buffer ? image->u.buf.offset :
1483                         panfrost_texture_offset(&rsrc->image.layout,
1484                                                 image->u.tex.level,
1485                                                 is_3d ? 0 : image->u.tex.first_layer,
1486                                                 is_3d ? image->u.tex.first_layer : 0);
1487 
1488                 if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) {
1489                         panfrost_batch_write_rsrc(batch, rsrc, shader);
1490 
1491                         unsigned level = is_buffer ? 0 : image->u.tex.level;
1492                         BITSET_SET(rsrc->valid.data, level);
1493 
1494                         if (is_buffer) {
1495                                 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1496                                                 0, rsrc->base.width0);
1497                         }
1498                 } else {
1499                         panfrost_batch_read_rsrc(batch, rsrc, shader);
1500                 }
1501 
1502                 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1503                         cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1504                         cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;
1505                         cfg.stride = util_format_get_blocksize(image->format);
1506                         cfg.size = rsrc->image.data.bo->size - offset;
1507                 }
1508 
1509                 if (is_buffer) {
1510                         pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1511                                 cfg.s_dimension = rsrc->base.width0 /
1512                                         util_format_get_blocksize(image->format);
1513                                 cfg.t_dimension = cfg.r_dimension = 1;
1514                         }
1515 
1516                         continue;
1517                 }
1518 
1519                 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1520                         unsigned level = image->u.tex.level;
1521 
1522                         cfg.s_dimension = u_minify(rsrc->base.width0, level);
1523                         cfg.t_dimension = u_minify(rsrc->base.height0, level);
1524                         cfg.r_dimension = is_3d ?
1525                                 u_minify(rsrc->base.depth0, level) :
1526                                 image->u.tex.last_layer - image->u.tex.first_layer + 1;
1527 
1528                         cfg.row_stride =
1529                                 rsrc->image.layout.slices[level].row_stride;
1530 
1531                         if (rsrc->base.target != PIPE_TEXTURE_2D) {
1532                                 cfg.slice_stride =
1533                                         panfrost_get_layer_stride(&rsrc->image.layout,
1534                                                                   level);
1535                         }
1536                 }
1537         }
1538 }
1539 
1540 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1541 panfrost_emit_image_attribs(struct panfrost_batch *batch,
1542                             mali_ptr *buffers,
1543                             enum pipe_shader_type type)
1544 {
1545         struct panfrost_context *ctx = batch->ctx;
1546         struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);
1547 
1548         if (!shader->info.attribute_count) {
1549                 *buffers = 0;
1550                 return 0;
1551         }
1552 
1553         /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1554         unsigned attr_count = shader->info.attribute_count;
1555         unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1556 
1557         struct panfrost_ptr bufs =
1558                 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1559 
1560         struct panfrost_ptr attribs =
1561                 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1562 
1563         emit_image_attribs(ctx, type, attribs.cpu, 0);
1564         emit_image_bufs(batch, type, bufs.cpu, 0);
1565 
1566         /* We need an empty attrib buf to stop the prefetching on Bifrost */
1567 #if PAN_ARCH >= 6
1568         pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1569                  ATTRIBUTE_BUFFER, cfg);
1570 #endif
1571 
1572         *buffers = bufs.gpu;
1573         return attribs.gpu;
1574 }
1575 
1576 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1577 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1578                           mali_ptr *buffers)
1579 {
1580         struct panfrost_context *ctx = batch->ctx;
1581         struct panfrost_vertex_state *so = ctx->vertex;
1582         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1583         bool instanced = ctx->indirect_draw || ctx->instance_count > 1;
1584         uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1585         unsigned nr_images = util_last_bit(image_mask);
1586 
1587         /* Worst case: everything is NPOT, which is only possible if instancing
1588          * is enabled. Otherwise single record is gauranteed.
1589          * Also, we allocate more memory than what's needed here if either instancing
1590          * is enabled or images are present, this can be improved. */
1591         unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
1592         unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
1593                            (PAN_ARCH >= 6 ? 1 : 0);
1594 
1595 #if PAN_ARCH <= 5
1596         /* Midgard needs vertexid/instanceid handled specially */
1597         bool special_vbufs = vs->info.attribute_count >= PAN_VERTEX_ID;
1598 
1599         if (special_vbufs)
1600                 nr_bufs += 2;
1601 #endif
1602 
1603         if (!nr_bufs) {
1604                 *buffers = 0;
1605                 return 0;
1606         }
1607 
1608         struct panfrost_ptr S =
1609                 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
1610                                           ATTRIBUTE_BUFFER);
1611         struct panfrost_ptr T =
1612                 pan_pool_alloc_desc_array(&batch->pool.base,
1613                                           vs->info.attribute_count,
1614                                           ATTRIBUTE);
1615 
1616         struct mali_attribute_buffer_packed *bufs =
1617                 (struct mali_attribute_buffer_packed *) S.cpu;
1618 
1619         struct mali_attribute_packed *out =
1620                 (struct mali_attribute_packed *) T.cpu;
1621 
1622         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1623         unsigned k = 0;
1624 
1625         for (unsigned i = 0; i < so->nr_bufs; ++i) {
1626                 unsigned vbi = so->buffers[i].vbi;
1627                 unsigned divisor = so->buffers[i].divisor;
1628                 attrib_to_buffer[i] = k;
1629 
1630                 if (!(ctx->vb_mask & (1 << vbi)))
1631                         continue;
1632 
1633                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1634                 struct panfrost_resource *rsrc;
1635 
1636                 rsrc = pan_resource(buf->buffer.resource);
1637                 if (!rsrc)
1638                         continue;
1639 
1640                 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1641 
1642                 /* Mask off lower bits, see offset fixup below */
1643                 mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;
1644                 mali_ptr addr = raw_addr & ~63;
1645 
1646                 /* Since we advanced the base pointer, we shrink the buffer
1647                  * size, but add the offset we subtracted */
1648                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1649                         - buf->buffer_offset;
1650 
1651                 /* When there is a divisor, the hardware-level divisor is
1652                  * the product of the instance divisor and the padded count */
1653                 unsigned stride = buf->stride;
1654 
1655                 if (ctx->indirect_draw) {
1656                         /* We allocated 2 records for each attribute buffer */
1657                         assert((k & 1) == 0);
1658 
1659                         /* With indirect draws we can't guess the vertex_count.
1660                          * Pre-set the address, stride and size fields, the
1661                          * compute shader do the rest.
1662                          */
1663                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1664                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D;
1665                                 cfg.pointer = addr;
1666                                 cfg.stride = stride;
1667                                 cfg.size = size;
1668                         }
1669 
1670                         /* We store the unmodified divisor in the continuation
1671                          * slot so the compute shader can retrieve it.
1672                          */
1673                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1674                                 cfg.divisor = divisor;
1675                         }
1676 
1677                         k += 2;
1678                         continue;
1679                 }
1680 
1681                 unsigned hw_divisor = ctx->padded_count * divisor;
1682 
1683                 if (ctx->instance_count <= 1) {
1684                         /* Per-instance would be every attribute equal */
1685                         if (divisor)
1686                                 stride = 0;
1687 
1688                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1689                                 cfg.pointer = addr;
1690                                 cfg.stride = stride;
1691                                 cfg.size = size;
1692                         }
1693                 } else if (!divisor) {
1694                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1695                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1696                                 cfg.pointer = addr;
1697                                 cfg.stride = stride;
1698                                 cfg.size = size;
1699                                 cfg.divisor = ctx->padded_count;
1700                         }
1701                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1702                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1703                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1704                                 cfg.pointer = addr;
1705                                 cfg.stride = stride;
1706                                 cfg.size = size;
1707                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1708                         }
1709 
1710                 } else {
1711                         unsigned shift = 0, extra_flags = 0;
1712 
1713                         unsigned magic_divisor =
1714                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1715 
1716                         /* Records with continuations must be aligned */
1717                         k = ALIGN_POT(k, 2);
1718                         attrib_to_buffer[i] = k;
1719 
1720                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1721                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1722                                 cfg.pointer = addr;
1723                                 cfg.stride = stride;
1724                                 cfg.size = size;
1725 
1726                                 cfg.divisor_r = shift;
1727                                 cfg.divisor_e = extra_flags;
1728                         }
1729 
1730                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1731                                 cfg.divisor_numerator = magic_divisor;
1732                                 cfg.divisor = divisor;
1733                         }
1734 
1735                         ++k;
1736                 }
1737 
1738                 ++k;
1739         }
1740 
1741 #if PAN_ARCH <= 5
1742         /* Add special gl_VertexID/gl_InstanceID buffers */
1743         if (special_vbufs) {
1744                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1745 
1746                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1747                         cfg.buffer_index = k++;
1748                         cfg.format = so->formats[PAN_VERTEX_ID];
1749                 }
1750 
1751                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1752 
1753                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1754                         cfg.buffer_index = k++;
1755                         cfg.format = so->formats[PAN_INSTANCE_ID];
1756                 }
1757         }
1758 #endif
1759 
1760         k = ALIGN_POT(k, 2);
1761         emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
1762         emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
1763         k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
1764 
1765 #if PAN_ARCH >= 6
1766         /* We need an empty attrib buf to stop the prefetching on Bifrost */
1767         pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
1768 #endif
1769 
1770         /* Attribute addresses require 64-byte alignment, so let:
1771          *
1772          *      base' = base & ~63 = base - (base & 63)
1773          *      offset' = offset + (base & 63)
1774          *
1775          * Since base' + offset' = base + offset, these are equivalent
1776          * addressing modes and now base is 64 aligned.
1777          */
1778 
1779         for (unsigned i = 0; i < so->num_elements; ++i) {
1780                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1781                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1782 
1783                 /* BOs are aligned; just fixup for buffer_offset */
1784                 signed src_offset = so->pipe[i].src_offset;
1785                 src_offset += (buf->buffer_offset & 63);
1786 
1787                 /* Base instance offset */
1788                 if (ctx->base_instance && so->pipe[i].instance_divisor) {
1789                         src_offset += (ctx->base_instance * buf->stride) /
1790                                       so->pipe[i].instance_divisor;
1791                 }
1792 
1793                 /* Also, somewhat obscurely per-instance data needs to be
1794                  * offset in response to a delayed start in an indexed draw */
1795 
1796                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1797                         src_offset -= buf->stride * ctx->offset_start;
1798 
1799                 pan_pack(out + i, ATTRIBUTE, cfg) {
1800                         cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
1801                         cfg.format = so->formats[i];
1802                         cfg.offset = src_offset;
1803                 }
1804         }
1805 
1806         *buffers = S.gpu;
1807         return T.gpu;
1808 }
1809 
1810 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)1811 panfrost_emit_varyings(struct panfrost_batch *batch,
1812                 struct mali_attribute_buffer_packed *slot,
1813                 unsigned stride, unsigned count)
1814 {
1815         unsigned size = stride * count;
1816         mali_ptr ptr =
1817                 batch->ctx->indirect_draw ? 0 :
1818                 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
1819 
1820         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1821                 cfg.stride = stride;
1822                 cfg.size = size;
1823                 cfg.pointer = ptr;
1824         }
1825 
1826         return ptr;
1827 }
1828 
1829 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1830 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1831 {
1832         return target->buffer_offset + (pan_so_target(target)->offset * stride);
1833 }
1834 
1835 static void
panfrost_emit_streamout(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count,struct pipe_stream_output_target * target)1836 panfrost_emit_streamout(struct panfrost_batch *batch,
1837                         struct mali_attribute_buffer_packed *slot,
1838                         unsigned stride, unsigned count,
1839                         struct pipe_stream_output_target *target)
1840 {
1841         unsigned max_size = target->buffer_size;
1842         unsigned expected_size = stride * count;
1843 
1844         /* Grab the BO and bind it to the batch */
1845         struct panfrost_resource *rsrc = pan_resource(target->buffer);
1846         struct panfrost_bo *bo = rsrc->image.data.bo;
1847 
1848         panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1849         panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
1850 
1851         unsigned offset = panfrost_xfb_offset(stride, target);
1852 
1853         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1854                 cfg.pointer = bo->ptr.gpu + (offset & ~63);
1855                 cfg.stride = stride;
1856                 cfg.size = MIN2(max_size, expected_size) + (offset & 63);
1857 
1858                 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1859                                 offset, cfg.size);
1860         }
1861 }
1862 
1863 /* Helpers for manipulating stream out information so we can pack varyings
1864  * accordingly. Compute the src_offset for a given captured varying */
1865 
1866 static struct pipe_stream_output *
pan_get_so(struct pipe_stream_output_info * info,gl_varying_slot loc)1867 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1868 {
1869         for (unsigned i = 0; i < info->num_outputs; ++i) {
1870                 if (info->output[i].register_index == loc)
1871                         return &info->output[i];
1872         }
1873 
1874         unreachable("Varying not captured");
1875 }
1876 
1877 /* Given a varying, figure out which index it corresponds to */
1878 
1879 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)1880 pan_varying_index(unsigned present, enum pan_special_varying v)
1881 {
1882         return util_bitcount(present & BITFIELD_MASK(v));
1883 }
1884 
1885 /* Get the base offset for XFB buffers, which by convention come after
1886  * everything else. Wrapper function for semantic reasons; by construction this
1887  * is just popcount. */
1888 
1889 static inline unsigned
pan_xfb_base(unsigned present)1890 pan_xfb_base(unsigned present)
1891 {
1892         return util_bitcount(present);
1893 }
1894 
1895 /* Determines which varying buffers are required */
1896 
1897 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)1898 pan_varying_present(const struct panfrost_device *dev,
1899                     struct pan_shader_info *producer,
1900                     struct pan_shader_info *consumer,
1901                     uint16_t point_coord_mask)
1902 {
1903         /* At the moment we always emit general and position buffers. Not
1904          * strictly necessary but usually harmless */
1905 
1906         unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
1907 
1908         /* Enable special buffers by the shader info */
1909 
1910         if (producer->vs.writes_point_size)
1911                 present |= BITFIELD_BIT(PAN_VARY_PSIZ);
1912 
1913 #if PAN_ARCH <= 5
1914         /* On Midgard, these exist as real varyings. Later architectures use
1915          * LD_VAR_SPECIAL reads instead. */
1916 
1917         if (consumer->fs.reads_point_coord)
1918                 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1919 
1920         if (consumer->fs.reads_face)
1921                 present |= BITFIELD_BIT(PAN_VARY_FACE);
1922 
1923         if (consumer->fs.reads_frag_coord)
1924                 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
1925 
1926         /* Also, if we have a point sprite, we need a point coord buffer */
1927 
1928         for (unsigned i = 0; i < consumer->varyings.input_count; i++)  {
1929                 gl_varying_slot loc = consumer->varyings.input[i].location;
1930 
1931                 if (util_varying_is_point_coord(loc, point_coord_mask))
1932                         present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1933         }
1934 #endif
1935 
1936         return present;
1937 }
1938 
1939 /* Emitters for varying records */
1940 
1941 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)1942 pan_emit_vary(const struct panfrost_device *dev,
1943               struct mali_attribute_packed *out,
1944               unsigned buffer_index,
1945               mali_pixel_format format, unsigned offset)
1946 {
1947         pan_pack(out, ATTRIBUTE, cfg) {
1948                 cfg.buffer_index = buffer_index;
1949                 cfg.offset_enable = (PAN_ARCH <= 5);
1950                 cfg.format = format;
1951                 cfg.offset = offset;
1952         }
1953 }
1954 
1955 /* Special records */
1956 
1957 static const struct {
1958        unsigned components;
1959        enum mali_format format;
1960 } pan_varying_formats[PAN_VARY_MAX] = {
1961         [PAN_VARY_POSITION]     = { 4, MALI_SNAP_4 },
1962         [PAN_VARY_PSIZ]         = { 1, MALI_R16F },
1963         [PAN_VARY_PNTCOORD]     = { 1, MALI_R16F },
1964         [PAN_VARY_FACE]         = { 1, MALI_R32I },
1965         [PAN_VARY_FRAGCOORD]    = { 4, MALI_RGBA32F },
1966 };
1967 
1968 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)1969 pan_special_format(const struct panfrost_device *dev,
1970                 enum pan_special_varying buf)
1971 {
1972         assert(buf < PAN_VARY_MAX);
1973         mali_pixel_format format = (pan_varying_formats[buf].format << 12);
1974 
1975 #if PAN_ARCH <= 6
1976         unsigned nr = pan_varying_formats[buf].components;
1977         format |= panfrost_get_default_swizzle(nr);
1978 #endif
1979 
1980         return format;
1981 }
1982 
1983 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)1984 pan_emit_vary_special(const struct panfrost_device *dev,
1985                       struct mali_attribute_packed *out,
1986                       unsigned present, enum pan_special_varying buf)
1987 {
1988         pan_emit_vary(dev, out, pan_varying_index(present, buf),
1989                         pan_special_format(dev, buf), 0);
1990 }
1991 
1992 /* Negative indicates a varying is not found */
1993 
1994 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)1995 pan_find_vary(const struct pan_shader_varying *vary,
1996                 unsigned vary_count, unsigned loc)
1997 {
1998         for (unsigned i = 0; i < vary_count; ++i) {
1999                 if (vary[i].location == loc)
2000                         return i;
2001         }
2002 
2003         return -1;
2004 }
2005 
2006 /* Assign varying locations for the general buffer. Returns the calculated
2007  * per-vertex stride, and outputs offsets into the passed array. Negative
2008  * offset indicates a varying is not used. */
2009 
2010 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2011 pan_assign_varyings(const struct panfrost_device *dev,
2012                     struct pan_shader_info *producer,
2013                     struct pan_shader_info *consumer,
2014                     signed *offsets)
2015 {
2016         unsigned producer_count = producer->varyings.output_count;
2017         unsigned consumer_count = consumer->varyings.input_count;
2018 
2019         const struct pan_shader_varying *producer_vars = producer->varyings.output;
2020         const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2021 
2022         unsigned stride = 0;
2023 
2024         for (unsigned i = 0; i < producer_count; ++i) {
2025                 signed loc = pan_find_vary(consumer_vars, consumer_count,
2026                                 producer_vars[i].location);
2027 
2028                 if (loc >= 0) {
2029                         offsets[i] = stride;
2030 
2031                         enum pipe_format format = consumer_vars[loc].format;
2032                         stride += util_format_get_blocksize(format);
2033                 } else {
2034                         offsets[i] = -1;
2035                 }
2036         }
2037 
2038         return stride;
2039 }
2040 
2041 /* Emitter for a single varying (attribute) descriptor */
2042 
2043 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,struct pipe_stream_output_info * xfb,uint64_t xfb_loc_mask,unsigned max_xfb,unsigned * xfb_offsets,signed offset,enum pan_special_varying pos_varying)2044 panfrost_emit_varying(const struct panfrost_device *dev,
2045                       struct mali_attribute_packed *out,
2046                       const struct pan_shader_varying varying,
2047                       enum pipe_format pipe_format,
2048                       unsigned present,
2049                       uint16_t point_sprite_mask,
2050                       struct pipe_stream_output_info *xfb,
2051                       uint64_t xfb_loc_mask,
2052                       unsigned max_xfb,
2053                       unsigned *xfb_offsets,
2054                       signed offset,
2055                       enum pan_special_varying pos_varying)
2056 {
2057         /* Note: varying.format != pipe_format in some obscure cases due to a
2058          * limitation of the NIR linker. This should be fixed in the future to
2059          * eliminate the additional lookups. See:
2060          * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2061          */
2062         gl_varying_slot loc = varying.location;
2063         mali_pixel_format format = dev->formats[pipe_format].hw;
2064 
2065         struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
2066                 pan_get_so(xfb, loc) : NULL;
2067 
2068         if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2069                 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2070         } else if (o && o->output_buffer < max_xfb) {
2071                 unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;
2072 
2073                 pan_emit_vary(dev, out,
2074                                 pan_xfb_base(present) + o->output_buffer,
2075                                 format, (o->dst_offset * 4) + fixup_offset);
2076         } else if (loc == VARYING_SLOT_POS) {
2077                 pan_emit_vary_special(dev, out, present, pos_varying);
2078         } else if (loc == VARYING_SLOT_PSIZ) {
2079                 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2080         } else if (loc == VARYING_SLOT_FACE) {
2081                 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2082         } else if (offset < 0) {
2083                 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2084         } else {
2085                 STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2086                 pan_emit_vary(dev, out, 0, format, offset);
2087         }
2088 }
2089 
2090 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2091  * rather than draw time (under good conditions). */
2092 
2093 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_shader_state * producer,struct panfrost_shader_state * consumer,struct panfrost_streamout * xfb,uint16_t point_coord_mask,struct pan_linkage * out)2094 panfrost_emit_varying_descs(
2095                 struct panfrost_pool *pool,
2096                 struct panfrost_shader_state *producer,
2097                 struct panfrost_shader_state *consumer,
2098                 struct panfrost_streamout *xfb,
2099                 uint16_t point_coord_mask,
2100                 struct pan_linkage *out)
2101 {
2102         struct panfrost_device *dev = pool->base.dev;
2103         struct pipe_stream_output_info *xfb_info = &producer->stream_output;
2104         unsigned producer_count = producer->info.varyings.output_count;
2105         unsigned consumer_count = consumer->info.varyings.input_count;
2106 
2107         /* Offsets within the general varying buffer, indexed by location */
2108         signed offsets[PAN_MAX_VARYINGS];
2109         assert(producer_count <= ARRAY_SIZE(offsets));
2110         assert(consumer_count <= ARRAY_SIZE(offsets));
2111 
2112         /* Allocate enough descriptors for both shader stages */
2113         struct panfrost_ptr T =
2114                 pan_pool_alloc_desc_array(&pool->base,
2115                                           producer_count + consumer_count,
2116                                           ATTRIBUTE);
2117 
2118         /* Take a reference if we're being put on the CSO */
2119         if (!pool->owned) {
2120                 out->bo = pool->transient_bo;
2121                 panfrost_bo_reference(out->bo);
2122         }
2123 
2124         struct mali_attribute_packed *descs = T.cpu;
2125         out->producer = producer_count ? T.gpu : 0;
2126         out->consumer = consumer_count ? T.gpu +
2127                 (pan_size(ATTRIBUTE) * producer_count) : 0;
2128 
2129         /* Lay out the varyings. Must use producer to lay out, in order to
2130          * respect transform feedback precisions. */
2131         out->present = pan_varying_present(dev, &producer->info,
2132                         &consumer->info, point_coord_mask);
2133 
2134         out->stride = pan_assign_varyings(dev, &producer->info,
2135                         &consumer->info, offsets);
2136 
2137         unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS];
2138 
2139         for (unsigned i = 0; i < xfb->num_targets; ++i) {
2140                 xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,
2141                                 xfb->targets[i]);
2142         }
2143 
2144         for (unsigned i = 0; i < producer_count; ++i) {
2145                 signed j = pan_find_vary(consumer->info.varyings.input,
2146                                 consumer->info.varyings.input_count,
2147                                 producer->info.varyings.output[i].location);
2148 
2149                 enum pipe_format format = (j >= 0) ?
2150                         consumer->info.varyings.input[j].format :
2151                         producer->info.varyings.output[i].format;
2152 
2153                 panfrost_emit_varying(dev, descs + i,
2154                                 producer->info.varyings.output[i], format,
2155                                 out->present, 0, &producer->stream_output,
2156                                 producer->so_mask, xfb->num_targets,
2157                                 xfb_offsets, offsets[i], PAN_VARY_POSITION);
2158         }
2159 
2160         for (unsigned i = 0; i < consumer_count; ++i) {
2161                 signed j = pan_find_vary(producer->info.varyings.output,
2162                                 producer->info.varyings.output_count,
2163                                 consumer->info.varyings.input[i].location);
2164 
2165                 signed offset = (j >= 0) ? offsets[j] : -1;
2166 
2167                 panfrost_emit_varying(dev, descs + producer_count + i,
2168                                 consumer->info.varyings.input[i],
2169                                 consumer->info.varyings.input[i].format,
2170                                 out->present, point_coord_mask,
2171                                 &producer->stream_output, producer->so_mask,
2172                                 xfb->num_targets, xfb_offsets, offset,
2173                                 PAN_VARY_FRAGCOORD);
2174         }
2175 }
2176 
2177 #if PAN_ARCH <= 5
2178 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2179 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2180                 unsigned present,
2181                 enum pan_special_varying v,
2182                 unsigned special)
2183 {
2184         if (present & BITFIELD_BIT(v)) {
2185                 unsigned idx = pan_varying_index(present, v);
2186 
2187                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2188                         cfg.special = special;
2189                         cfg.type = 0;
2190                 }
2191         }
2192 }
2193 #endif
2194 
2195 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,unsigned * buffer_count,mali_ptr * position,mali_ptr * psiz,bool point_coord_replace)2196 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2197                                  unsigned vertex_count,
2198                                  mali_ptr *vs_attribs,
2199                                  mali_ptr *fs_attribs,
2200                                  mali_ptr *buffers,
2201                                  unsigned *buffer_count,
2202                                  mali_ptr *position,
2203                                  mali_ptr *psiz,
2204                                  bool point_coord_replace)
2205 {
2206         /* Load the shaders */
2207         struct panfrost_context *ctx = batch->ctx;
2208         struct panfrost_shader_state *vs, *fs;
2209 
2210         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2211         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2212 
2213         uint16_t point_coord_mask = 0;
2214 
2215 #if PAN_ARCH <= 5
2216         /* Point sprites are lowered on Bifrost and newer */
2217         if (point_coord_replace)
2218                 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2219 #endif
2220 
2221         /* In good conditions, we only need to link varyings once */
2222         bool prelink =
2223                 (point_coord_mask == 0) &&
2224                 (ctx->streamout.num_targets == 0) &&
2225                 !vs->info.separable &&
2226                 !fs->info.separable;
2227 
2228         /* Try to reduce copies */
2229         struct pan_linkage _linkage;
2230         struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2231 
2232         /* Emit ATTRIBUTE descriptors if needed */
2233         if (!prelink || vs->linkage.bo == NULL) {
2234                 struct panfrost_pool *pool =
2235                         prelink ? &ctx->descs : &batch->pool;
2236 
2237                 panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
2238         }
2239 
2240         struct pipe_stream_output_info *so = &vs->stream_output;
2241         unsigned present = linkage->present, stride = linkage->stride;
2242         unsigned xfb_base = pan_xfb_base(present);
2243         struct panfrost_ptr T =
2244                 pan_pool_alloc_desc_array(&batch->pool.base,
2245                                           xfb_base +
2246                                           ctx->streamout.num_targets + 1,
2247                                           ATTRIBUTE_BUFFER);
2248         struct mali_attribute_buffer_packed *varyings =
2249                 (struct mali_attribute_buffer_packed *) T.cpu;
2250 
2251         if (buffer_count)
2252                 *buffer_count = xfb_base + ctx->streamout.num_targets;
2253 
2254 #if PAN_ARCH >= 6
2255         /* Suppress prefetch on Bifrost */
2256         memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
2257 #endif
2258 
2259         /* Emit the stream out buffers. We need enough room for all the
2260          * vertices we emit across all instances */
2261 
2262         unsigned out_count = ctx->instance_count *
2263                 u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2264 
2265         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2266                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2267                                         so->stride[i] * 4,
2268                                         out_count,
2269                                         ctx->streamout.targets[i]);
2270         }
2271 
2272         if (stride) {
2273                 panfrost_emit_varyings(batch,
2274                                 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2275                                 stride, vertex_count);
2276         }
2277 
2278         /* fp32 vec4 gl_Position */
2279         *position = panfrost_emit_varyings(batch,
2280                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2281                         sizeof(float) * 4, vertex_count);
2282 
2283         if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2284                 *psiz = panfrost_emit_varyings(batch,
2285                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2286                                 2, vertex_count);
2287         }
2288 
2289 #if PAN_ARCH <= 5
2290         pan_emit_special_input(varyings, present,
2291                         PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2292         pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2293                         MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2294         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2295                         MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2296 #endif
2297 
2298         *buffers = T.gpu;
2299         *vs_attribs = linkage->producer;
2300         *fs_attribs = linkage->consumer;
2301 }
2302 
2303 static void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)2304 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2305                                 const struct panfrost_ptr *vertex_job,
2306                                 const struct panfrost_ptr *tiler_job)
2307 {
2308         struct panfrost_context *ctx = batch->ctx;
2309 
2310         /* If rasterizer discard is enable, only submit the vertex. XXX - set
2311          * job_barrier in case buffers get ping-ponged and we need to enforce
2312          * ordering, this has a perf hit! See
2313          * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */
2314 
2315         unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2316                                            MALI_JOB_TYPE_VERTEX, true, false,
2317                                            ctx->indirect_draw ?
2318                                            batch->indirect_draw_job_id : 0,
2319                                            0, vertex_job, false);
2320 
2321         if (ctx->rasterizer->base.rasterizer_discard || batch->scissor_culls_everything)
2322                 return;
2323 
2324         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2325                          MALI_JOB_TYPE_TILER, false, false,
2326                          vertex, 0, tiler_job, false);
2327 }
2328 
2329 static void
emit_tls(struct panfrost_batch * batch)2330 emit_tls(struct panfrost_batch *batch)
2331 {
2332         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2333 
2334         /* Emitted with the FB descriptor on Midgard. */
2335         if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2336                 return;
2337 
2338         struct panfrost_bo *tls_bo =
2339                 batch->stack_size ?
2340                 panfrost_batch_get_scratchpad(batch,
2341                                               batch->stack_size,
2342                                               dev->thread_tls_alloc,
2343                                               dev->core_count):
2344                 NULL;
2345         struct pan_tls_info tls = {
2346                 .tls = {
2347                         .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2348                         .size = batch->stack_size,
2349                 },
2350         };
2351 
2352         assert(batch->tls.cpu);
2353         GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2354 }
2355 
2356 static void
emit_fbd(struct panfrost_batch * batch,const struct pan_fb_info * fb)2357 emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
2358 {
2359         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2360         struct panfrost_bo *tls_bo =
2361                 batch->stack_size ?
2362                 panfrost_batch_get_scratchpad(batch,
2363                                               batch->stack_size,
2364                                               dev->thread_tls_alloc,
2365                                               dev->core_count):
2366                 NULL;
2367         struct pan_tls_info tls = {
2368                 .tls = {
2369                         .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2370                         .size = batch->stack_size,
2371                 },
2372         };
2373 
2374         batch->framebuffer.gpu |=
2375                 GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx,
2376                                    batch->framebuffer.cpu);
2377 }
2378 
2379 /* Mark a surface as written */
2380 
2381 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2382 panfrost_initialize_surface(struct panfrost_batch *batch,
2383                             struct pipe_surface *surf)
2384 {
2385         if (surf) {
2386                 struct panfrost_resource *rsrc = pan_resource(surf->texture);
2387                 BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2388         }
2389 }
2390 
2391 /* Generate a fragment job. This should be called once per frame. (According to
2392  * presentations, this is supposed to correspond to eglSwapBuffers) */
2393 
2394 static mali_ptr
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2395 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2396 {
2397         /* Mark the affected buffers as initialized, since we're writing to it.
2398          * Also, add the surfaces we're writing to to the batch */
2399 
2400         struct pipe_framebuffer_state *fb = &batch->key;
2401 
2402         for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2403                 panfrost_initialize_surface(batch, fb->cbufs[i]);
2404 
2405         panfrost_initialize_surface(batch, fb->zsbuf);
2406 
2407         /* The passed tile coords can be out of range in some cases, so we need
2408          * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2409          * Theoretically we also need to clamp the coordinates positive, but we
2410          * avoid that edge case as all four values are unsigned. Also,
2411          * theoretically we could clamp the minima, but if that has to happen
2412          * the asserts would fail anyway (since the maxima would get clamped
2413          * and then be smaller than the minima). An edge case of sorts occurs
2414          * when no scissors are added to draw, so by default min=~0 and max=0.
2415          * But that can't happen if any actual drawing occurs (beyond a
2416          * wallpaper reload), so this is again irrelevant in practice. */
2417 
2418         batch->maxx = MIN2(batch->maxx, fb->width);
2419         batch->maxy = MIN2(batch->maxy, fb->height);
2420 
2421         /* Rendering region must be at least 1x1; otherwise, there is nothing
2422          * to do and the whole job chain should have been discarded. */
2423 
2424         assert(batch->maxx > batch->minx);
2425         assert(batch->maxy > batch->miny);
2426 
2427         struct panfrost_ptr transfer =
2428                 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
2429 
2430         GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu,
2431                                     transfer.cpu);
2432 
2433         return transfer.gpu;
2434 }
2435 
2436 #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
2437 
2438 static uint8_t
pan_draw_mode(enum pipe_prim_type mode)2439 pan_draw_mode(enum pipe_prim_type mode)
2440 {
2441         switch (mode) {
2442                 DEFINE_CASE(POINTS);
2443                 DEFINE_CASE(LINES);
2444                 DEFINE_CASE(LINE_LOOP);
2445                 DEFINE_CASE(LINE_STRIP);
2446                 DEFINE_CASE(TRIANGLES);
2447                 DEFINE_CASE(TRIANGLE_STRIP);
2448                 DEFINE_CASE(TRIANGLE_FAN);
2449                 DEFINE_CASE(QUADS);
2450                 DEFINE_CASE(POLYGON);
2451 #if PAN_ARCH <= 6
2452                 DEFINE_CASE(QUAD_STRIP);
2453 #endif
2454 
2455         default:
2456                 unreachable("Invalid draw mode");
2457         }
2458 }
2459 
2460 #undef DEFINE_CASE
2461 
2462 /* Count generated primitives (when there is no geom/tess shaders) for
2463  * transform feedback */
2464 
2465 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2466 panfrost_statistics_record(
2467                 struct panfrost_context *ctx,
2468                 const struct pipe_draw_info *info,
2469                 const struct pipe_draw_start_count_bias *draw)
2470 {
2471         if (!ctx->active_queries)
2472                 return;
2473 
2474         uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2475         ctx->prims_generated += prims;
2476 
2477         if (!ctx->streamout.num_targets)
2478                 return;
2479 
2480         ctx->tf_prims_generated += prims;
2481 }
2482 
2483 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2484 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2485 {
2486         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2487                 unsigned count;
2488 
2489                 count = u_stream_outputs_for_vertices(ctx->active_prim,
2490                                                       ctx->vertex_count);
2491                 pan_so_target(ctx->streamout.targets[i])->offset += count;
2492         }
2493 }
2494 
2495 static inline void
pan_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)2496 pan_emit_draw_descs(struct panfrost_batch *batch,
2497                 struct MALI_DRAW *d, enum pipe_shader_type st)
2498 {
2499         d->offset_start = batch->ctx->offset_start;
2500         d->instance_size = batch->ctx->instance_count > 1 ?
2501                            batch->ctx->padded_count : 1;
2502 
2503         d->uniform_buffers = batch->uniform_buffers[st];
2504         d->push_uniforms = batch->push_uniforms[st];
2505         d->textures = batch->textures[st];
2506         d->samplers = batch->samplers[st];
2507 }
2508 
2509 static inline enum mali_index_type
panfrost_translate_index_size(unsigned size)2510 panfrost_translate_index_size(unsigned size)
2511 {
2512         STATIC_ASSERT(MALI_INDEX_TYPE_NONE  == 0);
2513         STATIC_ASSERT(MALI_INDEX_TYPE_UINT8  == 1);
2514         STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);
2515 
2516         return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;
2517 }
2518 
2519 static void
panfrost_draw_emit_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * job)2520 panfrost_draw_emit_vertex(struct panfrost_batch *batch,
2521                           const struct pipe_draw_info *info,
2522                           void *invocation_template,
2523                           mali_ptr vs_vary, mali_ptr varyings,
2524                           mali_ptr attribs, mali_ptr attrib_bufs,
2525                           void *job)
2526 {
2527         void *section =
2528                 pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
2529         memcpy(section, invocation_template, pan_size(INVOCATION));
2530 
2531         pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
2532                 cfg.job_task_split = 5;
2533         }
2534 
2535         pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) {
2536                 cfg.draw_descriptor_is_64b = true;
2537                 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
2538                 cfg.attributes = attribs;
2539                 cfg.attribute_buffers = attrib_bufs;
2540                 cfg.varyings = vs_vary;
2541                 cfg.varying_buffers = vs_vary ? varyings : 0;
2542                 cfg.thread_storage = batch->tls.gpu;
2543                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
2544         }
2545 }
2546 
2547 static void
panfrost_emit_primitive_size(struct panfrost_context * ctx,bool points,mali_ptr size_array,void * prim_size)2548 panfrost_emit_primitive_size(struct panfrost_context *ctx,
2549                              bool points, mali_ptr size_array,
2550                              void *prim_size)
2551 {
2552         struct panfrost_rasterizer *rast = ctx->rasterizer;
2553 
2554         pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
2555                 if (panfrost_writes_point_size(ctx)) {
2556                         cfg.size_array = size_array;
2557                 } else {
2558                         cfg.constant = points ?
2559                                        rast->base.point_size :
2560                                        rast->base.line_width;
2561                 }
2562         }
2563 }
2564 
2565 static bool
panfrost_is_implicit_prim_restart(const struct pipe_draw_info * info)2566 panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)
2567 {
2568         unsigned implicit_index = (1 << (info->index_size * 8)) - 1;
2569         bool implicit = info->restart_index == implicit_index;
2570         return info->primitive_restart && implicit;
2571 }
2572 
2573 static inline void
panfrost_update_state_tex(struct panfrost_batch * batch,enum pipe_shader_type st)2574 panfrost_update_state_tex(struct panfrost_batch *batch,
2575                           enum pipe_shader_type st)
2576 {
2577         struct panfrost_context *ctx = batch->ctx;
2578         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
2579 
2580         unsigned dirty_3d = ctx->dirty;
2581         unsigned dirty = ctx->dirty_shader[st];
2582 
2583         if (dirty & PAN_DIRTY_STAGE_TEXTURE) {
2584                 batch->textures[st] =
2585                         panfrost_emit_texture_descriptors(batch, st);
2586         }
2587 
2588         if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2589                 batch->samplers[st] =
2590                         panfrost_emit_sampler_descriptors(batch, st);
2591         }
2592 
2593         if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2594                 batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,
2595                                 &batch->push_uniforms[st]);
2596         }
2597 }
2598 
2599 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2600 panfrost_update_state_3d(struct panfrost_batch *batch)
2601 {
2602         unsigned dirty = batch->ctx->dirty;
2603 
2604         if (dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
2605                 batch->viewport = panfrost_emit_viewport(batch);
2606 
2607         if (dirty & PAN_DIRTY_TLS_SIZE)
2608                 panfrost_batch_adjust_stack_size(batch);
2609 }
2610 
2611 static void
panfrost_update_state_vs(struct panfrost_batch * batch)2612 panfrost_update_state_vs(struct panfrost_batch *batch)
2613 {
2614         enum pipe_shader_type st = PIPE_SHADER_VERTEX;
2615         unsigned dirty = batch->ctx->dirty_shader[st];
2616 
2617         if (dirty & PAN_DIRTY_STAGE_RENDERER)
2618                 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2619 
2620         panfrost_update_state_tex(batch, st);
2621 }
2622 
2623 static void
panfrost_update_state_fs(struct panfrost_batch * batch)2624 panfrost_update_state_fs(struct panfrost_batch *batch)
2625 {
2626         enum pipe_shader_type st = PIPE_SHADER_FRAGMENT;
2627         unsigned dirty = batch->ctx->dirty_shader[st];
2628 
2629         if (dirty & PAN_DIRTY_STAGE_RENDERER)
2630                 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2631 
2632         if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2633                 batch->attribs[st] = panfrost_emit_image_attribs(batch,
2634                                 &batch->attrib_bufs[st], st);
2635         }
2636 
2637         panfrost_update_state_tex(batch, st);
2638 }
2639 
2640 #if PAN_ARCH >= 6
2641 static mali_ptr
panfrost_batch_get_bifrost_tiler(struct panfrost_batch * batch,unsigned vertex_count)2642 panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
2643 {
2644         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2645 
2646         if (!vertex_count)
2647                 return 0;
2648 
2649         if (batch->tiler_ctx.bifrost)
2650                 return batch->tiler_ctx.bifrost;
2651 
2652         struct panfrost_ptr t =
2653                 pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
2654 
2655         GENX(pan_emit_tiler_heap)(dev, t.cpu);
2656 
2657         mali_ptr heap = t.gpu;
2658 
2659         t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
2660         GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
2661                                  util_framebuffer_get_num_samples(&batch->key),
2662                                  heap, t.cpu);
2663 
2664         batch->tiler_ctx.bifrost = t.gpu;
2665         return batch->tiler_ctx.bifrost;
2666 }
2667 #endif
2668 
2669 static void
panfrost_draw_emit_tiler(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,mali_ptr indices,mali_ptr fs_vary,mali_ptr varyings,mali_ptr pos,mali_ptr psiz,void * job)2670 panfrost_draw_emit_tiler(struct panfrost_batch *batch,
2671                          const struct pipe_draw_info *info,
2672                          const struct pipe_draw_start_count_bias *draw,
2673                          void *invocation_template,
2674                          mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
2675                          mali_ptr pos, mali_ptr psiz, void *job)
2676 {
2677         struct panfrost_context *ctx = batch->ctx;
2678         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2679 
2680         void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
2681         memcpy(section, invocation_template, pan_size(INVOCATION));
2682 
2683         section = pan_section_ptr(job, TILER_JOB, PRIMITIVE);
2684         pan_pack(section, PRIMITIVE, cfg) {
2685                 cfg.draw_mode = pan_draw_mode(info->mode);
2686                 if (panfrost_writes_point_size(ctx))
2687                         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
2688 
2689                 /* For line primitives, PRIMITIVE.first_provoking_vertex must
2690                  * be set to true and the provoking vertex is selected with
2691                  * DRAW.flat_shading_vertex.
2692                  */
2693                 if (info->mode == PIPE_PRIM_LINES ||
2694                     info->mode == PIPE_PRIM_LINE_LOOP ||
2695                     info->mode == PIPE_PRIM_LINE_STRIP)
2696                         cfg.first_provoking_vertex = true;
2697                 else
2698                         cfg.first_provoking_vertex = rast->flatshade_first;
2699 
2700                 if (panfrost_is_implicit_prim_restart(info)) {
2701                         cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
2702                 } else if (info->primitive_restart) {
2703                         cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
2704                         cfg.primitive_restart_index = info->restart_index;
2705                 }
2706 
2707                 cfg.job_task_split = 6;
2708 
2709                 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
2710                 cfg.index_type = panfrost_translate_index_size(info->index_size);
2711 
2712                 if (cfg.index_type) {
2713                         cfg.indices = indices;
2714                         cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
2715                 }
2716         }
2717 
2718         enum pipe_prim_type prim = u_reduced_prim(info->mode);
2719         bool polygon = (prim == PIPE_PRIM_TRIANGLES);
2720         void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
2721 
2722 #if PAN_ARCH >= 6
2723         pan_section_pack(job, TILER_JOB, TILER, cfg) {
2724                 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
2725         }
2726 
2727         pan_section_pack(job, TILER_JOB, PADDING, cfg);
2728 #endif
2729 
2730         section = pan_section_ptr(job, TILER_JOB, DRAW);
2731         pan_pack(section, DRAW, cfg) {
2732                 cfg.four_components_per_vertex = true;
2733                 cfg.draw_descriptor_is_64b = true;
2734                 cfg.front_face_ccw = rast->front_ccw;
2735 
2736                 /*
2737                  * From the Gallium documentation,
2738                  * pipe_rasterizer_state::cull_face "indicates which faces of
2739                  * polygons to cull". Points and lines are not considered
2740                  * polygons and should be drawn even if all faces are culled.
2741                  * The hardware does not take primitive type into account when
2742                  * culling, so we need to do that check ourselves.
2743                  */
2744                 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
2745                 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
2746                 cfg.position = pos;
2747                 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
2748                 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
2749                 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
2750                 cfg.viewport = batch->viewport;
2751                 cfg.varyings = fs_vary;
2752                 cfg.varying_buffers = fs_vary ? varyings : 0;
2753                 cfg.thread_storage = batch->tls.gpu;
2754 
2755                 /* For all primitives but lines DRAW.flat_shading_vertex must
2756                  * be set to 0 and the provoking vertex is selected with the
2757                  * PRIMITIVE.first_provoking_vertex field.
2758                  */
2759                 if (prim == PIPE_PRIM_LINES) {
2760                         /* The logic is inverted across arches. */
2761                         cfg.flat_shading_vertex = rast->flatshade_first
2762                                                 ^ (PAN_ARCH <= 5);
2763                 }
2764 
2765                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
2766 
2767                 if (ctx->occlusion_query && ctx->active_queries) {
2768                         if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
2769                                 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
2770                         else
2771                                 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
2772 
2773                         struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
2774                         cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
2775                         panfrost_batch_write_rsrc(ctx->batch, rsrc,
2776                                               PIPE_SHADER_FRAGMENT);
2777                 }
2778         }
2779 
2780         panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
2781 }
2782 
2783 static void
panfrost_direct_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)2784 panfrost_direct_draw(struct panfrost_batch *batch,
2785                      const struct pipe_draw_info *info,
2786                      unsigned drawid_offset,
2787                      const struct pipe_draw_start_count_bias *draw)
2788 {
2789         if (!draw->count || !info->instance_count)
2790                 return;
2791 
2792         struct panfrost_context *ctx = batch->ctx;
2793 
2794         /* Take into account a negative bias */
2795         ctx->indirect_draw = false;
2796         ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
2797         ctx->instance_count = info->instance_count;
2798         ctx->base_vertex = info->index_size ? draw->index_bias : 0;
2799         ctx->base_instance = info->start_instance;
2800         ctx->active_prim = info->mode;
2801         ctx->drawid = drawid_offset;
2802 
2803         struct panfrost_ptr tiler =
2804                 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
2805         struct panfrost_ptr vertex =
2806                 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
2807 
2808         unsigned vertex_count = ctx->vertex_count;
2809 
2810         unsigned min_index = 0, max_index = 0;
2811         mali_ptr indices = 0;
2812 
2813         if (info->index_size) {
2814                 indices = panfrost_get_index_buffer_bounded(batch, info, draw,
2815                                                             &min_index,
2816                                                             &max_index);
2817 
2818                 /* Use the corresponding values */
2819                 vertex_count = max_index - min_index + 1;
2820                 ctx->offset_start = min_index + draw->index_bias;
2821         } else {
2822                 ctx->offset_start = draw->start;
2823         }
2824 
2825         if (info->instance_count > 1)
2826                 ctx->padded_count = panfrost_padded_vertex_count(vertex_count);
2827         else
2828                 ctx->padded_count = vertex_count;
2829 
2830         panfrost_statistics_record(ctx, info, draw);
2831 
2832         struct mali_invocation_packed invocation;
2833         if (info->instance_count > 1) {
2834                 panfrost_pack_work_groups_compute(&invocation,
2835                                                   1, vertex_count, info->instance_count,
2836                                                   1, 1, 1, true, false);
2837         } else {
2838                 pan_pack(&invocation, INVOCATION, cfg) {
2839                         cfg.invocations = MALI_POSITIVE(vertex_count);
2840                         cfg.size_y_shift = 0;
2841                         cfg.size_z_shift = 0;
2842                         cfg.workgroups_x_shift = 0;
2843                         cfg.workgroups_y_shift = 0;
2844                         cfg.workgroups_z_shift = 32;
2845                         cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
2846                 }
2847         }
2848 
2849         /* Emit all sort of descriptors. */
2850         mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2851 
2852         panfrost_emit_varying_descriptor(batch,
2853                                          ctx->padded_count *
2854                                          ctx->instance_count,
2855                                          &vs_vary, &fs_vary, &varyings,
2856                                          NULL, &pos, &psiz,
2857                                          info->mode == PIPE_PRIM_POINTS);
2858 
2859         mali_ptr attribs, attrib_bufs;
2860         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
2861 
2862         panfrost_update_state_3d(batch);
2863         panfrost_update_state_vs(batch);
2864         panfrost_update_state_fs(batch);
2865         panfrost_clean_state_3d(ctx);
2866 
2867         /* Fire off the draw itself */
2868         panfrost_draw_emit_vertex(batch, info, &invocation,
2869                                   vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
2870         panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
2871                                  fs_vary, varyings, pos, psiz, tiler.cpu);
2872         panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
2873 
2874         /* Increment transform feedback offsets */
2875         panfrost_update_streamout_offsets(ctx);
2876 }
2877 
2878 static void
panfrost_indirect_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)2879 panfrost_indirect_draw(struct panfrost_batch *batch,
2880                        const struct pipe_draw_info *info,
2881                        unsigned drawid_offset,
2882                        const struct pipe_draw_indirect_info *indirect,
2883                        const struct pipe_draw_start_count_bias *draw)
2884 {
2885         /* Indirect draw count and multi-draw not supported. */
2886         assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);
2887 
2888         struct panfrost_context *ctx = batch->ctx;
2889         struct panfrost_device *dev = pan_device(ctx->base.screen);
2890 
2891         /* TODO: update statistics (see panfrost_statistics_record()) */
2892         /* TODO: Increment transform feedback offsets */
2893         assert(ctx->streamout.num_targets == 0);
2894 
2895         ctx->active_prim = info->mode;
2896         ctx->drawid = drawid_offset;
2897         ctx->indirect_draw = true;
2898 
2899         struct panfrost_ptr tiler =
2900                 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
2901         struct panfrost_ptr vertex =
2902                 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
2903 
2904         struct panfrost_shader_state *vs =
2905                 panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2906 
2907         struct panfrost_bo *index_buf = NULL;
2908 
2909         if (info->index_size) {
2910                 assert(!info->has_user_indices);
2911                 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
2912                 index_buf = rsrc->image.data.bo;
2913                 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2914         }
2915 
2916         mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2917         unsigned varying_buf_count;
2918 
2919         /* We want to create templates, set all count fields to 0 to reflect
2920          * that.
2921          */
2922         ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;
2923         ctx->offset_start = 0;
2924 
2925         /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the
2926          * vertex shader uses gl_VertexID or gl_BaseVertex.
2927          */
2928         ctx->first_vertex_sysval_ptr = 0;
2929         ctx->base_vertex_sysval_ptr = 0;
2930         ctx->base_instance_sysval_ptr = 0;
2931 
2932         panfrost_update_state_3d(batch);
2933         panfrost_update_state_vs(batch);
2934         panfrost_update_state_fs(batch);
2935         panfrost_clean_state_3d(ctx);
2936 
2937         bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);
2938 
2939         panfrost_emit_varying_descriptor(batch, 0,
2940                                          &vs_vary, &fs_vary, &varyings,
2941                                          &varying_buf_count, &pos, &psiz,
2942                                          point_coord_replace);
2943 
2944         mali_ptr attribs, attrib_bufs;
2945         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
2946 
2947         /* Zero-ed invocation, the compute job will update it. */
2948         static struct mali_invocation_packed invocation;
2949 
2950         /* Fire off the draw itself */
2951         panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings,
2952                                   attribs, attrib_bufs, vertex.cpu);
2953         panfrost_draw_emit_tiler(batch, info, draw, &invocation,
2954                                  index_buf ? index_buf->ptr.gpu : 0,
2955                                  fs_vary, varyings, pos, psiz, tiler.cpu);
2956 
2957         /* Add the varying heap BO to the batch if we're allocating varyings. */
2958         if (varyings) {
2959                 panfrost_batch_add_bo(batch,
2960                                       dev->indirect_draw_shaders.varying_heap,
2961                                       PIPE_SHADER_VERTEX);
2962         }
2963 
2964         assert(indirect->buffer);
2965 
2966         struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);
2967 
2968         /* Don't count images: those attributes don't need to be patched. */
2969         unsigned attrib_count =
2970                 vs->info.attribute_count -
2971                 util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);
2972 
2973         panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);
2974 
2975         struct pan_indirect_draw_info draw_info = {
2976                 .last_indirect_draw = batch->indirect_draw_job_id,
2977                 .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,
2978                 .index_buf = index_buf ? index_buf->ptr.gpu : 0,
2979                 .first_vertex_sysval = ctx->first_vertex_sysval_ptr,
2980                 .base_vertex_sysval = ctx->base_vertex_sysval_ptr,
2981                 .base_instance_sysval = ctx->base_instance_sysval_ptr,
2982                 .vertex_job = vertex.gpu,
2983                 .tiler_job = tiler.gpu,
2984                 .attrib_bufs = attrib_bufs,
2985                 .attribs = attribs,
2986                 .attrib_count = attrib_count,
2987                 .varying_bufs = varyings,
2988                 .index_size = info->index_size,
2989         };
2990 
2991         if (panfrost_writes_point_size(ctx))
2992                 draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;
2993 
2994         if (vs->info.vs.writes_point_size)
2995                 draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;
2996 
2997 
2998         if (info->primitive_restart) {
2999                 draw_info.restart_index = info->restart_index;
3000                 draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;
3001         }
3002 
3003         batch->indirect_draw_job_id =
3004                 GENX(panfrost_emit_indirect_draw)(&batch->pool.base,
3005                                                   &batch->scoreboard,
3006                                                   &draw_info,
3007                                                   &batch->indirect_draw_ctx);
3008 
3009         panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3010 }
3011 
3012 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3013 panfrost_draw_vbo(struct pipe_context *pipe,
3014                   const struct pipe_draw_info *info,
3015                   unsigned drawid_offset,
3016                   const struct pipe_draw_indirect_info *indirect,
3017                   const struct pipe_draw_start_count_bias *draws,
3018                   unsigned num_draws)
3019 {
3020         struct panfrost_context *ctx = pan_context(pipe);
3021         struct panfrost_device *dev = pan_device(pipe->screen);
3022 
3023         if (!panfrost_render_condition_check(ctx))
3024                 return;
3025 
3026         /* Emulate indirect draws unless we're using the experimental path */
3027         if (!(dev->debug & PAN_DBG_INDIRECT) && indirect && indirect->buffer) {
3028                 assert(num_draws == 1);
3029                 util_draw_indirect(pipe, info, indirect);
3030                 return;
3031         }
3032 
3033         /* Do some common setup */
3034         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3035 
3036         /* Don't add too many jobs to a single batch. Hardware has a hard limit
3037          * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to
3038          * avoid the risk of timeouts. This might not be a good idea. */
3039         if (unlikely(batch->scoreboard.job_index > 10000))
3040                 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3041 
3042         unsigned zs_draws = ctx->depth_stencil->draws;
3043         batch->draws |= zs_draws;
3044         batch->resolve |= zs_draws;
3045 
3046         /* Mark everything dirty when debugging */
3047         if (unlikely(dev->debug & PAN_DBG_DIRTY))
3048                 panfrost_dirty_state_all(ctx);
3049 
3050         /* Conservatively assume draw parameters always change */
3051         ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3052 
3053         if (indirect) {
3054                 assert(num_draws == 1);
3055 
3056                 if (indirect->count_from_stream_output) {
3057                         struct pipe_draw_start_count_bias tmp_draw = *draws;
3058                         struct panfrost_streamout_target *so =
3059                                 pan_so_target(indirect->count_from_stream_output);
3060 
3061                         tmp_draw.start = 0;
3062                         tmp_draw.count = so->offset;
3063                         tmp_draw.index_bias = 0;
3064                         panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);
3065                         return;
3066                 }
3067 
3068                 panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);
3069                 return;
3070         }
3071 
3072         struct pipe_draw_info tmp_info = *info;
3073         unsigned drawid = drawid_offset;
3074 
3075         for (unsigned i = 0; i < num_draws; i++) {
3076                 panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
3077 
3078                 if (tmp_info.increment_draw_id) {
3079                         ctx->dirty |= PAN_DIRTY_DRAWID;
3080                         drawid++;
3081                 }
3082         }
3083 
3084 }
3085 
3086 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3087  * construct the COMPUTE job and some of its payload.
3088  */
3089 
3090 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3091 panfrost_launch_grid(struct pipe_context *pipe,
3092                 const struct pipe_grid_info *info)
3093 {
3094         struct panfrost_context *ctx = pan_context(pipe);
3095 
3096         /* XXX - shouldn't be necessary with working memory barriers. Affected
3097          * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3098         panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3099 
3100         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3101 
3102         struct panfrost_shader_state *cs =
3103                 &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];
3104 
3105         /* Indirect dispatch can't handle workgroup local storage since that
3106          * would require dynamic memory allocation. Bail in this case. */
3107         if (info->indirect && !cs->info.wls_size) {
3108                 struct pipe_transfer *transfer;
3109                 uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,
3110                                 info->indirect_offset,
3111                                 3 * sizeof(uint32_t),
3112                                 PIPE_MAP_READ,
3113                                 &transfer);
3114 
3115                 struct pipe_grid_info direct = *info;
3116                 direct.indirect = NULL;
3117                 direct.grid[0] = params[0];
3118                 direct.grid[1] = params[1];
3119                 direct.grid[2] = params[2];
3120                 pipe_buffer_unmap(pipe, transfer);
3121 
3122                 if (params[0] && params[1] && params[2])
3123                         panfrost_launch_grid(pipe, &direct);
3124 
3125                 return;
3126         }
3127 
3128         ctx->compute_grid = info;
3129 
3130         struct panfrost_ptr t =
3131                 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3132 
3133         /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
3134          * reuse the graphics path for this by lowering to Gallium */
3135 
3136         struct pipe_constant_buffer ubuf = {
3137                 .buffer = NULL,
3138                 .buffer_offset = 0,
3139                 .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem,
3140                 .user_buffer = info->input
3141         };
3142 
3143         if (info->input)
3144                 pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);
3145 
3146         /* Invoke according to the grid info */
3147 
3148         void *invocation =
3149                 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION);
3150         unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };
3151 
3152         if (info->indirect)
3153                 num_wg[0] = num_wg[1] = num_wg[2] = 1;
3154 
3155         panfrost_pack_work_groups_compute(invocation,
3156                                           num_wg[0], num_wg[1], num_wg[2],
3157                                           info->block[0], info->block[1],
3158                                           info->block[2],
3159                                           false, info->indirect != NULL);
3160 
3161         pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
3162                 cfg.job_task_split =
3163                         util_logbase2_ceil(info->block[0] + 1) +
3164                         util_logbase2_ceil(info->block[1] + 1) +
3165                         util_logbase2_ceil(info->block[2] + 1);
3166         }
3167 
3168         pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
3169                 cfg.draw_descriptor_is_64b = true;
3170                 cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);
3171                 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
3172                 cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
3173                 cfg.uniform_buffers = panfrost_emit_const_buf(batch,
3174                                 PIPE_SHADER_COMPUTE, &cfg.push_uniforms);
3175                 cfg.textures = panfrost_emit_texture_descriptors(batch,
3176                                 PIPE_SHADER_COMPUTE);
3177                 cfg.samplers = panfrost_emit_sampler_descriptors(batch,
3178                                 PIPE_SHADER_COMPUTE);
3179         }
3180 
3181         unsigned indirect_dep = 0;
3182         if (info->indirect) {
3183                 struct pan_indirect_dispatch_info indirect = {
3184                         .job = t.gpu,
3185                         .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
3186                                         info->indirect_offset,
3187                         .num_wg_sysval = {
3188                                 batch->num_wg_sysval[0],
3189                                 batch->num_wg_sysval[1],
3190                                 batch->num_wg_sysval[2],
3191                         },
3192                 };
3193 
3194                 indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base,
3195                                                                 &batch->scoreboard,
3196                                                                 &indirect);
3197         }
3198 
3199         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3200                          MALI_JOB_TYPE_COMPUTE, true, false,
3201                          indirect_dep, 0, &t, false);
3202         panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3203 }
3204 
3205 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3206 panfrost_create_rasterizer_state(
3207         struct pipe_context *pctx,
3208         const struct pipe_rasterizer_state *cso)
3209 {
3210         struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3211 
3212         so->base = *cso;
3213 
3214         /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */
3215         assert(cso->offset_clamp == 0.0);
3216 
3217         pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3218                 cfg.multisample_enable = cso->multisample;
3219                 cfg.fixed_function_near_discard = cso->depth_clip_near;
3220                 cfg.fixed_function_far_discard = cso->depth_clip_far;
3221                 cfg.shader_depth_range_fixed = true;
3222         }
3223 
3224         pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3225                 cfg.depth_range_1 = cso->offset_tri;
3226                 cfg.depth_range_2 = cso->offset_tri;
3227                 cfg.single_sampled_lines = !cso->multisample;
3228         }
3229 
3230         return so;
3231 }
3232 
3233 /* Assigns a vertex buffer for a given (index, divisor) tuple */
3234 
3235 static unsigned
pan_assign_vertex_buffer(struct pan_vertex_buffer * buffers,unsigned * nr_bufs,unsigned vbi,unsigned divisor)3236 pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers,
3237                          unsigned *nr_bufs,
3238                          unsigned vbi,
3239                          unsigned divisor)
3240 {
3241         /* Look up the buffer */
3242         for (unsigned i = 0; i < (*nr_bufs); ++i) {
3243                 if (buffers[i].vbi == vbi && buffers[i].divisor == divisor)
3244                         return i;
3245         }
3246 
3247         /* Else, create a new buffer */
3248         unsigned idx = (*nr_bufs)++;
3249 
3250         buffers[idx] = (struct pan_vertex_buffer) {
3251                 .vbi = vbi,
3252                 .divisor = divisor
3253         };
3254 
3255         return idx;
3256 }
3257 
3258 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3259 panfrost_create_vertex_elements_state(
3260         struct pipe_context *pctx,
3261         unsigned num_elements,
3262         const struct pipe_vertex_element *elements)
3263 {
3264         struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3265         struct panfrost_device *dev = pan_device(pctx->screen);
3266 
3267         so->num_elements = num_elements;
3268         memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3269 
3270         /* Assign attribute buffers corresponding to the vertex buffers, keyed
3271          * for a particular divisor since that's how instancing works on Mali */
3272         for (unsigned i = 0; i < num_elements; ++i) {
3273                 so->element_buffer[i] = pan_assign_vertex_buffer(
3274                                 so->buffers, &so->nr_bufs,
3275                                 elements[i].vertex_buffer_index,
3276                                 elements[i].instance_divisor);
3277         }
3278 
3279         for (int i = 0; i < num_elements; ++i) {
3280                 enum pipe_format fmt = elements[i].src_format;
3281                 const struct util_format_description *desc = util_format_description(fmt);
3282                 so->formats[i] = dev->formats[desc->format].hw;
3283                 assert(so->formats[i]);
3284         }
3285 
3286         /* Let's also prepare vertex builtins */
3287         so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3288         so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3289 
3290         return so;
3291 }
3292 
3293 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3294 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3295 {
3296         switch (in) {
3297         case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;
3298         case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;
3299         case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;
3300         case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;
3301         case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;
3302         case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;
3303         case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;
3304         case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;
3305         default: unreachable("Invalid stencil op");
3306         }
3307 }
3308 
3309 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3310 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3311                     struct mali_stencil_packed *out)
3312 {
3313         pan_pack(out, STENCIL, s) {
3314                 s.mask = in->valuemask;
3315                 s.compare_function = (enum mali_func) in->func;
3316                 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3317                 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3318                 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3319         }
3320 }
3321 
3322 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3323 panfrost_create_depth_stencil_state(struct pipe_context *pipe,
3324                                     const struct pipe_depth_stencil_alpha_state *zsa)
3325 {
3326         struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3327         so->base = *zsa;
3328 
3329         /* Normalize (there's no separate enable) */
3330         if (!zsa->alpha_enabled)
3331                 so->base.alpha_func = MALI_FUNC_ALWAYS;
3332 
3333         /* Prepack relevant parts of the Renderer State Descriptor. They will
3334          * be ORed in at draw-time */
3335         pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3336                 cfg.depth_function = zsa->depth_enabled ?
3337                         (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;
3338 
3339                 cfg.depth_write_mask = zsa->depth_writemask;
3340         }
3341 
3342         pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3343                 cfg.stencil_enable = zsa->stencil[0].enabled;
3344 
3345                 cfg.stencil_mask_front = zsa->stencil[0].writemask;
3346                 cfg.stencil_mask_back = zsa->stencil[1].enabled ?
3347                         zsa->stencil[1].writemask : zsa->stencil[0].writemask;
3348 
3349 #if PAN_ARCH <= 5
3350                 cfg.alpha_test_compare_function =
3351                         (enum mali_func) so->base.alpha_func;
3352 #endif
3353         }
3354 
3355         /* Stencil tests have their own words in the RSD */
3356         pan_pipe_to_stencil(&zsa->stencil[0], &so->stencil_front);
3357 
3358         if (zsa->stencil[1].enabled)
3359                 pan_pipe_to_stencil(&zsa->stencil[1], &so->stencil_back);
3360 	else
3361                 so->stencil_back = so->stencil_front;
3362 
3363         so->enabled = zsa->stencil[0].enabled ||
3364                 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3365 
3366         /* Write masks need tracking together */
3367         if (zsa->depth_writemask)
3368                 so->draws |= PIPE_CLEAR_DEPTH;
3369 
3370         if (zsa->stencil[0].enabled)
3371                 so->draws |= PIPE_CLEAR_STENCIL;
3372 
3373         /* TODO: Bounds test should be easy */
3374         assert(!zsa->depth_bounds_test);
3375 
3376         return so;
3377 }
3378 
3379 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3380 panfrost_create_sampler_view(
3381         struct pipe_context *pctx,
3382         struct pipe_resource *texture,
3383         const struct pipe_sampler_view *template)
3384 {
3385         struct panfrost_context *ctx = pan_context(pctx);
3386         struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
3387 
3388         pan_legalize_afbc_format(ctx, pan_resource(texture), template->format);
3389 
3390         pipe_reference(NULL, &texture->reference);
3391 
3392         so->base = *template;
3393         so->base.texture = texture;
3394         so->base.reference.count = 1;
3395         so->base.context = pctx;
3396 
3397         panfrost_create_sampler_view_bo(so, pctx, texture);
3398 
3399         return (struct pipe_sampler_view *) so;
3400 }
3401 
3402 /* A given Gallium blend state can be encoded to the hardware in numerous,
3403  * dramatically divergent ways due to the interactions of blending with
3404  * framebuffer formats. Conceptually, there are two modes:
3405  *
3406  * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3407  *   state, and suitable blend constant)
3408  *
3409  * - Blend shaders (for everything else)
3410  *
3411  * A given Gallium blend configuration will compile to exactly one
3412  * fixed-function blend state, if it compiles to any, although the constant
3413  * will vary across runs as that is tracked outside of the Gallium CSO.
3414  *
3415  * However, that same blend configuration will compile to many different blend
3416  * shaders, depending on the framebuffer formats active. The rationale is that
3417  * blend shaders override not just fixed-function blending but also
3418  * fixed-function format conversion, so blend shaders are keyed to a particular
3419  * framebuffer format. As an example, the tilebuffer format is identical for
3420  * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3421  * blend shaders.
3422  *
3423  * All of this state is encapsulated in the panfrost_blend_state struct
3424  * (our subclass of pipe_blend_state).
3425  */
3426 
3427 /* Create a blend CSO. Essentially, try to compile a fixed-function
3428  * expression and initialize blend shaders */
3429 
3430 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3431 panfrost_create_blend_state(struct pipe_context *pipe,
3432                             const struct pipe_blend_state *blend)
3433 {
3434         struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3435         so->base = *blend;
3436 
3437         so->pan.logicop_enable = blend->logicop_enable;
3438         so->pan.logicop_func = blend->logicop_func;
3439         so->pan.rt_count = blend->max_rt + 1;
3440 
3441         for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3442                 unsigned g = blend->independent_blend_enable ? c : 0;
3443                 const struct pipe_rt_blend_state pipe = blend->rt[g];
3444                 struct pan_blend_equation equation = {0};
3445 
3446                 equation.color_mask = pipe.colormask;
3447                 equation.blend_enable = pipe.blend_enable;
3448 
3449                 if (pipe.blend_enable) {
3450                         equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);
3451                         equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);
3452                         equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);
3453                         equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);
3454                         equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);
3455                         equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);
3456                         equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);
3457                         equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);
3458                         equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);
3459                         equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);
3460                 }
3461 
3462                 /* Determine some common properties */
3463                 unsigned constant_mask = pan_blend_constant_mask(equation);
3464                 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3465                 so->info[c] = (struct pan_blend_info) {
3466                         .no_colour = (equation.color_mask == 0),
3467                         .opaque = pan_blend_is_opaque(equation),
3468                         .constant_mask = constant_mask,
3469 
3470                         /* TODO: check the dest for the logicop */
3471                         .load_dest = blend->logicop_enable ||
3472                                 pan_blend_reads_dest(equation),
3473 
3474                         /* Could this possibly be fixed-function? */
3475                         .fixed_function = !blend->logicop_enable &&
3476                                 pan_blend_can_fixed_function(equation,
3477                                                              supports_2src) &&
3478                                 (!constant_mask ||
3479                                  pan_blend_supports_constant(PAN_ARCH, c))
3480                 };
3481 
3482                 so->pan.rts[c].equation = equation;
3483 
3484                 /* Bifrost needs to know if any render target loads its
3485                  * destination in the hot draw path, so precompute this */
3486                 if (so->info[c].load_dest)
3487                         so->load_dest_mask |= BITFIELD_BIT(c);
3488 
3489                 /* Converting equations to Mali style is expensive, do it at
3490                  * CSO create time instead of draw-time */
3491                 if (so->info[c].fixed_function) {
3492                         so->equation[c] = pan_pack_blend(equation);
3493                 }
3494         }
3495 
3496         return so;
3497 }
3498 
3499 static void
prepare_rsd(struct panfrost_shader_state * state,struct panfrost_pool * pool,bool upload)3500 prepare_rsd(struct panfrost_shader_state *state,
3501             struct panfrost_pool *pool, bool upload)
3502 {
3503         struct mali_renderer_state_packed *out =
3504                 (struct mali_renderer_state_packed *)&state->partial_rsd;
3505 
3506         if (upload) {
3507                 struct panfrost_ptr ptr =
3508                         pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3509 
3510                 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3511                 out = ptr.cpu;
3512         }
3513 
3514         pan_pack(out, RENDERER_STATE, cfg) {
3515                 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3516         }
3517 }
3518 
3519 static void
panfrost_get_sample_position(struct pipe_context * context,unsigned sample_count,unsigned sample_index,float * out_value)3520 panfrost_get_sample_position(struct pipe_context *context,
3521                              unsigned sample_count,
3522                              unsigned sample_index,
3523                              float *out_value)
3524 {
3525         panfrost_query_sample_position(
3526                         panfrost_sample_pattern(sample_count),
3527                         sample_index,
3528                         out_value);
3529 }
3530 
3531 static void
screen_destroy(struct pipe_screen * pscreen)3532 screen_destroy(struct pipe_screen *pscreen)
3533 {
3534         struct panfrost_device *dev = pan_device(pscreen);
3535         GENX(panfrost_cleanup_indirect_draw_shaders)(dev);
3536         GENX(pan_indirect_dispatch_cleanup)(dev);
3537         GENX(pan_blitter_cleanup)(dev);
3538 }
3539 
3540 static void
preload(struct panfrost_batch * batch,struct pan_fb_info * fb)3541 preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
3542 {
3543         GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
3544                              PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
3545 }
3546 
3547 static void
init_batch(struct panfrost_batch * batch)3548 init_batch(struct panfrost_batch *batch)
3549 {
3550         /* Reserve the framebuffer and local storage descriptors */
3551         batch->framebuffer =
3552 #if PAN_ARCH == 4
3553                 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
3554 #else
3555                 pan_pool_alloc_desc_aggregate(&batch->pool.base,
3556                                               PAN_DESC(FRAMEBUFFER),
3557                                               PAN_DESC(ZS_CRC_EXTENSION),
3558                                               PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
3559 
3560                 batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD;
3561 #endif
3562 
3563 #if PAN_ARCH >= 6
3564         batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
3565 #else
3566         /* On Midgard, the TLS is embedded in the FB descriptor */
3567         batch->tls = batch->framebuffer;
3568 #endif
3569 }
3570 
3571 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)3572 panfrost_sampler_view_destroy(
3573         struct pipe_context *pctx,
3574         struct pipe_sampler_view *pview)
3575 {
3576         struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
3577 
3578         pipe_resource_reference(&pview->texture, NULL);
3579         panfrost_bo_unreference(view->state.bo);
3580         ralloc_free(view);
3581 }
3582 
3583 static void
context_init(struct pipe_context * pipe)3584 context_init(struct pipe_context *pipe)
3585 {
3586         pipe->draw_vbo           = panfrost_draw_vbo;
3587         pipe->launch_grid        = panfrost_launch_grid;
3588 
3589         pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3590         pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3591         pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3592         pipe->create_sampler_view = panfrost_create_sampler_view;
3593         pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
3594         pipe->create_sampler_state = panfrost_create_sampler_state;
3595         pipe->create_blend_state = panfrost_create_blend_state;
3596 
3597         pipe->get_sample_position = panfrost_get_sample_position;
3598 }
3599 
3600 #if PAN_ARCH <= 5
3601 
3602 /* Returns the polygon list's GPU address if available, or otherwise allocates
3603  * the polygon list.  It's perfectly fast to use allocate/free BO directly,
3604  * since we'll hit the BO cache and this is one-per-batch anyway. */
3605 
3606 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)3607 batch_get_polygon_list(struct panfrost_batch *batch)
3608 {
3609         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3610 
3611         if (!batch->tiler_ctx.midgard.polygon_list) {
3612                 bool has_draws = batch->scoreboard.first_tiler != NULL;
3613                 unsigned size =
3614                         panfrost_tiler_get_polygon_list_size(dev,
3615                                                              batch->key.width,
3616                                                              batch->key.height,
3617                                                              has_draws);
3618                 size = util_next_power_of_two(size);
3619 
3620                 /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
3621                  * we need to write the polygon list manually because there's not WRITE_VALUE
3622                  * job in the chain (maybe we should add one...). */
3623                 bool init_polygon_list = !has_draws && (dev->quirks & MIDGARD_NO_HIER_TILING);
3624                 batch->tiler_ctx.midgard.polygon_list =
3625                         panfrost_batch_create_bo(batch, size,
3626                                                  init_polygon_list ? 0 : PAN_BO_INVISIBLE,
3627                                                  PIPE_SHADER_VERTEX,
3628                                                  "Polygon list");
3629                 panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
3630                                 PIPE_SHADER_FRAGMENT);
3631 
3632                 if (init_polygon_list) {
3633                         assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
3634                         uint32_t *polygon_list_body =
3635                                 batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
3636                                 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
3637 
3638                         /* Magic for Mali T720 */
3639                         polygon_list_body[0] = 0xa0000000;
3640                 }
3641 
3642                 batch->tiler_ctx.midgard.disable = !has_draws;
3643         }
3644 
3645         return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
3646 }
3647 #endif
3648 
3649 static void
init_polygon_list(struct panfrost_batch * batch)3650 init_polygon_list(struct panfrost_batch *batch)
3651 {
3652 #if PAN_ARCH <= 5
3653         mali_ptr polygon_list = batch_get_polygon_list(batch);
3654         panfrost_scoreboard_initialize_tiler(&batch->pool.base,
3655                                              &batch->scoreboard,
3656                                              polygon_list);
3657 #endif
3658 }
3659 
3660 void
GENX(panfrost_cmdstream_screen_init)3661 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
3662 {
3663         struct panfrost_device *dev = &screen->dev;
3664 
3665         screen->vtbl.prepare_rsd = prepare_rsd;
3666         screen->vtbl.emit_tls    = emit_tls;
3667         screen->vtbl.emit_fbd    = emit_fbd;
3668         screen->vtbl.emit_fragment_job = emit_fragment_job;
3669         screen->vtbl.screen_destroy = screen_destroy;
3670         screen->vtbl.preload     = preload;
3671         screen->vtbl.context_init = context_init;
3672         screen->vtbl.init_batch = init_batch;
3673         screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
3674         screen->vtbl.init_polygon_list = init_polygon_list;
3675         screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
3676         screen->vtbl.compile_shader = GENX(pan_shader_compile);
3677 
3678         GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
3679                                &screen->blitter.desc_pool.base);
3680         GENX(pan_indirect_dispatch_init)(dev);
3681         GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base);
3682 }
3683