1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "util/macros.h"
27 #include "util/u_prim.h"
28 #include "util/u_vbuf.h"
29 #include "util/u_helpers.h"
30 #include "util/u_draw.h"
31 #include "util/u_memory.h"
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "gallium/auxiliary/util/u_blend.h"
35
36 #include "panfrost-quirks.h"
37 #include "genxml/gen_macros.h"
38
39 #include "pan_pool.h"
40 #include "pan_bo.h"
41 #include "pan_blend.h"
42 #include "pan_context.h"
43 #include "pan_job.h"
44 #include "pan_shader.h"
45 #include "pan_texture.h"
46 #include "pan_util.h"
47 #include "pan_indirect_draw.h"
48 #include "pan_indirect_dispatch.h"
49 #include "pan_blitter.h"
50
51 struct panfrost_rasterizer {
52 struct pipe_rasterizer_state base;
53
54 /* Partially packed RSD words */
55 struct mali_multisample_misc_packed multisample;
56 struct mali_stencil_mask_misc_packed stencil_misc;
57 };
58
59 struct panfrost_zsa_state {
60 struct pipe_depth_stencil_alpha_state base;
61
62 /* Is any depth, stencil, or alpha testing enabled? */
63 bool enabled;
64
65 /* Mask of PIPE_CLEAR_{DEPTH,STENCIL} written */
66 unsigned draws;
67
68 /* Prepacked words from the RSD */
69 struct mali_multisample_misc_packed rsd_depth;
70 struct mali_stencil_mask_misc_packed rsd_stencil;
71 struct mali_stencil_packed stencil_front, stencil_back;
72 };
73
74 struct panfrost_sampler_state {
75 struct pipe_sampler_state base;
76 struct mali_sampler_packed hw;
77 };
78
79 /* Misnomer: Sampler view corresponds to textures, not samplers */
80
81 struct panfrost_sampler_view {
82 struct pipe_sampler_view base;
83 struct panfrost_pool_ref state;
84 struct mali_texture_packed bifrost_descriptor;
85 mali_ptr texture_bo;
86 uint64_t modifier;
87 };
88
89 /* Statically assert that PIPE_* enums match the hardware enums.
90 * (As long as they match, we don't need to translate them.)
91 */
92 UNUSED static void
pan_pipe_asserts()93 pan_pipe_asserts()
94 {
95 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
96
97 /* Compare functions are natural in both Gallium and Mali */
98 PIPE_ASSERT(PIPE_FUNC_NEVER == MALI_FUNC_NEVER);
99 PIPE_ASSERT(PIPE_FUNC_LESS == MALI_FUNC_LESS);
100 PIPE_ASSERT(PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL);
101 PIPE_ASSERT(PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL);
102 PIPE_ASSERT(PIPE_FUNC_GREATER == MALI_FUNC_GREATER);
103 PIPE_ASSERT(PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL);
104 PIPE_ASSERT(PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL);
105 PIPE_ASSERT(PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS);
106 }
107
108 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)109 panfrost_sample_pattern(unsigned samples)
110 {
111 switch (samples) {
112 case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
113 case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
114 case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
115 case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
116 default: unreachable("Unsupported sample count");
117 }
118 }
119
120 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)121 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
122 {
123 /* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use
124 * CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for
125 * nearest filtering, so use CLAMP_TO_EDGE in that case. */
126
127 switch (w) {
128 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
129 case PIPE_TEX_WRAP_CLAMP:
130 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
131 #if PAN_ARCH <= 5
132 MALI_WRAP_MODE_CLAMP;
133 #else
134 MALI_WRAP_MODE_CLAMP_TO_BORDER;
135 #endif
136 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
137 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
138 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
139 case PIPE_TEX_WRAP_MIRROR_CLAMP:
140 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
141 #if PAN_ARCH <= 5
142 MALI_WRAP_MODE_MIRRORED_CLAMP;
143 #else
144 MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
145 #endif
146 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
147 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
148 default: unreachable("Invalid wrap");
149 }
150 }
151
152 /* The hardware compares in the wrong order order, so we have to flip before
153 * encoding. Yes, really. */
154
155 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)156 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
157 {
158 return !cso->compare_mode ? MALI_FUNC_NEVER :
159 panfrost_flip_compare_func((enum mali_func) cso->compare_func);
160 }
161
162 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)163 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
164 {
165 switch (f) {
166 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
167 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
168 #if PAN_ARCH >= 6
169 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
170 #else
171 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST;
172 #endif
173 default: unreachable("Invalid");
174 }
175 }
176
177
178 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)179 panfrost_create_sampler_state(
180 struct pipe_context *pctx,
181 const struct pipe_sampler_state *cso)
182 {
183 struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
184 so->base = *cso;
185
186 bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
187
188 pan_pack(&so->hw, SAMPLER, cfg) {
189 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
190 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
191
192 cfg.normalized_coordinates = cso->normalized_coords;
193 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
194 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
195 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
196
197 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
198 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
199 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
200
201 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
202 cfg.compare_function = panfrost_sampler_compare_func(cso);
203 cfg.seamless_cube_map = cso->seamless_cube_map;
204
205 cfg.border_color_r = cso->border_color.ui[0];
206 cfg.border_color_g = cso->border_color.ui[1];
207 cfg.border_color_b = cso->border_color.ui[2];
208 cfg.border_color_a = cso->border_color.ui[3];
209
210 #if PAN_ARCH >= 6
211 if (cso->max_anisotropy > 1) {
212 cfg.maximum_anisotropy = cso->max_anisotropy;
213 cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
214 }
215 #else
216 /* Emulate disabled mipmapping by clamping the LOD as tight as
217 * possible (from 0 to epsilon = 1/256) */
218 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
219 cfg.maximum_lod = cfg.minimum_lod + 1;
220 #endif
221 }
222
223 return so;
224 }
225
226 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_state * blend,struct pipe_framebuffer_state * state,const struct panfrost_zsa_state * zsa)227 panfrost_fs_required(
228 struct panfrost_shader_state *fs,
229 struct panfrost_blend_state *blend,
230 struct pipe_framebuffer_state *state,
231 const struct panfrost_zsa_state *zsa)
232 {
233 /* If we generally have side effects. This inclues use of discard,
234 * which can affect the results of an occlusion query. */
235 if (fs->info.fs.sidefx)
236 return true;
237
238 /* Using an empty FS requires early-z to be enabled, but alpha test
239 * needs it disabled */
240 if ((enum mali_func) zsa->base.alpha_func != MALI_FUNC_ALWAYS)
241 return true;
242
243 /* If colour is written we need to execute */
244 for (unsigned i = 0; i < state->nr_cbufs; ++i) {
245 if (state->cbufs[i] && !blend->info[i].no_colour)
246 return true;
247 }
248
249 /* If depth is written and not implied we need to execute.
250 * TODO: Predicate on Z/S writes being enabled */
251 return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);
252 }
253
254 #if PAN_ARCH >= 5
255 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)256 pack_blend_constant(enum pipe_format format, float cons)
257 {
258 const struct util_format_description *format_desc =
259 util_format_description(format);
260
261 unsigned chan_size = 0;
262
263 for (unsigned i = 0; i < format_desc->nr_channels; i++)
264 chan_size = MAX2(format_desc->channel[0].size, chan_size);
265
266 uint16_t unorm = (cons * ((1 << chan_size) - 1));
267 return unorm << (16 - chan_size);
268 }
269
270 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)271 panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)
272 {
273 unsigned rt_count = batch->key.nr_cbufs;
274 struct panfrost_context *ctx = batch->ctx;
275 const struct panfrost_blend_state *so = ctx->blend;
276 bool dithered = so->base.dither;
277
278 /* Always have at least one render target for depth-only passes */
279 for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
280 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
281
282 /* Disable blending for unbacked render targets */
283 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
284 pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
285 cfg.enable = false;
286 #if PAN_ARCH >= 6
287 cfg.internal.mode = MALI_BLEND_MODE_OFF;
288 #endif
289 }
290
291 continue;
292 }
293
294 struct pan_blend_info info = so->info[i];
295 enum pipe_format format = batch->key.cbufs[i]->format;
296 float cons = pan_blend_get_constant(info.constant_mask,
297 ctx->blend_color.color);
298
299 /* Word 0: Flags and constant */
300 pan_pack(packed, BLEND, cfg) {
301 cfg.srgb = util_format_is_srgb(format);
302 cfg.load_destination = info.load_dest;
303 cfg.round_to_fb_precision = !dithered;
304 cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
305 #if PAN_ARCH >= 6
306 cfg.constant = pack_blend_constant(format, cons);
307 #else
308 cfg.blend_shader = (blend_shaders[i] != 0);
309
310 if (blend_shaders[i])
311 cfg.shader_pc = blend_shaders[i];
312 else
313 cfg.constant = cons;
314 #endif
315 }
316
317 if (!blend_shaders[i]) {
318 /* Word 1: Blend Equation */
319 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
320 packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
321 }
322
323 #if PAN_ARCH >= 6
324 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct panfrost_shader_state *fs =
326 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
327
328 /* Words 2 and 3: Internal blend */
329 if (blend_shaders[i]) {
330 /* The blend shader's address needs to be at
331 * the same top 32 bit as the fragment shader.
332 * TODO: Ensure that's always the case.
333 */
334 assert(!fs->bin.bo ||
335 (blend_shaders[i] & (0xffffffffull << 32)) ==
336 (fs->bin.gpu & (0xffffffffull << 32)));
337
338 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
339 assert(!(ret_offset & 0x7));
340
341 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
342 cfg.mode = MALI_BLEND_MODE_SHADER;
343 cfg.shader.pc = (u32) blend_shaders[i];
344 cfg.shader.return_value = ret_offset ?
345 fs->bin.gpu + ret_offset : 0;
346 }
347 } else {
348 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
349 cfg.mode = info.opaque ?
350 MALI_BLEND_MODE_OPAQUE :
351 MALI_BLEND_MODE_FIXED_FUNCTION;
352
353 /* If we want the conversion to work properly,
354 * num_comps must be set to 4
355 */
356 cfg.fixed_function.num_comps = 4;
357 cfg.fixed_function.conversion.memory_format =
358 panfrost_format_to_bifrost_blend(dev, format, dithered);
359 cfg.fixed_function.conversion.register_format =
360 fs->info.bifrost.blend[i].format;
361 cfg.fixed_function.rt = i;
362 }
363 }
364 #endif
365 }
366
367 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
368 if (!so->info[i].no_colour && batch->key.cbufs[i]) {
369 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
370 batch->resolve |= (PIPE_CLEAR_COLOR0 << i);
371 }
372 }
373 }
374 #endif
375
376 /* Construct a partial RSD corresponding to no executed fragment shader, and
377 * merge with the existing partial RSD. */
378
379 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)380 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
381 {
382 struct mali_renderer_state_packed empty_rsd;
383
384 pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
385 #if PAN_ARCH >= 6
386 cfg.properties.shader_modifies_coverage = true;
387 cfg.properties.allow_forward_pixel_to_kill = true;
388 cfg.properties.allow_forward_pixel_to_be_killed = true;
389 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
390 #else
391 cfg.shader.shader = 0x1;
392 cfg.properties.work_register_count = 1;
393 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
394 cfg.properties.force_early_z = true;
395 #endif
396 }
397
398 pan_merge((*rsd), empty_rsd, RENDERER_STATE);
399 }
400
401 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)402 panfrost_prepare_fs_state(struct panfrost_context *ctx,
403 mali_ptr *blend_shaders,
404 struct mali_renderer_state_packed *rsd)
405 {
406 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
407 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
408 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
409 struct panfrost_blend_state *so = ctx->blend;
410 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
411 bool msaa = rast->multisample;
412
413 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
414
415 bool has_blend_shader = false;
416
417 for (unsigned c = 0; c < rt_count; ++c)
418 has_blend_shader |= (blend_shaders[c] != 0);
419
420 pan_pack(rsd, RENDERER_STATE, cfg) {
421 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
422 #if PAN_ARCH >= 6
423 /* Track if any colour buffer is reused across draws, either
424 * from reading it directly, or from failing to write it */
425 unsigned rt_mask = ctx->fb_rt_mask;
426 uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
427 bool blend_reads_dest = (so->load_dest_mask & rt_mask);
428
429 cfg.properties.allow_forward_pixel_to_kill =
430 fs->info.fs.can_fpk &&
431 !(rt_mask & ~rt_written) &&
432 !alpha_to_coverage &&
433 !blend_reads_dest;
434 #else
435 cfg.properties.force_early_z =
436 fs->info.fs.can_early_z && !alpha_to_coverage &&
437 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
438
439 /* TODO: Reduce this limit? */
440 if (has_blend_shader)
441 cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8);
442 else
443 cfg.properties.work_register_count = fs->info.work_reg_count;
444
445 /* Hardware quirks around early-zs forcing without a
446 * depth buffer. Note this breaks occlusion queries. */
447 bool has_oq = ctx->occlusion_query && ctx->active_queries;
448 bool force_ez_with_discard = !zsa->enabled && !has_oq;
449
450 cfg.properties.shader_reads_tilebuffer =
451 force_ez_with_discard && fs->info.fs.can_discard;
452 cfg.properties.shader_contains_discard =
453 !force_ez_with_discard && fs->info.fs.can_discard;
454 #endif
455 }
456
457 #if PAN_ARCH == 4
458 if (rt_count > 0) {
459 cfg.multisample_misc.load_destination = so->info[0].load_dest;
460 cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
461 cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour;
462 cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
463 cfg.stencil_mask_misc.dither_disable = !so->base.dither;
464 cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
465
466 if (blend_shaders[0]) {
467 cfg.blend_shader = blend_shaders[0];
468 } else {
469 cfg.blend_constant = pan_blend_get_constant(
470 so->info[0].constant_mask,
471 ctx->blend_color.color);
472 }
473 } else {
474 /* If there is no colour buffer, leaving fields default is
475 * fine, except for blending which is nonnullable */
476 cfg.blend_equation.color_mask = 0xf;
477 cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
478 cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
479 cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
480 cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
481 cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
482 cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
483 }
484 #elif PAN_ARCH == 5
485 /* Workaround */
486 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
487 #endif
488
489 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
490
491 cfg.multisample_misc.evaluate_per_sample =
492 msaa && (ctx->min_samples > 1);
493
494 #if PAN_ARCH >= 6
495 /* MSAA blend shaders need to pass their sample ID to
496 * LD_TILE/ST_TILE, so we must preload it. Additionally, we
497 * need per-sample shading for the blend shader, accomplished
498 * by forcing per-sample shading for the whole program. */
499
500 if (msaa && has_blend_shader) {
501 cfg.multisample_misc.evaluate_per_sample = true;
502 cfg.preload.fragment.sample_mask_id = true;
503 }
504 #endif
505
506 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
507 cfg.depth_units = rast->offset_units * 2.0f;
508 cfg.depth_factor = rast->offset_scale;
509
510 bool back_enab = zsa->base.stencil[1].enabled;
511 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
512 cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
513
514 #if PAN_ARCH <= 5
515 /* v6+ fits register preload here, no alpha testing */
516 cfg.alpha_reference = zsa->base.alpha_ref_value;
517 #endif
518 }
519 }
520
521 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)522 panfrost_emit_frag_shader(struct panfrost_context *ctx,
523 struct mali_renderer_state_packed *fragmeta,
524 mali_ptr *blend_shaders)
525 {
526 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
527 const struct panfrost_rasterizer *rast = ctx->rasterizer;
528 struct panfrost_shader_state *fs =
529 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
530
531 /* We need to merge several several partial renderer state descriptors,
532 * so stage to temporary storage rather than reading back write-combine
533 * memory, which will trash performance. */
534 struct mali_renderer_state_packed rsd;
535 panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
536
537 #if PAN_ARCH == 4
538 if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
539 /* Word 14: SFBD Blend Equation */
540 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
541 rsd.opaque[14] = ctx->blend->equation[0];
542 }
543 #endif
544
545 /* Merge with CSO state and upload */
546 if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
547 struct mali_renderer_state_packed *partial_rsd =
548 (struct mali_renderer_state_packed *)&fs->partial_rsd;
549 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
550 pan_merge(rsd, *partial_rsd, RENDERER_STATE);
551 } else {
552 pan_merge_empty_fs(&rsd);
553 }
554
555 /* Word 8, 9 Misc state */
556 rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
557 | rast->multisample.opaque[0];
558
559 rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]
560 | rast->stencil_misc.opaque[0];
561
562 /* Word 10, 11 Stencil Front and Back */
563 rsd.opaque[10] |= zsa->stencil_front.opaque[0];
564 rsd.opaque[11] |= zsa->stencil_back.opaque[0];
565
566 memcpy(fragmeta, &rsd, sizeof(rsd));
567 }
568
569 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)570 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
571 {
572 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
573
574 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
575 panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
576
577 return ss->state.gpu;
578 }
579
580 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)581 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
582 {
583 struct panfrost_context *ctx = batch->ctx;
584 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
585
586 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
587
588 struct panfrost_ptr xfer;
589
590 #if PAN_ARCH == 4
591 xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
592 #else
593 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
594
595 xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,
596 PAN_DESC(RENDERER_STATE),
597 PAN_DESC_ARRAY(rt_count, BLEND));
598 #endif
599
600 mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
601 unsigned shader_offset = 0;
602 struct panfrost_bo *shader_bo = NULL;
603
604 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c) {
605 if (ctx->pipe_framebuffer.cbufs[c]) {
606 blend_shaders[c] = panfrost_get_blend(batch,
607 c, &shader_bo, &shader_offset);
608 }
609 }
610
611 panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
612
613 #if PAN_ARCH >= 5
614 panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders);
615 #else
616 batch->draws |= PIPE_CLEAR_COLOR0;
617 batch->resolve |= PIPE_CLEAR_COLOR0;
618 #endif
619
620 if (ctx->depth_stencil->base.depth_enabled)
621 batch->read |= PIPE_CLEAR_DEPTH;
622
623 if (ctx->depth_stencil->base.stencil[0].enabled)
624 batch->read |= PIPE_CLEAR_STENCIL;
625
626 return xfer.gpu;
627 }
628
629 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)630 panfrost_emit_viewport(struct panfrost_batch *batch)
631 {
632 struct panfrost_context *ctx = batch->ctx;
633 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
634 const struct pipe_scissor_state *ss = &ctx->scissor;
635 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
636
637 /* Derive min/max from translate/scale. Note since |x| >= 0 by
638 * definition, we have that -|x| <= |x| hence translate - |scale| <=
639 * translate + |scale|, so the ordering is correct here. */
640 float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
641 float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
642 float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
643 float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
644 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
645 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
646
647 /* Scissor to the intersection of viewport and to the scissor, clamped
648 * to the framebuffer */
649
650 unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));
651 unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));
652 unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));
653 unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));
654
655 if (ss && rast->scissor) {
656 minx = MAX2(ss->minx, minx);
657 miny = MAX2(ss->miny, miny);
658 maxx = MIN2(ss->maxx, maxx);
659 maxy = MIN2(ss->maxy, maxy);
660 }
661
662 /* Set the range to [1, 1) so max values don't wrap round */
663 if (maxx == 0 || maxy == 0)
664 maxx = maxy = minx = miny = 1;
665
666 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
667
668 pan_pack(T.cpu, VIEWPORT, cfg) {
669 /* [minx, maxx) and [miny, maxy) are exclusive ranges, but
670 * these are inclusive */
671 cfg.scissor_minimum_x = minx;
672 cfg.scissor_minimum_y = miny;
673 cfg.scissor_maximum_x = maxx - 1;
674 cfg.scissor_maximum_y = maxy - 1;
675
676 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
677 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
678 }
679
680 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
681 batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
682
683 return T.gpu;
684 }
685
686 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)687 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
688 enum pipe_shader_type st,
689 struct panfrost_constant_buffer *buf,
690 unsigned index)
691 {
692 struct pipe_constant_buffer *cb = &buf->cb[index];
693 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
694
695 if (rsrc) {
696 panfrost_batch_read_rsrc(batch, rsrc, st);
697
698 /* Alignment gauranteed by
699 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
700 return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;
701 } else if (cb->user_buffer) {
702 return pan_pool_upload_aligned(&batch->pool.base,
703 cb->user_buffer +
704 cb->buffer_offset,
705 cb->buffer_size, 16);
706 } else {
707 unreachable("No constant buffer");
708 }
709 }
710
711 struct sysval_uniform {
712 union {
713 float f[4];
714 int32_t i[4];
715 uint32_t u[4];
716 uint64_t du[2];
717 };
718 };
719
720 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)721 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
722 struct sysval_uniform *uniform)
723 {
724 struct panfrost_context *ctx = batch->ctx;
725 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
726
727 uniform->f[0] = vp->scale[0];
728 uniform->f[1] = vp->scale[1];
729 uniform->f[2] = vp->scale[2];
730 }
731
732 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)733 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
734 struct sysval_uniform *uniform)
735 {
736 struct panfrost_context *ctx = batch->ctx;
737 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
738
739 uniform->f[0] = vp->translate[0];
740 uniform->f[1] = vp->translate[1];
741 uniform->f[2] = vp->translate[2];
742 }
743
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)744 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
745 enum pipe_shader_type st,
746 unsigned int sysvalid,
747 struct sysval_uniform *uniform)
748 {
749 struct panfrost_context *ctx = batch->ctx;
750 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
751 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
752 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
753 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
754
755 assert(dim);
756
757 if (tex->target == PIPE_BUFFER) {
758 assert(dim == 1);
759 uniform->i[0] =
760 tex->u.buf.size / util_format_get_blocksize(tex->format);
761 return;
762 }
763
764 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
765
766 if (dim > 1)
767 uniform->i[1] = u_minify(tex->texture->height0,
768 tex->u.tex.first_level);
769
770 if (dim > 2)
771 uniform->i[2] = u_minify(tex->texture->depth0,
772 tex->u.tex.first_level);
773
774 if (is_array)
775 uniform->i[dim] = tex->texture->array_size;
776 }
777
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)778 static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
779 enum pipe_shader_type st,
780 unsigned int sysvalid,
781 struct sysval_uniform *uniform)
782 {
783 struct panfrost_context *ctx = batch->ctx;
784 unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
785 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
786 unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
787
788 assert(dim && dim < 4);
789
790 struct pipe_image_view *image = &ctx->images[st][idx];
791
792 if (image->resource->target == PIPE_BUFFER) {
793 unsigned blocksize = util_format_get_blocksize(image->format);
794 uniform->i[0] = image->resource->width0 / blocksize;
795 return;
796 }
797
798 uniform->i[0] = u_minify(image->resource->width0,
799 image->u.tex.level);
800
801 if (dim > 1)
802 uniform->i[1] = u_minify(image->resource->height0,
803 image->u.tex.level);
804
805 if (dim > 2)
806 uniform->i[2] = u_minify(image->resource->depth0,
807 image->u.tex.level);
808
809 if (is_array)
810 uniform->i[dim] = image->resource->array_size;
811 }
812
813 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)814 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
815 enum pipe_shader_type st,
816 unsigned ssbo_id,
817 struct sysval_uniform *uniform)
818 {
819 struct panfrost_context *ctx = batch->ctx;
820
821 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
822 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
823
824 /* Compute address */
825 struct panfrost_resource *rsrc = pan_resource(sb.buffer);
826 struct panfrost_bo *bo = rsrc->image.data.bo;
827
828 panfrost_batch_write_rsrc(batch, rsrc, st);
829
830 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
831 sb.buffer_offset, sb.buffer_size);
832
833 /* Upload address and size as sysval */
834 uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
835 uniform->u[2] = sb.buffer_size;
836 }
837
838 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)839 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
840 enum pipe_shader_type st,
841 unsigned samp_idx,
842 struct sysval_uniform *uniform)
843 {
844 struct panfrost_context *ctx = batch->ctx;
845 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
846
847 uniform->f[0] = sampl->min_lod;
848 uniform->f[1] = sampl->max_lod;
849 uniform->f[2] = sampl->lod_bias;
850
851 /* Even without any errata, Midgard represents "no mipmapping" as
852 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
853 * panfrost_create_sampler_state which also explains our choice of
854 * epsilon value (again to keep behaviour consistent) */
855
856 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
857 uniform->f[1] = uniform->f[0] + (1.0/256.0);
858 }
859
860 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)861 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
862 struct sysval_uniform *uniform)
863 {
864 struct panfrost_context *ctx = batch->ctx;
865
866 uniform->u[0] = ctx->compute_grid->grid[0];
867 uniform->u[1] = ctx->compute_grid->grid[1];
868 uniform->u[2] = ctx->compute_grid->grid[2];
869 }
870
871 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)872 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
873 struct sysval_uniform *uniform)
874 {
875 struct panfrost_context *ctx = batch->ctx;
876
877 uniform->u[0] = ctx->compute_grid->block[0];
878 uniform->u[1] = ctx->compute_grid->block[1];
879 uniform->u[2] = ctx->compute_grid->block[2];
880 }
881
882 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)883 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
884 struct sysval_uniform *uniform)
885 {
886 struct panfrost_context *ctx = batch->ctx;
887
888 uniform->u[0] = ctx->compute_grid->work_dim;
889 }
890
891 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
892 * Midgard, we emulate the Bifrost path with some extra arithmetic in the
893 * shader, to keep the code as unified as possible. */
894
895 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)896 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
897 struct sysval_uniform *uniform)
898 {
899 struct panfrost_context *ctx = batch->ctx;
900 struct panfrost_device *dev = pan_device(ctx->base.screen);
901
902 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
903 uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));
904 }
905
906 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)907 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
908 struct sysval_uniform *uniform)
909 {
910 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
911 uniform->u[0] = samples > 1;
912 }
913
914 #if PAN_ARCH >= 6
915 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)916 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
917 unsigned size_and_rt, struct sysval_uniform *uniform)
918 {
919 struct panfrost_context *ctx = batch->ctx;
920 struct panfrost_device *dev = pan_device(ctx->base.screen);
921 unsigned rt = size_and_rt & 0xF;
922 unsigned size = size_and_rt >> 4;
923
924 if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
925 enum pipe_format format = batch->key.cbufs[rt]->format;
926 uniform->u[0] =
927 GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32;
928 } else {
929 pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
930 cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
931 }
932 }
933 #endif
934
935 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,const struct panfrost_ptr * ptr,struct panfrost_shader_state * ss,enum pipe_shader_type st)936 panfrost_upload_sysvals(struct panfrost_batch *batch,
937 const struct panfrost_ptr *ptr,
938 struct panfrost_shader_state *ss,
939 enum pipe_shader_type st)
940 {
941 struct sysval_uniform *uniforms = ptr->cpu;
942
943 for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
944 int sysval = ss->info.sysvals.sysvals[i];
945
946 switch (PAN_SYSVAL_TYPE(sysval)) {
947 case PAN_SYSVAL_VIEWPORT_SCALE:
948 panfrost_upload_viewport_scale_sysval(batch,
949 &uniforms[i]);
950 break;
951 case PAN_SYSVAL_VIEWPORT_OFFSET:
952 panfrost_upload_viewport_offset_sysval(batch,
953 &uniforms[i]);
954 break;
955 case PAN_SYSVAL_TEXTURE_SIZE:
956 panfrost_upload_txs_sysval(batch, st,
957 PAN_SYSVAL_ID(sysval),
958 &uniforms[i]);
959 break;
960 case PAN_SYSVAL_SSBO:
961 panfrost_upload_ssbo_sysval(batch, st,
962 PAN_SYSVAL_ID(sysval),
963 &uniforms[i]);
964 break;
965 case PAN_SYSVAL_NUM_WORK_GROUPS:
966 for (unsigned j = 0; j < 3; j++) {
967 batch->num_wg_sysval[j] =
968 ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);
969 }
970 panfrost_upload_num_work_groups_sysval(batch,
971 &uniforms[i]);
972 break;
973 case PAN_SYSVAL_LOCAL_GROUP_SIZE:
974 panfrost_upload_local_group_size_sysval(batch,
975 &uniforms[i]);
976 break;
977 case PAN_SYSVAL_WORK_DIM:
978 panfrost_upload_work_dim_sysval(batch,
979 &uniforms[i]);
980 break;
981 case PAN_SYSVAL_SAMPLER:
982 panfrost_upload_sampler_sysval(batch, st,
983 PAN_SYSVAL_ID(sysval),
984 &uniforms[i]);
985 break;
986 case PAN_SYSVAL_IMAGE_SIZE:
987 panfrost_upload_image_size_sysval(batch, st,
988 PAN_SYSVAL_ID(sysval),
989 &uniforms[i]);
990 break;
991 case PAN_SYSVAL_SAMPLE_POSITIONS:
992 panfrost_upload_sample_positions_sysval(batch,
993 &uniforms[i]);
994 break;
995 case PAN_SYSVAL_MULTISAMPLED:
996 panfrost_upload_multisampled_sysval(batch,
997 &uniforms[i]);
998 break;
999 #if PAN_ARCH >= 6
1000 case PAN_SYSVAL_RT_CONVERSION:
1001 panfrost_upload_rt_conversion_sysval(batch,
1002 PAN_SYSVAL_ID(sysval), &uniforms[i]);
1003 break;
1004 #endif
1005 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1006 batch->ctx->first_vertex_sysval_ptr =
1007 ptr->gpu + (i * sizeof(*uniforms));
1008 batch->ctx->base_vertex_sysval_ptr =
1009 batch->ctx->first_vertex_sysval_ptr + 4;
1010 batch->ctx->base_instance_sysval_ptr =
1011 batch->ctx->first_vertex_sysval_ptr + 8;
1012
1013 uniforms[i].u[0] = batch->ctx->offset_start;
1014 uniforms[i].u[1] = batch->ctx->base_vertex;
1015 uniforms[i].u[2] = batch->ctx->base_instance;
1016 break;
1017 case PAN_SYSVAL_DRAWID:
1018 uniforms[i].u[0] = batch->ctx->drawid;
1019 break;
1020 default:
1021 assert(0);
1022 }
1023 }
1024 }
1025
1026 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1027 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1028 struct panfrost_constant_buffer *buf,
1029 unsigned index)
1030 {
1031 struct pipe_constant_buffer *cb = &buf->cb[index];
1032 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1033
1034 if (rsrc) {
1035 panfrost_bo_mmap(rsrc->image.data.bo);
1036 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1037 panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);
1038
1039 return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;
1040 } else if (cb->user_buffer) {
1041 return cb->user_buffer + cb->buffer_offset;
1042 } else
1043 unreachable("No constant buffer");
1044 }
1045
1046 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr * push_constants)1047 panfrost_emit_const_buf(struct panfrost_batch *batch,
1048 enum pipe_shader_type stage,
1049 mali_ptr *push_constants)
1050 {
1051 struct panfrost_context *ctx = batch->ctx;
1052 struct panfrost_shader_variants *all = ctx->shader[stage];
1053
1054 if (!all)
1055 return 0;
1056
1057 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1058 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1059
1060 /* Allocate room for the sysval and the uniforms */
1061 size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
1062 struct panfrost_ptr transfer =
1063 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1064
1065 /* Upload sysvals requested by the shader */
1066 panfrost_upload_sysvals(batch, &transfer, ss, stage);
1067
1068 /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1069 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
1070 unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1071 unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1072
1073 struct panfrost_ptr ubos =
1074 pan_pool_alloc_desc_array(&batch->pool.base,
1075 ubo_count + 1,
1076 UNIFORM_BUFFER);
1077
1078 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1079
1080 /* Upload sysval as a final UBO */
1081
1082 if (sys_size) {
1083 pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {
1084 cfg.entries = DIV_ROUND_UP(sys_size, 16);
1085 cfg.pointer = transfer.gpu;
1086 }
1087 }
1088
1089 /* The rest are honest-to-goodness UBOs */
1090
1091 u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1092 size_t usz = buf->cb[ubo].buffer_size;
1093
1094 if (usz == 0) {
1095 ubo_ptr[ubo] = 0;
1096 continue;
1097 }
1098
1099 /* Issue (57) for the ARB_uniform_buffer_object spec says that
1100 * the buffer can be larger than the uniform data inside it,
1101 * so clamp ubo size to what hardware supports. */
1102
1103 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1104 cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);
1105 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1106 stage, buf, ubo);
1107 }
1108 }
1109
1110 if (ss->info.push.count == 0)
1111 return ubos.gpu;
1112
1113 /* Copy push constants required by the shader */
1114 struct panfrost_ptr push_transfer =
1115 pan_pool_alloc_aligned(&batch->pool.base,
1116 ss->info.push.count * 4, 16);
1117
1118 uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
1119 *push_constants = push_transfer.gpu;
1120
1121 for (unsigned i = 0; i < ss->info.push.count; ++i) {
1122 struct panfrost_ubo_word src = ss->info.push.words[i];
1123
1124 if (src.ubo == sysval_ubo) {
1125 unsigned sysval_idx = src.offset / 16;
1126 unsigned sysval_comp = (src.offset % 16) / 4;
1127 unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);
1128 mali_ptr ptr = push_transfer.gpu + (4 * i);
1129
1130 switch (sysval_type) {
1131 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1132 switch (sysval_comp) {
1133 case 0:
1134 batch->ctx->first_vertex_sysval_ptr = ptr;
1135 break;
1136 case 1:
1137 batch->ctx->base_vertex_sysval_ptr = ptr;
1138 break;
1139 case 2:
1140 batch->ctx->base_instance_sysval_ptr = ptr;
1141 break;
1142 case 3:
1143 /* Spurious (Midgard doesn't pack) */
1144 break;
1145 default:
1146 unreachable("Invalid vertex/instance offset component\n");
1147 }
1148 break;
1149
1150 case PAN_SYSVAL_NUM_WORK_GROUPS:
1151 batch->num_wg_sysval[sysval_comp] = ptr;
1152 break;
1153
1154 default:
1155 break;
1156 }
1157 }
1158 /* Map the UBO, this should be cheap. However this is reading
1159 * from write-combine memory which is _very_ slow. It might pay
1160 * off to upload sysvals to a staging buffer on the CPU on the
1161 * assumption sysvals will get pushed (TODO) */
1162
1163 const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
1164 panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1165
1166 /* TODO: Is there any benefit to combining ranges */
1167 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
1168 }
1169
1170 return ubos.gpu;
1171 }
1172
1173 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * info)1174 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1175 const struct pipe_grid_info *info)
1176 {
1177 struct panfrost_context *ctx = batch->ctx;
1178 struct panfrost_device *dev = pan_device(ctx->base.screen);
1179 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1180 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1181 struct panfrost_ptr t =
1182 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1183
1184 pan_pack(t.cpu, LOCAL_STORAGE, ls) {
1185 unsigned wls_single_size =
1186 util_next_power_of_two(MAX2(ss->info.wls_size, 128));
1187
1188 if (ss->info.wls_size) {
1189 ls.wls_instances =
1190 util_next_power_of_two(info->grid[0]) *
1191 util_next_power_of_two(info->grid[1]) *
1192 util_next_power_of_two(info->grid[2]);
1193
1194 ls.wls_size_scale = util_logbase2(wls_single_size) + 1;
1195
1196 unsigned wls_size = wls_single_size * ls.wls_instances * dev->core_count;
1197
1198 ls.wls_base_pointer =
1199 (panfrost_batch_get_shared_memory(batch,
1200 wls_size,
1201 1))->ptr.gpu;
1202 } else {
1203 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1204 }
1205
1206 if (ss->info.tls_size) {
1207 unsigned shift =
1208 panfrost_get_stack_shift(ss->info.tls_size);
1209 struct panfrost_bo *bo =
1210 panfrost_batch_get_scratchpad(batch,
1211 ss->info.tls_size,
1212 dev->thread_tls_alloc,
1213 dev->core_count);
1214
1215 ls.tls_size = shift;
1216 ls.tls_base_pointer = bo->ptr.gpu;
1217 }
1218 };
1219
1220 return t.gpu;
1221 }
1222
1223 #if PAN_ARCH <= 5
1224 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1225 panfrost_get_tex_desc(struct panfrost_batch *batch,
1226 enum pipe_shader_type st,
1227 struct panfrost_sampler_view *view)
1228 {
1229 if (!view)
1230 return (mali_ptr) 0;
1231
1232 struct pipe_sampler_view *pview = &view->base;
1233 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1234
1235 panfrost_batch_read_rsrc(batch, rsrc, st);
1236 panfrost_batch_add_bo(batch, view->state.bo, st);
1237
1238 return view->state.gpu;
1239 }
1240 #endif
1241
1242 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1243 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1244 struct pipe_context *pctx,
1245 struct pipe_resource *texture)
1246 {
1247 struct panfrost_device *device = pan_device(pctx->screen);
1248 struct panfrost_context *ctx = pan_context(pctx);
1249 struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1250 enum pipe_format format = so->base.format;
1251 assert(prsrc->image.data.bo);
1252
1253 /* Format to access the stencil/depth portion of a Z32_S8 texture */
1254 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1255 assert(prsrc->separate_stencil);
1256 texture = &prsrc->separate_stencil->base;
1257 prsrc = (struct panfrost_resource *)texture;
1258 format = texture->format;
1259 } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1260 format = PIPE_FORMAT_Z32_FLOAT;
1261 }
1262
1263 const struct util_format_description *desc = util_format_description(format);
1264
1265 bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);
1266
1267 if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {
1268 if (desc->is_snorm)
1269 format = PIPE_FORMAT_R8G8B8A8_SNORM;
1270 else
1271 format = PIPE_FORMAT_R8G8B8A8_UNORM;
1272 desc = util_format_description(format);
1273 }
1274
1275 so->texture_bo = prsrc->image.data.bo->ptr.gpu;
1276 so->modifier = prsrc->image.layout.modifier;
1277
1278 /* MSAA only supported for 2D textures */
1279
1280 assert(texture->nr_samples <= 1 ||
1281 so->base.target == PIPE_TEXTURE_2D ||
1282 so->base.target == PIPE_TEXTURE_2D_ARRAY);
1283
1284 enum mali_texture_dimension type =
1285 panfrost_translate_texture_dimension(so->base.target);
1286
1287 bool is_buffer = (so->base.target == PIPE_BUFFER);
1288
1289 unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1290 unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1291 unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1292 unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1293 unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1294 unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /
1295 util_format_get_blocksize(format);
1296
1297 if (so->base.target == PIPE_TEXTURE_3D) {
1298 first_layer /= prsrc->image.layout.depth;
1299 last_layer /= prsrc->image.layout.depth;
1300 assert(!first_layer && !last_layer);
1301 }
1302
1303 struct pan_image_view iview = {
1304 .format = format,
1305 .dim = type,
1306 .first_level = first_level,
1307 .last_level = last_level,
1308 .first_layer = first_layer,
1309 .last_layer = last_layer,
1310 .swizzle = {
1311 so->base.swizzle_r,
1312 so->base.swizzle_g,
1313 so->base.swizzle_b,
1314 so->base.swizzle_a,
1315 },
1316 .image = &prsrc->image,
1317
1318 .buf.offset = buf_offset,
1319 .buf.size = buf_size,
1320 };
1321
1322 unsigned size =
1323 (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1324 GENX(panfrost_estimate_texture_payload_size)(&iview);
1325
1326 struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64);
1327 so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1328
1329 void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1330
1331 if (PAN_ARCH <= 5) {
1332 payload.cpu += pan_size(TEXTURE);
1333 payload.gpu += pan_size(TEXTURE);
1334 }
1335
1336 GENX(panfrost_new_texture)(device, &iview, tex, &payload);
1337 }
1338
1339 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1340 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1341 struct pipe_context *pctx)
1342 {
1343 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1344 if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||
1345 view->modifier != rsrc->image.layout.modifier) {
1346 panfrost_bo_unreference(view->state.bo);
1347 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1348 }
1349 }
1350
1351 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1352 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1353 enum pipe_shader_type stage)
1354 {
1355 struct panfrost_context *ctx = batch->ctx;
1356
1357 if (!ctx->sampler_view_count[stage])
1358 return 0;
1359
1360 #if PAN_ARCH >= 6
1361 struct panfrost_ptr T =
1362 pan_pool_alloc_desc_array(&batch->pool.base,
1363 ctx->sampler_view_count[stage],
1364 TEXTURE);
1365 struct mali_texture_packed *out =
1366 (struct mali_texture_packed *) T.cpu;
1367
1368 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1369 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1370 struct pipe_sampler_view *pview = &view->base;
1371 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1372
1373 panfrost_update_sampler_view(view, &ctx->base);
1374 out[i] = view->bifrost_descriptor;
1375
1376 panfrost_batch_read_rsrc(batch, rsrc, stage);
1377 panfrost_batch_add_bo(batch, view->state.bo, stage);
1378 }
1379
1380 return T.gpu;
1381 #else
1382 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1383
1384 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1385 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1386
1387 panfrost_update_sampler_view(view, &ctx->base);
1388
1389 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1390 }
1391
1392 return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1393 sizeof(uint64_t) *
1394 ctx->sampler_view_count[stage],
1395 sizeof(uint64_t));
1396 #endif
1397 }
1398
1399 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1400 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1401 enum pipe_shader_type stage)
1402 {
1403 struct panfrost_context *ctx = batch->ctx;
1404
1405 if (!ctx->sampler_count[stage])
1406 return 0;
1407
1408 struct panfrost_ptr T =
1409 pan_pool_alloc_desc_array(&batch->pool.base,
1410 ctx->sampler_count[stage],
1411 SAMPLER);
1412 struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu;
1413
1414 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1415 out[i] = ctx->samplers[stage][i]->hw;
1416
1417 return T.gpu;
1418 }
1419
1420 /* Packs all image attribute descs and attribute buffer descs.
1421 * `first_image_buf_index` must be the index of the first image attribute buffer descriptor.
1422 */
1423 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1424 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1425 struct mali_attribute_packed *attribs, unsigned first_buf)
1426 {
1427 struct panfrost_device *dev = pan_device(ctx->base.screen);
1428 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1429
1430 for (unsigned i = 0; i < last_bit; ++i) {
1431 enum pipe_format format = ctx->images[shader][i].format;
1432
1433 pan_pack(attribs + i, ATTRIBUTE, cfg) {
1434 /* Continuation record means 2 buffers per image */
1435 cfg.buffer_index = first_buf + (i * 2);
1436 cfg.offset_enable = (PAN_ARCH <= 5);
1437 cfg.format = dev->formats[format].hw;
1438 }
1439 }
1440 }
1441
1442 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1443 pan_modifier_to_attr_type(uint64_t modifier)
1444 {
1445 switch (modifier) {
1446 case DRM_FORMAT_MOD_LINEAR:
1447 return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1448 case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1449 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1450 default:
1451 unreachable("Invalid modifier for attribute record");
1452 }
1453 }
1454
1455 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1456 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1457 struct mali_attribute_buffer_packed *bufs,
1458 unsigned first_image_buf_index)
1459 {
1460 struct panfrost_context *ctx = batch->ctx;
1461 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1462
1463 for (unsigned i = 0; i < last_bit; ++i) {
1464 struct pipe_image_view *image = &ctx->images[shader][i];
1465
1466 if (!(ctx->image_mask[shader] & (1 << i)) ||
1467 !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1468 /* Unused image bindings */
1469 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);
1470 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);
1471 continue;
1472 }
1473
1474 struct panfrost_resource *rsrc = pan_resource(image->resource);
1475
1476 /* TODO: MSAA */
1477 assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");
1478
1479 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1480 bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1481
1482 unsigned offset = is_buffer ? image->u.buf.offset :
1483 panfrost_texture_offset(&rsrc->image.layout,
1484 image->u.tex.level,
1485 is_3d ? 0 : image->u.tex.first_layer,
1486 is_3d ? image->u.tex.first_layer : 0);
1487
1488 if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) {
1489 panfrost_batch_write_rsrc(batch, rsrc, shader);
1490
1491 unsigned level = is_buffer ? 0 : image->u.tex.level;
1492 BITSET_SET(rsrc->valid.data, level);
1493
1494 if (is_buffer) {
1495 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1496 0, rsrc->base.width0);
1497 }
1498 } else {
1499 panfrost_batch_read_rsrc(batch, rsrc, shader);
1500 }
1501
1502 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1503 cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1504 cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;
1505 cfg.stride = util_format_get_blocksize(image->format);
1506 cfg.size = rsrc->image.data.bo->size - offset;
1507 }
1508
1509 if (is_buffer) {
1510 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1511 cfg.s_dimension = rsrc->base.width0 /
1512 util_format_get_blocksize(image->format);
1513 cfg.t_dimension = cfg.r_dimension = 1;
1514 }
1515
1516 continue;
1517 }
1518
1519 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1520 unsigned level = image->u.tex.level;
1521
1522 cfg.s_dimension = u_minify(rsrc->base.width0, level);
1523 cfg.t_dimension = u_minify(rsrc->base.height0, level);
1524 cfg.r_dimension = is_3d ?
1525 u_minify(rsrc->base.depth0, level) :
1526 image->u.tex.last_layer - image->u.tex.first_layer + 1;
1527
1528 cfg.row_stride =
1529 rsrc->image.layout.slices[level].row_stride;
1530
1531 if (rsrc->base.target != PIPE_TEXTURE_2D) {
1532 cfg.slice_stride =
1533 panfrost_get_layer_stride(&rsrc->image.layout,
1534 level);
1535 }
1536 }
1537 }
1538 }
1539
1540 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1541 panfrost_emit_image_attribs(struct panfrost_batch *batch,
1542 mali_ptr *buffers,
1543 enum pipe_shader_type type)
1544 {
1545 struct panfrost_context *ctx = batch->ctx;
1546 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);
1547
1548 if (!shader->info.attribute_count) {
1549 *buffers = 0;
1550 return 0;
1551 }
1552
1553 /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1554 unsigned attr_count = shader->info.attribute_count;
1555 unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1556
1557 struct panfrost_ptr bufs =
1558 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1559
1560 struct panfrost_ptr attribs =
1561 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1562
1563 emit_image_attribs(ctx, type, attribs.cpu, 0);
1564 emit_image_bufs(batch, type, bufs.cpu, 0);
1565
1566 /* We need an empty attrib buf to stop the prefetching on Bifrost */
1567 #if PAN_ARCH >= 6
1568 pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1569 ATTRIBUTE_BUFFER, cfg);
1570 #endif
1571
1572 *buffers = bufs.gpu;
1573 return attribs.gpu;
1574 }
1575
1576 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1577 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1578 mali_ptr *buffers)
1579 {
1580 struct panfrost_context *ctx = batch->ctx;
1581 struct panfrost_vertex_state *so = ctx->vertex;
1582 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1583 bool instanced = ctx->indirect_draw || ctx->instance_count > 1;
1584 uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1585 unsigned nr_images = util_last_bit(image_mask);
1586
1587 /* Worst case: everything is NPOT, which is only possible if instancing
1588 * is enabled. Otherwise single record is gauranteed.
1589 * Also, we allocate more memory than what's needed here if either instancing
1590 * is enabled or images are present, this can be improved. */
1591 unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
1592 unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
1593 (PAN_ARCH >= 6 ? 1 : 0);
1594
1595 #if PAN_ARCH <= 5
1596 /* Midgard needs vertexid/instanceid handled specially */
1597 bool special_vbufs = vs->info.attribute_count >= PAN_VERTEX_ID;
1598
1599 if (special_vbufs)
1600 nr_bufs += 2;
1601 #endif
1602
1603 if (!nr_bufs) {
1604 *buffers = 0;
1605 return 0;
1606 }
1607
1608 struct panfrost_ptr S =
1609 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
1610 ATTRIBUTE_BUFFER);
1611 struct panfrost_ptr T =
1612 pan_pool_alloc_desc_array(&batch->pool.base,
1613 vs->info.attribute_count,
1614 ATTRIBUTE);
1615
1616 struct mali_attribute_buffer_packed *bufs =
1617 (struct mali_attribute_buffer_packed *) S.cpu;
1618
1619 struct mali_attribute_packed *out =
1620 (struct mali_attribute_packed *) T.cpu;
1621
1622 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1623 unsigned k = 0;
1624
1625 for (unsigned i = 0; i < so->nr_bufs; ++i) {
1626 unsigned vbi = so->buffers[i].vbi;
1627 unsigned divisor = so->buffers[i].divisor;
1628 attrib_to_buffer[i] = k;
1629
1630 if (!(ctx->vb_mask & (1 << vbi)))
1631 continue;
1632
1633 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1634 struct panfrost_resource *rsrc;
1635
1636 rsrc = pan_resource(buf->buffer.resource);
1637 if (!rsrc)
1638 continue;
1639
1640 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1641
1642 /* Mask off lower bits, see offset fixup below */
1643 mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;
1644 mali_ptr addr = raw_addr & ~63;
1645
1646 /* Since we advanced the base pointer, we shrink the buffer
1647 * size, but add the offset we subtracted */
1648 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1649 - buf->buffer_offset;
1650
1651 /* When there is a divisor, the hardware-level divisor is
1652 * the product of the instance divisor and the padded count */
1653 unsigned stride = buf->stride;
1654
1655 if (ctx->indirect_draw) {
1656 /* We allocated 2 records for each attribute buffer */
1657 assert((k & 1) == 0);
1658
1659 /* With indirect draws we can't guess the vertex_count.
1660 * Pre-set the address, stride and size fields, the
1661 * compute shader do the rest.
1662 */
1663 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1664 cfg.type = MALI_ATTRIBUTE_TYPE_1D;
1665 cfg.pointer = addr;
1666 cfg.stride = stride;
1667 cfg.size = size;
1668 }
1669
1670 /* We store the unmodified divisor in the continuation
1671 * slot so the compute shader can retrieve it.
1672 */
1673 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1674 cfg.divisor = divisor;
1675 }
1676
1677 k += 2;
1678 continue;
1679 }
1680
1681 unsigned hw_divisor = ctx->padded_count * divisor;
1682
1683 if (ctx->instance_count <= 1) {
1684 /* Per-instance would be every attribute equal */
1685 if (divisor)
1686 stride = 0;
1687
1688 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1689 cfg.pointer = addr;
1690 cfg.stride = stride;
1691 cfg.size = size;
1692 }
1693 } else if (!divisor) {
1694 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1695 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1696 cfg.pointer = addr;
1697 cfg.stride = stride;
1698 cfg.size = size;
1699 cfg.divisor = ctx->padded_count;
1700 }
1701 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1702 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1703 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1704 cfg.pointer = addr;
1705 cfg.stride = stride;
1706 cfg.size = size;
1707 cfg.divisor_r = __builtin_ctz(hw_divisor);
1708 }
1709
1710 } else {
1711 unsigned shift = 0, extra_flags = 0;
1712
1713 unsigned magic_divisor =
1714 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1715
1716 /* Records with continuations must be aligned */
1717 k = ALIGN_POT(k, 2);
1718 attrib_to_buffer[i] = k;
1719
1720 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1721 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1722 cfg.pointer = addr;
1723 cfg.stride = stride;
1724 cfg.size = size;
1725
1726 cfg.divisor_r = shift;
1727 cfg.divisor_e = extra_flags;
1728 }
1729
1730 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1731 cfg.divisor_numerator = magic_divisor;
1732 cfg.divisor = divisor;
1733 }
1734
1735 ++k;
1736 }
1737
1738 ++k;
1739 }
1740
1741 #if PAN_ARCH <= 5
1742 /* Add special gl_VertexID/gl_InstanceID buffers */
1743 if (special_vbufs) {
1744 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1745
1746 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1747 cfg.buffer_index = k++;
1748 cfg.format = so->formats[PAN_VERTEX_ID];
1749 }
1750
1751 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1752
1753 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1754 cfg.buffer_index = k++;
1755 cfg.format = so->formats[PAN_INSTANCE_ID];
1756 }
1757 }
1758 #endif
1759
1760 k = ALIGN_POT(k, 2);
1761 emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
1762 emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
1763 k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
1764
1765 #if PAN_ARCH >= 6
1766 /* We need an empty attrib buf to stop the prefetching on Bifrost */
1767 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
1768 #endif
1769
1770 /* Attribute addresses require 64-byte alignment, so let:
1771 *
1772 * base' = base & ~63 = base - (base & 63)
1773 * offset' = offset + (base & 63)
1774 *
1775 * Since base' + offset' = base + offset, these are equivalent
1776 * addressing modes and now base is 64 aligned.
1777 */
1778
1779 for (unsigned i = 0; i < so->num_elements; ++i) {
1780 unsigned vbi = so->pipe[i].vertex_buffer_index;
1781 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1782
1783 /* BOs are aligned; just fixup for buffer_offset */
1784 signed src_offset = so->pipe[i].src_offset;
1785 src_offset += (buf->buffer_offset & 63);
1786
1787 /* Base instance offset */
1788 if (ctx->base_instance && so->pipe[i].instance_divisor) {
1789 src_offset += (ctx->base_instance * buf->stride) /
1790 so->pipe[i].instance_divisor;
1791 }
1792
1793 /* Also, somewhat obscurely per-instance data needs to be
1794 * offset in response to a delayed start in an indexed draw */
1795
1796 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1797 src_offset -= buf->stride * ctx->offset_start;
1798
1799 pan_pack(out + i, ATTRIBUTE, cfg) {
1800 cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
1801 cfg.format = so->formats[i];
1802 cfg.offset = src_offset;
1803 }
1804 }
1805
1806 *buffers = S.gpu;
1807 return T.gpu;
1808 }
1809
1810 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)1811 panfrost_emit_varyings(struct panfrost_batch *batch,
1812 struct mali_attribute_buffer_packed *slot,
1813 unsigned stride, unsigned count)
1814 {
1815 unsigned size = stride * count;
1816 mali_ptr ptr =
1817 batch->ctx->indirect_draw ? 0 :
1818 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
1819
1820 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1821 cfg.stride = stride;
1822 cfg.size = size;
1823 cfg.pointer = ptr;
1824 }
1825
1826 return ptr;
1827 }
1828
1829 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1830 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1831 {
1832 return target->buffer_offset + (pan_so_target(target)->offset * stride);
1833 }
1834
1835 static void
panfrost_emit_streamout(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count,struct pipe_stream_output_target * target)1836 panfrost_emit_streamout(struct panfrost_batch *batch,
1837 struct mali_attribute_buffer_packed *slot,
1838 unsigned stride, unsigned count,
1839 struct pipe_stream_output_target *target)
1840 {
1841 unsigned max_size = target->buffer_size;
1842 unsigned expected_size = stride * count;
1843
1844 /* Grab the BO and bind it to the batch */
1845 struct panfrost_resource *rsrc = pan_resource(target->buffer);
1846 struct panfrost_bo *bo = rsrc->image.data.bo;
1847
1848 panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1849 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
1850
1851 unsigned offset = panfrost_xfb_offset(stride, target);
1852
1853 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1854 cfg.pointer = bo->ptr.gpu + (offset & ~63);
1855 cfg.stride = stride;
1856 cfg.size = MIN2(max_size, expected_size) + (offset & 63);
1857
1858 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1859 offset, cfg.size);
1860 }
1861 }
1862
1863 /* Helpers for manipulating stream out information so we can pack varyings
1864 * accordingly. Compute the src_offset for a given captured varying */
1865
1866 static struct pipe_stream_output *
pan_get_so(struct pipe_stream_output_info * info,gl_varying_slot loc)1867 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1868 {
1869 for (unsigned i = 0; i < info->num_outputs; ++i) {
1870 if (info->output[i].register_index == loc)
1871 return &info->output[i];
1872 }
1873
1874 unreachable("Varying not captured");
1875 }
1876
1877 /* Given a varying, figure out which index it corresponds to */
1878
1879 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)1880 pan_varying_index(unsigned present, enum pan_special_varying v)
1881 {
1882 return util_bitcount(present & BITFIELD_MASK(v));
1883 }
1884
1885 /* Get the base offset for XFB buffers, which by convention come after
1886 * everything else. Wrapper function for semantic reasons; by construction this
1887 * is just popcount. */
1888
1889 static inline unsigned
pan_xfb_base(unsigned present)1890 pan_xfb_base(unsigned present)
1891 {
1892 return util_bitcount(present);
1893 }
1894
1895 /* Determines which varying buffers are required */
1896
1897 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)1898 pan_varying_present(const struct panfrost_device *dev,
1899 struct pan_shader_info *producer,
1900 struct pan_shader_info *consumer,
1901 uint16_t point_coord_mask)
1902 {
1903 /* At the moment we always emit general and position buffers. Not
1904 * strictly necessary but usually harmless */
1905
1906 unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
1907
1908 /* Enable special buffers by the shader info */
1909
1910 if (producer->vs.writes_point_size)
1911 present |= BITFIELD_BIT(PAN_VARY_PSIZ);
1912
1913 #if PAN_ARCH <= 5
1914 /* On Midgard, these exist as real varyings. Later architectures use
1915 * LD_VAR_SPECIAL reads instead. */
1916
1917 if (consumer->fs.reads_point_coord)
1918 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1919
1920 if (consumer->fs.reads_face)
1921 present |= BITFIELD_BIT(PAN_VARY_FACE);
1922
1923 if (consumer->fs.reads_frag_coord)
1924 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
1925
1926 /* Also, if we have a point sprite, we need a point coord buffer */
1927
1928 for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
1929 gl_varying_slot loc = consumer->varyings.input[i].location;
1930
1931 if (util_varying_is_point_coord(loc, point_coord_mask))
1932 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1933 }
1934 #endif
1935
1936 return present;
1937 }
1938
1939 /* Emitters for varying records */
1940
1941 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)1942 pan_emit_vary(const struct panfrost_device *dev,
1943 struct mali_attribute_packed *out,
1944 unsigned buffer_index,
1945 mali_pixel_format format, unsigned offset)
1946 {
1947 pan_pack(out, ATTRIBUTE, cfg) {
1948 cfg.buffer_index = buffer_index;
1949 cfg.offset_enable = (PAN_ARCH <= 5);
1950 cfg.format = format;
1951 cfg.offset = offset;
1952 }
1953 }
1954
1955 /* Special records */
1956
1957 static const struct {
1958 unsigned components;
1959 enum mali_format format;
1960 } pan_varying_formats[PAN_VARY_MAX] = {
1961 [PAN_VARY_POSITION] = { 4, MALI_SNAP_4 },
1962 [PAN_VARY_PSIZ] = { 1, MALI_R16F },
1963 [PAN_VARY_PNTCOORD] = { 1, MALI_R16F },
1964 [PAN_VARY_FACE] = { 1, MALI_R32I },
1965 [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F },
1966 };
1967
1968 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)1969 pan_special_format(const struct panfrost_device *dev,
1970 enum pan_special_varying buf)
1971 {
1972 assert(buf < PAN_VARY_MAX);
1973 mali_pixel_format format = (pan_varying_formats[buf].format << 12);
1974
1975 #if PAN_ARCH <= 6
1976 unsigned nr = pan_varying_formats[buf].components;
1977 format |= panfrost_get_default_swizzle(nr);
1978 #endif
1979
1980 return format;
1981 }
1982
1983 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)1984 pan_emit_vary_special(const struct panfrost_device *dev,
1985 struct mali_attribute_packed *out,
1986 unsigned present, enum pan_special_varying buf)
1987 {
1988 pan_emit_vary(dev, out, pan_varying_index(present, buf),
1989 pan_special_format(dev, buf), 0);
1990 }
1991
1992 /* Negative indicates a varying is not found */
1993
1994 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)1995 pan_find_vary(const struct pan_shader_varying *vary,
1996 unsigned vary_count, unsigned loc)
1997 {
1998 for (unsigned i = 0; i < vary_count; ++i) {
1999 if (vary[i].location == loc)
2000 return i;
2001 }
2002
2003 return -1;
2004 }
2005
2006 /* Assign varying locations for the general buffer. Returns the calculated
2007 * per-vertex stride, and outputs offsets into the passed array. Negative
2008 * offset indicates a varying is not used. */
2009
2010 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2011 pan_assign_varyings(const struct panfrost_device *dev,
2012 struct pan_shader_info *producer,
2013 struct pan_shader_info *consumer,
2014 signed *offsets)
2015 {
2016 unsigned producer_count = producer->varyings.output_count;
2017 unsigned consumer_count = consumer->varyings.input_count;
2018
2019 const struct pan_shader_varying *producer_vars = producer->varyings.output;
2020 const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2021
2022 unsigned stride = 0;
2023
2024 for (unsigned i = 0; i < producer_count; ++i) {
2025 signed loc = pan_find_vary(consumer_vars, consumer_count,
2026 producer_vars[i].location);
2027
2028 if (loc >= 0) {
2029 offsets[i] = stride;
2030
2031 enum pipe_format format = consumer_vars[loc].format;
2032 stride += util_format_get_blocksize(format);
2033 } else {
2034 offsets[i] = -1;
2035 }
2036 }
2037
2038 return stride;
2039 }
2040
2041 /* Emitter for a single varying (attribute) descriptor */
2042
2043 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,struct pipe_stream_output_info * xfb,uint64_t xfb_loc_mask,unsigned max_xfb,unsigned * xfb_offsets,signed offset,enum pan_special_varying pos_varying)2044 panfrost_emit_varying(const struct panfrost_device *dev,
2045 struct mali_attribute_packed *out,
2046 const struct pan_shader_varying varying,
2047 enum pipe_format pipe_format,
2048 unsigned present,
2049 uint16_t point_sprite_mask,
2050 struct pipe_stream_output_info *xfb,
2051 uint64_t xfb_loc_mask,
2052 unsigned max_xfb,
2053 unsigned *xfb_offsets,
2054 signed offset,
2055 enum pan_special_varying pos_varying)
2056 {
2057 /* Note: varying.format != pipe_format in some obscure cases due to a
2058 * limitation of the NIR linker. This should be fixed in the future to
2059 * eliminate the additional lookups. See:
2060 * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2061 */
2062 gl_varying_slot loc = varying.location;
2063 mali_pixel_format format = dev->formats[pipe_format].hw;
2064
2065 struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
2066 pan_get_so(xfb, loc) : NULL;
2067
2068 if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2069 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2070 } else if (o && o->output_buffer < max_xfb) {
2071 unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;
2072
2073 pan_emit_vary(dev, out,
2074 pan_xfb_base(present) + o->output_buffer,
2075 format, (o->dst_offset * 4) + fixup_offset);
2076 } else if (loc == VARYING_SLOT_POS) {
2077 pan_emit_vary_special(dev, out, present, pos_varying);
2078 } else if (loc == VARYING_SLOT_PSIZ) {
2079 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2080 } else if (loc == VARYING_SLOT_FACE) {
2081 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2082 } else if (offset < 0) {
2083 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2084 } else {
2085 STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2086 pan_emit_vary(dev, out, 0, format, offset);
2087 }
2088 }
2089
2090 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2091 * rather than draw time (under good conditions). */
2092
2093 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_shader_state * producer,struct panfrost_shader_state * consumer,struct panfrost_streamout * xfb,uint16_t point_coord_mask,struct pan_linkage * out)2094 panfrost_emit_varying_descs(
2095 struct panfrost_pool *pool,
2096 struct panfrost_shader_state *producer,
2097 struct panfrost_shader_state *consumer,
2098 struct panfrost_streamout *xfb,
2099 uint16_t point_coord_mask,
2100 struct pan_linkage *out)
2101 {
2102 struct panfrost_device *dev = pool->base.dev;
2103 struct pipe_stream_output_info *xfb_info = &producer->stream_output;
2104 unsigned producer_count = producer->info.varyings.output_count;
2105 unsigned consumer_count = consumer->info.varyings.input_count;
2106
2107 /* Offsets within the general varying buffer, indexed by location */
2108 signed offsets[PAN_MAX_VARYINGS];
2109 assert(producer_count <= ARRAY_SIZE(offsets));
2110 assert(consumer_count <= ARRAY_SIZE(offsets));
2111
2112 /* Allocate enough descriptors for both shader stages */
2113 struct panfrost_ptr T =
2114 pan_pool_alloc_desc_array(&pool->base,
2115 producer_count + consumer_count,
2116 ATTRIBUTE);
2117
2118 /* Take a reference if we're being put on the CSO */
2119 if (!pool->owned) {
2120 out->bo = pool->transient_bo;
2121 panfrost_bo_reference(out->bo);
2122 }
2123
2124 struct mali_attribute_packed *descs = T.cpu;
2125 out->producer = producer_count ? T.gpu : 0;
2126 out->consumer = consumer_count ? T.gpu +
2127 (pan_size(ATTRIBUTE) * producer_count) : 0;
2128
2129 /* Lay out the varyings. Must use producer to lay out, in order to
2130 * respect transform feedback precisions. */
2131 out->present = pan_varying_present(dev, &producer->info,
2132 &consumer->info, point_coord_mask);
2133
2134 out->stride = pan_assign_varyings(dev, &producer->info,
2135 &consumer->info, offsets);
2136
2137 unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS];
2138
2139 for (unsigned i = 0; i < xfb->num_targets; ++i) {
2140 xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,
2141 xfb->targets[i]);
2142 }
2143
2144 for (unsigned i = 0; i < producer_count; ++i) {
2145 signed j = pan_find_vary(consumer->info.varyings.input,
2146 consumer->info.varyings.input_count,
2147 producer->info.varyings.output[i].location);
2148
2149 enum pipe_format format = (j >= 0) ?
2150 consumer->info.varyings.input[j].format :
2151 producer->info.varyings.output[i].format;
2152
2153 panfrost_emit_varying(dev, descs + i,
2154 producer->info.varyings.output[i], format,
2155 out->present, 0, &producer->stream_output,
2156 producer->so_mask, xfb->num_targets,
2157 xfb_offsets, offsets[i], PAN_VARY_POSITION);
2158 }
2159
2160 for (unsigned i = 0; i < consumer_count; ++i) {
2161 signed j = pan_find_vary(producer->info.varyings.output,
2162 producer->info.varyings.output_count,
2163 consumer->info.varyings.input[i].location);
2164
2165 signed offset = (j >= 0) ? offsets[j] : -1;
2166
2167 panfrost_emit_varying(dev, descs + producer_count + i,
2168 consumer->info.varyings.input[i],
2169 consumer->info.varyings.input[i].format,
2170 out->present, point_coord_mask,
2171 &producer->stream_output, producer->so_mask,
2172 xfb->num_targets, xfb_offsets, offset,
2173 PAN_VARY_FRAGCOORD);
2174 }
2175 }
2176
2177 #if PAN_ARCH <= 5
2178 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2179 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2180 unsigned present,
2181 enum pan_special_varying v,
2182 unsigned special)
2183 {
2184 if (present & BITFIELD_BIT(v)) {
2185 unsigned idx = pan_varying_index(present, v);
2186
2187 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2188 cfg.special = special;
2189 cfg.type = 0;
2190 }
2191 }
2192 }
2193 #endif
2194
2195 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,unsigned * buffer_count,mali_ptr * position,mali_ptr * psiz,bool point_coord_replace)2196 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2197 unsigned vertex_count,
2198 mali_ptr *vs_attribs,
2199 mali_ptr *fs_attribs,
2200 mali_ptr *buffers,
2201 unsigned *buffer_count,
2202 mali_ptr *position,
2203 mali_ptr *psiz,
2204 bool point_coord_replace)
2205 {
2206 /* Load the shaders */
2207 struct panfrost_context *ctx = batch->ctx;
2208 struct panfrost_shader_state *vs, *fs;
2209
2210 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2211 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2212
2213 uint16_t point_coord_mask = 0;
2214
2215 #if PAN_ARCH <= 5
2216 /* Point sprites are lowered on Bifrost and newer */
2217 if (point_coord_replace)
2218 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2219 #endif
2220
2221 /* In good conditions, we only need to link varyings once */
2222 bool prelink =
2223 (point_coord_mask == 0) &&
2224 (ctx->streamout.num_targets == 0) &&
2225 !vs->info.separable &&
2226 !fs->info.separable;
2227
2228 /* Try to reduce copies */
2229 struct pan_linkage _linkage;
2230 struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2231
2232 /* Emit ATTRIBUTE descriptors if needed */
2233 if (!prelink || vs->linkage.bo == NULL) {
2234 struct panfrost_pool *pool =
2235 prelink ? &ctx->descs : &batch->pool;
2236
2237 panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
2238 }
2239
2240 struct pipe_stream_output_info *so = &vs->stream_output;
2241 unsigned present = linkage->present, stride = linkage->stride;
2242 unsigned xfb_base = pan_xfb_base(present);
2243 struct panfrost_ptr T =
2244 pan_pool_alloc_desc_array(&batch->pool.base,
2245 xfb_base +
2246 ctx->streamout.num_targets + 1,
2247 ATTRIBUTE_BUFFER);
2248 struct mali_attribute_buffer_packed *varyings =
2249 (struct mali_attribute_buffer_packed *) T.cpu;
2250
2251 if (buffer_count)
2252 *buffer_count = xfb_base + ctx->streamout.num_targets;
2253
2254 #if PAN_ARCH >= 6
2255 /* Suppress prefetch on Bifrost */
2256 memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
2257 #endif
2258
2259 /* Emit the stream out buffers. We need enough room for all the
2260 * vertices we emit across all instances */
2261
2262 unsigned out_count = ctx->instance_count *
2263 u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2264
2265 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2266 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2267 so->stride[i] * 4,
2268 out_count,
2269 ctx->streamout.targets[i]);
2270 }
2271
2272 if (stride) {
2273 panfrost_emit_varyings(batch,
2274 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2275 stride, vertex_count);
2276 }
2277
2278 /* fp32 vec4 gl_Position */
2279 *position = panfrost_emit_varyings(batch,
2280 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2281 sizeof(float) * 4, vertex_count);
2282
2283 if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2284 *psiz = panfrost_emit_varyings(batch,
2285 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2286 2, vertex_count);
2287 }
2288
2289 #if PAN_ARCH <= 5
2290 pan_emit_special_input(varyings, present,
2291 PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2292 pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2293 MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2294 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2295 MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2296 #endif
2297
2298 *buffers = T.gpu;
2299 *vs_attribs = linkage->producer;
2300 *fs_attribs = linkage->consumer;
2301 }
2302
2303 static void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)2304 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2305 const struct panfrost_ptr *vertex_job,
2306 const struct panfrost_ptr *tiler_job)
2307 {
2308 struct panfrost_context *ctx = batch->ctx;
2309
2310 /* If rasterizer discard is enable, only submit the vertex. XXX - set
2311 * job_barrier in case buffers get ping-ponged and we need to enforce
2312 * ordering, this has a perf hit! See
2313 * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */
2314
2315 unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2316 MALI_JOB_TYPE_VERTEX, true, false,
2317 ctx->indirect_draw ?
2318 batch->indirect_draw_job_id : 0,
2319 0, vertex_job, false);
2320
2321 if (ctx->rasterizer->base.rasterizer_discard || batch->scissor_culls_everything)
2322 return;
2323
2324 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2325 MALI_JOB_TYPE_TILER, false, false,
2326 vertex, 0, tiler_job, false);
2327 }
2328
2329 static void
emit_tls(struct panfrost_batch * batch)2330 emit_tls(struct panfrost_batch *batch)
2331 {
2332 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2333
2334 /* Emitted with the FB descriptor on Midgard. */
2335 if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2336 return;
2337
2338 struct panfrost_bo *tls_bo =
2339 batch->stack_size ?
2340 panfrost_batch_get_scratchpad(batch,
2341 batch->stack_size,
2342 dev->thread_tls_alloc,
2343 dev->core_count):
2344 NULL;
2345 struct pan_tls_info tls = {
2346 .tls = {
2347 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2348 .size = batch->stack_size,
2349 },
2350 };
2351
2352 assert(batch->tls.cpu);
2353 GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2354 }
2355
2356 static void
emit_fbd(struct panfrost_batch * batch,const struct pan_fb_info * fb)2357 emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
2358 {
2359 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2360 struct panfrost_bo *tls_bo =
2361 batch->stack_size ?
2362 panfrost_batch_get_scratchpad(batch,
2363 batch->stack_size,
2364 dev->thread_tls_alloc,
2365 dev->core_count):
2366 NULL;
2367 struct pan_tls_info tls = {
2368 .tls = {
2369 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2370 .size = batch->stack_size,
2371 },
2372 };
2373
2374 batch->framebuffer.gpu |=
2375 GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx,
2376 batch->framebuffer.cpu);
2377 }
2378
2379 /* Mark a surface as written */
2380
2381 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2382 panfrost_initialize_surface(struct panfrost_batch *batch,
2383 struct pipe_surface *surf)
2384 {
2385 if (surf) {
2386 struct panfrost_resource *rsrc = pan_resource(surf->texture);
2387 BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2388 }
2389 }
2390
2391 /* Generate a fragment job. This should be called once per frame. (According to
2392 * presentations, this is supposed to correspond to eglSwapBuffers) */
2393
2394 static mali_ptr
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2395 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2396 {
2397 /* Mark the affected buffers as initialized, since we're writing to it.
2398 * Also, add the surfaces we're writing to to the batch */
2399
2400 struct pipe_framebuffer_state *fb = &batch->key;
2401
2402 for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2403 panfrost_initialize_surface(batch, fb->cbufs[i]);
2404
2405 panfrost_initialize_surface(batch, fb->zsbuf);
2406
2407 /* The passed tile coords can be out of range in some cases, so we need
2408 * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2409 * Theoretically we also need to clamp the coordinates positive, but we
2410 * avoid that edge case as all four values are unsigned. Also,
2411 * theoretically we could clamp the minima, but if that has to happen
2412 * the asserts would fail anyway (since the maxima would get clamped
2413 * and then be smaller than the minima). An edge case of sorts occurs
2414 * when no scissors are added to draw, so by default min=~0 and max=0.
2415 * But that can't happen if any actual drawing occurs (beyond a
2416 * wallpaper reload), so this is again irrelevant in practice. */
2417
2418 batch->maxx = MIN2(batch->maxx, fb->width);
2419 batch->maxy = MIN2(batch->maxy, fb->height);
2420
2421 /* Rendering region must be at least 1x1; otherwise, there is nothing
2422 * to do and the whole job chain should have been discarded. */
2423
2424 assert(batch->maxx > batch->minx);
2425 assert(batch->maxy > batch->miny);
2426
2427 struct panfrost_ptr transfer =
2428 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
2429
2430 GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu,
2431 transfer.cpu);
2432
2433 return transfer.gpu;
2434 }
2435
2436 #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
2437
2438 static uint8_t
pan_draw_mode(enum pipe_prim_type mode)2439 pan_draw_mode(enum pipe_prim_type mode)
2440 {
2441 switch (mode) {
2442 DEFINE_CASE(POINTS);
2443 DEFINE_CASE(LINES);
2444 DEFINE_CASE(LINE_LOOP);
2445 DEFINE_CASE(LINE_STRIP);
2446 DEFINE_CASE(TRIANGLES);
2447 DEFINE_CASE(TRIANGLE_STRIP);
2448 DEFINE_CASE(TRIANGLE_FAN);
2449 DEFINE_CASE(QUADS);
2450 DEFINE_CASE(POLYGON);
2451 #if PAN_ARCH <= 6
2452 DEFINE_CASE(QUAD_STRIP);
2453 #endif
2454
2455 default:
2456 unreachable("Invalid draw mode");
2457 }
2458 }
2459
2460 #undef DEFINE_CASE
2461
2462 /* Count generated primitives (when there is no geom/tess shaders) for
2463 * transform feedback */
2464
2465 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2466 panfrost_statistics_record(
2467 struct panfrost_context *ctx,
2468 const struct pipe_draw_info *info,
2469 const struct pipe_draw_start_count_bias *draw)
2470 {
2471 if (!ctx->active_queries)
2472 return;
2473
2474 uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2475 ctx->prims_generated += prims;
2476
2477 if (!ctx->streamout.num_targets)
2478 return;
2479
2480 ctx->tf_prims_generated += prims;
2481 }
2482
2483 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2484 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2485 {
2486 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2487 unsigned count;
2488
2489 count = u_stream_outputs_for_vertices(ctx->active_prim,
2490 ctx->vertex_count);
2491 pan_so_target(ctx->streamout.targets[i])->offset += count;
2492 }
2493 }
2494
2495 static inline void
pan_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)2496 pan_emit_draw_descs(struct panfrost_batch *batch,
2497 struct MALI_DRAW *d, enum pipe_shader_type st)
2498 {
2499 d->offset_start = batch->ctx->offset_start;
2500 d->instance_size = batch->ctx->instance_count > 1 ?
2501 batch->ctx->padded_count : 1;
2502
2503 d->uniform_buffers = batch->uniform_buffers[st];
2504 d->push_uniforms = batch->push_uniforms[st];
2505 d->textures = batch->textures[st];
2506 d->samplers = batch->samplers[st];
2507 }
2508
2509 static inline enum mali_index_type
panfrost_translate_index_size(unsigned size)2510 panfrost_translate_index_size(unsigned size)
2511 {
2512 STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0);
2513 STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1);
2514 STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);
2515
2516 return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;
2517 }
2518
2519 static void
panfrost_draw_emit_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * job)2520 panfrost_draw_emit_vertex(struct panfrost_batch *batch,
2521 const struct pipe_draw_info *info,
2522 void *invocation_template,
2523 mali_ptr vs_vary, mali_ptr varyings,
2524 mali_ptr attribs, mali_ptr attrib_bufs,
2525 void *job)
2526 {
2527 void *section =
2528 pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
2529 memcpy(section, invocation_template, pan_size(INVOCATION));
2530
2531 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
2532 cfg.job_task_split = 5;
2533 }
2534
2535 pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) {
2536 cfg.draw_descriptor_is_64b = true;
2537 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
2538 cfg.attributes = attribs;
2539 cfg.attribute_buffers = attrib_bufs;
2540 cfg.varyings = vs_vary;
2541 cfg.varying_buffers = vs_vary ? varyings : 0;
2542 cfg.thread_storage = batch->tls.gpu;
2543 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
2544 }
2545 }
2546
2547 static void
panfrost_emit_primitive_size(struct panfrost_context * ctx,bool points,mali_ptr size_array,void * prim_size)2548 panfrost_emit_primitive_size(struct panfrost_context *ctx,
2549 bool points, mali_ptr size_array,
2550 void *prim_size)
2551 {
2552 struct panfrost_rasterizer *rast = ctx->rasterizer;
2553
2554 pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
2555 if (panfrost_writes_point_size(ctx)) {
2556 cfg.size_array = size_array;
2557 } else {
2558 cfg.constant = points ?
2559 rast->base.point_size :
2560 rast->base.line_width;
2561 }
2562 }
2563 }
2564
2565 static bool
panfrost_is_implicit_prim_restart(const struct pipe_draw_info * info)2566 panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)
2567 {
2568 unsigned implicit_index = (1 << (info->index_size * 8)) - 1;
2569 bool implicit = info->restart_index == implicit_index;
2570 return info->primitive_restart && implicit;
2571 }
2572
2573 static inline void
panfrost_update_state_tex(struct panfrost_batch * batch,enum pipe_shader_type st)2574 panfrost_update_state_tex(struct panfrost_batch *batch,
2575 enum pipe_shader_type st)
2576 {
2577 struct panfrost_context *ctx = batch->ctx;
2578 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
2579
2580 unsigned dirty_3d = ctx->dirty;
2581 unsigned dirty = ctx->dirty_shader[st];
2582
2583 if (dirty & PAN_DIRTY_STAGE_TEXTURE) {
2584 batch->textures[st] =
2585 panfrost_emit_texture_descriptors(batch, st);
2586 }
2587
2588 if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2589 batch->samplers[st] =
2590 panfrost_emit_sampler_descriptors(batch, st);
2591 }
2592
2593 if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2594 batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,
2595 &batch->push_uniforms[st]);
2596 }
2597 }
2598
2599 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2600 panfrost_update_state_3d(struct panfrost_batch *batch)
2601 {
2602 unsigned dirty = batch->ctx->dirty;
2603
2604 if (dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
2605 batch->viewport = panfrost_emit_viewport(batch);
2606
2607 if (dirty & PAN_DIRTY_TLS_SIZE)
2608 panfrost_batch_adjust_stack_size(batch);
2609 }
2610
2611 static void
panfrost_update_state_vs(struct panfrost_batch * batch)2612 panfrost_update_state_vs(struct panfrost_batch *batch)
2613 {
2614 enum pipe_shader_type st = PIPE_SHADER_VERTEX;
2615 unsigned dirty = batch->ctx->dirty_shader[st];
2616
2617 if (dirty & PAN_DIRTY_STAGE_RENDERER)
2618 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2619
2620 panfrost_update_state_tex(batch, st);
2621 }
2622
2623 static void
panfrost_update_state_fs(struct panfrost_batch * batch)2624 panfrost_update_state_fs(struct panfrost_batch *batch)
2625 {
2626 enum pipe_shader_type st = PIPE_SHADER_FRAGMENT;
2627 unsigned dirty = batch->ctx->dirty_shader[st];
2628
2629 if (dirty & PAN_DIRTY_STAGE_RENDERER)
2630 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2631
2632 if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2633 batch->attribs[st] = panfrost_emit_image_attribs(batch,
2634 &batch->attrib_bufs[st], st);
2635 }
2636
2637 panfrost_update_state_tex(batch, st);
2638 }
2639
2640 #if PAN_ARCH >= 6
2641 static mali_ptr
panfrost_batch_get_bifrost_tiler(struct panfrost_batch * batch,unsigned vertex_count)2642 panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
2643 {
2644 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2645
2646 if (!vertex_count)
2647 return 0;
2648
2649 if (batch->tiler_ctx.bifrost)
2650 return batch->tiler_ctx.bifrost;
2651
2652 struct panfrost_ptr t =
2653 pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
2654
2655 GENX(pan_emit_tiler_heap)(dev, t.cpu);
2656
2657 mali_ptr heap = t.gpu;
2658
2659 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
2660 GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
2661 util_framebuffer_get_num_samples(&batch->key),
2662 heap, t.cpu);
2663
2664 batch->tiler_ctx.bifrost = t.gpu;
2665 return batch->tiler_ctx.bifrost;
2666 }
2667 #endif
2668
2669 static void
panfrost_draw_emit_tiler(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,mali_ptr indices,mali_ptr fs_vary,mali_ptr varyings,mali_ptr pos,mali_ptr psiz,void * job)2670 panfrost_draw_emit_tiler(struct panfrost_batch *batch,
2671 const struct pipe_draw_info *info,
2672 const struct pipe_draw_start_count_bias *draw,
2673 void *invocation_template,
2674 mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
2675 mali_ptr pos, mali_ptr psiz, void *job)
2676 {
2677 struct panfrost_context *ctx = batch->ctx;
2678 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2679
2680 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
2681 memcpy(section, invocation_template, pan_size(INVOCATION));
2682
2683 section = pan_section_ptr(job, TILER_JOB, PRIMITIVE);
2684 pan_pack(section, PRIMITIVE, cfg) {
2685 cfg.draw_mode = pan_draw_mode(info->mode);
2686 if (panfrost_writes_point_size(ctx))
2687 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
2688
2689 /* For line primitives, PRIMITIVE.first_provoking_vertex must
2690 * be set to true and the provoking vertex is selected with
2691 * DRAW.flat_shading_vertex.
2692 */
2693 if (info->mode == PIPE_PRIM_LINES ||
2694 info->mode == PIPE_PRIM_LINE_LOOP ||
2695 info->mode == PIPE_PRIM_LINE_STRIP)
2696 cfg.first_provoking_vertex = true;
2697 else
2698 cfg.first_provoking_vertex = rast->flatshade_first;
2699
2700 if (panfrost_is_implicit_prim_restart(info)) {
2701 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
2702 } else if (info->primitive_restart) {
2703 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
2704 cfg.primitive_restart_index = info->restart_index;
2705 }
2706
2707 cfg.job_task_split = 6;
2708
2709 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
2710 cfg.index_type = panfrost_translate_index_size(info->index_size);
2711
2712 if (cfg.index_type) {
2713 cfg.indices = indices;
2714 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
2715 }
2716 }
2717
2718 enum pipe_prim_type prim = u_reduced_prim(info->mode);
2719 bool polygon = (prim == PIPE_PRIM_TRIANGLES);
2720 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
2721
2722 #if PAN_ARCH >= 6
2723 pan_section_pack(job, TILER_JOB, TILER, cfg) {
2724 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
2725 }
2726
2727 pan_section_pack(job, TILER_JOB, PADDING, cfg);
2728 #endif
2729
2730 section = pan_section_ptr(job, TILER_JOB, DRAW);
2731 pan_pack(section, DRAW, cfg) {
2732 cfg.four_components_per_vertex = true;
2733 cfg.draw_descriptor_is_64b = true;
2734 cfg.front_face_ccw = rast->front_ccw;
2735
2736 /*
2737 * From the Gallium documentation,
2738 * pipe_rasterizer_state::cull_face "indicates which faces of
2739 * polygons to cull". Points and lines are not considered
2740 * polygons and should be drawn even if all faces are culled.
2741 * The hardware does not take primitive type into account when
2742 * culling, so we need to do that check ourselves.
2743 */
2744 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
2745 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
2746 cfg.position = pos;
2747 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
2748 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
2749 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
2750 cfg.viewport = batch->viewport;
2751 cfg.varyings = fs_vary;
2752 cfg.varying_buffers = fs_vary ? varyings : 0;
2753 cfg.thread_storage = batch->tls.gpu;
2754
2755 /* For all primitives but lines DRAW.flat_shading_vertex must
2756 * be set to 0 and the provoking vertex is selected with the
2757 * PRIMITIVE.first_provoking_vertex field.
2758 */
2759 if (prim == PIPE_PRIM_LINES) {
2760 /* The logic is inverted across arches. */
2761 cfg.flat_shading_vertex = rast->flatshade_first
2762 ^ (PAN_ARCH <= 5);
2763 }
2764
2765 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
2766
2767 if (ctx->occlusion_query && ctx->active_queries) {
2768 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
2769 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
2770 else
2771 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
2772
2773 struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
2774 cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
2775 panfrost_batch_write_rsrc(ctx->batch, rsrc,
2776 PIPE_SHADER_FRAGMENT);
2777 }
2778 }
2779
2780 panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
2781 }
2782
2783 static void
panfrost_direct_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)2784 panfrost_direct_draw(struct panfrost_batch *batch,
2785 const struct pipe_draw_info *info,
2786 unsigned drawid_offset,
2787 const struct pipe_draw_start_count_bias *draw)
2788 {
2789 if (!draw->count || !info->instance_count)
2790 return;
2791
2792 struct panfrost_context *ctx = batch->ctx;
2793
2794 /* Take into account a negative bias */
2795 ctx->indirect_draw = false;
2796 ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
2797 ctx->instance_count = info->instance_count;
2798 ctx->base_vertex = info->index_size ? draw->index_bias : 0;
2799 ctx->base_instance = info->start_instance;
2800 ctx->active_prim = info->mode;
2801 ctx->drawid = drawid_offset;
2802
2803 struct panfrost_ptr tiler =
2804 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
2805 struct panfrost_ptr vertex =
2806 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
2807
2808 unsigned vertex_count = ctx->vertex_count;
2809
2810 unsigned min_index = 0, max_index = 0;
2811 mali_ptr indices = 0;
2812
2813 if (info->index_size) {
2814 indices = panfrost_get_index_buffer_bounded(batch, info, draw,
2815 &min_index,
2816 &max_index);
2817
2818 /* Use the corresponding values */
2819 vertex_count = max_index - min_index + 1;
2820 ctx->offset_start = min_index + draw->index_bias;
2821 } else {
2822 ctx->offset_start = draw->start;
2823 }
2824
2825 if (info->instance_count > 1)
2826 ctx->padded_count = panfrost_padded_vertex_count(vertex_count);
2827 else
2828 ctx->padded_count = vertex_count;
2829
2830 panfrost_statistics_record(ctx, info, draw);
2831
2832 struct mali_invocation_packed invocation;
2833 if (info->instance_count > 1) {
2834 panfrost_pack_work_groups_compute(&invocation,
2835 1, vertex_count, info->instance_count,
2836 1, 1, 1, true, false);
2837 } else {
2838 pan_pack(&invocation, INVOCATION, cfg) {
2839 cfg.invocations = MALI_POSITIVE(vertex_count);
2840 cfg.size_y_shift = 0;
2841 cfg.size_z_shift = 0;
2842 cfg.workgroups_x_shift = 0;
2843 cfg.workgroups_y_shift = 0;
2844 cfg.workgroups_z_shift = 32;
2845 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
2846 }
2847 }
2848
2849 /* Emit all sort of descriptors. */
2850 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2851
2852 panfrost_emit_varying_descriptor(batch,
2853 ctx->padded_count *
2854 ctx->instance_count,
2855 &vs_vary, &fs_vary, &varyings,
2856 NULL, &pos, &psiz,
2857 info->mode == PIPE_PRIM_POINTS);
2858
2859 mali_ptr attribs, attrib_bufs;
2860 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
2861
2862 panfrost_update_state_3d(batch);
2863 panfrost_update_state_vs(batch);
2864 panfrost_update_state_fs(batch);
2865 panfrost_clean_state_3d(ctx);
2866
2867 /* Fire off the draw itself */
2868 panfrost_draw_emit_vertex(batch, info, &invocation,
2869 vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
2870 panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
2871 fs_vary, varyings, pos, psiz, tiler.cpu);
2872 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
2873
2874 /* Increment transform feedback offsets */
2875 panfrost_update_streamout_offsets(ctx);
2876 }
2877
2878 static void
panfrost_indirect_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)2879 panfrost_indirect_draw(struct panfrost_batch *batch,
2880 const struct pipe_draw_info *info,
2881 unsigned drawid_offset,
2882 const struct pipe_draw_indirect_info *indirect,
2883 const struct pipe_draw_start_count_bias *draw)
2884 {
2885 /* Indirect draw count and multi-draw not supported. */
2886 assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);
2887
2888 struct panfrost_context *ctx = batch->ctx;
2889 struct panfrost_device *dev = pan_device(ctx->base.screen);
2890
2891 /* TODO: update statistics (see panfrost_statistics_record()) */
2892 /* TODO: Increment transform feedback offsets */
2893 assert(ctx->streamout.num_targets == 0);
2894
2895 ctx->active_prim = info->mode;
2896 ctx->drawid = drawid_offset;
2897 ctx->indirect_draw = true;
2898
2899 struct panfrost_ptr tiler =
2900 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
2901 struct panfrost_ptr vertex =
2902 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
2903
2904 struct panfrost_shader_state *vs =
2905 panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2906
2907 struct panfrost_bo *index_buf = NULL;
2908
2909 if (info->index_size) {
2910 assert(!info->has_user_indices);
2911 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
2912 index_buf = rsrc->image.data.bo;
2913 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2914 }
2915
2916 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2917 unsigned varying_buf_count;
2918
2919 /* We want to create templates, set all count fields to 0 to reflect
2920 * that.
2921 */
2922 ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;
2923 ctx->offset_start = 0;
2924
2925 /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the
2926 * vertex shader uses gl_VertexID or gl_BaseVertex.
2927 */
2928 ctx->first_vertex_sysval_ptr = 0;
2929 ctx->base_vertex_sysval_ptr = 0;
2930 ctx->base_instance_sysval_ptr = 0;
2931
2932 panfrost_update_state_3d(batch);
2933 panfrost_update_state_vs(batch);
2934 panfrost_update_state_fs(batch);
2935 panfrost_clean_state_3d(ctx);
2936
2937 bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);
2938
2939 panfrost_emit_varying_descriptor(batch, 0,
2940 &vs_vary, &fs_vary, &varyings,
2941 &varying_buf_count, &pos, &psiz,
2942 point_coord_replace);
2943
2944 mali_ptr attribs, attrib_bufs;
2945 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
2946
2947 /* Zero-ed invocation, the compute job will update it. */
2948 static struct mali_invocation_packed invocation;
2949
2950 /* Fire off the draw itself */
2951 panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings,
2952 attribs, attrib_bufs, vertex.cpu);
2953 panfrost_draw_emit_tiler(batch, info, draw, &invocation,
2954 index_buf ? index_buf->ptr.gpu : 0,
2955 fs_vary, varyings, pos, psiz, tiler.cpu);
2956
2957 /* Add the varying heap BO to the batch if we're allocating varyings. */
2958 if (varyings) {
2959 panfrost_batch_add_bo(batch,
2960 dev->indirect_draw_shaders.varying_heap,
2961 PIPE_SHADER_VERTEX);
2962 }
2963
2964 assert(indirect->buffer);
2965
2966 struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);
2967
2968 /* Don't count images: those attributes don't need to be patched. */
2969 unsigned attrib_count =
2970 vs->info.attribute_count -
2971 util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);
2972
2973 panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);
2974
2975 struct pan_indirect_draw_info draw_info = {
2976 .last_indirect_draw = batch->indirect_draw_job_id,
2977 .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,
2978 .index_buf = index_buf ? index_buf->ptr.gpu : 0,
2979 .first_vertex_sysval = ctx->first_vertex_sysval_ptr,
2980 .base_vertex_sysval = ctx->base_vertex_sysval_ptr,
2981 .base_instance_sysval = ctx->base_instance_sysval_ptr,
2982 .vertex_job = vertex.gpu,
2983 .tiler_job = tiler.gpu,
2984 .attrib_bufs = attrib_bufs,
2985 .attribs = attribs,
2986 .attrib_count = attrib_count,
2987 .varying_bufs = varyings,
2988 .index_size = info->index_size,
2989 };
2990
2991 if (panfrost_writes_point_size(ctx))
2992 draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;
2993
2994 if (vs->info.vs.writes_point_size)
2995 draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;
2996
2997
2998 if (info->primitive_restart) {
2999 draw_info.restart_index = info->restart_index;
3000 draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;
3001 }
3002
3003 batch->indirect_draw_job_id =
3004 GENX(panfrost_emit_indirect_draw)(&batch->pool.base,
3005 &batch->scoreboard,
3006 &draw_info,
3007 &batch->indirect_draw_ctx);
3008
3009 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3010 }
3011
3012 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3013 panfrost_draw_vbo(struct pipe_context *pipe,
3014 const struct pipe_draw_info *info,
3015 unsigned drawid_offset,
3016 const struct pipe_draw_indirect_info *indirect,
3017 const struct pipe_draw_start_count_bias *draws,
3018 unsigned num_draws)
3019 {
3020 struct panfrost_context *ctx = pan_context(pipe);
3021 struct panfrost_device *dev = pan_device(pipe->screen);
3022
3023 if (!panfrost_render_condition_check(ctx))
3024 return;
3025
3026 /* Emulate indirect draws unless we're using the experimental path */
3027 if (!(dev->debug & PAN_DBG_INDIRECT) && indirect && indirect->buffer) {
3028 assert(num_draws == 1);
3029 util_draw_indirect(pipe, info, indirect);
3030 return;
3031 }
3032
3033 /* Do some common setup */
3034 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3035
3036 /* Don't add too many jobs to a single batch. Hardware has a hard limit
3037 * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to
3038 * avoid the risk of timeouts. This might not be a good idea. */
3039 if (unlikely(batch->scoreboard.job_index > 10000))
3040 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3041
3042 unsigned zs_draws = ctx->depth_stencil->draws;
3043 batch->draws |= zs_draws;
3044 batch->resolve |= zs_draws;
3045
3046 /* Mark everything dirty when debugging */
3047 if (unlikely(dev->debug & PAN_DBG_DIRTY))
3048 panfrost_dirty_state_all(ctx);
3049
3050 /* Conservatively assume draw parameters always change */
3051 ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3052
3053 if (indirect) {
3054 assert(num_draws == 1);
3055
3056 if (indirect->count_from_stream_output) {
3057 struct pipe_draw_start_count_bias tmp_draw = *draws;
3058 struct panfrost_streamout_target *so =
3059 pan_so_target(indirect->count_from_stream_output);
3060
3061 tmp_draw.start = 0;
3062 tmp_draw.count = so->offset;
3063 tmp_draw.index_bias = 0;
3064 panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);
3065 return;
3066 }
3067
3068 panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);
3069 return;
3070 }
3071
3072 struct pipe_draw_info tmp_info = *info;
3073 unsigned drawid = drawid_offset;
3074
3075 for (unsigned i = 0; i < num_draws; i++) {
3076 panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
3077
3078 if (tmp_info.increment_draw_id) {
3079 ctx->dirty |= PAN_DIRTY_DRAWID;
3080 drawid++;
3081 }
3082 }
3083
3084 }
3085
3086 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3087 * construct the COMPUTE job and some of its payload.
3088 */
3089
3090 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3091 panfrost_launch_grid(struct pipe_context *pipe,
3092 const struct pipe_grid_info *info)
3093 {
3094 struct panfrost_context *ctx = pan_context(pipe);
3095
3096 /* XXX - shouldn't be necessary with working memory barriers. Affected
3097 * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3098 panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3099
3100 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3101
3102 struct panfrost_shader_state *cs =
3103 &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];
3104
3105 /* Indirect dispatch can't handle workgroup local storage since that
3106 * would require dynamic memory allocation. Bail in this case. */
3107 if (info->indirect && !cs->info.wls_size) {
3108 struct pipe_transfer *transfer;
3109 uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,
3110 info->indirect_offset,
3111 3 * sizeof(uint32_t),
3112 PIPE_MAP_READ,
3113 &transfer);
3114
3115 struct pipe_grid_info direct = *info;
3116 direct.indirect = NULL;
3117 direct.grid[0] = params[0];
3118 direct.grid[1] = params[1];
3119 direct.grid[2] = params[2];
3120 pipe_buffer_unmap(pipe, transfer);
3121
3122 if (params[0] && params[1] && params[2])
3123 panfrost_launch_grid(pipe, &direct);
3124
3125 return;
3126 }
3127
3128 ctx->compute_grid = info;
3129
3130 struct panfrost_ptr t =
3131 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3132
3133 /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
3134 * reuse the graphics path for this by lowering to Gallium */
3135
3136 struct pipe_constant_buffer ubuf = {
3137 .buffer = NULL,
3138 .buffer_offset = 0,
3139 .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem,
3140 .user_buffer = info->input
3141 };
3142
3143 if (info->input)
3144 pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);
3145
3146 /* Invoke according to the grid info */
3147
3148 void *invocation =
3149 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION);
3150 unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };
3151
3152 if (info->indirect)
3153 num_wg[0] = num_wg[1] = num_wg[2] = 1;
3154
3155 panfrost_pack_work_groups_compute(invocation,
3156 num_wg[0], num_wg[1], num_wg[2],
3157 info->block[0], info->block[1],
3158 info->block[2],
3159 false, info->indirect != NULL);
3160
3161 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
3162 cfg.job_task_split =
3163 util_logbase2_ceil(info->block[0] + 1) +
3164 util_logbase2_ceil(info->block[1] + 1) +
3165 util_logbase2_ceil(info->block[2] + 1);
3166 }
3167
3168 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
3169 cfg.draw_descriptor_is_64b = true;
3170 cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);
3171 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
3172 cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
3173 cfg.uniform_buffers = panfrost_emit_const_buf(batch,
3174 PIPE_SHADER_COMPUTE, &cfg.push_uniforms);
3175 cfg.textures = panfrost_emit_texture_descriptors(batch,
3176 PIPE_SHADER_COMPUTE);
3177 cfg.samplers = panfrost_emit_sampler_descriptors(batch,
3178 PIPE_SHADER_COMPUTE);
3179 }
3180
3181 unsigned indirect_dep = 0;
3182 if (info->indirect) {
3183 struct pan_indirect_dispatch_info indirect = {
3184 .job = t.gpu,
3185 .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
3186 info->indirect_offset,
3187 .num_wg_sysval = {
3188 batch->num_wg_sysval[0],
3189 batch->num_wg_sysval[1],
3190 batch->num_wg_sysval[2],
3191 },
3192 };
3193
3194 indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base,
3195 &batch->scoreboard,
3196 &indirect);
3197 }
3198
3199 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3200 MALI_JOB_TYPE_COMPUTE, true, false,
3201 indirect_dep, 0, &t, false);
3202 panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3203 }
3204
3205 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3206 panfrost_create_rasterizer_state(
3207 struct pipe_context *pctx,
3208 const struct pipe_rasterizer_state *cso)
3209 {
3210 struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3211
3212 so->base = *cso;
3213
3214 /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */
3215 assert(cso->offset_clamp == 0.0);
3216
3217 pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3218 cfg.multisample_enable = cso->multisample;
3219 cfg.fixed_function_near_discard = cso->depth_clip_near;
3220 cfg.fixed_function_far_discard = cso->depth_clip_far;
3221 cfg.shader_depth_range_fixed = true;
3222 }
3223
3224 pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3225 cfg.depth_range_1 = cso->offset_tri;
3226 cfg.depth_range_2 = cso->offset_tri;
3227 cfg.single_sampled_lines = !cso->multisample;
3228 }
3229
3230 return so;
3231 }
3232
3233 /* Assigns a vertex buffer for a given (index, divisor) tuple */
3234
3235 static unsigned
pan_assign_vertex_buffer(struct pan_vertex_buffer * buffers,unsigned * nr_bufs,unsigned vbi,unsigned divisor)3236 pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers,
3237 unsigned *nr_bufs,
3238 unsigned vbi,
3239 unsigned divisor)
3240 {
3241 /* Look up the buffer */
3242 for (unsigned i = 0; i < (*nr_bufs); ++i) {
3243 if (buffers[i].vbi == vbi && buffers[i].divisor == divisor)
3244 return i;
3245 }
3246
3247 /* Else, create a new buffer */
3248 unsigned idx = (*nr_bufs)++;
3249
3250 buffers[idx] = (struct pan_vertex_buffer) {
3251 .vbi = vbi,
3252 .divisor = divisor
3253 };
3254
3255 return idx;
3256 }
3257
3258 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3259 panfrost_create_vertex_elements_state(
3260 struct pipe_context *pctx,
3261 unsigned num_elements,
3262 const struct pipe_vertex_element *elements)
3263 {
3264 struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3265 struct panfrost_device *dev = pan_device(pctx->screen);
3266
3267 so->num_elements = num_elements;
3268 memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3269
3270 /* Assign attribute buffers corresponding to the vertex buffers, keyed
3271 * for a particular divisor since that's how instancing works on Mali */
3272 for (unsigned i = 0; i < num_elements; ++i) {
3273 so->element_buffer[i] = pan_assign_vertex_buffer(
3274 so->buffers, &so->nr_bufs,
3275 elements[i].vertex_buffer_index,
3276 elements[i].instance_divisor);
3277 }
3278
3279 for (int i = 0; i < num_elements; ++i) {
3280 enum pipe_format fmt = elements[i].src_format;
3281 const struct util_format_description *desc = util_format_description(fmt);
3282 so->formats[i] = dev->formats[desc->format].hw;
3283 assert(so->formats[i]);
3284 }
3285
3286 /* Let's also prepare vertex builtins */
3287 so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3288 so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3289
3290 return so;
3291 }
3292
3293 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3294 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3295 {
3296 switch (in) {
3297 case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;
3298 case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;
3299 case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;
3300 case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;
3301 case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;
3302 case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;
3303 case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;
3304 case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;
3305 default: unreachable("Invalid stencil op");
3306 }
3307 }
3308
3309 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3310 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3311 struct mali_stencil_packed *out)
3312 {
3313 pan_pack(out, STENCIL, s) {
3314 s.mask = in->valuemask;
3315 s.compare_function = (enum mali_func) in->func;
3316 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3317 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3318 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3319 }
3320 }
3321
3322 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3323 panfrost_create_depth_stencil_state(struct pipe_context *pipe,
3324 const struct pipe_depth_stencil_alpha_state *zsa)
3325 {
3326 struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3327 so->base = *zsa;
3328
3329 /* Normalize (there's no separate enable) */
3330 if (!zsa->alpha_enabled)
3331 so->base.alpha_func = MALI_FUNC_ALWAYS;
3332
3333 /* Prepack relevant parts of the Renderer State Descriptor. They will
3334 * be ORed in at draw-time */
3335 pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3336 cfg.depth_function = zsa->depth_enabled ?
3337 (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;
3338
3339 cfg.depth_write_mask = zsa->depth_writemask;
3340 }
3341
3342 pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3343 cfg.stencil_enable = zsa->stencil[0].enabled;
3344
3345 cfg.stencil_mask_front = zsa->stencil[0].writemask;
3346 cfg.stencil_mask_back = zsa->stencil[1].enabled ?
3347 zsa->stencil[1].writemask : zsa->stencil[0].writemask;
3348
3349 #if PAN_ARCH <= 5
3350 cfg.alpha_test_compare_function =
3351 (enum mali_func) so->base.alpha_func;
3352 #endif
3353 }
3354
3355 /* Stencil tests have their own words in the RSD */
3356 pan_pipe_to_stencil(&zsa->stencil[0], &so->stencil_front);
3357
3358 if (zsa->stencil[1].enabled)
3359 pan_pipe_to_stencil(&zsa->stencil[1], &so->stencil_back);
3360 else
3361 so->stencil_back = so->stencil_front;
3362
3363 so->enabled = zsa->stencil[0].enabled ||
3364 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3365
3366 /* Write masks need tracking together */
3367 if (zsa->depth_writemask)
3368 so->draws |= PIPE_CLEAR_DEPTH;
3369
3370 if (zsa->stencil[0].enabled)
3371 so->draws |= PIPE_CLEAR_STENCIL;
3372
3373 /* TODO: Bounds test should be easy */
3374 assert(!zsa->depth_bounds_test);
3375
3376 return so;
3377 }
3378
3379 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3380 panfrost_create_sampler_view(
3381 struct pipe_context *pctx,
3382 struct pipe_resource *texture,
3383 const struct pipe_sampler_view *template)
3384 {
3385 struct panfrost_context *ctx = pan_context(pctx);
3386 struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
3387
3388 pan_legalize_afbc_format(ctx, pan_resource(texture), template->format);
3389
3390 pipe_reference(NULL, &texture->reference);
3391
3392 so->base = *template;
3393 so->base.texture = texture;
3394 so->base.reference.count = 1;
3395 so->base.context = pctx;
3396
3397 panfrost_create_sampler_view_bo(so, pctx, texture);
3398
3399 return (struct pipe_sampler_view *) so;
3400 }
3401
3402 /* A given Gallium blend state can be encoded to the hardware in numerous,
3403 * dramatically divergent ways due to the interactions of blending with
3404 * framebuffer formats. Conceptually, there are two modes:
3405 *
3406 * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3407 * state, and suitable blend constant)
3408 *
3409 * - Blend shaders (for everything else)
3410 *
3411 * A given Gallium blend configuration will compile to exactly one
3412 * fixed-function blend state, if it compiles to any, although the constant
3413 * will vary across runs as that is tracked outside of the Gallium CSO.
3414 *
3415 * However, that same blend configuration will compile to many different blend
3416 * shaders, depending on the framebuffer formats active. The rationale is that
3417 * blend shaders override not just fixed-function blending but also
3418 * fixed-function format conversion, so blend shaders are keyed to a particular
3419 * framebuffer format. As an example, the tilebuffer format is identical for
3420 * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3421 * blend shaders.
3422 *
3423 * All of this state is encapsulated in the panfrost_blend_state struct
3424 * (our subclass of pipe_blend_state).
3425 */
3426
3427 /* Create a blend CSO. Essentially, try to compile a fixed-function
3428 * expression and initialize blend shaders */
3429
3430 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3431 panfrost_create_blend_state(struct pipe_context *pipe,
3432 const struct pipe_blend_state *blend)
3433 {
3434 struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3435 so->base = *blend;
3436
3437 so->pan.logicop_enable = blend->logicop_enable;
3438 so->pan.logicop_func = blend->logicop_func;
3439 so->pan.rt_count = blend->max_rt + 1;
3440
3441 for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3442 unsigned g = blend->independent_blend_enable ? c : 0;
3443 const struct pipe_rt_blend_state pipe = blend->rt[g];
3444 struct pan_blend_equation equation = {0};
3445
3446 equation.color_mask = pipe.colormask;
3447 equation.blend_enable = pipe.blend_enable;
3448
3449 if (pipe.blend_enable) {
3450 equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);
3451 equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);
3452 equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);
3453 equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);
3454 equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);
3455 equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);
3456 equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);
3457 equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);
3458 equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);
3459 equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);
3460 }
3461
3462 /* Determine some common properties */
3463 unsigned constant_mask = pan_blend_constant_mask(equation);
3464 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3465 so->info[c] = (struct pan_blend_info) {
3466 .no_colour = (equation.color_mask == 0),
3467 .opaque = pan_blend_is_opaque(equation),
3468 .constant_mask = constant_mask,
3469
3470 /* TODO: check the dest for the logicop */
3471 .load_dest = blend->logicop_enable ||
3472 pan_blend_reads_dest(equation),
3473
3474 /* Could this possibly be fixed-function? */
3475 .fixed_function = !blend->logicop_enable &&
3476 pan_blend_can_fixed_function(equation,
3477 supports_2src) &&
3478 (!constant_mask ||
3479 pan_blend_supports_constant(PAN_ARCH, c))
3480 };
3481
3482 so->pan.rts[c].equation = equation;
3483
3484 /* Bifrost needs to know if any render target loads its
3485 * destination in the hot draw path, so precompute this */
3486 if (so->info[c].load_dest)
3487 so->load_dest_mask |= BITFIELD_BIT(c);
3488
3489 /* Converting equations to Mali style is expensive, do it at
3490 * CSO create time instead of draw-time */
3491 if (so->info[c].fixed_function) {
3492 so->equation[c] = pan_pack_blend(equation);
3493 }
3494 }
3495
3496 return so;
3497 }
3498
3499 static void
prepare_rsd(struct panfrost_shader_state * state,struct panfrost_pool * pool,bool upload)3500 prepare_rsd(struct panfrost_shader_state *state,
3501 struct panfrost_pool *pool, bool upload)
3502 {
3503 struct mali_renderer_state_packed *out =
3504 (struct mali_renderer_state_packed *)&state->partial_rsd;
3505
3506 if (upload) {
3507 struct panfrost_ptr ptr =
3508 pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3509
3510 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3511 out = ptr.cpu;
3512 }
3513
3514 pan_pack(out, RENDERER_STATE, cfg) {
3515 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3516 }
3517 }
3518
3519 static void
panfrost_get_sample_position(struct pipe_context * context,unsigned sample_count,unsigned sample_index,float * out_value)3520 panfrost_get_sample_position(struct pipe_context *context,
3521 unsigned sample_count,
3522 unsigned sample_index,
3523 float *out_value)
3524 {
3525 panfrost_query_sample_position(
3526 panfrost_sample_pattern(sample_count),
3527 sample_index,
3528 out_value);
3529 }
3530
3531 static void
screen_destroy(struct pipe_screen * pscreen)3532 screen_destroy(struct pipe_screen *pscreen)
3533 {
3534 struct panfrost_device *dev = pan_device(pscreen);
3535 GENX(panfrost_cleanup_indirect_draw_shaders)(dev);
3536 GENX(pan_indirect_dispatch_cleanup)(dev);
3537 GENX(pan_blitter_cleanup)(dev);
3538 }
3539
3540 static void
preload(struct panfrost_batch * batch,struct pan_fb_info * fb)3541 preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
3542 {
3543 GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
3544 PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
3545 }
3546
3547 static void
init_batch(struct panfrost_batch * batch)3548 init_batch(struct panfrost_batch *batch)
3549 {
3550 /* Reserve the framebuffer and local storage descriptors */
3551 batch->framebuffer =
3552 #if PAN_ARCH == 4
3553 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
3554 #else
3555 pan_pool_alloc_desc_aggregate(&batch->pool.base,
3556 PAN_DESC(FRAMEBUFFER),
3557 PAN_DESC(ZS_CRC_EXTENSION),
3558 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
3559
3560 batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD;
3561 #endif
3562
3563 #if PAN_ARCH >= 6
3564 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
3565 #else
3566 /* On Midgard, the TLS is embedded in the FB descriptor */
3567 batch->tls = batch->framebuffer;
3568 #endif
3569 }
3570
3571 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)3572 panfrost_sampler_view_destroy(
3573 struct pipe_context *pctx,
3574 struct pipe_sampler_view *pview)
3575 {
3576 struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
3577
3578 pipe_resource_reference(&pview->texture, NULL);
3579 panfrost_bo_unreference(view->state.bo);
3580 ralloc_free(view);
3581 }
3582
3583 static void
context_init(struct pipe_context * pipe)3584 context_init(struct pipe_context *pipe)
3585 {
3586 pipe->draw_vbo = panfrost_draw_vbo;
3587 pipe->launch_grid = panfrost_launch_grid;
3588
3589 pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3590 pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3591 pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3592 pipe->create_sampler_view = panfrost_create_sampler_view;
3593 pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
3594 pipe->create_sampler_state = panfrost_create_sampler_state;
3595 pipe->create_blend_state = panfrost_create_blend_state;
3596
3597 pipe->get_sample_position = panfrost_get_sample_position;
3598 }
3599
3600 #if PAN_ARCH <= 5
3601
3602 /* Returns the polygon list's GPU address if available, or otherwise allocates
3603 * the polygon list. It's perfectly fast to use allocate/free BO directly,
3604 * since we'll hit the BO cache and this is one-per-batch anyway. */
3605
3606 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)3607 batch_get_polygon_list(struct panfrost_batch *batch)
3608 {
3609 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3610
3611 if (!batch->tiler_ctx.midgard.polygon_list) {
3612 bool has_draws = batch->scoreboard.first_tiler != NULL;
3613 unsigned size =
3614 panfrost_tiler_get_polygon_list_size(dev,
3615 batch->key.width,
3616 batch->key.height,
3617 has_draws);
3618 size = util_next_power_of_two(size);
3619
3620 /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
3621 * we need to write the polygon list manually because there's not WRITE_VALUE
3622 * job in the chain (maybe we should add one...). */
3623 bool init_polygon_list = !has_draws && (dev->quirks & MIDGARD_NO_HIER_TILING);
3624 batch->tiler_ctx.midgard.polygon_list =
3625 panfrost_batch_create_bo(batch, size,
3626 init_polygon_list ? 0 : PAN_BO_INVISIBLE,
3627 PIPE_SHADER_VERTEX,
3628 "Polygon list");
3629 panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
3630 PIPE_SHADER_FRAGMENT);
3631
3632 if (init_polygon_list) {
3633 assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
3634 uint32_t *polygon_list_body =
3635 batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
3636 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
3637
3638 /* Magic for Mali T720 */
3639 polygon_list_body[0] = 0xa0000000;
3640 }
3641
3642 batch->tiler_ctx.midgard.disable = !has_draws;
3643 }
3644
3645 return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
3646 }
3647 #endif
3648
3649 static void
init_polygon_list(struct panfrost_batch * batch)3650 init_polygon_list(struct panfrost_batch *batch)
3651 {
3652 #if PAN_ARCH <= 5
3653 mali_ptr polygon_list = batch_get_polygon_list(batch);
3654 panfrost_scoreboard_initialize_tiler(&batch->pool.base,
3655 &batch->scoreboard,
3656 polygon_list);
3657 #endif
3658 }
3659
3660 void
GENX(panfrost_cmdstream_screen_init)3661 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
3662 {
3663 struct panfrost_device *dev = &screen->dev;
3664
3665 screen->vtbl.prepare_rsd = prepare_rsd;
3666 screen->vtbl.emit_tls = emit_tls;
3667 screen->vtbl.emit_fbd = emit_fbd;
3668 screen->vtbl.emit_fragment_job = emit_fragment_job;
3669 screen->vtbl.screen_destroy = screen_destroy;
3670 screen->vtbl.preload = preload;
3671 screen->vtbl.context_init = context_init;
3672 screen->vtbl.init_batch = init_batch;
3673 screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
3674 screen->vtbl.init_polygon_list = init_polygon_list;
3675 screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
3676 screen->vtbl.compile_shader = GENX(pan_shader_compile);
3677
3678 GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
3679 &screen->blitter.desc_pool.base);
3680 GENX(pan_indirect_dispatch_init)(dev);
3681 GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base);
3682 }
3683