1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "util/format/u_format.h"
32 #include "util/u_inlines.h"
33 #include "util/u_memory.h"
34 #include "util/u_string.h"
35
36 #include "nir/tgsi_to_nir.h"
37 #include "nir_serialize.h"
38
39 #include "freedreno_context.h"
40 #include "freedreno_util.h"
41
42 #include "ir3/ir3_cache.h"
43 #include "ir3/ir3_compiler.h"
44 #include "ir3/ir3_gallium.h"
45 #include "ir3/ir3_nir.h"
46 #include "ir3/ir3_shader.h"
47
48 /**
49 * The hardware cso for shader state
50 *
51 * Initially just a container for the ir3_shader, but this is where we'll
52 * plumb in async compile.
53 */
54 struct ir3_shader_state {
55 struct ir3_shader *shader;
56
57 /* Fence signalled when async compile is completed: */
58 struct util_queue_fence ready;
59 };
60
61 /**
62 * Should initial variants be compiled synchronously?
63 *
64 * The only case where pipe_debug_message() is used in the initial-variants
65 * path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie.
66 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
67 * compile the initial shader variant asynchronously.
68 */
69 static bool
initial_variants_synchronous(struct fd_context * ctx)70 initial_variants_synchronous(struct fd_context *ctx)
71 {
72 return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
73 FD_DBG(SERIALC);
74 }
75
76 static void
dump_shader_info(struct ir3_shader_variant * v,struct pipe_debug_callback * debug)77 dump_shader_info(struct ir3_shader_variant *v,
78 struct pipe_debug_callback *debug)
79 {
80 if (!FD_DBG(SHADERDB))
81 return;
82
83 pipe_debug_message(
84 debug, SHADER_INFO,
85 "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
86 "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
87 "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
88 "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
89 "%d loops\n",
90 ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
91 v->info.instrs_count - v->info.nops_count, v->info.mov_count,
92 v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
93 v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen,
94 v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
95 v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
96 v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
97 v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
98 v->info.stp_count, v->info.ldp_count, v->info.sstall,
99 v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
100 }
101
102 static void
upload_shader_variant(struct ir3_shader_variant * v)103 upload_shader_variant(struct ir3_shader_variant *v)
104 {
105 struct shader_info *info = &v->shader->nir->info;
106 struct ir3_compiler *compiler = v->shader->compiler;
107
108 assert(!v->bo);
109
110 v->bo =
111 fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
112 "%s:%s", ir3_shader_stage(v), info->name);
113
114 /* Always include shaders in kernel crash dumps. */
115 fd_bo_mark_for_dump(v->bo);
116
117 fd_bo_upload(v->bo, v->bin, v->info.size);
118 }
119
120 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct pipe_debug_callback * debug)121 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
122 bool binning_pass, struct pipe_debug_callback *debug)
123 {
124 struct ir3_shader_variant *v;
125 bool created = false;
126
127 /* Some shader key values may not be used by a given ir3_shader (for
128 * example, fragment shader saturates in the vertex shader), so clean out
129 * those flags to avoid recompiling.
130 */
131 ir3_key_clear_unused(&key, shader);
132
133 v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
134
135 if (created) {
136 if (shader->initial_variants_done) {
137 perf_debug_message(debug, SHADER_INFO,
138 "%s shader: recompiling at draw time: global "
139 "0x%08x, vfsamples %x/%x, astc %x/%x\n",
140 ir3_shader_stage(v), key.global, key.vsamples,
141 key.fsamples, key.vastc_srgb, key.fastc_srgb);
142 }
143
144 dump_shader_info(v, debug);
145 upload_shader_variant(v);
146
147 if (v->binning) {
148 upload_shader_variant(v->binning);
149 dump_shader_info(v->binning, debug);
150 }
151 }
152
153 return v;
154 }
155
156 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)157 copy_stream_out(struct ir3_stream_output_info *i,
158 const struct pipe_stream_output_info *p)
159 {
160 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
161 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
162
163 i->num_outputs = p->num_outputs;
164 for (int n = 0; n < ARRAY_SIZE(i->stride); n++)
165 i->stride[n] = p->stride[n];
166
167 for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
168 i->output[n].register_index = p->output[n].register_index;
169 i->output[n].start_component = p->output[n].start_component;
170 i->output[n].num_components = p->output[n].num_components;
171 i->output[n].output_buffer = p->output[n].output_buffer;
172 i->output[n].dst_offset = p->output[n].dst_offset;
173 i->output[n].stream = p->output[n].stream;
174 }
175 }
176
177 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct pipe_debug_callback * debug)178 create_initial_variants(struct ir3_shader_state *hwcso,
179 struct pipe_debug_callback *debug)
180 {
181 struct ir3_shader *shader = hwcso->shader;
182 struct ir3_compiler *compiler = shader->compiler;
183 nir_shader *nir = shader->nir;
184
185 /* Compile standard variants immediately to try to avoid draw-time stalls
186 * to run the compiler.
187 */
188 struct ir3_shader_key key = {
189 .tessellation = IR3_TESS_NONE,
190 .ucp_enables = MASK(nir->info.clip_distance_array_size),
191 .msaa = true,
192 };
193
194 switch (nir->info.stage) {
195 case MESA_SHADER_TESS_EVAL:
196 key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
197 break;
198
199 case MESA_SHADER_TESS_CTRL:
200 /* The primitive_mode field, while it exists for TCS, is not
201 * populated (since separable shaders between TCS/TES are legal,
202 * so TCS wouldn't have access to TES's declaration). Make a
203 * guess so that we shader-db something plausible for TCS.
204 */
205 if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
206 key.tessellation = IR3_TESS_TRIANGLES;
207 else
208 key.tessellation = IR3_TESS_ISOLINES;
209 break;
210
211 case MESA_SHADER_GEOMETRY:
212 key.has_gs = true;
213 break;
214
215 default:
216 break;
217 }
218
219 key.safe_constlen = false;
220 struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
221 if (!v)
222 return;
223
224 if (v->constlen > compiler->max_const_safe) {
225 key.safe_constlen = true;
226 ir3_shader_variant(shader, key, false, debug);
227 }
228
229 /* For vertex shaders, also compile initial binning pass shader: */
230 if (nir->info.stage == MESA_SHADER_VERTEX) {
231 key.safe_constlen = false;
232 v = ir3_shader_variant(shader, key, true, debug);
233 if (!v)
234 return;
235
236 if (v->constlen > compiler->max_const_safe) {
237 key.safe_constlen = true;
238 ir3_shader_variant(shader, key, true, debug);
239 }
240 }
241
242 shader->initial_variants_done = true;
243 }
244
245 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)246 create_initial_variants_async(void *job, void *gdata, int thread_index)
247 {
248 struct ir3_shader_state *hwcso = job;
249 struct pipe_debug_callback debug = {};
250
251 create_initial_variants(hwcso, &debug);
252 }
253
254 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)255 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
256 {
257 struct ir3_shader_state *hwcso = job;
258 struct ir3_shader *shader = hwcso->shader;
259 struct pipe_debug_callback debug = {};
260 static struct ir3_shader_key key; /* static is implicitly zeroed */
261
262 ir3_shader_variant(shader, key, false, &debug);
263 shader->initial_variants_done = true;
264 }
265
266 /* a bit annoying that compute-shader and normal shader state objects
267 * aren't a bit more aligned.
268 */
269 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)270 ir3_shader_compute_state_create(struct pipe_context *pctx,
271 const struct pipe_compute_state *cso)
272 {
273 struct fd_context *ctx = fd_context(pctx);
274
275 /* req_input_mem will only be non-zero for cl kernels (ie. clover).
276 * This isn't a perfect test because I guess it is possible (but
277 * uncommon) for none for the kernel parameters to be a global,
278 * but ctx->set_global_bindings() can't fail, so this is the next
279 * best place to fail if we need a newer version of kernel driver:
280 */
281 if ((cso->req_input_mem > 0) &&
282 fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
283 return NULL;
284 }
285
286 struct ir3_compiler *compiler = ctx->screen->compiler;
287 nir_shader *nir;
288
289 if (cso->ir_type == PIPE_SHADER_IR_NIR) {
290 /* we take ownership of the reference: */
291 nir = (nir_shader *)cso->prog;
292 } else if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
293 const nir_shader_compiler_options *options =
294 ir3_get_compiler_options(compiler);
295 const struct pipe_binary_program_header *hdr = cso->prog;
296 struct blob_reader reader;
297
298 blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
299 nir = nir_deserialize(NULL, options, &reader);
300
301 ir3_finalize_nir(compiler, nir);
302 } else {
303 debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
304 if (ir3_shader_debug & IR3_DBG_DISASM) {
305 tgsi_dump(cso->prog, 0);
306 }
307 nir = tgsi_to_nir(cso->prog, pctx->screen, false);
308 }
309
310 struct ir3_shader *shader =
311 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
312 /* TODO: force to single on a6xx with legacy
313 * ballot extension that uses 64-bit masks
314 */
315 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
316 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
317 }, NULL);
318 shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */
319 shader->cs.req_local_mem = cso->req_local_mem;
320
321 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
322
323 util_queue_fence_init(&hwcso->ready);
324 hwcso->shader = shader;
325
326 /* Immediately compile a standard variant. We have so few variants in our
327 * shaders, that doing so almost eliminates draw-time recompiles. (This
328 * is also how we get data from shader-db's ./run)
329 */
330
331 if (initial_variants_synchronous(ctx)) {
332 static struct ir3_shader_key key; /* static is implicitly zeroed */
333 ir3_shader_variant(shader, key, false, &ctx->debug);
334 shader->initial_variants_done = true;
335 } else {
336 struct fd_screen *screen = ctx->screen;
337 util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
338 create_initial_compute_variants_async, NULL, 0);
339 }
340
341 return hwcso;
342 }
343
344 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)345 ir3_shader_state_create(struct pipe_context *pctx,
346 const struct pipe_shader_state *cso)
347 {
348 struct fd_context *ctx = fd_context(pctx);
349 struct ir3_compiler *compiler = ctx->screen->compiler;
350 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
351
352 /*
353 * Convert to nir (if necessary):
354 */
355
356 nir_shader *nir;
357 if (cso->type == PIPE_SHADER_IR_NIR) {
358 /* we take ownership of the reference: */
359 nir = cso->ir.nir;
360 } else {
361 debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
362 if (ir3_shader_debug & IR3_DBG_DISASM) {
363 tgsi_dump(cso->tokens, 0);
364 }
365 nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
366 }
367
368 /*
369 * Create ir3_shader:
370 *
371 * This part is cheap, it doesn't compile initial variants
372 */
373
374 struct ir3_stream_output_info stream_output = {};
375 copy_stream_out(&stream_output, &cso->stream_output);
376
377 hwcso->shader =
378 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
379 /* TODO: force to single on a6xx with legacy
380 * ballot extension that uses 64-bit masks
381 */
382 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
383 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
384 },
385 &stream_output);
386
387 /*
388 * Create initial variants to avoid draw-time stalls. This is
389 * normally done asynchronously, unless debug is enabled (which
390 * will be the case for shader-db)
391 */
392
393 util_queue_fence_init(&hwcso->ready);
394
395 if (initial_variants_synchronous(ctx)) {
396 create_initial_variants(hwcso, &ctx->debug);
397 } else {
398 util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
399 create_initial_variants_async, NULL, 0);
400 }
401
402 return hwcso;
403 }
404
405 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)406 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
407 {
408 struct fd_context *ctx = fd_context(pctx);
409 struct fd_screen *screen = ctx->screen;
410 struct ir3_shader_state *hwcso = _hwcso;
411 struct ir3_shader *so = hwcso->shader;
412
413 ir3_cache_invalidate(ctx->shader_cache, hwcso);
414
415 /* util_queue_drop_job() guarantees that either:
416 * 1) job did not execute
417 * 2) job completed
418 *
419 * In either case the fence is signaled
420 */
421 util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
422
423 /* free the uploaded shaders, since this is handled outside of the
424 * shared ir3 code (ie. not used by turnip):
425 */
426 for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
427 fd_bo_del(v->bo);
428 v->bo = NULL;
429
430 if (v->binning && v->binning->bo) {
431 fd_bo_del(v->binning->bo);
432 v->binning->bo = NULL;
433 }
434 }
435
436 ir3_shader_destroy(so);
437 util_queue_fence_destroy(&hwcso->ready);
438 free(hwcso);
439 }
440
441 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)442 ir3_get_shader(struct ir3_shader_state *hwcso)
443 {
444 if (!hwcso)
445 return NULL;
446
447 struct ir3_shader *shader = hwcso->shader;
448 perf_time (1000, "waited for %s:%s:%s variants",
449 _mesa_shader_stage_to_abbrev(shader->type),
450 shader->nir->info.name,
451 shader->nir->info.label) {
452 /* wait for initial variants to compile: */
453 util_queue_fence_wait(&hwcso->ready);
454 }
455
456 return shader;
457 }
458
459 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)460 ir3_get_shader_info(struct ir3_shader_state *hwcso)
461 {
462 if (!hwcso)
463 return NULL;
464 return &hwcso->shader->nir->info;
465 }
466
467 /* fixup dirty shader state in case some "unrelated" (from the state-
468 * tracker's perspective) state change causes us to switch to a
469 * different variant.
470 */
471 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)472 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
473 {
474 struct fd_context *ctx = fd_context(pctx);
475
476 if (!ir3_shader_key_equal(ctx->last.key, key)) {
477 if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
478 fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
479 FD_DIRTY_SHADER_PROG);
480 }
481
482 if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
483 fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
484 }
485
486 /* NOTE: currently only a6xx has gs/tess, but needs no
487 * gs/tess specific lowering.
488 */
489
490 *ctx->last.key = *key;
491 }
492 }
493
494 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,void * nir)495 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
496 {
497 struct fd_screen *screen = fd_screen(pscreen);
498
499 ir3_nir_lower_io_to_temporaries(nir);
500 ir3_finalize_nir(screen->compiler, nir);
501
502 return NULL;
503 }
504
505 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)506 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
507 unsigned max_threads)
508 {
509 struct fd_screen *screen = fd_screen(pscreen);
510
511 /* This function doesn't allow a greater number of threads than
512 * the queue had at its creation.
513 */
514 util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
515 }
516
517 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)518 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
519 void *shader,
520 enum pipe_shader_type shader_type)
521 {
522 struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
523
524 return util_queue_fence_is_signalled(&hwcso->ready);
525 }
526
527 void
ir3_prog_init(struct pipe_context * pctx)528 ir3_prog_init(struct pipe_context *pctx)
529 {
530 pctx->create_vs_state = ir3_shader_state_create;
531 pctx->delete_vs_state = ir3_shader_state_delete;
532
533 pctx->create_tcs_state = ir3_shader_state_create;
534 pctx->delete_tcs_state = ir3_shader_state_delete;
535
536 pctx->create_tes_state = ir3_shader_state_create;
537 pctx->delete_tes_state = ir3_shader_state_delete;
538
539 pctx->create_gs_state = ir3_shader_state_create;
540 pctx->delete_gs_state = ir3_shader_state_delete;
541
542 pctx->create_fs_state = ir3_shader_state_create;
543 pctx->delete_fs_state = ir3_shader_state_delete;
544 }
545
546 void
ir3_screen_init(struct pipe_screen * pscreen)547 ir3_screen_init(struct pipe_screen *pscreen)
548 {
549 struct fd_screen *screen = fd_screen(pscreen);
550
551 screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id,
552 &(struct ir3_compiler_options) {});
553
554 /* TODO do we want to limit things to # of fast cores, or just limit
555 * based on total # of both big and little cores. The little cores
556 * tend to be in-order and probably much slower for compiling than
557 * big cores. OTOH if they are sitting idle, maybe it is useful to
558 * use them?
559 */
560 unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
561
562 util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
563 UTIL_QUEUE_INIT_RESIZE_IF_FULL |
564 UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
565
566 pscreen->finalize_nir = ir3_screen_finalize_nir;
567 pscreen->set_max_shader_compiler_threads =
568 ir3_set_max_shader_compiler_threads;
569 pscreen->is_parallel_shader_compilation_finished =
570 ir3_is_parallel_shader_compilation_finished;
571 }
572
573 void
ir3_screen_fini(struct pipe_screen * pscreen)574 ir3_screen_fini(struct pipe_screen *pscreen)
575 {
576 struct fd_screen *screen = fd_screen(pscreen);
577
578 util_queue_destroy(&screen->compile_queue);
579 ir3_compiler_destroy(screen->compiler);
580 screen->compiler = NULL;
581 }
582
583 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)584 ir3_update_max_tf_vtx(struct fd_context *ctx,
585 const struct ir3_shader_variant *v)
586 {
587 struct fd_streamout_stateobj *so = &ctx->streamout;
588 struct ir3_stream_output_info *info = &v->shader->stream_output;
589 uint32_t maxvtxcnt = 0x7fffffff;
590
591 if (v->shader->stream_output.num_outputs == 0)
592 maxvtxcnt = 0;
593 if (so->num_targets == 0)
594 maxvtxcnt = 0;
595
596 /* offset to write to is:
597 *
598 * total_vtxcnt = vtxcnt + offsets[i]
599 * offset = total_vtxcnt * stride[i]
600 *
601 * offset = vtxcnt * stride[i] ; calculated in shader
602 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
603 *
604 * assuming for each vtx, each target buffer will have data written
605 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
606 *
607 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
608 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
609 *
610 * but shader is actually doing a less-than (rather than less-than-
611 * equal) check, so we can drop the -stride[i].
612 *
613 * TODO is assumption about `offset + stride[i]` legit?
614 */
615 for (unsigned i = 0; i < so->num_targets; i++) {
616 struct pipe_stream_output_target *target = so->targets[i];
617 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
618 if (target) {
619 uint32_t max = target->buffer_size / stride;
620 maxvtxcnt = MIN2(maxvtxcnt, max);
621 }
622 }
623
624 ctx->streamout.max_tf_vtx = maxvtxcnt;
625 }
626