1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "util/disk_cache.h"
29 #include "util/mesa-sha1.h"
30 #include "util/u_atomic.h"
31 #include "radv_debug.h"
32 #include "radv_private.h"
33 #include "radv_cs.h"
34 #include "radv_shader.h"
35 #include "nir/nir.h"
36 #include "nir/nir_builder.h"
37 #include "nir/nir_xfb_info.h"
38 #include "spirv/nir_spirv.h"
39 #include "vk_util.h"
40 
41 #include "sid.h"
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_nir_to_llvm.h"
45 #include "vk_format.h"
46 #include "util/debug.h"
47 #include "ac_exp_param.h"
48 #include "ac_shader_util.h"
49 
50 struct radv_blend_state {
51 	uint32_t blend_enable_4bit;
52 	uint32_t need_src_alpha;
53 
54 	uint32_t cb_color_control;
55 	uint32_t cb_target_mask;
56 	uint32_t cb_target_enabled_4bit;
57 	uint32_t sx_mrt_blend_opt[8];
58 	uint32_t cb_blend_control[8];
59 
60 	uint32_t spi_shader_col_format;
61 	uint32_t col_format_is_int8;
62 	uint32_t col_format_is_int10;
63 	uint32_t cb_shader_mask;
64 	uint32_t db_alpha_to_mask;
65 
66 	uint32_t commutative_4bit;
67 
68 	bool single_cb_enable;
69 	bool mrt0_is_dual_src;
70 };
71 
72 struct radv_dsa_order_invariance {
73 	/* Whether the final result in Z/S buffers is guaranteed to be
74 	 * invariant under changes to the order in which fragments arrive.
75 	 */
76 	bool zs;
77 
78 	/* Whether the set of fragments that pass the combined Z/S test is
79 	 * guaranteed to be invariant under changes to the order in which
80 	 * fragments arrive.
81 	 */
82 	bool pass_set;
83 };
84 
85 static const VkPipelineMultisampleStateCreateInfo *
radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)86 radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
87 {
88 	if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable)
89 		return pCreateInfo->pMultisampleState;
90 	return NULL;
91 }
92 
93 static const VkPipelineTessellationStateCreateInfo *
radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)94 radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
95 {
96 	for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
97 		if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT ||
98 		    pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) {
99 			return pCreateInfo->pTessellationState;
100 		}
101 	}
102 	return NULL;
103 }
104 
105 static const VkPipelineDepthStencilStateCreateInfo *
radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)106 radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
107 {
108 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
109 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
110 
111 	if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
112 	    subpass->depth_stencil_attachment)
113 		return pCreateInfo->pDepthStencilState;
114 	return NULL;
115 }
116 
117 static const VkPipelineColorBlendStateCreateInfo *
radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)118 radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
119 {
120 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
121 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
122 
123 	if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
124 	    subpass->has_color_att)
125 		return pCreateInfo->pColorBlendState;
126 	return NULL;
127 }
128 
radv_pipeline_has_ngg(const struct radv_pipeline * pipeline)129 bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
130 {
131 	struct radv_shader_variant *variant = NULL;
132 	if (pipeline->shaders[MESA_SHADER_GEOMETRY])
133 		variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
134 	else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
135 		variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
136 	else if (pipeline->shaders[MESA_SHADER_VERTEX])
137 		variant = pipeline->shaders[MESA_SHADER_VERTEX];
138 	else
139 		return false;
140 	return variant->info.is_ngg;
141 }
142 
radv_pipeline_has_ngg_passthrough(const struct radv_pipeline * pipeline)143 bool radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline)
144 {
145 	assert(radv_pipeline_has_ngg(pipeline));
146 
147 	struct radv_shader_variant *variant = NULL;
148 	if (pipeline->shaders[MESA_SHADER_GEOMETRY])
149 		variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
150 	else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
151 		variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
152 	else if (pipeline->shaders[MESA_SHADER_VERTEX])
153 		variant = pipeline->shaders[MESA_SHADER_VERTEX];
154 	else
155 		return false;
156 	return variant->info.is_ngg_passthrough;
157 }
158 
radv_pipeline_has_gs_copy_shader(const struct radv_pipeline * pipeline)159 bool radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
160 {
161 	if (!radv_pipeline_has_gs(pipeline))
162 		return false;
163 
164 	/* The GS copy shader is required if the pipeline has GS on GFX6-GFX9.
165 	 * On GFX10, it might be required in rare cases if it's not possible to
166 	 * enable NGG.
167 	 */
168 	if (radv_pipeline_has_ngg(pipeline))
169 		return false;
170 
171 	assert(pipeline->gs_copy_shader);
172 	return true;
173 }
174 
175 static void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)176 radv_pipeline_destroy(struct radv_device *device,
177                       struct radv_pipeline *pipeline,
178                       const VkAllocationCallbacks* allocator)
179 {
180 	for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
181 		if (pipeline->shaders[i])
182 			radv_shader_variant_destroy(device, pipeline->shaders[i]);
183 
184 	if (pipeline->gs_copy_shader)
185 		radv_shader_variant_destroy(device, pipeline->gs_copy_shader);
186 
187 	if(pipeline->cs.buf)
188 		free(pipeline->cs.buf);
189 
190 	vk_object_base_finish(&pipeline->base);
191 	vk_free2(&device->vk.alloc, allocator, pipeline);
192 }
193 
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)194 void radv_DestroyPipeline(
195 	VkDevice                                    _device,
196 	VkPipeline                                  _pipeline,
197 	const VkAllocationCallbacks*                pAllocator)
198 {
199 	RADV_FROM_HANDLE(radv_device, device, _device);
200 	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
201 
202 	if (!_pipeline)
203 		return;
204 
205 	radv_pipeline_destroy(device, pipeline, pAllocator);
206 }
207 
get_hash_flags(struct radv_device * device)208 static uint32_t get_hash_flags(struct radv_device *device)
209 {
210 	uint32_t hash_flags = 0;
211 
212 	if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
213 		hash_flags |= RADV_HASH_SHADER_NO_NGG;
214 	if (device->physical_device->cs_wave_size == 32)
215 		hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
216 	if (device->physical_device->ps_wave_size == 32)
217 		hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
218 	if (device->physical_device->ge_wave_size == 32)
219 		hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
220 	if (device->physical_device->use_llvm)
221 		hash_flags |= RADV_HASH_SHADER_LLVM;
222 	if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
223 		hash_flags |= RADV_HASH_SHADER_DISCARD_TO_DEMOTE;
224 	if (device->instance->enable_mrt_output_nan_fixup)
225 		hash_flags |= RADV_HASH_SHADER_MRT_NAN_FIXUP;
226 	return hash_flags;
227 }
228 
229 static void
radv_pipeline_init_scratch(struct radv_device * device,struct radv_pipeline * pipeline)230 radv_pipeline_init_scratch(struct radv_device *device,
231                            struct radv_pipeline *pipeline)
232 {
233 	unsigned scratch_bytes_per_wave = 0;
234 	unsigned max_waves = 0;
235 	unsigned min_waves = 1;
236 
237 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
238 		if (pipeline->shaders[i] &&
239 		    pipeline->shaders[i]->config.scratch_bytes_per_wave) {
240 			unsigned max_stage_waves = device->scratch_waves;
241 
242 			scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
243 			                              pipeline->shaders[i]->config.scratch_bytes_per_wave);
244 
245 			max_stage_waves = MIN2(max_stage_waves,
246 			          4 * device->physical_device->rad_info.num_good_compute_units *
247 			          (256 / pipeline->shaders[i]->config.num_vgprs));
248 			max_waves = MAX2(max_waves, max_stage_waves);
249 		}
250 	}
251 
252 	if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
253 		unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
254 		                      pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
255 		                      pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
256 		min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
257 	}
258 
259 	pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
260 	pipeline->max_waves = max_waves;
261 }
262 
si_translate_blend_logic_op(VkLogicOp op)263 static uint32_t si_translate_blend_logic_op(VkLogicOp op)
264 {
265 	switch (op) {
266 	case VK_LOGIC_OP_CLEAR:
267 		return V_028808_ROP3_CLEAR;
268 	case VK_LOGIC_OP_AND:
269 		return V_028808_ROP3_AND;
270 	case VK_LOGIC_OP_AND_REVERSE:
271 		return V_028808_ROP3_AND_REVERSE;
272 	case VK_LOGIC_OP_COPY:
273 		return V_028808_ROP3_COPY;
274 	case VK_LOGIC_OP_AND_INVERTED:
275 		return V_028808_ROP3_AND_INVERTED;
276 	case VK_LOGIC_OP_NO_OP:
277 		return V_028808_ROP3_NO_OP;
278 	case VK_LOGIC_OP_XOR:
279 		return V_028808_ROP3_XOR;
280 	case VK_LOGIC_OP_OR:
281 		return V_028808_ROP3_OR;
282 	case VK_LOGIC_OP_NOR:
283 		return V_028808_ROP3_NOR;
284 	case VK_LOGIC_OP_EQUIVALENT:
285 		return V_028808_ROP3_EQUIVALENT;
286 	case VK_LOGIC_OP_INVERT:
287 		return V_028808_ROP3_INVERT;
288 	case VK_LOGIC_OP_OR_REVERSE:
289 		return V_028808_ROP3_OR_REVERSE;
290 	case VK_LOGIC_OP_COPY_INVERTED:
291 		return V_028808_ROP3_COPY_INVERTED;
292 	case VK_LOGIC_OP_OR_INVERTED:
293 		return V_028808_ROP3_OR_INVERTED;
294 	case VK_LOGIC_OP_NAND:
295 		return V_028808_ROP3_NAND;
296 	case VK_LOGIC_OP_SET:
297 		return V_028808_ROP3_SET;
298 	default:
299 		unreachable("Unhandled logic op");
300 	}
301 }
302 
303 
si_translate_blend_function(VkBlendOp op)304 static uint32_t si_translate_blend_function(VkBlendOp op)
305 {
306 	switch (op) {
307 	case VK_BLEND_OP_ADD:
308 		return V_028780_COMB_DST_PLUS_SRC;
309 	case VK_BLEND_OP_SUBTRACT:
310 		return V_028780_COMB_SRC_MINUS_DST;
311 	case VK_BLEND_OP_REVERSE_SUBTRACT:
312 		return V_028780_COMB_DST_MINUS_SRC;
313 	case VK_BLEND_OP_MIN:
314 		return V_028780_COMB_MIN_DST_SRC;
315 	case VK_BLEND_OP_MAX:
316 		return V_028780_COMB_MAX_DST_SRC;
317 	default:
318 		return 0;
319 	}
320 }
321 
si_translate_blend_factor(VkBlendFactor factor)322 static uint32_t si_translate_blend_factor(VkBlendFactor factor)
323 {
324 	switch (factor) {
325 	case VK_BLEND_FACTOR_ZERO:
326 		return V_028780_BLEND_ZERO;
327 	case VK_BLEND_FACTOR_ONE:
328 		return V_028780_BLEND_ONE;
329 	case VK_BLEND_FACTOR_SRC_COLOR:
330 		return V_028780_BLEND_SRC_COLOR;
331 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
332 		return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
333 	case VK_BLEND_FACTOR_DST_COLOR:
334 		return V_028780_BLEND_DST_COLOR;
335 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
336 		return V_028780_BLEND_ONE_MINUS_DST_COLOR;
337 	case VK_BLEND_FACTOR_SRC_ALPHA:
338 		return V_028780_BLEND_SRC_ALPHA;
339 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
340 		return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
341 	case VK_BLEND_FACTOR_DST_ALPHA:
342 		return V_028780_BLEND_DST_ALPHA;
343 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
344 		return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
345 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
346 		return V_028780_BLEND_CONSTANT_COLOR;
347 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
348 		return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
349 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
350 		return V_028780_BLEND_CONSTANT_ALPHA;
351 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
352 		return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
353 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
354 		return V_028780_BLEND_SRC_ALPHA_SATURATE;
355 	case VK_BLEND_FACTOR_SRC1_COLOR:
356 		return V_028780_BLEND_SRC1_COLOR;
357 	case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
358 		return V_028780_BLEND_INV_SRC1_COLOR;
359 	case VK_BLEND_FACTOR_SRC1_ALPHA:
360 		return V_028780_BLEND_SRC1_ALPHA;
361 	case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
362 		return V_028780_BLEND_INV_SRC1_ALPHA;
363 	default:
364 		return 0;
365 	}
366 }
367 
si_translate_blend_opt_function(VkBlendOp op)368 static uint32_t si_translate_blend_opt_function(VkBlendOp op)
369 {
370 	switch (op) {
371 	case VK_BLEND_OP_ADD:
372 		return V_028760_OPT_COMB_ADD;
373 	case VK_BLEND_OP_SUBTRACT:
374 		return V_028760_OPT_COMB_SUBTRACT;
375 	case VK_BLEND_OP_REVERSE_SUBTRACT:
376 		return V_028760_OPT_COMB_REVSUBTRACT;
377 	case VK_BLEND_OP_MIN:
378 		return V_028760_OPT_COMB_MIN;
379 	case VK_BLEND_OP_MAX:
380 		return V_028760_OPT_COMB_MAX;
381 	default:
382 		return V_028760_OPT_COMB_BLEND_DISABLED;
383 	}
384 }
385 
si_translate_blend_opt_factor(VkBlendFactor factor,bool is_alpha)386 static uint32_t si_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha)
387 {
388 	switch (factor) {
389 	case VK_BLEND_FACTOR_ZERO:
390 		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
391 	case VK_BLEND_FACTOR_ONE:
392 		return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
393 	case VK_BLEND_FACTOR_SRC_COLOR:
394 		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
395 				: V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
396 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
397 		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
398 				: V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
399 	case VK_BLEND_FACTOR_SRC_ALPHA:
400 		return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
401 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
402 		return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
403 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
404 		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
405 				: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
406 	default:
407 		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
408 	}
409 }
410 
411 /**
412  * Get rid of DST in the blend factors by commuting the operands:
413  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
414  */
si_blend_remove_dst(unsigned * func,unsigned * src_factor,unsigned * dst_factor,unsigned expected_dst,unsigned replacement_src)415 static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
416 				unsigned *dst_factor, unsigned expected_dst,
417 				unsigned replacement_src)
418 {
419 	if (*src_factor == expected_dst &&
420 	    *dst_factor == VK_BLEND_FACTOR_ZERO) {
421 		*src_factor = VK_BLEND_FACTOR_ZERO;
422 		*dst_factor = replacement_src;
423 
424 		/* Commuting the operands requires reversing subtractions. */
425 		if (*func == VK_BLEND_OP_SUBTRACT)
426 			*func = VK_BLEND_OP_REVERSE_SUBTRACT;
427 		else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT)
428 			*func = VK_BLEND_OP_SUBTRACT;
429 	}
430 }
431 
si_blend_factor_uses_dst(unsigned factor)432 static bool si_blend_factor_uses_dst(unsigned factor)
433 {
434 	return factor == VK_BLEND_FACTOR_DST_COLOR ||
435 		factor == VK_BLEND_FACTOR_DST_ALPHA ||
436 		factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
437 		factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA ||
438 		factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
439 }
440 
is_dual_src(VkBlendFactor factor)441 static bool is_dual_src(VkBlendFactor factor)
442 {
443 	switch (factor) {
444 	case VK_BLEND_FACTOR_SRC1_COLOR:
445 	case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
446 	case VK_BLEND_FACTOR_SRC1_ALPHA:
447 	case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
448 		return true;
449 	default:
450 		return false;
451 	}
452 }
453 
radv_choose_spi_color_format(VkFormat vk_format,bool blend_enable,bool blend_need_alpha)454 static unsigned radv_choose_spi_color_format(VkFormat vk_format,
455 					     bool blend_enable,
456 					     bool blend_need_alpha)
457 {
458 	const struct vk_format_description *desc = vk_format_description(vk_format);
459 	struct ac_spi_color_formats formats = {};
460 	unsigned format, ntype, swap;
461 
462 	format = radv_translate_colorformat(vk_format);
463 	ntype = radv_translate_color_numformat(vk_format, desc,
464 					       vk_format_get_first_non_void_channel(vk_format));
465 	swap = radv_translate_colorswap(vk_format, false);
466 
467 	ac_choose_spi_color_formats(format, swap, ntype, false, &formats);
468 
469 	if (blend_enable && blend_need_alpha)
470 		return formats.blend_alpha;
471 	else if(blend_need_alpha)
472 		return formats.alpha;
473 	else if(blend_enable)
474 		return formats.blend;
475 	else
476 		return formats.normal;
477 }
478 
479 static bool
format_is_int8(VkFormat format)480 format_is_int8(VkFormat format)
481 {
482 	const struct vk_format_description *desc = vk_format_description(format);
483 	int channel =  vk_format_get_first_non_void_channel(format);
484 
485 	return channel >= 0 && desc->channel[channel].pure_integer &&
486 	       desc->channel[channel].size == 8;
487 }
488 
489 static bool
format_is_int10(VkFormat format)490 format_is_int10(VkFormat format)
491 {
492 	const struct vk_format_description *desc = vk_format_description(format);
493 
494 	if (desc->nr_channels != 4)
495 		return false;
496 	for (unsigned i = 0; i < 4; i++) {
497 		if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
498 			return true;
499 	}
500 	return false;
501 }
502 
503 static void
radv_pipeline_compute_spi_color_formats(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,struct radv_blend_state * blend)504 radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
505 					const VkGraphicsPipelineCreateInfo *pCreateInfo,
506 					struct radv_blend_state *blend)
507 {
508 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
509 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
510 	unsigned col_format = 0, is_int8 = 0, is_int10 = 0;
511 	unsigned num_targets;
512 
513 	for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
514 		unsigned cf;
515 
516 		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED ||
517 		    !(blend->cb_target_mask & (0xfu << (i * 4)))) {
518 			cf = V_028714_SPI_SHADER_ZERO;
519 		} else {
520 			struct radv_render_pass_attachment *attachment = pass->attachments + subpass->color_attachments[i].attachment;
521 			bool blend_enable =
522 				blend->blend_enable_4bit & (0xfu << (i * 4));
523 
524 			cf = radv_choose_spi_color_format(attachment->format,
525 			                                  blend_enable,
526 							  blend->need_src_alpha & (1 << i));
527 
528 			if (format_is_int8(attachment->format))
529 				is_int8 |= 1 << i;
530 			if (format_is_int10(attachment->format))
531 				is_int10 |= 1 << i;
532 		}
533 
534 		col_format |= cf << (4 * i);
535 	}
536 
537 	if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
538 		/* When a subpass doesn't have any color attachments, write the
539 		 * alpha channel of MRT0 when alpha coverage is enabled because
540 		 * the depth attachment needs it.
541 		 */
542 		col_format |= V_028714_SPI_SHADER_32_AR;
543 	}
544 
545 	/* If the i-th target format is set, all previous target formats must
546 	 * be non-zero to avoid hangs.
547 	 */
548 	num_targets = (util_last_bit(col_format) + 3) / 4;
549 	for (unsigned i = 0; i < num_targets; i++) {
550 		if (!(col_format & (0xfu << (i * 4)))) {
551 			col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
552 		}
553 	}
554 
555 	/* The output for dual source blending should have the same format as
556 	 * the first output.
557 	 */
558 	if (blend->mrt0_is_dual_src)
559 		col_format |= (col_format & 0xf) << 4;
560 
561 	blend->spi_shader_col_format = col_format;
562 	blend->col_format_is_int8 = is_int8;
563 	blend->col_format_is_int10 = is_int10;
564 }
565 
566 /*
567  * Ordered so that for each i,
568  * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
569  */
570 const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
571 	VK_FORMAT_R32_SFLOAT,
572 	VK_FORMAT_R32G32_SFLOAT,
573 	VK_FORMAT_R8G8B8A8_UNORM,
574 	VK_FORMAT_R16G16B16A16_UNORM,
575 	VK_FORMAT_R16G16B16A16_SNORM,
576 	VK_FORMAT_R16G16B16A16_UINT,
577 	VK_FORMAT_R16G16B16A16_SINT,
578 	VK_FORMAT_R32G32B32A32_SFLOAT,
579 	VK_FORMAT_R8G8B8A8_UINT,
580 	VK_FORMAT_R8G8B8A8_SINT,
581 	VK_FORMAT_A2R10G10B10_UINT_PACK32,
582 	VK_FORMAT_A2R10G10B10_SINT_PACK32,
583 };
584 
radv_format_meta_fs_key(VkFormat format)585 unsigned radv_format_meta_fs_key(VkFormat format)
586 {
587 	unsigned col_format = radv_choose_spi_color_format(format, false, false);
588 
589 	assert(col_format != V_028714_SPI_SHADER_32_AR);
590 	if (col_format >= V_028714_SPI_SHADER_32_AR)
591 		--col_format; /* Skip V_028714_SPI_SHADER_32_AR  since there is no such VkFormat */
592 
593 	--col_format; /* Skip V_028714_SPI_SHADER_ZERO */
594 	bool is_int8 = format_is_int8(format);
595 	bool is_int10 = format_is_int10(format);
596 
597 	return col_format + (is_int8 ? 3 : is_int10 ? 5 : 0);
598 }
599 
600 static void
radv_blend_check_commutativity(struct radv_blend_state * blend,VkBlendOp op,VkBlendFactor src,VkBlendFactor dst,unsigned chanmask)601 radv_blend_check_commutativity(struct radv_blend_state *blend,
602 			       VkBlendOp op, VkBlendFactor src,
603 			       VkBlendFactor dst, unsigned chanmask)
604 {
605 	/* Src factor is allowed when it does not depend on Dst. */
606 	static const uint32_t src_allowed =
607 		(1u << VK_BLEND_FACTOR_ONE) |
608 		(1u << VK_BLEND_FACTOR_SRC_COLOR) |
609 		(1u << VK_BLEND_FACTOR_SRC_ALPHA) |
610 		(1u << VK_BLEND_FACTOR_SRC_ALPHA_SATURATE) |
611 		(1u << VK_BLEND_FACTOR_CONSTANT_COLOR) |
612 		(1u << VK_BLEND_FACTOR_CONSTANT_ALPHA) |
613 		(1u << VK_BLEND_FACTOR_SRC1_COLOR) |
614 		(1u << VK_BLEND_FACTOR_SRC1_ALPHA) |
615 		(1u << VK_BLEND_FACTOR_ZERO) |
616 		(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR) |
617 		(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) |
618 		(1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR) |
619 		(1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA) |
620 		(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR) |
621 		(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA);
622 
623 	if (dst == VK_BLEND_FACTOR_ONE &&
624 	    (src_allowed & (1u << src))) {
625 		/* Addition is commutative, but floating point addition isn't
626 		 * associative: subtle changes can be introduced via different
627 		 * rounding. Be conservative, only enable for min and max.
628 		 */
629 		if (op == VK_BLEND_OP_MAX || op == VK_BLEND_OP_MIN)
630 			blend->commutative_4bit |= chanmask;
631 	}
632 }
633 
634 static struct radv_blend_state
radv_pipeline_init_blend_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)635 radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
636 			       const VkGraphicsPipelineCreateInfo *pCreateInfo,
637 			       const struct radv_graphics_pipeline_create_info *extra)
638 {
639 	const VkPipelineColorBlendStateCreateInfo *vkblend = radv_pipeline_get_color_blend_state(pCreateInfo);
640 	const VkPipelineMultisampleStateCreateInfo *vkms = radv_pipeline_get_multisample_state(pCreateInfo);
641 	struct radv_blend_state blend = {0};
642 	unsigned mode = V_028808_CB_NORMAL;
643 	int i;
644 
645 	if (extra && extra->custom_blend_mode) {
646 		blend.single_cb_enable = true;
647 		mode = extra->custom_blend_mode;
648 	}
649 
650 	blend.cb_color_control = 0;
651 	if (vkblend) {
652 		if (vkblend->logicOpEnable)
653 			blend.cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp));
654 		else
655 			blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
656 	}
657 
658 	blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
659 		S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
660 		S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
661 		S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
662 		S_028B70_OFFSET_ROUND(1);
663 
664 	if (vkms && vkms->alphaToCoverageEnable) {
665 		blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
666 		blend.need_src_alpha |= 0x1;
667 	}
668 
669 	blend.cb_target_mask = 0;
670 	if (vkblend) {
671 		for (i = 0; i < vkblend->attachmentCount; i++) {
672 			const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
673 			unsigned blend_cntl = 0;
674 			unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
675 			VkBlendOp eqRGB = att->colorBlendOp;
676 			VkBlendFactor srcRGB = att->srcColorBlendFactor;
677 			VkBlendFactor dstRGB = att->dstColorBlendFactor;
678 			VkBlendOp eqA = att->alphaBlendOp;
679 			VkBlendFactor srcA = att->srcAlphaBlendFactor;
680 			VkBlendFactor dstA = att->dstAlphaBlendFactor;
681 
682 			blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
683 
684 			if (!att->colorWriteMask)
685 				continue;
686 
687 			blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i);
688 			blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
689 			if (!att->blendEnable) {
690 				blend.cb_blend_control[i] = blend_cntl;
691 				continue;
692 			}
693 
694 			if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA))
695 				if (i == 0)
696 					blend.mrt0_is_dual_src = true;
697 
698 			if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) {
699 				srcRGB = VK_BLEND_FACTOR_ONE;
700 				dstRGB = VK_BLEND_FACTOR_ONE;
701 			}
702 			if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) {
703 				srcA = VK_BLEND_FACTOR_ONE;
704 				dstA = VK_BLEND_FACTOR_ONE;
705 			}
706 
707 			radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB,
708 						       0x7u << (4 * i));
709 			radv_blend_check_commutativity(&blend, eqA, srcA, dstA,
710 						       0x8u << (4 * i));
711 
712 			/* Blending optimizations for RB+.
713 			 * These transformations don't change the behavior.
714 			 *
715 			 * First, get rid of DST in the blend factors:
716 			 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
717 			 */
718 			si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
719 					    VK_BLEND_FACTOR_DST_COLOR,
720 					    VK_BLEND_FACTOR_SRC_COLOR);
721 
722 			si_blend_remove_dst(&eqA, &srcA, &dstA,
723 					    VK_BLEND_FACTOR_DST_COLOR,
724 					    VK_BLEND_FACTOR_SRC_COLOR);
725 
726 			si_blend_remove_dst(&eqA, &srcA, &dstA,
727 					    VK_BLEND_FACTOR_DST_ALPHA,
728 					    VK_BLEND_FACTOR_SRC_ALPHA);
729 
730 			/* Look up the ideal settings from tables. */
731 			srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
732 			dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
733 			srcA_opt = si_translate_blend_opt_factor(srcA, true);
734 			dstA_opt = si_translate_blend_opt_factor(dstA, true);
735 
736 			/* Handle interdependencies. */
737 			if (si_blend_factor_uses_dst(srcRGB))
738 				dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
739 			if (si_blend_factor_uses_dst(srcA))
740 				dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
741 
742 			if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
743 			    (dstRGB == VK_BLEND_FACTOR_ZERO ||
744 			     dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
745 			     dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
746 				dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
747 
748 			/* Set the final value. */
749 			blend.sx_mrt_blend_opt[i] =
750 				S_028760_COLOR_SRC_OPT(srcRGB_opt) |
751 				S_028760_COLOR_DST_OPT(dstRGB_opt) |
752 				S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
753 				S_028760_ALPHA_SRC_OPT(srcA_opt) |
754 				S_028760_ALPHA_DST_OPT(dstA_opt) |
755 				S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
756 			blend_cntl |= S_028780_ENABLE(1);
757 
758 			blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
759 			blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
760 			blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
761 			if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
762 				blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
763 				blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
764 				blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
765 				blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
766 			}
767 			blend.cb_blend_control[i] = blend_cntl;
768 
769 			blend.blend_enable_4bit |= 0xfu << (i * 4);
770 
771 			if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
772 			    dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
773 			    srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
774 			    dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
775 			    srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA ||
776 			    dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
777 				blend.need_src_alpha |= 1 << i;
778 		}
779 		for (i = vkblend->attachmentCount; i < 8; i++) {
780 			blend.cb_blend_control[i] = 0;
781 			blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
782 		}
783 	}
784 
785 	if (pipeline->device->physical_device->rad_info.has_rbplus) {
786 		/* Disable RB+ blend optimizations for dual source blending. */
787 		if (blend.mrt0_is_dual_src) {
788 			for (i = 0; i < 8; i++) {
789 				blend.sx_mrt_blend_opt[i] =
790 					S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
791 					S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
792 			}
793 		}
794 
795 		/* RB+ doesn't work with dual source blending, logic op and
796 		 * RESOLVE.
797 		 */
798 		if (blend.mrt0_is_dual_src ||
799 		    (vkblend && vkblend->logicOpEnable) ||
800 		    mode == V_028808_CB_RESOLVE)
801 			blend.cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
802 	}
803 
804 	if (blend.cb_target_mask)
805 		blend.cb_color_control |= S_028808_MODE(mode);
806 	else
807 		blend.cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
808 
809 	radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend);
810 	return blend;
811 }
812 
si_translate_fill(VkPolygonMode func)813 static uint32_t si_translate_fill(VkPolygonMode func)
814 {
815 	switch(func) {
816 	case VK_POLYGON_MODE_FILL:
817 		return V_028814_X_DRAW_TRIANGLES;
818 	case VK_POLYGON_MODE_LINE:
819 		return V_028814_X_DRAW_LINES;
820 	case VK_POLYGON_MODE_POINT:
821 		return V_028814_X_DRAW_POINTS;
822 	default:
823 		assert(0);
824 		return V_028814_X_DRAW_POINTS;
825 	}
826 }
827 
radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo * pCreateInfo)828 static uint8_t radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo)
829 {
830 	const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState;
831 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
832 	struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
833 	uint32_t ps_iter_samples = 1;
834 	uint32_t num_samples;
835 
836 	/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
837 	 *
838 	 * "If the VK_AMD_mixed_attachment_samples extension is enabled and the
839 	 *  subpass uses color attachments, totalSamples is the number of
840 	 *  samples of the color attachments. Otherwise, totalSamples is the
841 	 *  value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples
842 	 *  specified at pipeline creation time."
843 	 */
844 	if (subpass->has_color_att) {
845 		num_samples = subpass->color_sample_count;
846 	} else {
847 		num_samples = vkms->rasterizationSamples;
848 	}
849 
850 	if (vkms->sampleShadingEnable) {
851 		ps_iter_samples = ceilf(vkms->minSampleShading * num_samples);
852 		ps_iter_samples = util_next_power_of_two(ps_iter_samples);
853 	}
854 	return ps_iter_samples;
855 }
856 
857 static bool
radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)858 radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
859 {
860 	return pCreateInfo->depthTestEnable &&
861 	       pCreateInfo->depthWriteEnable &&
862 	       pCreateInfo->depthCompareOp != VK_COMPARE_OP_NEVER;
863 }
864 
865 static bool
radv_writes_stencil(const VkStencilOpState * state)866 radv_writes_stencil(const VkStencilOpState *state)
867 {
868 	return state->writeMask &&
869 	       (state->failOp != VK_STENCIL_OP_KEEP ||
870 		state->passOp != VK_STENCIL_OP_KEEP ||
871 		state->depthFailOp != VK_STENCIL_OP_KEEP);
872 }
873 
874 static bool
radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)875 radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
876 {
877 	return pCreateInfo->stencilTestEnable &&
878 	       (radv_writes_stencil(&pCreateInfo->front) ||
879 		radv_writes_stencil(&pCreateInfo->back));
880 }
881 
882 static bool
radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)883 radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
884 {
885 	return radv_is_depth_write_enabled(pCreateInfo) ||
886 	       radv_is_stencil_write_enabled(pCreateInfo);
887 }
888 
889 static bool
radv_order_invariant_stencil_op(VkStencilOp op)890 radv_order_invariant_stencil_op(VkStencilOp op)
891 {
892 	/* REPLACE is normally order invariant, except when the stencil
893 	 * reference value is written by the fragment shader. Tracking this
894 	 * interaction does not seem worth the effort, so be conservative.
895 	 */
896 	return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP &&
897 	       op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
898 	       op != VK_STENCIL_OP_REPLACE;
899 }
900 
901 static bool
radv_order_invariant_stencil_state(const VkStencilOpState * state)902 radv_order_invariant_stencil_state(const VkStencilOpState *state)
903 {
904 	/* Compute whether, assuming Z writes are disabled, this stencil state
905 	 * is order invariant in the sense that the set of passing fragments as
906 	 * well as the final stencil buffer result does not depend on the order
907 	 * of fragments.
908 	 */
909 	return !state->writeMask ||
910 	       /* The following assumes that Z writes are disabled. */
911 	       (state->compareOp == VK_COMPARE_OP_ALWAYS &&
912 		radv_order_invariant_stencil_op(state->passOp) &&
913 		radv_order_invariant_stencil_op(state->depthFailOp)) ||
914 	       (state->compareOp == VK_COMPARE_OP_NEVER &&
915 		radv_order_invariant_stencil_op(state->failOp));
916 }
917 
918 static bool
radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo * pCreateInfo)919 radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo *pCreateInfo)
920 {
921 	VkDynamicState ds_states[] = {
922 		VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT,
923 		VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
924 		VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT,
925 		VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
926 		VK_DYNAMIC_STATE_STENCIL_OP_EXT,
927 	};
928 
929 	if (pCreateInfo->pDynamicState) {
930 		uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
931 		for (uint32_t i = 0; i < count; i++) {
932 			for (uint32_t j = 0; j < ARRAY_SIZE(ds_states); j++) {
933 				if (pCreateInfo->pDynamicState->pDynamicStates[i] == ds_states[j])
934 					return true;
935 			}
936 		}
937 	}
938 
939 	return false;
940 }
941 
942 static bool
radv_pipeline_out_of_order_rast(struct radv_pipeline * pipeline,struct radv_blend_state * blend,const VkGraphicsPipelineCreateInfo * pCreateInfo)943 radv_pipeline_out_of_order_rast(struct radv_pipeline *pipeline,
944 				struct radv_blend_state *blend,
945 				const VkGraphicsPipelineCreateInfo *pCreateInfo)
946 {
947 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
948 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
949 	const VkPipelineDepthStencilStateCreateInfo *vkds = radv_pipeline_get_depth_stencil_state(pCreateInfo);
950 	const VkPipelineColorBlendStateCreateInfo *vkblend = radv_pipeline_get_color_blend_state(pCreateInfo);
951 	unsigned colormask = blend->cb_target_enabled_4bit;
952 
953 	if (!pipeline->device->physical_device->out_of_order_rast_allowed)
954 		return false;
955 
956 	/* Be conservative if a logic operation is enabled with color buffers. */
957 	if (colormask && vkblend && vkblend->logicOpEnable)
958 		return false;
959 
960 	/* Be conservative if an extended dynamic depth/stencil state is
961 	 * enabled because the driver can't update out-of-order rasterization
962 	 * dynamically.
963 	 */
964 	if (radv_pipeline_has_dynamic_ds_states(pCreateInfo))
965 		return false;
966 
967 	/* Default depth/stencil invariance when no attachment is bound. */
968 	struct radv_dsa_order_invariance dsa_order_invariant = {
969 		.zs = true, .pass_set = true
970 	};
971 
972 	if (vkds) {
973 		struct radv_render_pass_attachment *attachment =
974 			pass->attachments + subpass->depth_stencil_attachment->attachment;
975 		bool has_stencil = vk_format_is_stencil(attachment->format);
976 		struct radv_dsa_order_invariance order_invariance[2];
977 		struct radv_shader_variant *ps =
978 			pipeline->shaders[MESA_SHADER_FRAGMENT];
979 
980 		/* Compute depth/stencil order invariance in order to know if
981 		 * it's safe to enable out-of-order.
982 		 */
983 		bool zfunc_is_ordered =
984 			vkds->depthCompareOp == VK_COMPARE_OP_NEVER ||
985 			vkds->depthCompareOp == VK_COMPARE_OP_LESS ||
986 			vkds->depthCompareOp == VK_COMPARE_OP_LESS_OR_EQUAL ||
987 			vkds->depthCompareOp == VK_COMPARE_OP_GREATER ||
988 			vkds->depthCompareOp == VK_COMPARE_OP_GREATER_OR_EQUAL;
989 
990 		bool nozwrite_and_order_invariant_stencil =
991 			!radv_is_ds_write_enabled(vkds) ||
992 			(!radv_is_depth_write_enabled(vkds) &&
993 			 radv_order_invariant_stencil_state(&vkds->front) &&
994 			 radv_order_invariant_stencil_state(&vkds->back));
995 
996 		order_invariance[1].zs =
997 			nozwrite_and_order_invariant_stencil ||
998 			(!radv_is_stencil_write_enabled(vkds) &&
999 			 zfunc_is_ordered);
1000 		order_invariance[0].zs =
1001 			!radv_is_depth_write_enabled(vkds) || zfunc_is_ordered;
1002 
1003 		order_invariance[1].pass_set =
1004 			nozwrite_and_order_invariant_stencil ||
1005 			(!radv_is_stencil_write_enabled(vkds) &&
1006 			 (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
1007 			  vkds->depthCompareOp == VK_COMPARE_OP_NEVER));
1008 		order_invariance[0].pass_set =
1009 			!radv_is_depth_write_enabled(vkds) ||
1010 			(vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
1011 			 vkds->depthCompareOp == VK_COMPARE_OP_NEVER);
1012 
1013 		dsa_order_invariant = order_invariance[has_stencil];
1014 		if (!dsa_order_invariant.zs)
1015 			return false;
1016 
1017 		/* The set of PS invocations is always order invariant,
1018 		 * except when early Z/S tests are requested.
1019 		 */
1020 		if (ps &&
1021 		    ps->info.ps.writes_memory &&
1022 		    ps->info.ps.early_fragment_test &&
1023 		    !dsa_order_invariant.pass_set)
1024 			return false;
1025 
1026 		/* Determine if out-of-order rasterization should be disabled
1027 		 * when occlusion queries are used.
1028 		 */
1029 		pipeline->graphics.disable_out_of_order_rast_for_occlusion =
1030 			!dsa_order_invariant.pass_set;
1031 	}
1032 
1033 	/* No color buffers are enabled for writing. */
1034 	if (!colormask)
1035 		return true;
1036 
1037 	unsigned blendmask = colormask & blend->blend_enable_4bit;
1038 
1039 	if (blendmask) {
1040 		/* Only commutative blending. */
1041 		if (blendmask & ~blend->commutative_4bit)
1042 			return false;
1043 
1044 		if (!dsa_order_invariant.pass_set)
1045 			return false;
1046 	}
1047 
1048 	if (colormask & ~blendmask)
1049 		return false;
1050 
1051 	return true;
1052 }
1053 
1054 static const VkConservativeRasterizationModeEXT
radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo * pCreateInfo)1055 radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *pCreateInfo)
1056 {
1057 	const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
1058 		vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
1059 
1060 	if (!conservative_raster)
1061 		return VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
1062 	return conservative_raster->conservativeRasterizationMode;
1063 }
1064 
1065 static void
radv_pipeline_init_multisample_state(struct radv_pipeline * pipeline,struct radv_blend_state * blend,const VkGraphicsPipelineCreateInfo * pCreateInfo)1066 radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
1067 				     struct radv_blend_state *blend,
1068 				     const VkGraphicsPipelineCreateInfo *pCreateInfo)
1069 {
1070 	const VkPipelineMultisampleStateCreateInfo *vkms = radv_pipeline_get_multisample_state(pCreateInfo);
1071 	struct radv_multisample_state *ms = &pipeline->graphics.ms;
1072 	unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes;
1073 	const VkConservativeRasterizationModeEXT mode =
1074 		radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState);
1075 	bool out_of_order_rast = false;
1076 	int ps_iter_samples = 1;
1077 	uint32_t mask = 0xffff;
1078 
1079 	if (vkms) {
1080 		ms->num_samples = vkms->rasterizationSamples;
1081 
1082 		/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
1083 		 *
1084 		 * "Sample shading is enabled for a graphics pipeline:
1085 		 *
1086 		 * - If the interface of the fragment shader entry point of the
1087 		 *   graphics pipeline includes an input variable decorated
1088 		 *   with SampleId or SamplePosition. In this case
1089 		 *   minSampleShadingFactor takes the value 1.0.
1090 		 * - Else if the sampleShadingEnable member of the
1091 		 *   VkPipelineMultisampleStateCreateInfo structure specified
1092 		 *   when creating the graphics pipeline is set to VK_TRUE. In
1093 		 *   this case minSampleShadingFactor takes the value of
1094 		 *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
1095 		 *
1096 		 * Otherwise, sample shading is considered disabled."
1097 		 */
1098 		if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
1099 			ps_iter_samples = ms->num_samples;
1100 		} else {
1101 			ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
1102 		}
1103 	} else {
1104 		ms->num_samples = 1;
1105 	}
1106 
1107 	const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
1108 		vk_find_struct_const(pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
1109 	if (raster_order && raster_order->rasterizationOrder == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
1110 		/* Out-of-order rasterization is explicitly enabled by the
1111 		 * application.
1112 		 */
1113 		out_of_order_rast = true;
1114 	} else {
1115 		/* Determine if the driver can enable out-of-order
1116 		 * rasterization internally.
1117 		 */
1118 		out_of_order_rast =
1119 			radv_pipeline_out_of_order_rast(pipeline, blend, pCreateInfo);
1120 	}
1121 
1122 	ms->pa_sc_aa_config = 0;
1123 	ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
1124 		      S_028804_INCOHERENT_EQAA_READS(1) |
1125 		      S_028804_INTERPOLATE_COMP_Z(1) |
1126 		      S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
1127 
1128 	/* Adjust MSAA state if conservative rasterization is enabled. */
1129 	if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
1130 		ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
1131 
1132 		ms->db_eqaa |= S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) |
1133 			       S_028804_OVERRASTERIZATION_AMOUNT(4);
1134 	}
1135 
1136 	ms->pa_sc_mode_cntl_1 =
1137 		S_028A4C_WALK_FENCE_ENABLE(1) | //TODO linear dst fixes
1138 		S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
1139 		S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
1140 		S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
1141 		/* always 1: */
1142 		S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
1143 		S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
1144 		S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
1145 		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
1146 		S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
1147 		S_028A4C_FORCE_EOV_REZ_ENABLE(1);
1148 	ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(pipeline->device->physical_device->rad_info.chip_class >= GFX9) |
1149 	                        S_028A48_VPORT_SCISSOR_ENABLE(1);
1150 
1151 	const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line =
1152 		vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1153 				     PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1154 	if (rast_line) {
1155 		ms->pa_sc_mode_cntl_0 |= S_028A48_LINE_STIPPLE_ENABLE(rast_line->stippledLineEnable);
1156 		if (rast_line->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
1157 			/* From the Vulkan spec 1.1.129:
1158 			 *
1159 			 * "When VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT lines
1160 			 *  are being rasterized, sample locations may all be
1161 			 *  treated as being at the pixel center (this may
1162 			 *  affect attribute and depth interpolation)."
1163 			 */
1164 			ms->num_samples = 1;
1165 		}
1166 	}
1167 
1168 	if (ms->num_samples > 1) {
1169 		RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1170 		struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1171 		uint32_t z_samples = subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples;
1172 		unsigned log_samples = util_logbase2(ms->num_samples);
1173 		unsigned log_z_samples = util_logbase2(z_samples);
1174 		unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
1175 		ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
1176 		ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
1177 			S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
1178 			S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
1179 			S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
1180 		ms->pa_sc_aa_config |= S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
1181 			S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
1182 			S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
1183 			S_028BE0_COVERED_CENTROID_IS_CENTER_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3);
1184 		ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
1185 		if (ps_iter_samples > 1)
1186 			pipeline->graphics.spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
1187 	}
1188 
1189 	if (vkms && vkms->pSampleMask) {
1190 		mask = vkms->pSampleMask[0] & 0xffff;
1191 	}
1192 
1193 	ms->pa_sc_aa_mask[0] = mask | (mask << 16);
1194 	ms->pa_sc_aa_mask[1] = mask | (mask << 16);
1195 }
1196 
1197 static bool
radv_prim_can_use_guardband(enum VkPrimitiveTopology topology)1198 radv_prim_can_use_guardband(enum VkPrimitiveTopology topology)
1199 {
1200 	switch (topology) {
1201 	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1202 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1203 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1204 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1205 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1206 		return false;
1207 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1208 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1209 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1210 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1211 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1212 	case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1213 		return true;
1214 	default:
1215 		unreachable("unhandled primitive type");
1216 	}
1217 }
1218 
1219 static uint32_t
si_conv_gl_prim_to_gs_out(unsigned gl_prim)1220 si_conv_gl_prim_to_gs_out(unsigned gl_prim)
1221 {
1222 	switch (gl_prim) {
1223 	case 0: /* GL_POINTS */
1224 		return V_028A6C_OUTPRIM_TYPE_POINTLIST;
1225 	case 1: /* GL_LINES */
1226 	case 3: /* GL_LINE_STRIP */
1227 	case 0xA: /* GL_LINE_STRIP_ADJACENCY_ARB */
1228 	case 0x8E7A: /* GL_ISOLINES */
1229 		return V_028A6C_OUTPRIM_TYPE_LINESTRIP;
1230 
1231 	case 4: /* GL_TRIANGLES */
1232 	case 0xc: /* GL_TRIANGLES_ADJACENCY_ARB */
1233 	case 5: /* GL_TRIANGLE_STRIP */
1234 	case 7: /* GL_QUADS */
1235 		return V_028A6C_OUTPRIM_TYPE_TRISTRIP;
1236 	default:
1237 		assert(0);
1238 		return 0;
1239 	}
1240 }
1241 
1242 static uint32_t
si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)1243 si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
1244 {
1245 	switch (topology) {
1246 	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1247 	case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1248 		return V_028A6C_OUTPRIM_TYPE_POINTLIST;
1249 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1250 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1251 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1252 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1253 		return V_028A6C_OUTPRIM_TYPE_LINESTRIP;
1254 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1255 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1256 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1257 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1258 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1259 		return V_028A6C_OUTPRIM_TYPE_TRISTRIP;
1260 	default:
1261 		assert(0);
1262 		return 0;
1263 	}
1264 }
1265 
radv_dynamic_state_mask(VkDynamicState state)1266 static unsigned radv_dynamic_state_mask(VkDynamicState state)
1267 {
1268 	switch(state) {
1269 	case VK_DYNAMIC_STATE_VIEWPORT:
1270 	case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
1271 		return RADV_DYNAMIC_VIEWPORT;
1272 	case VK_DYNAMIC_STATE_SCISSOR:
1273 	case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
1274 		return RADV_DYNAMIC_SCISSOR;
1275 	case VK_DYNAMIC_STATE_LINE_WIDTH:
1276 		return RADV_DYNAMIC_LINE_WIDTH;
1277 	case VK_DYNAMIC_STATE_DEPTH_BIAS:
1278 		return RADV_DYNAMIC_DEPTH_BIAS;
1279 	case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1280 		return RADV_DYNAMIC_BLEND_CONSTANTS;
1281 	case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
1282 		return RADV_DYNAMIC_DEPTH_BOUNDS;
1283 	case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1284 		return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1285 	case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1286 		return RADV_DYNAMIC_STENCIL_WRITE_MASK;
1287 	case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1288 		return RADV_DYNAMIC_STENCIL_REFERENCE;
1289 	case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
1290 		return RADV_DYNAMIC_DISCARD_RECTANGLE;
1291 	case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
1292 		return RADV_DYNAMIC_SAMPLE_LOCATIONS;
1293 	case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
1294 		return RADV_DYNAMIC_LINE_STIPPLE;
1295 	case VK_DYNAMIC_STATE_CULL_MODE_EXT:
1296 		return RADV_DYNAMIC_CULL_MODE;
1297 	case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
1298 		return RADV_DYNAMIC_FRONT_FACE;
1299 	case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
1300 		return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
1301 	case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
1302 		return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
1303 	case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
1304 		return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
1305 	case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
1306 		return RADV_DYNAMIC_DEPTH_COMPARE_OP;
1307 	case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
1308 		return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
1309 	case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
1310 		return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
1311 	case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
1312 		return RADV_DYNAMIC_STENCIL_OP;
1313 	case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
1314 		return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1315 	default:
1316 		unreachable("Unhandled dynamic state");
1317 	}
1318 }
1319 
radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)1320 static uint32_t radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1321 {
1322 	uint32_t states = RADV_DYNAMIC_ALL;
1323 
1324 	/* If rasterization is disabled we do not care about any of the
1325 	 * dynamic states, since they are all rasterization related only,
1326 	 * except primitive topology and vertex binding stride.
1327 	 */
1328 	if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable)
1329 		return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY |
1330 		       RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1331 
1332 	if (!pCreateInfo->pRasterizationState->depthBiasEnable)
1333 		states &= ~RADV_DYNAMIC_DEPTH_BIAS;
1334 
1335 	if (!pCreateInfo->pDepthStencilState ||
1336 	    !pCreateInfo->pDepthStencilState->depthBoundsTestEnable)
1337 		states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
1338 
1339 	if (!pCreateInfo->pDepthStencilState ||
1340 	    !pCreateInfo->pDepthStencilState->stencilTestEnable)
1341 		states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK |
1342 		            RADV_DYNAMIC_STENCIL_WRITE_MASK |
1343 		            RADV_DYNAMIC_STENCIL_REFERENCE);
1344 
1345 	if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT))
1346 		states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
1347 
1348 	if (!pCreateInfo->pMultisampleState ||
1349 	    !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1350 				  PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
1351 		states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
1352 
1353 	if (!pCreateInfo->pRasterizationState ||
1354 	    !vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1355 				  PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT))
1356 		states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1357 
1358 	/* TODO: blend constants & line width. */
1359 
1360 	return states;
1361 }
1362 
1363 static struct radv_ia_multi_vgt_param_helpers
radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline * pipeline)1364 radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline)
1365 {
1366 	struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
1367 	const struct radv_device *device = pipeline->device;
1368 
1369 	if (radv_pipeline_has_tess(pipeline))
1370 		ia_multi_vgt_param.primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
1371 	else if (radv_pipeline_has_gs(pipeline))
1372 		ia_multi_vgt_param.primgroup_size = 64;
1373 	else
1374 		ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
1375 
1376 	/* GS requirement. */
1377 	ia_multi_vgt_param.partial_es_wave = false;
1378 	if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= GFX8)
1379 		if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
1380 			ia_multi_vgt_param.partial_es_wave = true;
1381 
1382 	ia_multi_vgt_param.ia_switch_on_eoi = false;
1383 	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
1384 		ia_multi_vgt_param.ia_switch_on_eoi = true;
1385 	if (radv_pipeline_has_gs(pipeline) &&
1386 	    pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
1387 		ia_multi_vgt_param.ia_switch_on_eoi = true;
1388 	if (radv_pipeline_has_tess(pipeline)) {
1389 		/* SWITCH_ON_EOI must be set if PrimID is used. */
1390 		if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
1391 		    radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
1392 			ia_multi_vgt_param.ia_switch_on_eoi = true;
1393 	}
1394 
1395 	ia_multi_vgt_param.partial_vs_wave = false;
1396 	if (radv_pipeline_has_tess(pipeline)) {
1397 		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
1398 		if ((device->physical_device->rad_info.family == CHIP_TAHITI ||
1399 		     device->physical_device->rad_info.family == CHIP_PITCAIRN ||
1400 		     device->physical_device->rad_info.family == CHIP_BONAIRE) &&
1401 		    radv_pipeline_has_gs(pipeline))
1402 			ia_multi_vgt_param.partial_vs_wave = true;
1403 		/* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
1404 		if (device->physical_device->rad_info.has_distributed_tess) {
1405 			if (radv_pipeline_has_gs(pipeline)) {
1406 				if (device->physical_device->rad_info.chip_class <= GFX8)
1407 					ia_multi_vgt_param.partial_es_wave = true;
1408 			} else {
1409 				ia_multi_vgt_param.partial_vs_wave = true;
1410 			}
1411 		}
1412 	}
1413 
1414 	if (radv_pipeline_has_gs(pipeline)) {
1415 		/* On these chips there is the possibility of a hang if the
1416 		 * pipeline uses a GS and partial_vs_wave is not set.
1417 		 *
1418 		 * This mostly does not hit 4-SE chips, as those typically set
1419 		 * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
1420 		 * with GS due to another workaround.
1421 		 *
1422 		 * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
1423 		 */
1424 		if (device->physical_device->rad_info.family == CHIP_TONGA ||
1425 		    device->physical_device->rad_info.family == CHIP_FIJI ||
1426 		    device->physical_device->rad_info.family == CHIP_POLARIS10 ||
1427 		    device->physical_device->rad_info.family == CHIP_POLARIS11 ||
1428 		    device->physical_device->rad_info.family == CHIP_POLARIS12 ||
1429 	            device->physical_device->rad_info.family == CHIP_VEGAM) {
1430 			ia_multi_vgt_param.partial_vs_wave = true;
1431 		}
1432 	}
1433 
1434 	ia_multi_vgt_param.base =
1435 		S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
1436 		/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
1437 		S_028AA8_MAX_PRIMGRP_IN_WAVE(device->physical_device->rad_info.chip_class == GFX8 ? 2 : 0) |
1438 		S_030960_EN_INST_OPT_BASIC(device->physical_device->rad_info.chip_class >= GFX9) |
1439 		S_030960_EN_INST_OPT_ADV(device->physical_device->rad_info.chip_class >= GFX9);
1440 
1441 	return ia_multi_vgt_param;
1442 }
1443 
1444 static void
radv_pipeline_init_input_assembly_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)1445 radv_pipeline_init_input_assembly_state(struct radv_pipeline *pipeline,
1446 					const VkGraphicsPipelineCreateInfo *pCreateInfo,
1447 					const struct radv_graphics_pipeline_create_info *extra)
1448 {
1449 	const VkPipelineInputAssemblyStateCreateInfo *ia_state = pCreateInfo->pInputAssemblyState;
1450 	struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
1451 	struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
1452 
1453 	pipeline->graphics.prim_restart_enable = !!ia_state->primitiveRestartEnable;
1454 	pipeline->graphics.can_use_guardband = radv_prim_can_use_guardband(ia_state->topology);
1455 
1456 	if (radv_pipeline_has_gs(pipeline)) {
1457 		if (si_conv_gl_prim_to_gs_out(gs->info.gs.output_prim) == V_028A6C_OUTPRIM_TYPE_TRISTRIP)
1458 			pipeline->graphics.can_use_guardband = true;
1459 	} else if (radv_pipeline_has_tess(pipeline)) {
1460 		if (!tes->info.tes.point_mode &&
1461 		    si_conv_gl_prim_to_gs_out(tes->info.tes.primitive_mode) == V_028A6C_OUTPRIM_TYPE_TRISTRIP)
1462 			pipeline->graphics.can_use_guardband = true;
1463 	}
1464 
1465 	if (extra && extra->use_rectlist) {
1466 		pipeline->graphics.can_use_guardband = true;
1467 	}
1468 
1469 	pipeline->graphics.ia_multi_vgt_param =
1470 		radv_compute_ia_multi_vgt_param_helpers(pipeline);
1471 }
1472 
1473 static void
radv_pipeline_init_dynamic_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)1474 radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
1475 				 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1476 				 const struct radv_graphics_pipeline_create_info *extra)
1477 {
1478 	uint32_t needed_states = radv_pipeline_needed_dynamic_state(pCreateInfo);
1479 	uint32_t states = needed_states;
1480 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1481 	struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1482 
1483 	pipeline->dynamic_state = default_dynamic_state;
1484 	pipeline->graphics.needed_dynamic_state = needed_states;
1485 
1486 	if (pCreateInfo->pDynamicState) {
1487 		/* Remove all of the states that are marked as dynamic */
1488 		uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
1489 		for (uint32_t s = 0; s < count; s++)
1490 			states &= ~radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
1491 	}
1492 
1493 	struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
1494 
1495 	if (needed_states & RADV_DYNAMIC_VIEWPORT) {
1496 		assert(pCreateInfo->pViewportState);
1497 
1498 		dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
1499 		if (states & RADV_DYNAMIC_VIEWPORT) {
1500 			typed_memcpy(dynamic->viewport.viewports,
1501 			             pCreateInfo->pViewportState->pViewports,
1502 			             pCreateInfo->pViewportState->viewportCount);
1503 		}
1504 	}
1505 
1506 	if (needed_states & RADV_DYNAMIC_SCISSOR) {
1507 		dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
1508 		if (states & RADV_DYNAMIC_SCISSOR) {
1509 			typed_memcpy(dynamic->scissor.scissors,
1510 			             pCreateInfo->pViewportState->pScissors,
1511 			             pCreateInfo->pViewportState->scissorCount);
1512 		}
1513 	}
1514 
1515 	if (states & RADV_DYNAMIC_LINE_WIDTH) {
1516 		assert(pCreateInfo->pRasterizationState);
1517 		dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
1518 	}
1519 
1520 	if (states & RADV_DYNAMIC_DEPTH_BIAS) {
1521 		assert(pCreateInfo->pRasterizationState);
1522 		dynamic->depth_bias.bias =
1523 			pCreateInfo->pRasterizationState->depthBiasConstantFactor;
1524 		dynamic->depth_bias.clamp =
1525 			pCreateInfo->pRasterizationState->depthBiasClamp;
1526 		dynamic->depth_bias.slope =
1527 			pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
1528 	}
1529 
1530 	/* Section 9.2 of the Vulkan 1.0.15 spec says:
1531 	 *
1532 	 *    pColorBlendState is [...] NULL if the pipeline has rasterization
1533 	 *    disabled or if the subpass of the render pass the pipeline is
1534 	 *    created against does not use any color attachments.
1535 	 */
1536 	if (subpass->has_color_att && states & RADV_DYNAMIC_BLEND_CONSTANTS) {
1537 		assert(pCreateInfo->pColorBlendState);
1538 		typed_memcpy(dynamic->blend_constants,
1539 			     pCreateInfo->pColorBlendState->blendConstants, 4);
1540 	}
1541 
1542 	if (states & RADV_DYNAMIC_CULL_MODE) {
1543 		dynamic->cull_mode =
1544 			pCreateInfo->pRasterizationState->cullMode;
1545 	}
1546 
1547 	if (states & RADV_DYNAMIC_FRONT_FACE) {
1548 		dynamic->front_face =
1549 			pCreateInfo->pRasterizationState->frontFace;
1550 	}
1551 
1552 	if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
1553 		dynamic->primitive_topology =
1554 			si_translate_prim(pCreateInfo->pInputAssemblyState->topology);
1555 		if (extra && extra->use_rectlist) {
1556 			dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
1557 		}
1558 	}
1559 
1560 	/* If there is no depthstencil attachment, then don't read
1561 	 * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
1562 	 * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
1563 	 * no need to override the depthstencil defaults in
1564 	 * radv_pipeline::dynamic_state when there is no depthstencil attachment.
1565 	 *
1566 	 * Section 9.2 of the Vulkan 1.0.15 spec says:
1567 	 *
1568 	 *    pDepthStencilState is [...] NULL if the pipeline has rasterization
1569 	 *    disabled or if the subpass of the render pass the pipeline is created
1570 	 *    against does not use a depth/stencil attachment.
1571 	 */
1572 	if (needed_states && subpass->depth_stencil_attachment) {
1573 		assert(pCreateInfo->pDepthStencilState);
1574 
1575 		if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
1576 			dynamic->depth_bounds.min =
1577 				pCreateInfo->pDepthStencilState->minDepthBounds;
1578 			dynamic->depth_bounds.max =
1579 				pCreateInfo->pDepthStencilState->maxDepthBounds;
1580 		}
1581 
1582 		if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
1583 			dynamic->stencil_compare_mask.front =
1584 				pCreateInfo->pDepthStencilState->front.compareMask;
1585 			dynamic->stencil_compare_mask.back =
1586 				pCreateInfo->pDepthStencilState->back.compareMask;
1587 		}
1588 
1589 		if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
1590 			dynamic->stencil_write_mask.front =
1591 				pCreateInfo->pDepthStencilState->front.writeMask;
1592 			dynamic->stencil_write_mask.back =
1593 				pCreateInfo->pDepthStencilState->back.writeMask;
1594 		}
1595 
1596 		if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
1597 			dynamic->stencil_reference.front =
1598 				pCreateInfo->pDepthStencilState->front.reference;
1599 			dynamic->stencil_reference.back =
1600 				pCreateInfo->pDepthStencilState->back.reference;
1601 		}
1602 
1603 		if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
1604 			dynamic->depth_test_enable =
1605 				pCreateInfo->pDepthStencilState->depthTestEnable;
1606 		}
1607 
1608 		if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
1609 			dynamic->depth_write_enable =
1610 				pCreateInfo->pDepthStencilState->depthWriteEnable;
1611 		}
1612 
1613 		if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
1614 			dynamic->depth_compare_op =
1615 				pCreateInfo->pDepthStencilState->depthCompareOp;
1616 		}
1617 
1618 		if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
1619 			dynamic->depth_bounds_test_enable =
1620 				pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
1621 		}
1622 
1623 		if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
1624 			dynamic->stencil_test_enable =
1625 				pCreateInfo->pDepthStencilState->stencilTestEnable;
1626 		}
1627 
1628 		if (states & RADV_DYNAMIC_STENCIL_OP) {
1629 			dynamic->stencil_op.front.compare_op =
1630 				pCreateInfo->pDepthStencilState->front.compareOp;
1631 			dynamic->stencil_op.front.fail_op =
1632 				pCreateInfo->pDepthStencilState->front.failOp;
1633 			dynamic->stencil_op.front.pass_op =
1634 				pCreateInfo->pDepthStencilState->front.passOp;
1635 			dynamic->stencil_op.front.depth_fail_op =
1636 				pCreateInfo->pDepthStencilState->front.depthFailOp;
1637 
1638 			dynamic->stencil_op.back.compare_op =
1639 				pCreateInfo->pDepthStencilState->back.compareOp;
1640 			dynamic->stencil_op.back.fail_op =
1641 				pCreateInfo->pDepthStencilState->back.failOp;
1642 			dynamic->stencil_op.back.pass_op =
1643 				pCreateInfo->pDepthStencilState->back.passOp;
1644 			dynamic->stencil_op.back.depth_fail_op =
1645 				pCreateInfo->pDepthStencilState->back.depthFailOp;
1646 		}
1647 	}
1648 
1649 	const  VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
1650 			vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
1651 	if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1652 		dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
1653 		if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1654 			typed_memcpy(dynamic->discard_rectangle.rectangles,
1655 			             discard_rectangle_info->pDiscardRectangles,
1656 			             discard_rectangle_info->discardRectangleCount);
1657 		}
1658 	}
1659 
1660 	if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
1661 		const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
1662 			vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1663 					     PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1664 		/* If sampleLocationsEnable is VK_FALSE, the default sample
1665 		 * locations are used and the values specified in
1666 		 * sampleLocationsInfo are ignored.
1667 		 */
1668 		if (sample_location_info->sampleLocationsEnable) {
1669 			const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
1670 				&sample_location_info->sampleLocationsInfo;
1671 
1672 			assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
1673 
1674 			dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
1675 			dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
1676 			dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
1677 			typed_memcpy(&dynamic->sample_location.locations[0],
1678 				     pSampleLocationsInfo->pSampleLocations,
1679 				     pSampleLocationsInfo->sampleLocationsCount);
1680 		}
1681 	}
1682 
1683 	const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info =
1684 		vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1685 				     PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1686 	if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
1687 		dynamic->line_stipple.factor = rast_line_info->lineStippleFactor;
1688 		dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern;
1689 	}
1690 
1691 	if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE))
1692 		pipeline->graphics.uses_dynamic_stride = true;
1693 
1694 	pipeline->dynamic_state.mask = states;
1695 }
1696 
1697 static void
radv_pipeline_init_raster_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1698 radv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
1699 				const VkGraphicsPipelineCreateInfo *pCreateInfo)
1700 {
1701 	const VkPipelineRasterizationStateCreateInfo *raster_info =
1702 		pCreateInfo->pRasterizationState;
1703 
1704 	pipeline->graphics.pa_su_sc_mode_cntl =
1705 		S_028814_FACE(raster_info->frontFace) |
1706 		S_028814_CULL_FRONT(!!(raster_info->cullMode & VK_CULL_MODE_FRONT_BIT)) |
1707 		S_028814_CULL_BACK(!!(raster_info->cullMode & VK_CULL_MODE_BACK_BIT)) |
1708 		S_028814_POLY_MODE(raster_info->polygonMode != VK_POLYGON_MODE_FILL) |
1709 		S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1710 		S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1711 		S_028814_POLY_OFFSET_FRONT_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1712 		S_028814_POLY_OFFSET_BACK_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1713 		S_028814_POLY_OFFSET_PARA_ENABLE(raster_info->depthBiasEnable ? 1 : 0);
1714 }
1715 
1716 static void
radv_pipeline_init_depth_stencil_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1717 radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline,
1718 				       const VkGraphicsPipelineCreateInfo *pCreateInfo)
1719 {
1720 	const VkPipelineDepthStencilStateCreateInfo *ds_info
1721 		= radv_pipeline_get_depth_stencil_state(pCreateInfo);
1722 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1723 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
1724 	struct radv_render_pass_attachment *attachment = NULL;
1725 	uint32_t db_depth_control = 0;
1726 
1727 	if (subpass->depth_stencil_attachment)
1728 		attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
1729 
1730 	bool has_depth_attachment = attachment && vk_format_is_depth(attachment->format);
1731 	bool has_stencil_attachment = attachment && vk_format_is_stencil(attachment->format);
1732 
1733 	if (ds_info) {
1734 		if (has_depth_attachment) {
1735 			db_depth_control = S_028800_Z_ENABLE(ds_info->depthTestEnable ? 1 : 0) |
1736 			                   S_028800_Z_WRITE_ENABLE(ds_info->depthWriteEnable ? 1 : 0) |
1737 			                   S_028800_ZFUNC(ds_info->depthCompareOp) |
1738 			                   S_028800_DEPTH_BOUNDS_ENABLE(ds_info->depthBoundsTestEnable ? 1 : 0);
1739 		}
1740 
1741 		if (has_stencil_attachment && ds_info->stencilTestEnable) {
1742 			db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
1743 			db_depth_control |= S_028800_STENCILFUNC(ds_info->front.compareOp);
1744 			db_depth_control |= S_028800_STENCILFUNC_BF(ds_info->back.compareOp);
1745 		}
1746 	}
1747 
1748 	pipeline->graphics.db_depth_control = db_depth_control;
1749 }
1750 
1751 static void
gfx9_get_gs_info(const struct radv_pipeline_key * key,const struct radv_pipeline * pipeline,nir_shader ** nir,struct radv_shader_info * infos,struct gfx9_gs_info * out)1752 gfx9_get_gs_info(const struct radv_pipeline_key *key,
1753                  const struct radv_pipeline *pipeline,
1754 		 nir_shader **nir,
1755 		 struct radv_shader_info *infos,
1756 		 struct gfx9_gs_info *out)
1757 {
1758 	struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1759 	struct radv_es_output_info *es_info;
1760 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
1761 		es_info = nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1762 	else
1763 		es_info = nir[MESA_SHADER_TESS_CTRL] ?
1764                        &infos[MESA_SHADER_TESS_EVAL].tes.es_info :
1765                        &infos[MESA_SHADER_VERTEX].vs.es_info;
1766 
1767 	unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
1768 	bool uses_adjacency;
1769 	switch(key->topology) {
1770 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1771 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1772 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1773 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1774 		uses_adjacency = true;
1775 		break;
1776 	default:
1777 		uses_adjacency = false;
1778 		break;
1779 	}
1780 
1781 	/* All these are in dwords: */
1782 	/* We can't allow using the whole LDS, because GS waves compete with
1783 	 * other shader stages for LDS space. */
1784 	const unsigned max_lds_size = 8 * 1024;
1785 	const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
1786 	unsigned esgs_lds_size;
1787 
1788 	/* All these are per subgroup: */
1789 	const unsigned max_out_prims = 32 * 1024;
1790 	const unsigned max_es_verts = 255;
1791 	const unsigned ideal_gs_prims = 64;
1792 	unsigned max_gs_prims, gs_prims;
1793 	unsigned min_es_verts, es_verts, worst_case_es_verts;
1794 
1795 	if (uses_adjacency || gs_num_invocations > 1)
1796 		max_gs_prims = 127 / gs_num_invocations;
1797 	else
1798 		max_gs_prims = 255;
1799 
1800 	/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
1801 	 * Make sure we don't go over the maximum value.
1802 	 */
1803 	if (gs_info->gs.vertices_out > 0) {
1804 		max_gs_prims = MIN2(max_gs_prims,
1805 				    max_out_prims /
1806 				    (gs_info->gs.vertices_out * gs_num_invocations));
1807 	}
1808 	assert(max_gs_prims > 0);
1809 
1810 	/* If the primitive has adjacency, halve the number of vertices
1811 	 * that will be reused in multiple primitives.
1812 	 */
1813 	min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
1814 
1815 	gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
1816 	worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
1817 
1818 	/* Compute ESGS LDS size based on the worst case number of ES vertices
1819 	 * needed to create the target number of GS prims per subgroup.
1820 	 */
1821 	esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1822 
1823 	/* If total LDS usage is too big, refactor partitions based on ratio
1824 	 * of ESGS item sizes.
1825 	 */
1826 	if (esgs_lds_size > max_lds_size) {
1827 		/* Our target GS Prims Per Subgroup was too large. Calculate
1828 		 * the maximum number of GS Prims Per Subgroup that will fit
1829 		 * into LDS, capped by the maximum that the hardware can support.
1830 		 */
1831 		gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
1832 				max_gs_prims);
1833 		assert(gs_prims > 0);
1834 		worst_case_es_verts = MIN2(min_es_verts * gs_prims,
1835 					   max_es_verts);
1836 
1837 		esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1838 		assert(esgs_lds_size <= max_lds_size);
1839 	}
1840 
1841 	/* Now calculate remaining ESGS information. */
1842 	if (esgs_lds_size)
1843 		es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
1844 	else
1845 		es_verts = max_es_verts;
1846 
1847 	/* Vertices for adjacency primitives are not always reused, so restore
1848 	 * it for ES_VERTS_PER_SUBGRP.
1849 	 */
1850 	min_es_verts = gs_info->gs.vertices_in;
1851 
1852 	/* For normal primitives, the VGT only checks if they are past the ES
1853 	 * verts per subgroup after allocating a full GS primitive and if they
1854 	 * are, kick off a new subgroup.  But if those additional ES verts are
1855 	 * unique (e.g. not reused) we need to make sure there is enough LDS
1856 	 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
1857 	 */
1858 	es_verts -= min_es_verts - 1;
1859 
1860 	uint32_t es_verts_per_subgroup = es_verts;
1861 	uint32_t gs_prims_per_subgroup = gs_prims;
1862 	uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
1863 	uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
1864 	out->lds_size = align(esgs_lds_size, 128) / 128;
1865 	out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
1866 	                        S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
1867 	                        S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
1868 	out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
1869 	out->vgt_esgs_ring_itemsize  = esgs_itemsize;
1870 	assert(max_prims_per_subgroup <= max_out_prims);
1871 }
1872 
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)1873 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1874 				     unsigned min_verts_per_prim, bool use_adjacency)
1875 {
1876 	unsigned max_reuse = max_esverts - min_verts_per_prim;
1877 	if (use_adjacency)
1878 		max_reuse /= 2;
1879 	*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1880 }
1881 
1882 static unsigned
radv_get_num_input_vertices(nir_shader ** nir)1883 radv_get_num_input_vertices(nir_shader **nir)
1884 {
1885 	if (nir[MESA_SHADER_GEOMETRY]) {
1886 		nir_shader *gs = nir[MESA_SHADER_GEOMETRY];
1887 
1888 		return gs->info.gs.vertices_in;
1889 	}
1890 
1891 	if (nir[MESA_SHADER_TESS_CTRL]) {
1892 		nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
1893 
1894 		if (tes->info.tess.point_mode)
1895 			return 1;
1896 		if (tes->info.tess.primitive_mode == GL_ISOLINES)
1897 			return 2;
1898 		return 3;
1899 	}
1900 
1901 	return 3;
1902 }
1903 
1904 static void
gfx10_get_ngg_info(const struct radv_pipeline_key * key,struct radv_pipeline * pipeline,nir_shader ** nir,struct radv_shader_info * infos,struct gfx10_ngg_info * ngg)1905 gfx10_get_ngg_info(const struct radv_pipeline_key *key,
1906 		   struct radv_pipeline *pipeline,
1907 		   nir_shader **nir,
1908 		   struct radv_shader_info *infos,
1909 		   struct gfx10_ngg_info *ngg)
1910 {
1911 	struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1912 	struct radv_es_output_info *es_info =
1913 		nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1914 	unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
1915 	unsigned max_verts_per_prim = radv_get_num_input_vertices(nir);
1916 	unsigned min_verts_per_prim =
1917 		gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1918 	unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1;
1919 	bool uses_adjacency;
1920 	switch(key->topology) {
1921 	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1922 	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1923 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1924 	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1925 		uses_adjacency = true;
1926 		break;
1927 	default:
1928 		uses_adjacency = false;
1929 		break;
1930 	}
1931 
1932 	/* All these are in dwords: */
1933 	/* We can't allow using the whole LDS, because GS waves compete with
1934 	 * other shader stages for LDS space.
1935 	 *
1936 	 * TODO: We should really take the shader's internal LDS use into
1937 	 *       account. The linker will fail if the size is greater than
1938 	 *       8K dwords.
1939 	 */
1940 	const unsigned max_lds_size = 8 * 1024 - 768;
1941 	const unsigned target_lds_size = max_lds_size;
1942 	unsigned esvert_lds_size = 0;
1943 	unsigned gsprim_lds_size = 0;
1944 
1945 	/* All these are per subgroup: */
1946 	const unsigned min_esverts = pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 29 : 24;
1947 	bool max_vert_out_per_gs_instance = false;
1948 	unsigned max_esverts_base = 256;
1949 	unsigned max_gsprims_base = 128; /* default prim group size clamp */
1950 
1951 	/* Hardware has the following non-natural restrictions on the value
1952 	 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
1953 	 * the draw:
1954 	 *  - at most 252 for any line input primitive type
1955 	 *  - at most 251 for any quad input primitive type
1956 	 *  - at most 251 for triangle strips with adjacency (this happens to
1957 	 *    be the natural limit for triangle *lists* with adjacency)
1958 	 */
1959 	max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
1960 
1961 	if (gs_type == MESA_SHADER_GEOMETRY) {
1962 		unsigned max_out_verts_per_gsprim =
1963 			gs_info->gs.vertices_out * gs_num_invocations;
1964 
1965 		if (max_out_verts_per_gsprim <= 256) {
1966 			if (max_out_verts_per_gsprim) {
1967 				max_gsprims_base = MIN2(max_gsprims_base,
1968 							256 / max_out_verts_per_gsprim);
1969 			}
1970 		} else {
1971 			/* Use special multi-cycling mode in which each GS
1972 			 * instance gets its own subgroup. Does not work with
1973 			 * tessellation. */
1974 			max_vert_out_per_gs_instance = true;
1975 			max_gsprims_base = 1;
1976 			max_out_verts_per_gsprim = gs_info->gs.vertices_out;
1977 		}
1978 
1979 		esvert_lds_size = es_info->esgs_itemsize / 4;
1980 		gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
1981 	} else {
1982 		/* VS and TES. */
1983 		/* LDS size for passing data from GS to ES. */
1984 		struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL]
1985 			? &infos[MESA_SHADER_TESS_EVAL].so
1986 			: &infos[MESA_SHADER_VERTEX].so;
1987 
1988 		if (so_info->num_outputs)
1989 			esvert_lds_size = 4 * so_info->num_outputs + 1;
1990 
1991 		/* GS stores Primitive IDs (one DWORD) into LDS at the address
1992 		 * corresponding to the ES thread of the provoking vertex. All
1993 		 * ES threads load and export PrimitiveID for their thread.
1994 		 */
1995 		if (!nir[MESA_SHADER_TESS_CTRL] &&
1996 		    infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id)
1997 			esvert_lds_size = MAX2(esvert_lds_size, 1);
1998 	}
1999 
2000 	unsigned max_gsprims = max_gsprims_base;
2001 	unsigned max_esverts = max_esverts_base;
2002 
2003 	if (esvert_lds_size)
2004 		max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2005 	if (gsprim_lds_size)
2006 		max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2007 
2008 	max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2009 	clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2010 	assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2011 
2012 	if (esvert_lds_size || gsprim_lds_size) {
2013 		/* Now that we have a rough proportionality between esverts
2014 		 * and gsprims based on the primitive type, scale both of them
2015 		 * down simultaneously based on required LDS space.
2016 		 *
2017 		 * We could be smarter about this if we knew how much vertex
2018 		 * reuse to expect.
2019 		 */
2020 		unsigned lds_total = max_esverts * esvert_lds_size +
2021 				     max_gsprims * gsprim_lds_size;
2022 		if (lds_total > target_lds_size) {
2023 			max_esverts = max_esverts * target_lds_size / lds_total;
2024 			max_gsprims = max_gsprims * target_lds_size / lds_total;
2025 
2026 			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2027 			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2028 						 min_verts_per_prim, uses_adjacency);
2029 			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2030 		}
2031 	}
2032 
2033 	/* Round up towards full wave sizes for better ALU utilization. */
2034 	if (!max_vert_out_per_gs_instance) {
2035 		unsigned orig_max_esverts;
2036 		unsigned orig_max_gsprims;
2037 		unsigned wavesize;
2038 
2039 		if (gs_type == MESA_SHADER_GEOMETRY) {
2040 			wavesize = gs_info->wave_size;
2041 		} else {
2042 			wavesize = nir[MESA_SHADER_TESS_CTRL]
2043 				? infos[MESA_SHADER_TESS_EVAL].wave_size
2044 				: infos[MESA_SHADER_VERTEX].wave_size;
2045 		}
2046 
2047 		do {
2048 			orig_max_esverts = max_esverts;
2049 			orig_max_gsprims = max_gsprims;
2050 
2051 			max_esverts = align(max_esverts, wavesize);
2052 			max_esverts = MIN2(max_esverts, max_esverts_base);
2053 			if (esvert_lds_size)
2054 				max_esverts = MIN2(max_esverts,
2055 						   (max_lds_size - max_gsprims * gsprim_lds_size) /
2056 						   esvert_lds_size);
2057 			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2058 
2059 			max_gsprims = align(max_gsprims, wavesize);
2060 			max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2061 			if (gsprim_lds_size)
2062 				max_gsprims = MIN2(max_gsprims,
2063 						   (max_lds_size - max_esverts * esvert_lds_size) /
2064 						   gsprim_lds_size);
2065 			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
2066 						 min_verts_per_prim, uses_adjacency);
2067 			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2068 		} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2069 	}
2070 
2071 	/* Hardware restriction: minimum value of max_esverts */
2072 	max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2073 
2074 	unsigned max_out_vertices =
2075 		max_vert_out_per_gs_instance ? gs_info->gs.vertices_out :
2076 		gs_type == MESA_SHADER_GEOMETRY ?
2077 		max_gsprims * gs_num_invocations * gs_info->gs.vertices_out :
2078 		max_esverts;
2079 	assert(max_out_vertices <= 256);
2080 
2081 	unsigned prim_amp_factor = 1;
2082 	if (gs_type == MESA_SHADER_GEOMETRY) {
2083 		/* Number of output primitives per GS input primitive after
2084 		 * GS instancing. */
2085 		prim_amp_factor = gs_info->gs.vertices_out;
2086 	}
2087 
2088 	/* The GE only checks against the maximum number of ES verts after
2089 	 * allocating a full GS primitive. So we need to ensure that whenever
2090 	 * this check passes, there is enough space for a full primitive without
2091 	 * vertex reuse.
2092 	 */
2093 	ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2094 	ngg->max_gsprims = max_gsprims;
2095 	ngg->max_out_verts = max_out_vertices;
2096 	ngg->prim_amp_factor = prim_amp_factor;
2097 	ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2098 	ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
2099 	ngg->esgs_ring_size = 4 * max_esverts * esvert_lds_size;
2100 
2101 	if (gs_type == MESA_SHADER_GEOMETRY) {
2102 		ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
2103 	} else {
2104 		ngg->vgt_esgs_ring_itemsize = 1;
2105 	}
2106 
2107 	pipeline->graphics.esgs_ring_size = ngg->esgs_ring_size;
2108 
2109 	assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
2110 }
2111 
2112 static void
radv_pipeline_init_gs_ring_state(struct radv_pipeline * pipeline,const struct gfx9_gs_info * gs)2113 radv_pipeline_init_gs_ring_state(struct radv_pipeline *pipeline,
2114 				 const struct gfx9_gs_info *gs)
2115 {
2116 	struct radv_device *device = pipeline->device;
2117 	unsigned num_se = device->physical_device->rad_info.max_se;
2118 	unsigned wave_size = 64;
2119 	unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
2120 	/* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
2121 	 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
2122 	 */
2123 	unsigned gs_vertex_reuse =
2124 		(device->physical_device->rad_info.chip_class >= GFX8 ? 32 : 16) * num_se;
2125 	unsigned alignment = 256 * num_se;
2126 	/* The maximum size is 63.999 MB per SE. */
2127 	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
2128 	struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
2129 
2130 	/* Calculate the minimum size. */
2131 	unsigned min_esgs_ring_size = align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse *
2132 					    wave_size, alignment);
2133 	/* These are recommended sizes, not minimum sizes. */
2134 	unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
2135 		gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
2136 	unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
2137 		gs_info->gs.max_gsvs_emit_size;
2138 
2139 	min_esgs_ring_size = align(min_esgs_ring_size, alignment);
2140 	esgs_ring_size = align(esgs_ring_size, alignment);
2141 	gsvs_ring_size = align(gsvs_ring_size, alignment);
2142 
2143 	if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
2144 		pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
2145 
2146 	pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
2147 }
2148 
2149 struct radv_shader_variant *
radv_get_shader(const struct radv_pipeline * pipeline,gl_shader_stage stage)2150 radv_get_shader(const struct radv_pipeline *pipeline,
2151 		gl_shader_stage stage)
2152 {
2153 	if (stage == MESA_SHADER_VERTEX) {
2154 		if (pipeline->shaders[MESA_SHADER_VERTEX])
2155 			return pipeline->shaders[MESA_SHADER_VERTEX];
2156 		if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
2157 			return pipeline->shaders[MESA_SHADER_TESS_CTRL];
2158 		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2159 			return pipeline->shaders[MESA_SHADER_GEOMETRY];
2160 	} else if (stage == MESA_SHADER_TESS_EVAL) {
2161 		if (!radv_pipeline_has_tess(pipeline))
2162 			return NULL;
2163 		if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
2164 			return pipeline->shaders[MESA_SHADER_TESS_EVAL];
2165 		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2166 			return pipeline->shaders[MESA_SHADER_GEOMETRY];
2167 	}
2168 	return pipeline->shaders[stage];
2169 }
2170 
get_vs_output_info(const struct radv_pipeline * pipeline)2171 static const struct radv_vs_output_info *get_vs_output_info(const struct radv_pipeline *pipeline)
2172 {
2173 	if (radv_pipeline_has_gs(pipeline))
2174 		if (radv_pipeline_has_ngg(pipeline))
2175 			return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
2176 		else
2177 			return &pipeline->gs_copy_shader->info.vs.outinfo;
2178 	else if (radv_pipeline_has_tess(pipeline))
2179 		return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
2180 	else
2181 		return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
2182 }
2183 
2184 static void
radv_link_shaders(struct radv_pipeline * pipeline,nir_shader ** shaders)2185 radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders)
2186 {
2187 	nir_shader* ordered_shaders[MESA_SHADER_STAGES];
2188 	int shader_count = 0;
2189 
2190 	if(shaders[MESA_SHADER_FRAGMENT]) {
2191 		ordered_shaders[shader_count++] = shaders[MESA_SHADER_FRAGMENT];
2192 	}
2193 	if(shaders[MESA_SHADER_GEOMETRY]) {
2194 		ordered_shaders[shader_count++] = shaders[MESA_SHADER_GEOMETRY];
2195 	}
2196 	if(shaders[MESA_SHADER_TESS_EVAL]) {
2197 		ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_EVAL];
2198 	}
2199 	if(shaders[MESA_SHADER_TESS_CTRL]) {
2200 		ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_CTRL];
2201 	}
2202 	if(shaders[MESA_SHADER_VERTEX]) {
2203 		ordered_shaders[shader_count++] = shaders[MESA_SHADER_VERTEX];
2204 	}
2205 
2206 	if (shader_count > 1) {
2207 		unsigned first = ordered_shaders[shader_count - 1]->info.stage;
2208 		unsigned last = ordered_shaders[0]->info.stage;
2209 
2210 		if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
2211 		    ordered_shaders[1]->info.has_transform_feedback_varyings)
2212 			nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
2213 
2214 		for (int i = 0; i < shader_count; ++i)  {
2215 			nir_variable_mode mask = 0;
2216 
2217 			if (ordered_shaders[i]->info.stage != first)
2218 				mask = mask | nir_var_shader_in;
2219 
2220 			if (ordered_shaders[i]->info.stage != last)
2221 				mask = mask | nir_var_shader_out;
2222 
2223 			nir_lower_io_to_scalar_early(ordered_shaders[i], mask);
2224 			radv_optimize_nir(ordered_shaders[i], false, false);
2225 		}
2226 	}
2227 
2228 	for (int i = 1; i < shader_count; ++i)  {
2229 		nir_lower_io_arrays_to_elements(ordered_shaders[i],
2230 						ordered_shaders[i - 1]);
2231 
2232 		if (nir_link_opt_varyings(ordered_shaders[i],
2233 					  ordered_shaders[i - 1]))
2234 			radv_optimize_nir(ordered_shaders[i - 1], false, false);
2235 
2236 		nir_remove_dead_variables(ordered_shaders[i],
2237 					  nir_var_shader_out, NULL);
2238 		nir_remove_dead_variables(ordered_shaders[i - 1],
2239 					  nir_var_shader_in, NULL);
2240 
2241 		bool progress = nir_remove_unused_varyings(ordered_shaders[i],
2242 							   ordered_shaders[i - 1]);
2243 
2244 		nir_compact_varyings(ordered_shaders[i],
2245 				     ordered_shaders[i - 1], true);
2246 
2247 		if (progress) {
2248 			if (nir_lower_global_vars_to_local(ordered_shaders[i])) {
2249 				ac_lower_indirect_derefs(ordered_shaders[i],
2250 				                         pipeline->device->physical_device->rad_info.chip_class);
2251 			}
2252 			radv_optimize_nir(ordered_shaders[i], false, false);
2253 
2254 			if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
2255 				ac_lower_indirect_derefs(ordered_shaders[i - 1],
2256 				                         pipeline->device->physical_device->rad_info.chip_class);
2257 			}
2258 			radv_optimize_nir(ordered_shaders[i - 1], false, false);
2259 		}
2260 	}
2261 }
2262 
2263 static void
radv_set_linked_driver_locations(struct radv_pipeline * pipeline,nir_shader ** shaders,struct radv_shader_info infos[MESA_SHADER_STAGES])2264 radv_set_linked_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders,
2265                                  struct radv_shader_info infos[MESA_SHADER_STAGES])
2266 {
2267 	bool has_tess = shaders[MESA_SHADER_TESS_CTRL];
2268 	bool has_gs = shaders[MESA_SHADER_GEOMETRY];
2269 
2270 	if (!has_tess && !has_gs)
2271 		return;
2272 
2273 	unsigned vs_info_idx = MESA_SHADER_VERTEX;
2274 	unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
2275 
2276 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
2277 		/* These are merged into the next stage */
2278 		vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
2279 		tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
2280 	}
2281 
2282 	if (has_tess) {
2283 		nir_linked_io_var_info vs2tcs =
2284 			nir_assign_linked_io_var_locations(shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]);
2285 		nir_linked_io_var_info tcs2tes =
2286 			nir_assign_linked_io_var_locations(shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]);
2287 
2288 		infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
2289 		infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
2290 		infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
2291 		infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
2292 		infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
2293 		infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
2294 
2295 		if (has_gs) {
2296 			nir_linked_io_var_info tes2gs =
2297 				nir_assign_linked_io_var_locations(shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]);
2298 
2299 			infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
2300 			infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars;
2301 		}
2302 	} else if (has_gs) {
2303 		nir_linked_io_var_info vs2gs =
2304 			nir_assign_linked_io_var_locations(shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]);
2305 
2306 		infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
2307 		infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars;
2308 	}
2309 }
2310 
2311 static uint32_t
radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo * input_state,uint32_t attrib_binding)2312 radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state,
2313 		       uint32_t attrib_binding)
2314 {
2315 	for (uint32_t i = 0; i < input_state->vertexBindingDescriptionCount; i++) {
2316 		const VkVertexInputBindingDescription *input_binding =
2317 			&input_state->pVertexBindingDescriptions[i];
2318 
2319 		if (input_binding->binding == attrib_binding)
2320 			return input_binding->stride;
2321 	}
2322 
2323 	return 0;
2324 }
2325 
2326 static struct radv_pipeline_key
radv_generate_graphics_pipeline_key(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_blend_state * blend)2327 radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
2328                                     const VkGraphicsPipelineCreateInfo *pCreateInfo,
2329                                     const struct radv_blend_state *blend)
2330 {
2331 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
2332 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
2333 	const VkPipelineVertexInputStateCreateInfo *input_state =
2334 	                                         pCreateInfo->pVertexInputState;
2335 	const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state =
2336 		vk_find_struct_const(input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
2337 
2338 	struct radv_pipeline_key key;
2339 	memset(&key, 0, sizeof(key));
2340 
2341 	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
2342 		key.optimisations_disabled = 1;
2343 
2344 	key.has_multiview_view_index = !!subpass->view_mask;
2345 
2346 	uint32_t binding_input_rate = 0;
2347 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
2348 	for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
2349 		if (input_state->pVertexBindingDescriptions[i].inputRate) {
2350 			unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
2351 			binding_input_rate |= 1u << binding;
2352 			instance_rate_divisors[binding] = 1;
2353 		}
2354 	}
2355 	if (divisor_state) {
2356 		for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
2357 			instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
2358 				divisor_state->pVertexBindingDivisors[i].divisor;
2359 		}
2360 	}
2361 
2362 	for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
2363 		const VkVertexInputAttributeDescription *desc =
2364 			&input_state->pVertexAttributeDescriptions[i];
2365 		const struct vk_format_description *format_desc;
2366 		unsigned location = desc->location;
2367 		unsigned binding = desc->binding;
2368 		unsigned num_format, data_format;
2369 		int first_non_void;
2370 
2371 		if (binding_input_rate & (1u << binding)) {
2372 			key.instance_rate_inputs |= 1u << location;
2373 			key.instance_rate_divisors[location] = instance_rate_divisors[binding];
2374 		}
2375 
2376 		format_desc = vk_format_description(desc->format);
2377 		first_non_void = vk_format_get_first_non_void_channel(desc->format);
2378 
2379 		num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
2380 		data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
2381 
2382 		key.vertex_attribute_formats[location] = data_format | (num_format << 4);
2383 		key.vertex_attribute_bindings[location] = desc->binding;
2384 		key.vertex_attribute_offsets[location] = desc->offset;
2385 		key.vertex_attribute_strides[location] = radv_get_attrib_stride(input_state, desc->binding);
2386 
2387 		if (pipeline->device->physical_device->rad_info.chip_class <= GFX8 &&
2388 		    pipeline->device->physical_device->rad_info.family != CHIP_STONEY) {
2389 			VkFormat format = input_state->pVertexAttributeDescriptions[i].format;
2390 			uint64_t adjust;
2391 			switch(format) {
2392 			case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
2393 			case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
2394 				adjust = RADV_ALPHA_ADJUST_SNORM;
2395 				break;
2396 			case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
2397 			case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
2398 				adjust = RADV_ALPHA_ADJUST_SSCALED;
2399 				break;
2400 			case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2401 			case VK_FORMAT_A2B10G10R10_SINT_PACK32:
2402 				adjust = RADV_ALPHA_ADJUST_SINT;
2403 				break;
2404 			default:
2405 				adjust = 0;
2406 				break;
2407 			}
2408 			key.vertex_alpha_adjust |= adjust << (2 * location);
2409 		}
2410 
2411 		switch (desc->format) {
2412 		case VK_FORMAT_B8G8R8A8_UNORM:
2413 		case VK_FORMAT_B8G8R8A8_SNORM:
2414 		case VK_FORMAT_B8G8R8A8_USCALED:
2415 		case VK_FORMAT_B8G8R8A8_SSCALED:
2416 		case VK_FORMAT_B8G8R8A8_UINT:
2417 		case VK_FORMAT_B8G8R8A8_SINT:
2418 		case VK_FORMAT_B8G8R8A8_SRGB:
2419 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2420 		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
2421 		case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
2422 		case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
2423 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2424 		case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2425 			key.vertex_post_shuffle |= 1 << location;
2426 			break;
2427 		default:
2428 			break;
2429 		}
2430 	}
2431 
2432 	const VkPipelineTessellationStateCreateInfo *tess =
2433 		radv_pipeline_get_tessellation_state(pCreateInfo);
2434 	if (tess)
2435 		key.tess_input_vertices = tess->patchControlPoints;
2436 
2437 	const VkPipelineMultisampleStateCreateInfo *vkms =
2438 		radv_pipeline_get_multisample_state(pCreateInfo);
2439 	if (vkms && vkms->rasterizationSamples > 1) {
2440 		uint32_t num_samples = vkms->rasterizationSamples;
2441 		uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
2442 		key.num_samples = num_samples;
2443 		key.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
2444 	}
2445 
2446 	key.col_format = blend->spi_shader_col_format;
2447 	key.is_dual_src = blend->mrt0_is_dual_src;
2448 	if (pipeline->device->physical_device->rad_info.chip_class < GFX8) {
2449 		key.is_int8 = blend->col_format_is_int8;
2450 		key.is_int10 = blend->col_format_is_int10;
2451 	}
2452 
2453 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10)
2454 		key.topology = pCreateInfo->pInputAssemblyState->topology;
2455 
2456 	return key;
2457 }
2458 
2459 static bool
radv_nir_stage_uses_xfb(const nir_shader * nir)2460 radv_nir_stage_uses_xfb(const nir_shader *nir)
2461 {
2462 	nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
2463 	bool uses_xfb = !!xfb;
2464 
2465 	ralloc_free(xfb);
2466 	return uses_xfb;
2467 }
2468 
2469 static void
radv_fill_shader_keys(struct radv_device * device,struct radv_shader_variant_key * keys,const struct radv_pipeline_key * key,nir_shader ** nir)2470 radv_fill_shader_keys(struct radv_device *device,
2471 		      struct radv_shader_variant_key *keys,
2472                       const struct radv_pipeline_key *key,
2473                       nir_shader **nir)
2474 {
2475 	keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
2476 	keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
2477 	keys[MESA_SHADER_VERTEX].vs.post_shuffle = key->vertex_post_shuffle;
2478 	for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i) {
2479 		keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];
2480 		keys[MESA_SHADER_VERTEX].vs.vertex_attribute_formats[i] = key->vertex_attribute_formats[i];
2481 		keys[MESA_SHADER_VERTEX].vs.vertex_attribute_bindings[i] = key->vertex_attribute_bindings[i];
2482 		keys[MESA_SHADER_VERTEX].vs.vertex_attribute_offsets[i] = key->vertex_attribute_offsets[i];
2483 		keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i];
2484 	}
2485 	keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology);
2486 
2487 	if (nir[MESA_SHADER_TESS_CTRL]) {
2488 		keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true;
2489 		keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs = 0;
2490 		keys[MESA_SHADER_TESS_CTRL].tcs.input_vertices = key->tess_input_vertices;
2491 		keys[MESA_SHADER_TESS_CTRL].tcs.primitive_mode = nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode;
2492 
2493 		keys[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
2494 	}
2495 
2496 	if (nir[MESA_SHADER_GEOMETRY]) {
2497 		if (nir[MESA_SHADER_TESS_CTRL])
2498 			keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_es = true;
2499 		else
2500 			keys[MESA_SHADER_VERTEX].vs_common_out.as_es = true;
2501 	}
2502 
2503 	if (device->physical_device->use_ngg) {
2504 		if (nir[MESA_SHADER_TESS_CTRL]) {
2505 			keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = true;
2506 		} else {
2507 			keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = true;
2508 		}
2509 
2510 		if (nir[MESA_SHADER_TESS_CTRL] &&
2511 		    nir[MESA_SHADER_GEOMETRY] &&
2512 		    nir[MESA_SHADER_GEOMETRY]->info.gs.invocations *
2513 		    nir[MESA_SHADER_GEOMETRY]->info.gs.vertices_out > 256) {
2514 			/* Fallback to the legacy path if tessellation is
2515 			 * enabled with extreme geometry because
2516 			 * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
2517 			 * might hang.
2518 			 */
2519 			keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
2520 		}
2521 
2522 		if (!device->physical_device->use_ngg_gs) {
2523 			if (nir[MESA_SHADER_GEOMETRY]) {
2524 				if (nir[MESA_SHADER_TESS_CTRL])
2525 					keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
2526 				else
2527 					keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false;
2528 			}
2529 		}
2530 
2531 		gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
2532 
2533 		for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
2534 			if (nir[i])
2535 				last_xfb_stage = i;
2536 		}
2537 
2538 		bool uses_xfb = nir[last_xfb_stage] &&
2539 				radv_nir_stage_uses_xfb(nir[last_xfb_stage]);
2540 
2541 		if (!device->physical_device->use_ngg_streamout && uses_xfb) {
2542 			if (nir[MESA_SHADER_TESS_CTRL])
2543 				keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
2544 			else
2545 				keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false;
2546 		}
2547 
2548 		/* Determine if the pipeline is eligible for the NGG passthrough
2549 		 * mode. It can't be enabled for geometry shaders, for NGG
2550 		 * streamout or for vertex shaders that export the primitive ID
2551 		 * (this is checked later because we don't have the info here.)
2552 		 */
2553 		if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) {
2554 			if (nir[MESA_SHADER_TESS_CTRL] &&
2555 			    keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg) {
2556 				keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg_passthrough = true;
2557 			} else if (nir[MESA_SHADER_VERTEX] &&
2558 				   keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) {
2559 				keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = true;
2560 			}
2561 		}
2562 	}
2563 
2564 	for(int i = 0; i < MESA_SHADER_STAGES; ++i)
2565 		keys[i].has_multiview_view_index = key->has_multiview_view_index;
2566 
2567 	keys[MESA_SHADER_FRAGMENT].fs.col_format = key->col_format;
2568 	keys[MESA_SHADER_FRAGMENT].fs.is_int8 = key->is_int8;
2569 	keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10;
2570 	keys[MESA_SHADER_FRAGMENT].fs.log2_ps_iter_samples = key->log2_ps_iter_samples;
2571 	keys[MESA_SHADER_FRAGMENT].fs.num_samples = key->num_samples;
2572 	keys[MESA_SHADER_FRAGMENT].fs.is_dual_src = key->is_dual_src;
2573 
2574 	if (nir[MESA_SHADER_COMPUTE]) {
2575 		keys[MESA_SHADER_COMPUTE].cs.subgroup_size = key->compute_subgroup_size;
2576 	}
2577 }
2578 
2579 static uint8_t
radv_get_wave_size(struct radv_device * device,const VkPipelineShaderStageCreateInfo * pStage,gl_shader_stage stage,const struct radv_shader_variant_key * key)2580 radv_get_wave_size(struct radv_device *device,
2581 		   const VkPipelineShaderStageCreateInfo *pStage,
2582 		   gl_shader_stage stage,
2583 		   const struct radv_shader_variant_key *key)
2584 {
2585 	if (stage == MESA_SHADER_GEOMETRY && !key->vs_common_out.as_ngg)
2586 		return 64;
2587 	else if (stage == MESA_SHADER_COMPUTE) {
2588 		if (key->cs.subgroup_size) {
2589 			/* Return the required subgroup size if specified. */
2590 			return key->cs.subgroup_size;
2591 		}
2592 		return device->physical_device->cs_wave_size;
2593 	}
2594 	else if (stage == MESA_SHADER_FRAGMENT)
2595 		return device->physical_device->ps_wave_size;
2596 	else
2597 		return device->physical_device->ge_wave_size;
2598 }
2599 
2600 static uint8_t
radv_get_ballot_bit_size(struct radv_device * device,const VkPipelineShaderStageCreateInfo * pStage,gl_shader_stage stage,const struct radv_shader_variant_key * key)2601 radv_get_ballot_bit_size(struct radv_device *device,
2602 			 const VkPipelineShaderStageCreateInfo *pStage,
2603 			 gl_shader_stage stage,
2604 			 const struct radv_shader_variant_key *key)
2605 {
2606 	if (stage == MESA_SHADER_COMPUTE && key->cs.subgroup_size)
2607 		return key->cs.subgroup_size;
2608 	return 64;
2609 }
2610 
2611 static void
radv_fill_shader_info(struct radv_pipeline * pipeline,const VkPipelineShaderStageCreateInfo ** pStages,struct radv_shader_variant_key * keys,struct radv_shader_info * infos,nir_shader ** nir)2612 radv_fill_shader_info(struct radv_pipeline *pipeline,
2613 		      const VkPipelineShaderStageCreateInfo **pStages,
2614 		      struct radv_shader_variant_key *keys,
2615                       struct radv_shader_info *infos,
2616                       nir_shader **nir)
2617 {
2618 	unsigned active_stages = 0;
2619 	unsigned filled_stages = 0;
2620 
2621 	for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2622 		if (nir[i])
2623 			active_stages |= (1 << i);
2624 	}
2625 
2626 	if (nir[MESA_SHADER_FRAGMENT]) {
2627 		radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
2628 		radv_nir_shader_info_pass(nir[MESA_SHADER_FRAGMENT],
2629 					  pipeline->layout,
2630 					  &keys[MESA_SHADER_FRAGMENT],
2631 					  &infos[MESA_SHADER_FRAGMENT],
2632 					  pipeline->device->physical_device->use_llvm);
2633 
2634 		/* TODO: These are no longer used as keys we should refactor this */
2635 		keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id =
2636 		        infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
2637 		keys[MESA_SHADER_VERTEX].vs_common_out.export_layer_id =
2638 		        infos[MESA_SHADER_FRAGMENT].ps.layer_input;
2639 		keys[MESA_SHADER_VERTEX].vs_common_out.export_clip_dists =
2640 		        !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
2641 		keys[MESA_SHADER_VERTEX].vs_common_out.export_viewport_index =
2642 		        infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
2643 		keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_prim_id =
2644 		        infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
2645 		keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_layer_id =
2646 		        infos[MESA_SHADER_FRAGMENT].ps.layer_input;
2647 		keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_clip_dists =
2648 		        !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
2649 		keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_viewport_index =
2650 		        infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
2651 
2652 		/* NGG passthrough mode can't be enabled for vertex shaders
2653 		 * that export the primitive ID.
2654 		 *
2655 		 * TODO: I should really refactor the keys logic.
2656 		 */
2657 		if (nir[MESA_SHADER_VERTEX] &&
2658 		    keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id) {
2659 			keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = false;
2660 		}
2661 
2662 		filled_stages |= (1 << MESA_SHADER_FRAGMENT);
2663 	}
2664 
2665 	if (nir[MESA_SHADER_TESS_CTRL]) {
2666 		infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read =
2667 			nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
2668 		infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read =
2669 			nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
2670 	}
2671 
2672 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2673 	    nir[MESA_SHADER_TESS_CTRL]) {
2674 		struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
2675 		struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
2676 		key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
2677 
2678 		radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
2679 
2680 		for (int i = 0; i < 2; i++) {
2681 			radv_nir_shader_info_pass(combined_nir[i],
2682 						  pipeline->layout, &key,
2683 						  &infos[MESA_SHADER_TESS_CTRL],
2684 						  pipeline->device->physical_device->use_llvm);
2685 		}
2686 
2687 		keys[MESA_SHADER_TESS_EVAL].tes.num_patches =
2688 			infos[MESA_SHADER_TESS_CTRL].tcs.num_patches;
2689 		keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs =
2690 			util_last_bit64(infos[MESA_SHADER_TESS_CTRL].tcs.outputs_written);
2691 
2692 		filled_stages |= (1 << MESA_SHADER_VERTEX);
2693 		filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
2694 	}
2695 
2696 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2697 	    nir[MESA_SHADER_GEOMETRY]) {
2698 		gl_shader_stage pre_stage = nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2699 		struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
2700 
2701 		radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
2702 
2703 		for (int i = 0; i < 2; i++) {
2704 			radv_nir_shader_info_pass(combined_nir[i],
2705 						  pipeline->layout,
2706 						  &keys[pre_stage],
2707 						  &infos[MESA_SHADER_GEOMETRY],
2708 						  pipeline->device->physical_device->use_llvm);
2709 		}
2710 
2711 		filled_stages |= (1 << pre_stage);
2712 		filled_stages |= (1 << MESA_SHADER_GEOMETRY);
2713 	}
2714 
2715 	active_stages ^= filled_stages;
2716 	while (active_stages) {
2717 		int i = u_bit_scan(&active_stages);
2718 
2719 		if (i == MESA_SHADER_TESS_CTRL) {
2720 			keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs =
2721 				util_last_bit64(infos[MESA_SHADER_VERTEX].vs.ls_outputs_written);
2722 		}
2723 
2724 		if (i == MESA_SHADER_TESS_EVAL) {
2725 			keys[MESA_SHADER_TESS_EVAL].tes.num_patches =
2726 				infos[MESA_SHADER_TESS_CTRL].tcs.num_patches;
2727 			keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs =
2728 				util_last_bit64(infos[MESA_SHADER_TESS_CTRL].tcs.outputs_written);
2729 		}
2730 
2731 		radv_nir_shader_info_init(&infos[i]);
2732 		radv_nir_shader_info_pass(nir[i], pipeline->layout,
2733 					  &keys[i], &infos[i], pipeline->device->physical_device->use_llvm);
2734 	}
2735 
2736 	for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2737 		if (nir[i]) {
2738 			infos[i].wave_size =
2739 				radv_get_wave_size(pipeline->device, pStages[i],
2740 						   i, &keys[i]);
2741 			infos[i].ballot_bit_size =
2742 				radv_get_ballot_bit_size(pipeline->device,
2743 							 pStages[i], i,
2744 							 &keys[i]);
2745 		}
2746 	}
2747 }
2748 
2749 static void
merge_tess_info(struct shader_info * tes_info,const struct shader_info * tcs_info)2750 merge_tess_info(struct shader_info *tes_info,
2751                 const struct shader_info *tcs_info)
2752 {
2753 	/* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
2754 	 *
2755 	 *    "PointMode. Controls generation of points rather than triangles
2756 	 *     or lines. This functionality defaults to disabled, and is
2757 	 *     enabled if either shader stage includes the execution mode.
2758 	 *
2759 	 * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
2760 	 * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
2761 	 * and OutputVertices, it says:
2762 	 *
2763 	 *    "One mode must be set in at least one of the tessellation
2764 	 *     shader stages."
2765 	 *
2766 	 * So, the fields can be set in either the TCS or TES, but they must
2767 	 * agree if set in both.  Our backend looks at TES, so bitwise-or in
2768 	 * the values from the TCS.
2769 	 */
2770 	assert(tcs_info->tess.tcs_vertices_out == 0 ||
2771 	       tes_info->tess.tcs_vertices_out == 0 ||
2772 	       tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
2773 	tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
2774 
2775 	assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
2776 	       tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
2777 	       tcs_info->tess.spacing == tes_info->tess.spacing);
2778 	tes_info->tess.spacing |= tcs_info->tess.spacing;
2779 
2780 	assert(tcs_info->tess.primitive_mode == 0 ||
2781 	       tes_info->tess.primitive_mode == 0 ||
2782 	       tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
2783 	tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
2784 	tes_info->tess.ccw |= tcs_info->tess.ccw;
2785 	tes_info->tess.point_mode |= tcs_info->tess.point_mode;
2786 }
2787 
2788 static
radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT * ext)2789 void radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT *ext)
2790 {
2791 	if (!ext)
2792 		return;
2793 
2794 	if (ext->pPipelineCreationFeedback) {
2795 		ext->pPipelineCreationFeedback->flags = 0;
2796 		ext->pPipelineCreationFeedback->duration = 0;
2797 	}
2798 
2799 	for (unsigned i = 0; i < ext->pipelineStageCreationFeedbackCount; ++i) {
2800 		ext->pPipelineStageCreationFeedbacks[i].flags = 0;
2801 		ext->pPipelineStageCreationFeedbacks[i].duration = 0;
2802 	}
2803 }
2804 
2805 static
radv_start_feedback(VkPipelineCreationFeedbackEXT * feedback)2806 void radv_start_feedback(VkPipelineCreationFeedbackEXT *feedback)
2807 {
2808 	if (!feedback)
2809 		return;
2810 
2811 	feedback->duration -= radv_get_current_time();
2812 	feedback ->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
2813 }
2814 
2815 static
radv_stop_feedback(VkPipelineCreationFeedbackEXT * feedback,bool cache_hit)2816 void radv_stop_feedback(VkPipelineCreationFeedbackEXT *feedback, bool cache_hit)
2817 {
2818 	if (!feedback)
2819 		return;
2820 
2821 	feedback->duration += radv_get_current_time();
2822 	feedback ->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT |
2823 	                   (cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
2824 }
2825 
radv_create_shaders(struct radv_pipeline * pipeline,struct radv_device * device,struct radv_pipeline_cache * cache,const struct radv_pipeline_key * key,const VkPipelineShaderStageCreateInfo ** pStages,const VkPipelineCreateFlags flags,VkPipelineCreationFeedbackEXT * pipeline_feedback,VkPipelineCreationFeedbackEXT ** stage_feedbacks)2826 VkResult radv_create_shaders(struct radv_pipeline *pipeline,
2827                              struct radv_device *device,
2828                              struct radv_pipeline_cache *cache,
2829                              const struct radv_pipeline_key *key,
2830                              const VkPipelineShaderStageCreateInfo **pStages,
2831                              const VkPipelineCreateFlags flags,
2832                              VkPipelineCreationFeedbackEXT *pipeline_feedback,
2833                              VkPipelineCreationFeedbackEXT **stage_feedbacks)
2834 {
2835 	struct radv_shader_module fs_m = {0};
2836 	struct radv_shader_module *modules[MESA_SHADER_STAGES] = { 0, };
2837 	nir_shader *nir[MESA_SHADER_STAGES] = {0};
2838 	struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
2839 	struct radv_shader_variant_key keys[MESA_SHADER_STAGES] = {{{{{0}}}}};
2840 	struct radv_shader_info infos[MESA_SHADER_STAGES] = {0};
2841 	unsigned char hash[20], gs_copy_hash[20];
2842 	bool keep_executable_info = (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) || device->keep_shader_info;
2843 	bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
2844 	                           (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
2845 	                           device->keep_shader_info;
2846 
2847 	radv_start_feedback(pipeline_feedback);
2848 
2849 	for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
2850 		if (pStages[i]) {
2851 			modules[i] = radv_shader_module_from_handle(pStages[i]->module);
2852 			if (modules[i]->nir)
2853 				_mesa_sha1_compute(modules[i]->nir->info.name,
2854 				                   strlen(modules[i]->nir->info.name),
2855 				                   modules[i]->sha1);
2856 
2857 			pipeline->active_stages |= mesa_to_vk_shader_stage(i);
2858 		}
2859 	}
2860 
2861 	radv_hash_shaders(hash, pStages, pipeline->layout, key, get_hash_flags(device));
2862 	memcpy(gs_copy_hash, hash, 20);
2863 	gs_copy_hash[0] ^= 1;
2864 
2865 	bool found_in_application_cache = true;
2866 	if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info && !keep_statistic_info) {
2867 		struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
2868 		radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants,
2869 		                                                &found_in_application_cache);
2870 		pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
2871 	}
2872 
2873 	if (!keep_executable_info && !keep_statistic_info &&
2874 	    radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
2875 	                                                    &found_in_application_cache) &&
2876 	    (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader)) {
2877 		radv_stop_feedback(pipeline_feedback, found_in_application_cache);
2878 		return VK_SUCCESS;
2879 	}
2880 
2881 	if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
2882 		radv_stop_feedback(pipeline_feedback, found_in_application_cache);
2883 		return VK_PIPELINE_COMPILE_REQUIRED_EXT;
2884 	}
2885 
2886 	if (!modules[MESA_SHADER_FRAGMENT] && !modules[MESA_SHADER_COMPUTE]) {
2887 		nir_builder fs_b;
2888 		nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
2889 		fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "noop_fs");
2890 		fs_m.nir = fs_b.shader;
2891 		modules[MESA_SHADER_FRAGMENT] = &fs_m;
2892 	}
2893 
2894 	for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
2895 		const VkPipelineShaderStageCreateInfo *stage = pStages[i];
2896 		unsigned subgroup_size = 64, ballot_bit_size = 64;
2897 
2898 		if (!modules[i])
2899 			continue;
2900 
2901 		radv_start_feedback(stage_feedbacks[i]);
2902 
2903 		if (key->compute_subgroup_size) {
2904 			/* Only compute shaders currently support requiring a
2905 			 * specific subgroup size.
2906                          */
2907 			assert(i == MESA_SHADER_COMPUTE);
2908 			subgroup_size = key->compute_subgroup_size;
2909 			ballot_bit_size = key->compute_subgroup_size;
2910 		}
2911 
2912 		nir[i] = radv_shader_compile_to_nir(device, modules[i],
2913 						    stage ? stage->pName : "main", i,
2914 						    stage ? stage->pSpecializationInfo : NULL,
2915 						    flags, pipeline->layout,
2916 						    subgroup_size, ballot_bit_size);
2917 
2918 		/* We don't want to alter meta shaders IR directly so clone it
2919 		 * first.
2920 		 */
2921 		if (nir[i]->info.name) {
2922 			nir[i] = nir_shader_clone(NULL, nir[i]);
2923 		}
2924 
2925 		radv_stop_feedback(stage_feedbacks[i], false);
2926 	}
2927 
2928 	if (nir[MESA_SHADER_TESS_CTRL]) {
2929 		nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
2930 		merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
2931 	}
2932 
2933 	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
2934 		radv_link_shaders(pipeline, nir);
2935 
2936 	radv_set_linked_driver_locations(pipeline, nir, infos);
2937 
2938 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
2939 		if (nir[i]) {
2940 			/* do this again since information such as outputs_read can be out-of-date */
2941 			nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
2942 
2943 			if (device->physical_device->use_llvm) {
2944 				NIR_PASS_V(nir[i], nir_lower_bool_to_int32);
2945 			} else {
2946 				NIR_PASS_V(nir[i], nir_lower_non_uniform_access,
2947 				           nir_lower_non_uniform_ubo_access |
2948 				           nir_lower_non_uniform_ssbo_access |
2949 				           nir_lower_non_uniform_texture_access |
2950 				           nir_lower_non_uniform_image_access);
2951 			}
2952 			NIR_PASS_V(nir[i], nir_lower_memory_model);
2953 		}
2954 	}
2955 
2956 	if (nir[MESA_SHADER_FRAGMENT])
2957 		radv_lower_fs_io(nir[MESA_SHADER_FRAGMENT]);
2958 
2959 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
2960 		if (radv_can_dump_shader(device, modules[i], false))
2961 			nir_print_shader(nir[i], stderr);
2962 	}
2963 
2964 	radv_fill_shader_keys(device, keys, key, nir);
2965 
2966 	radv_fill_shader_info(pipeline, pStages, keys, infos, nir);
2967 
2968 	if ((nir[MESA_SHADER_VERTEX] &&
2969 	     keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) ||
2970 	    (nir[MESA_SHADER_TESS_EVAL] &&
2971 	     keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg)) {
2972 		struct gfx10_ngg_info *ngg_info;
2973 
2974 		if (nir[MESA_SHADER_GEOMETRY])
2975 			ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info;
2976 		else if (nir[MESA_SHADER_TESS_CTRL])
2977 			ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info;
2978 		else
2979 			ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info;
2980 
2981 		gfx10_get_ngg_info(key, pipeline, nir, infos, ngg_info);
2982 	} else if (nir[MESA_SHADER_GEOMETRY]) {
2983 		struct gfx9_gs_info *gs_info =
2984 			&infos[MESA_SHADER_GEOMETRY].gs_ring_info;
2985 
2986 		gfx9_get_gs_info(key, pipeline, nir, infos, gs_info);
2987 	}
2988 
2989 	if(modules[MESA_SHADER_GEOMETRY]) {
2990 		struct radv_shader_binary *gs_copy_binary = NULL;
2991 		if (!pipeline->gs_copy_shader &&
2992 		    !radv_pipeline_has_ngg(pipeline)) {
2993 			struct radv_shader_info info = {};
2994 			struct radv_shader_variant_key key = {};
2995 
2996 			key.has_multiview_view_index =
2997 				keys[MESA_SHADER_GEOMETRY].has_multiview_view_index;
2998 
2999 			radv_nir_shader_info_pass(nir[MESA_SHADER_GEOMETRY],
3000 						  pipeline->layout, &key,
3001 						  &info, pipeline->device->physical_device->use_llvm);
3002 			info.wave_size = 64; /* Wave32 not supported. */
3003 			info.ballot_bit_size = 64;
3004 
3005 			pipeline->gs_copy_shader = radv_create_gs_copy_shader(
3006 					device, nir[MESA_SHADER_GEOMETRY], &info,
3007 					&gs_copy_binary, keep_executable_info, keep_statistic_info,
3008 					keys[MESA_SHADER_GEOMETRY].has_multiview_view_index);
3009 		}
3010 
3011 		if (!keep_executable_info && !keep_statistic_info && pipeline->gs_copy_shader) {
3012 			struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
3013 			struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
3014 
3015 			binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
3016 			variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;
3017 
3018 			radv_pipeline_cache_insert_shaders(device, cache,
3019 							   gs_copy_hash,
3020 							   variants,
3021 							   binaries);
3022 		}
3023 		free(gs_copy_binary);
3024 	}
3025 
3026 	if (nir[MESA_SHADER_FRAGMENT]) {
3027 		if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
3028 			radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]);
3029 
3030 			pipeline->shaders[MESA_SHADER_FRAGMENT] =
3031 			       radv_shader_variant_compile(device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1,
3032 			                                  pipeline->layout, keys + MESA_SHADER_FRAGMENT,
3033 							  infos + MESA_SHADER_FRAGMENT,
3034 			                                  keep_executable_info, keep_statistic_info,
3035 			                                  &binaries[MESA_SHADER_FRAGMENT]);
3036 
3037 			radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false);
3038 		}
3039 	}
3040 
3041 	if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) {
3042 		if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) {
3043 			struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
3044 			struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
3045 			key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
3046 
3047 			radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]);
3048 
3049 			pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2,
3050 			                                                                      pipeline->layout,
3051 			                                                                      &key, &infos[MESA_SHADER_TESS_CTRL], keep_executable_info,
3052 			                                                                      keep_statistic_info, &binaries[MESA_SHADER_TESS_CTRL]);
3053 
3054 			radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
3055 		}
3056 		modules[MESA_SHADER_VERTEX] = NULL;
3057 		keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
3058 		keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.outputs_written);
3059 	}
3060 
3061 	if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) {
3062 		gl_shader_stage pre_stage = modules[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3063 		if (!pipeline->shaders[MESA_SHADER_GEOMETRY]) {
3064 			struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
3065 
3066 			radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]);
3067 
3068 			pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2,
3069 			                                                                     pipeline->layout,
3070 			                                                                     &keys[pre_stage], &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
3071 			                                                                     keep_statistic_info, &binaries[MESA_SHADER_GEOMETRY]);
3072 
3073 			radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false);
3074 		}
3075 		modules[pre_stage] = NULL;
3076 	}
3077 
3078 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3079 		if(modules[i] && !pipeline->shaders[i]) {
3080 			if (i == MESA_SHADER_TESS_CTRL) {
3081 				keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs = util_last_bit64(pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.ls_outputs_written);
3082 			}
3083 			if (i == MESA_SHADER_TESS_EVAL) {
3084 				keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
3085 				keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.outputs_written);
3086 			}
3087 
3088 			radv_start_feedback(stage_feedbacks[i]);
3089 
3090 			pipeline->shaders[i] = radv_shader_variant_compile(device, modules[i], &nir[i], 1,
3091 									  pipeline->layout,
3092 									  keys + i, infos + i, keep_executable_info,
3093 									  keep_statistic_info, &binaries[i]);
3094 
3095 			radv_stop_feedback(stage_feedbacks[i], false);
3096 		}
3097 	}
3098 
3099 	if (!keep_executable_info && !keep_statistic_info) {
3100 		radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders,
3101 						   binaries);
3102 	}
3103 
3104 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3105 		free(binaries[i]);
3106 		if (nir[i]) {
3107 			ralloc_free(nir[i]);
3108 
3109 			if (radv_can_dump_shader_stats(device, modules[i]))
3110 				radv_shader_dump_stats(device,
3111 						       pipeline->shaders[i],
3112 						       i, stderr);
3113 		}
3114 	}
3115 
3116 	if (fs_m.nir)
3117 		ralloc_free(fs_m.nir);
3118 
3119 	radv_stop_feedback(pipeline_feedback, false);
3120 	return VK_SUCCESS;
3121 }
3122 
3123 static uint32_t
radv_pipeline_stage_to_user_data_0(struct radv_pipeline * pipeline,gl_shader_stage stage,enum chip_class chip_class)3124 radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
3125 				   gl_shader_stage stage, enum chip_class chip_class)
3126 {
3127 	bool has_gs = radv_pipeline_has_gs(pipeline);
3128 	bool has_tess = radv_pipeline_has_tess(pipeline);
3129 	bool has_ngg = radv_pipeline_has_ngg(pipeline);
3130 
3131 	switch (stage) {
3132 	case MESA_SHADER_FRAGMENT:
3133 		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
3134 	case MESA_SHADER_VERTEX:
3135 		if (has_tess) {
3136 			if (chip_class >= GFX10) {
3137 				return R_00B430_SPI_SHADER_USER_DATA_HS_0;
3138 			} else if (chip_class == GFX9) {
3139 				return R_00B430_SPI_SHADER_USER_DATA_LS_0;
3140 			} else {
3141 				return R_00B530_SPI_SHADER_USER_DATA_LS_0;
3142 			}
3143 
3144 		}
3145 
3146 		if (has_gs) {
3147 			if (chip_class >= GFX10) {
3148 				return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3149 			} else {
3150 				return R_00B330_SPI_SHADER_USER_DATA_ES_0;
3151 			}
3152 		}
3153 
3154 		if (has_ngg)
3155 			return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3156 
3157 		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3158 	case MESA_SHADER_GEOMETRY:
3159 		return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
3160 		                            R_00B230_SPI_SHADER_USER_DATA_GS_0;
3161 	case MESA_SHADER_COMPUTE:
3162 		return R_00B900_COMPUTE_USER_DATA_0;
3163 	case MESA_SHADER_TESS_CTRL:
3164 		return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 :
3165 		                            R_00B430_SPI_SHADER_USER_DATA_HS_0;
3166 	case MESA_SHADER_TESS_EVAL:
3167 		if (has_gs) {
3168 			return chip_class >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0 :
3169 						     R_00B330_SPI_SHADER_USER_DATA_ES_0;
3170 		} else if (has_ngg) {
3171 			return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3172 		} else {
3173 			return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3174 		}
3175 	default:
3176 		unreachable("unknown shader");
3177 	}
3178 }
3179 
3180 struct radv_bin_size_entry {
3181 	unsigned bpp;
3182 	VkExtent2D extent;
3183 };
3184 
3185 static VkExtent2D
radv_gfx9_compute_bin_size(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)3186 radv_gfx9_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
3187 {
3188 	static const struct radv_bin_size_entry color_size_table[][3][9] = {
3189 		{
3190 			/* One RB / SE */
3191 			{
3192 				/* One shader engine */
3193 				{        0, {128,  128}},
3194 				{        1, { 64,  128}},
3195 				{        2, { 32,  128}},
3196 				{        3, { 16,  128}},
3197 				{       17, {  0,    0}},
3198 				{ UINT_MAX, {  0,    0}},
3199 			},
3200 			{
3201 				/* Two shader engines */
3202 				{        0, {128,  128}},
3203 				{        2, { 64,  128}},
3204 				{        3, { 32,  128}},
3205 				{        5, { 16,  128}},
3206 				{       17, {  0,    0}},
3207 				{ UINT_MAX, {  0,    0}},
3208 			},
3209 			{
3210 				/* Four shader engines */
3211 				{        0, {128,  128}},
3212 				{        3, { 64,  128}},
3213 				{        5, { 16,  128}},
3214 				{       17, {  0,    0}},
3215 				{ UINT_MAX, {  0,    0}},
3216 			},
3217 		},
3218 		{
3219 			/* Two RB / SE */
3220 			{
3221 				/* One shader engine */
3222 				{        0, {128,  128}},
3223 				{        2, { 64,  128}},
3224 				{        3, { 32,  128}},
3225 				{        5, { 16,  128}},
3226 				{       33, {  0,    0}},
3227 				{ UINT_MAX, {  0,    0}},
3228 			},
3229 			{
3230 				/* Two shader engines */
3231 				{        0, {128,  128}},
3232 				{        3, { 64,  128}},
3233 				{        5, { 32,  128}},
3234 				{        9, { 16,  128}},
3235 				{       33, {  0,    0}},
3236 				{ UINT_MAX, {  0,    0}},
3237 			},
3238 			{
3239 				/* Four shader engines */
3240 				{        0, {256,  256}},
3241 				{        2, {128,  256}},
3242 				{        3, {128,  128}},
3243 				{        5, { 64,  128}},
3244 				{        9, { 16,  128}},
3245 				{       33, {  0,    0}},
3246 				{ UINT_MAX, {  0,    0}},
3247 			},
3248 		},
3249 		{
3250 			/* Four RB / SE */
3251 			{
3252 				/* One shader engine */
3253 				{        0, {128,  256}},
3254 				{        2, {128,  128}},
3255 				{        3, { 64,  128}},
3256 				{        5, { 32,  128}},
3257 				{        9, { 16,  128}},
3258 				{       33, {  0,    0}},
3259 				{ UINT_MAX, {  0,    0}},
3260 			},
3261 			{
3262 				/* Two shader engines */
3263 				{        0, {256,  256}},
3264 				{        2, {128,  256}},
3265 				{        3, {128,  128}},
3266 				{        5, { 64,  128}},
3267 				{        9, { 32,  128}},
3268 				{       17, { 16,  128}},
3269 				{       33, {  0,    0}},
3270 				{ UINT_MAX, {  0,    0}},
3271 			},
3272 			{
3273 				/* Four shader engines */
3274 				{        0, {256,  512}},
3275 				{        2, {256,  256}},
3276 				{        3, {128,  256}},
3277 				{        5, {128,  128}},
3278 				{        9, { 64,  128}},
3279 				{       17, { 16,  128}},
3280 				{       33, {  0,    0}},
3281 				{ UINT_MAX, {  0,    0}},
3282 			},
3283 		},
3284 	};
3285 	static const struct radv_bin_size_entry ds_size_table[][3][9] = {
3286 		{
3287 			// One RB / SE
3288 			{
3289 				// One shader engine
3290 				{        0, {128,  256}},
3291 				{        2, {128,  128}},
3292 				{        4, { 64,  128}},
3293 				{        7, { 32,  128}},
3294 				{       13, { 16,  128}},
3295 				{       49, {  0,    0}},
3296 				{ UINT_MAX, {  0,    0}},
3297 			},
3298 			{
3299 				// Two shader engines
3300 				{        0, {256,  256}},
3301 				{        2, {128,  256}},
3302 				{        4, {128,  128}},
3303 				{        7, { 64,  128}},
3304 				{       13, { 32,  128}},
3305 				{       25, { 16,  128}},
3306 				{       49, {  0,    0}},
3307 				{ UINT_MAX, {  0,    0}},
3308 			},
3309 			{
3310 				// Four shader engines
3311 				{        0, {256,  512}},
3312 				{        2, {256,  256}},
3313 				{        4, {128,  256}},
3314 				{        7, {128,  128}},
3315 				{       13, { 64,  128}},
3316 				{       25, { 16,  128}},
3317 				{       49, {  0,    0}},
3318 				{ UINT_MAX, {  0,    0}},
3319 			},
3320 		},
3321 		{
3322 			// Two RB / SE
3323 			{
3324 				// One shader engine
3325 				{        0, {256,  256}},
3326 				{        2, {128,  256}},
3327 				{        4, {128,  128}},
3328 				{        7, { 64,  128}},
3329 				{       13, { 32,  128}},
3330 				{       25, { 16,  128}},
3331 				{       97, {  0,    0}},
3332 				{ UINT_MAX, {  0,    0}},
3333 			},
3334 			{
3335 				// Two shader engines
3336 				{        0, {256,  512}},
3337 				{        2, {256,  256}},
3338 				{        4, {128,  256}},
3339 				{        7, {128,  128}},
3340 				{       13, { 64,  128}},
3341 				{       25, { 32,  128}},
3342 				{       49, { 16,  128}},
3343 				{       97, {  0,    0}},
3344 				{ UINT_MAX, {  0,    0}},
3345 			},
3346 			{
3347 				// Four shader engines
3348 				{        0, {512,  512}},
3349 				{        2, {256,  512}},
3350 				{        4, {256,  256}},
3351 				{        7, {128,  256}},
3352 				{       13, {128,  128}},
3353 				{       25, { 64,  128}},
3354 				{       49, { 16,  128}},
3355 				{       97, {  0,    0}},
3356 				{ UINT_MAX, {  0,    0}},
3357 			},
3358 		},
3359 		{
3360 			// Four RB / SE
3361 			{
3362 				// One shader engine
3363 				{        0, {256,  512}},
3364 				{        2, {256,  256}},
3365 				{        4, {128,  256}},
3366 				{        7, {128,  128}},
3367 				{       13, { 64,  128}},
3368 				{       25, { 32,  128}},
3369 				{       49, { 16,  128}},
3370 				{ UINT_MAX, {  0,    0}},
3371 			},
3372 			{
3373 				// Two shader engines
3374 				{        0, {512,  512}},
3375 				{        2, {256,  512}},
3376 				{        4, {256,  256}},
3377 				{        7, {128,  256}},
3378 				{       13, {128,  128}},
3379 				{       25, { 64,  128}},
3380 				{       49, { 32,  128}},
3381 				{       97, { 16,  128}},
3382 				{ UINT_MAX, {  0,    0}},
3383 			},
3384 			{
3385 				// Four shader engines
3386 				{        0, {512,  512}},
3387 				{        4, {256,  512}},
3388 				{        7, {256,  256}},
3389 				{       13, {128,  256}},
3390 				{       25, {128,  128}},
3391 				{       49, { 64,  128}},
3392 				{       97, { 16,  128}},
3393 				{ UINT_MAX, {  0,    0}},
3394 			},
3395 		},
3396 	};
3397 
3398 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3399 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3400 	VkExtent2D extent = {512, 512};
3401 
3402 	unsigned log_num_rb_per_se =
3403 	    util_logbase2_ceil(pipeline->device->physical_device->rad_info.num_render_backends /
3404 	                       pipeline->device->physical_device->rad_info.max_se);
3405 	unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
3406 
3407 	unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
3408 	unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
3409 	unsigned effective_samples = total_samples;
3410 	unsigned color_bytes_per_pixel = 0;
3411 
3412 	const VkPipelineColorBlendStateCreateInfo *vkblend =
3413 		radv_pipeline_get_color_blend_state(pCreateInfo);
3414 	if (vkblend) {
3415 		for (unsigned i = 0; i < subpass->color_count; i++) {
3416 			if (!vkblend->pAttachments[i].colorWriteMask)
3417 				continue;
3418 
3419 			if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3420 				continue;
3421 
3422 			VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3423 			color_bytes_per_pixel += vk_format_get_blocksize(format);
3424 		}
3425 
3426 		/* MSAA images typically don't use all samples all the time. */
3427 		if (effective_samples >= 2 && ps_iter_samples <= 1)
3428 			effective_samples = 2;
3429 		color_bytes_per_pixel *= effective_samples;
3430 	}
3431 
3432 	const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
3433 	while(color_entry[1].bpp <= color_bytes_per_pixel)
3434 		++color_entry;
3435 
3436 	extent = color_entry->extent;
3437 
3438 	if (subpass->depth_stencil_attachment) {
3439 		struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
3440 
3441 		/* Coefficients taken from AMDVLK */
3442 		unsigned depth_coeff = vk_format_is_depth(attachment->format) ? 5 : 0;
3443 		unsigned stencil_coeff = vk_format_is_stencil(attachment->format) ? 1 : 0;
3444 		unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
3445 
3446 		const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
3447 		while(ds_entry[1].bpp <= ds_bytes_per_pixel)
3448 			++ds_entry;
3449 
3450 		if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
3451 			extent = ds_entry->extent;
3452 	}
3453 
3454 	return extent;
3455 }
3456 
3457 static VkExtent2D
radv_gfx10_compute_bin_size(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)3458 radv_gfx10_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
3459 {
3460 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3461 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3462 	VkExtent2D extent = {512, 512};
3463 
3464 	const unsigned db_tag_size = 64;
3465 	const unsigned db_tag_count = 312;
3466 	const unsigned color_tag_size = 1024;
3467 	const unsigned color_tag_count = 31;
3468 	const unsigned fmask_tag_size = 256;
3469 	const unsigned fmask_tag_count = 44;
3470 
3471 	const unsigned rb_count = pipeline->device->physical_device->rad_info.num_render_backends;
3472 	const unsigned pipe_count = MAX2(rb_count, pipeline->device->physical_device->rad_info.num_sdp_interfaces);
3473 
3474 	const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
3475 	const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
3476 	const unsigned fmask_tag_part = (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
3477 
3478 	const unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
3479 	const unsigned samples_log = util_logbase2_ceil(total_samples);
3480 
3481 	unsigned color_bytes_per_pixel = 0;
3482 	unsigned fmask_bytes_per_pixel = 0;
3483 
3484 	const VkPipelineColorBlendStateCreateInfo *vkblend =
3485 		radv_pipeline_get_color_blend_state(pCreateInfo);
3486 	if (vkblend) {
3487 		for (unsigned i = 0; i < subpass->color_count; i++) {
3488 			if (!vkblend->pAttachments[i].colorWriteMask)
3489 				continue;
3490 
3491 			if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3492 				continue;
3493 
3494 			VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3495 			color_bytes_per_pixel += vk_format_get_blocksize(format);
3496 
3497 			if (total_samples > 1) {
3498 				assert(samples_log <= 3);
3499 				const unsigned fmask_array[] = {0, 1, 1, 4};
3500 				fmask_bytes_per_pixel += fmask_array[samples_log];
3501 			}
3502 		}
3503 
3504 		color_bytes_per_pixel *= total_samples;
3505 	}
3506 	color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
3507 
3508 	const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
3509 	extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
3510 	extent.height = 1ull << (color_pixel_count_log / 2);
3511 
3512 	if (fmask_bytes_per_pixel) {
3513 		const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
3514 
3515 		const VkExtent2D fmask_extent = (VkExtent2D){
3516 			.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
3517 			.height = 1ull << (color_pixel_count_log / 2)
3518 		};
3519 
3520 		if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
3521 		    extent = fmask_extent;
3522 	}
3523 
3524 	if (subpass->depth_stencil_attachment) {
3525 		struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
3526 
3527 		/* Coefficients taken from AMDVLK */
3528 		unsigned depth_coeff = vk_format_is_depth(attachment->format) ? 5 : 0;
3529 		unsigned stencil_coeff = vk_format_is_stencil(attachment->format) ? 1 : 0;
3530 		unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
3531 
3532 		const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
3533 
3534 		const VkExtent2D db_extent = (VkExtent2D){
3535 			.width = 1ull << ((db_pixel_count_log + 1) / 2),
3536 			.height = 1ull << (color_pixel_count_log / 2)
3537 		};
3538 
3539 		if (db_extent.width * db_extent.height < extent.width * extent.height)
3540 		    extent = db_extent;
3541 	}
3542 
3543 	extent.width = MAX2(extent.width, 128);
3544 	extent.height = MAX2(extent.width, 64);
3545 
3546 	return extent;
3547 }
3548 
3549 static void
radv_pipeline_init_disabled_binning_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)3550 radv_pipeline_init_disabled_binning_state(struct radv_pipeline *pipeline,
3551 					  const VkGraphicsPipelineCreateInfo *pCreateInfo)
3552 {
3553 	uint32_t pa_sc_binner_cntl_0 =
3554 	                S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
3555 	                S_028C44_DISABLE_START_OF_PRIM(1);
3556 	uint32_t db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF);
3557 
3558 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
3559 		RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3560 		struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3561 		const VkPipelineColorBlendStateCreateInfo *vkblend =
3562 			radv_pipeline_get_color_blend_state(pCreateInfo);
3563 		unsigned min_bytes_per_pixel = 0;
3564 
3565 		if (vkblend) {
3566 			for (unsigned i = 0; i < subpass->color_count; i++) {
3567 				if (!vkblend->pAttachments[i].colorWriteMask)
3568 					continue;
3569 
3570 				if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3571 					continue;
3572 
3573 				VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3574 				unsigned bytes = vk_format_get_blocksize(format);
3575 				if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
3576 					min_bytes_per_pixel = bytes;
3577 			}
3578 		}
3579 
3580 		pa_sc_binner_cntl_0 =
3581 			S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
3582 			S_028C44_BIN_SIZE_X(0) |
3583 			S_028C44_BIN_SIZE_Y(0) |
3584 			S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
3585 			S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
3586 			S_028C44_DISABLE_START_OF_PRIM(1);
3587 	}
3588 
3589 	pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
3590 	pipeline->graphics.binning.db_dfsm_control = db_dfsm_control;
3591 }
3592 
3593 struct radv_binning_settings
radv_get_binning_settings(const struct radv_physical_device * pdev)3594 radv_get_binning_settings(const struct radv_physical_device *pdev)
3595 {
3596 	struct radv_binning_settings settings;
3597 	if (pdev->rad_info.has_dedicated_vram) {
3598 		if (pdev->rad_info.num_render_backends > 4) {
3599 			settings.context_states_per_bin = 1;
3600 			settings.persistent_states_per_bin = 1;
3601 		} else {
3602 			settings.context_states_per_bin = 3;
3603 			settings.persistent_states_per_bin = 8;
3604 		}
3605 		settings.fpovs_per_batch = 63;
3606 	} else {
3607 		/* The context states are affected by the scissor bug. */
3608 		settings.context_states_per_bin = 6;
3609 		/* 32 causes hangs for RAVEN. */
3610 		settings.persistent_states_per_bin = 16;
3611 		settings.fpovs_per_batch = 63;
3612 	}
3613 
3614 	if (pdev->rad_info.has_gfx9_scissor_bug)
3615 		settings.context_states_per_bin = 1;
3616 
3617 	return settings;
3618 }
3619 
3620 static void
radv_pipeline_init_binning_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_blend_state * blend)3621 radv_pipeline_init_binning_state(struct radv_pipeline *pipeline,
3622 				 const VkGraphicsPipelineCreateInfo *pCreateInfo,
3623 				 const struct radv_blend_state *blend)
3624 {
3625 	if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
3626 		return;
3627 
3628 	VkExtent2D bin_size;
3629 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
3630 		bin_size = radv_gfx10_compute_bin_size(pipeline, pCreateInfo);
3631 	} else if (pipeline->device->physical_device->rad_info.chip_class == GFX9) {
3632 		bin_size = radv_gfx9_compute_bin_size(pipeline, pCreateInfo);
3633 	} else
3634 		unreachable("Unhandled generation for binning bin size calculation");
3635 
3636 	if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) {
3637 		struct radv_binning_settings settings =
3638 			radv_get_binning_settings(pipeline->device->physical_device);
3639 
3640 		bool disable_start_of_prim = true;
3641 		uint32_t db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF);
3642 
3643 		const struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
3644 
3645 		if (pipeline->device->dfsm_allowed && ps &&
3646 		    !ps->info.ps.can_discard &&
3647 		    !ps->info.ps.writes_memory &&
3648 		    blend->cb_target_enabled_4bit) {
3649 			db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_AUTO);
3650 			disable_start_of_prim = (blend->blend_enable_4bit & blend->cb_target_enabled_4bit) != 0;
3651 		}
3652 
3653 		const uint32_t pa_sc_binner_cntl_0 =
3654 	                S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
3655 	                S_028C44_BIN_SIZE_X(bin_size.width == 16) |
3656 	                S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
3657 	                S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
3658 	                S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
3659 	                S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
3660 	                S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
3661 	                S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
3662 	                S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) |
3663 	                S_028C44_OPTIMAL_BIN_SELECTION(1);
3664 
3665 		pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
3666 		pipeline->graphics.binning.db_dfsm_control = db_dfsm_control;
3667 	} else
3668 		radv_pipeline_init_disabled_binning_state(pipeline, pCreateInfo);
3669 }
3670 
3671 
3672 static void
radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)3673 radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
3674                                            const struct radv_pipeline *pipeline,
3675                                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
3676                                            const struct radv_graphics_pipeline_create_info *extra)
3677 {
3678 	const VkPipelineDepthStencilStateCreateInfo *vkds = radv_pipeline_get_depth_stencil_state(pCreateInfo);
3679 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3680 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3681 	struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
3682 	struct radv_render_pass_attachment *attachment = NULL;
3683 	uint32_t db_render_control = 0, db_render_override2 = 0;
3684 	uint32_t db_render_override = 0;
3685 
3686 	if (subpass->depth_stencil_attachment)
3687 		attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
3688 
3689 	bool has_depth_attachment = attachment && vk_format_is_depth(attachment->format);
3690 
3691 	if (vkds && has_depth_attachment) {
3692 		/* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
3693 		db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2);
3694 
3695 		if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
3696 			db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE_GFX103(2);
3697 	}
3698 
3699 	if (attachment && extra) {
3700 		db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
3701 		db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
3702 
3703 		db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
3704 		db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
3705 		db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
3706 		db_render_override2 |= S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(extra->db_depth_disable_expclear);
3707 		db_render_override2 |= S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(extra->db_stencil_disable_expclear);
3708 	}
3709 
3710 	db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
3711 			      S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
3712 
3713 	if (!pCreateInfo->pRasterizationState->depthClampEnable &&
3714 	    ps->info.ps.writes_z) {
3715 		/* From VK_EXT_depth_range_unrestricted spec:
3716 		 *
3717 		 * "The behavior described in Primitive Clipping still applies.
3718 		 *  If depth clamping is disabled the depth values are still
3719 		 *  clipped to 0 ≤ zc ≤ wc before the viewport transform. If
3720 		 *  depth clamping is enabled the above equation is ignored and
3721 		 *  the depth values are instead clamped to the VkViewport
3722 		 *  minDepth and maxDepth values, which in the case of this
3723 		 *  extension can be outside of the 0.0 to 1.0 range."
3724 		 */
3725 		db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
3726 	}
3727 
3728 	radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control);
3729 	radeon_set_context_reg(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, db_render_override);
3730 	radeon_set_context_reg(ctx_cs, R_028010_DB_RENDER_OVERRIDE2, db_render_override2);
3731 }
3732 
3733 static void
radv_pipeline_generate_blend_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const struct radv_blend_state * blend)3734 radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
3735                                    const struct radv_pipeline *pipeline,
3736                                    const struct radv_blend_state *blend)
3737 {
3738 	radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
3739 	radeon_emit_array(ctx_cs, blend->cb_blend_control,
3740 			  8);
3741 	radeon_set_context_reg(ctx_cs, R_028808_CB_COLOR_CONTROL, blend->cb_color_control);
3742 	radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
3743 
3744 	if (pipeline->device->physical_device->rad_info.has_rbplus) {
3745 
3746 		radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
3747 		radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
3748 	}
3749 
3750 	radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
3751 
3752 	radeon_set_context_reg(ctx_cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask);
3753 	radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
3754 }
3755 
3756 static void
radv_pipeline_generate_raster_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)3757 radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
3758 				    const struct radv_pipeline *pipeline,
3759                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
3760 {
3761 	const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState;
3762 	const VkConservativeRasterizationModeEXT mode =
3763 		radv_get_conservative_raster_mode(vkraster);
3764 	uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
3765 	bool depth_clip_disable = vkraster->depthClampEnable;
3766 
3767 	const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
3768 		vk_find_struct_const(vkraster->pNext, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
3769 	if (depth_clip_state) {
3770 		depth_clip_disable = !depth_clip_state->depthClipEnable;
3771 	}
3772 
3773 	radeon_set_context_reg(ctx_cs, R_028810_PA_CL_CLIP_CNTL,
3774 	                       S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions.
3775 	                       S_028810_ZCLIP_NEAR_DISABLE(depth_clip_disable ? 1 : 0) |
3776 	                       S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
3777 	                       S_028810_DX_RASTERIZATION_KILL(vkraster->rasterizerDiscardEnable ? 1 : 0) |
3778 	                       S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
3779 
3780 	radeon_set_context_reg(ctx_cs, R_028BDC_PA_SC_LINE_CNTL,
3781 			       S_028BDC_DX10_DIAMOND_TEST_ENA(1));
3782 
3783 	/* Conservative rasterization. */
3784 	if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
3785 		pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) |
3786 					  S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
3787 					  S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
3788 
3789 		if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
3790 			pa_sc_conservative_rast |=
3791 				S_028C4C_OVER_RAST_ENABLE(1) |
3792 				S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
3793 				S_028C4C_UNDER_RAST_ENABLE(0) |
3794 				S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
3795 				S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
3796 		} else {
3797 			assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
3798 			pa_sc_conservative_rast |=
3799 				S_028C4C_OVER_RAST_ENABLE(0) |
3800 				S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
3801 				S_028C4C_UNDER_RAST_ENABLE(1) |
3802 				S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
3803 				S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
3804 		}
3805 	}
3806 
3807 	radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
3808 				   pa_sc_conservative_rast);
3809 }
3810 
3811 
3812 static void
radv_pipeline_generate_multisample_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)3813 radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs,
3814                                          const struct radv_pipeline *pipeline)
3815 {
3816 	const struct radv_multisample_state *ms = &pipeline->graphics.ms;
3817 
3818 	radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
3819 	radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
3820 	radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
3821 
3822 	radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
3823 	radeon_set_context_reg(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
3824 	radeon_set_context_reg(ctx_cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
3825 	radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
3826 
3827 	/* The exclusion bits can be set to improve rasterization efficiency
3828 	 * if no sample lies on the pixel boundary (-8 sample offset). It's
3829 	 * currently always TRUE because the driver doesn't support 16 samples.
3830 	 */
3831 	bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= GFX7;
3832 	radeon_set_context_reg(ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
3833 			       S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) |
3834 			       S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
3835 
3836 	/* GFX9: Flush DFSM when the AA mode changes. */
3837 	if (pipeline->device->dfsm_allowed) {
3838 		radeon_emit(ctx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3839 		radeon_emit(ctx_cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
3840 	}
3841 }
3842 
3843 static void
radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)3844 radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
3845                                    const struct radv_pipeline *pipeline)
3846 {
3847 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
3848 	const struct radv_shader_variant *vs =
3849 		pipeline->shaders[MESA_SHADER_TESS_EVAL] ?
3850 		pipeline->shaders[MESA_SHADER_TESS_EVAL] :
3851 		pipeline->shaders[MESA_SHADER_VERTEX];
3852 	unsigned vgt_primitiveid_en = 0;
3853 	uint32_t vgt_gs_mode = 0;
3854 
3855 	if (radv_pipeline_has_ngg(pipeline))
3856 		return;
3857 
3858 	if (radv_pipeline_has_gs(pipeline)) {
3859 		const struct radv_shader_variant *gs =
3860 			pipeline->shaders[MESA_SHADER_GEOMETRY];
3861 
3862 		vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
3863 		                             pipeline->device->physical_device->rad_info.chip_class);
3864 	} else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
3865 		vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
3866 		vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
3867 	}
3868 
3869 	radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
3870 	radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
3871 }
3872 
3873 static void
radv_pipeline_generate_hw_vs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)3874 radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs,
3875 			     struct radeon_cmdbuf *cs,
3876 			     const struct radv_pipeline *pipeline,
3877 			     const struct radv_shader_variant *shader)
3878 {
3879 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
3880 
3881 	radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
3882 	radeon_emit(cs, va >> 8);
3883 	radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
3884 	radeon_emit(cs, shader->config.rsrc1);
3885 	radeon_emit(cs, shader->config.rsrc2);
3886 
3887 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
3888 	unsigned clip_dist_mask, cull_dist_mask, total_mask;
3889 	clip_dist_mask = outinfo->clip_dist_mask;
3890 	cull_dist_mask = outinfo->cull_dist_mask;
3891 	total_mask = clip_dist_mask | cull_dist_mask;
3892 	bool misc_vec_ena = outinfo->writes_pointsize ||
3893 		outinfo->writes_layer ||
3894 		outinfo->writes_viewport_index;
3895 	unsigned spi_vs_out_config, nparams;
3896 
3897 	/* VS is required to export at least one param. */
3898 	nparams = MAX2(outinfo->param_exports, 1);
3899 	spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
3900 
3901 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
3902 		spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
3903 	}
3904 
3905 	radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
3906 
3907 	radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
3908 	                       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
3909 	                       S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
3910 	                                                   V_02870C_SPI_SHADER_4COMP :
3911 	                                                   V_02870C_SPI_SHADER_NONE) |
3912 	                       S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
3913 	                                                   V_02870C_SPI_SHADER_4COMP :
3914 	                                                   V_02870C_SPI_SHADER_NONE) |
3915 	                       S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
3916 	                                                   V_02870C_SPI_SHADER_4COMP :
3917 	                                                   V_02870C_SPI_SHADER_NONE));
3918 
3919 	radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
3920 	                       S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
3921 	                       S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
3922 	                       S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
3923 	                       S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
3924 	                       S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
3925 	                       S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
3926 	                       S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
3927 			       S_02881C_BYPASS_PRIM_RATE_COMBINER_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) |
3928 			       S_02881C_BYPASS_VTX_RATE_COMBINER_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) |
3929 	                       cull_dist_mask << 8 |
3930 	                       clip_dist_mask);
3931 
3932 	if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
3933 		radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
3934 		                       outinfo->writes_viewport_index);
3935 }
3936 
3937 static void
radv_pipeline_generate_hw_es(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)3938 radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs,
3939 			     const struct radv_pipeline *pipeline,
3940 			     const struct radv_shader_variant *shader)
3941 {
3942 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
3943 
3944 	radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
3945 	radeon_emit(cs, va >> 8);
3946 	radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
3947 	radeon_emit(cs, shader->config.rsrc1);
3948 	radeon_emit(cs, shader->config.rsrc2);
3949 }
3950 
3951 static void
radv_pipeline_generate_hw_ls(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)3952 radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
3953 			     const struct radv_pipeline *pipeline,
3954 			     const struct radv_shader_variant *shader)
3955 {
3956 	unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
3957 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
3958 	uint32_t rsrc2 = shader->config.rsrc2;
3959 
3960 	radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
3961 	radeon_emit(cs, va >> 8);
3962 	radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
3963 
3964 	rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
3965 	if (pipeline->device->physical_device->rad_info.chip_class == GFX7 &&
3966 	    pipeline->device->physical_device->rad_info.family != CHIP_HAWAII)
3967 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
3968 
3969 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
3970 	radeon_emit(cs, shader->config.rsrc1);
3971 	radeon_emit(cs, rsrc2);
3972 }
3973 
3974 static void
radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)3975 radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs,
3976 			      struct radeon_cmdbuf *cs,
3977 			      const struct radv_pipeline *pipeline,
3978 			      const struct radv_shader_variant *shader)
3979 {
3980 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
3981 	gl_shader_stage es_type =
3982 		radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3983 	struct radv_shader_variant *es =
3984 		es_type == MESA_SHADER_TESS_EVAL ? pipeline->shaders[MESA_SHADER_TESS_EVAL] : pipeline->shaders[MESA_SHADER_VERTEX];
3985 	const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
3986 
3987 	radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
3988 	radeon_emit(cs, va >> 8);
3989 	radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
3990 	radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
3991 	radeon_emit(cs, shader->config.rsrc1);
3992 	radeon_emit(cs, shader->config.rsrc2);
3993 
3994 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
3995 	unsigned clip_dist_mask, cull_dist_mask, total_mask;
3996 	clip_dist_mask = outinfo->clip_dist_mask;
3997 	cull_dist_mask = outinfo->cull_dist_mask;
3998 	total_mask = clip_dist_mask | cull_dist_mask;
3999 	bool misc_vec_ena = outinfo->writes_pointsize ||
4000 		outinfo->writes_layer ||
4001 		outinfo->writes_viewport_index;
4002 	bool es_enable_prim_id = outinfo->export_prim_id ||
4003 				 (es && es->info.uses_prim_id);
4004 	bool break_wave_at_eoi = false;
4005 	unsigned ge_cntl;
4006 	unsigned nparams;
4007 
4008 	if (es_type == MESA_SHADER_TESS_EVAL) {
4009 		struct radv_shader_variant *gs =
4010 			pipeline->shaders[MESA_SHADER_GEOMETRY];
4011 
4012 		if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
4013 			break_wave_at_eoi = true;
4014 	}
4015 
4016 	nparams = MAX2(outinfo->param_exports, 1);
4017 	radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
4018 	                       S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
4019 			       S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0));
4020 
4021 	radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
4022 			       S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
4023 	radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
4024 	                       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
4025 	                       S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
4026 	                                                   V_02870C_SPI_SHADER_4COMP :
4027 	                                                   V_02870C_SPI_SHADER_NONE) |
4028 	                       S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
4029 	                                                   V_02870C_SPI_SHADER_4COMP :
4030 	                                                   V_02870C_SPI_SHADER_NONE) |
4031 	                       S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
4032 	                                                   V_02870C_SPI_SHADER_4COMP :
4033 	                                                   V_02870C_SPI_SHADER_NONE));
4034 
4035 	radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
4036 	                       S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
4037 	                       S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
4038 	                       S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
4039 	                       S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
4040 	                       S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
4041 	                       S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
4042 	                       S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
4043 			       S_02881C_BYPASS_PRIM_RATE_COMBINER_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) |
4044 			       S_02881C_BYPASS_VTX_RATE_COMBINER_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3) |
4045 	                       cull_dist_mask << 8 |
4046 	                       clip_dist_mask);
4047 
4048 	radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
4049 			       S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
4050 			       S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
4051 
4052 	radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4053 			       ngg_state->vgt_esgs_ring_itemsize);
4054 
4055 	/* NGG specific registers. */
4056 	struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4057 	uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
4058 
4059 	radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4060 			       S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
4061 			       S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
4062 			       S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
4063 	radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
4064 			       S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
4065 	radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
4066 			       S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
4067 			       S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
4068 	radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4069 			       S_028B90_CNT(gs_num_invocations) |
4070 			       S_028B90_ENABLE(gs_num_invocations > 1) |
4071 			       S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
4072 
4073 	/* User edge flags are set by the pos exports. If user edge flags are
4074 	 * not used, we must use hw-generated edge flags and pass them via
4075 	 * the prim export to prevent drawing lines on internal edges of
4076 	 * decomposed primitives (such as quads) with polygon mode = lines.
4077 	 *
4078 	 * TODO: We should combine hw-generated edge flags with user edge
4079 	 *       flags in the shader.
4080 	 */
4081 	radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL,
4082 			       S_028838_INDEX_BUF_EDGE_FLAG_ENA(!radv_pipeline_has_tess(pipeline) &&
4083 			                                        !radv_pipeline_has_gs(pipeline)) |
4084 			       /* Reuse for NGG. */
4085 			       S_028838_VERTEX_REUSE_DEPTH_GFX103(pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 30 : 0));
4086 
4087 	ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
4088 		  S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
4089 		  S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
4090 
4091 	/* Bug workaround for a possible hang with non-tessellation cases.
4092 	 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
4093 	 *
4094 	 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
4095 	 */
4096 	if (pipeline->device->physical_device->rad_info.chip_class == GFX10 &&
4097 	    !radv_pipeline_has_tess(pipeline) &&
4098 	    ngg_state->hw_max_esverts != 256) {
4099 		ge_cntl &= C_03096C_VERT_GRP_SIZE;
4100 
4101 		if (ngg_state->hw_max_esverts > 5) {
4102 			ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
4103 		}
4104 	}
4105 
4106 	radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
4107 }
4108 
4109 static void
radv_pipeline_generate_hw_hs(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4110 radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
4111 			     const struct radv_pipeline *pipeline,
4112 			     const struct radv_shader_variant *shader)
4113 {
4114 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4115 
4116 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4117 		if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4118 			radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
4119 			radeon_emit(cs, va >> 8);
4120 			radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
4121 		} else {
4122 			radeon_set_sh_reg_seq(cs, R_00B410_SPI_SHADER_PGM_LO_LS, 2);
4123 			radeon_emit(cs, va >> 8);
4124 			radeon_emit(cs, S_00B414_MEM_BASE(va >> 40));
4125 		}
4126 
4127 		radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
4128 		radeon_emit(cs, shader->config.rsrc1);
4129 		radeon_emit(cs, shader->config.rsrc2);
4130 	} else {
4131 		radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
4132 		radeon_emit(cs, va >> 8);
4133 		radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
4134 		radeon_emit(cs, shader->config.rsrc1);
4135 		radeon_emit(cs, shader->config.rsrc2);
4136 	}
4137 }
4138 
4139 static void
radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4140 radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
4141 				     struct radeon_cmdbuf *cs,
4142 				     const struct radv_pipeline *pipeline)
4143 {
4144 	struct radv_shader_variant *vs;
4145 
4146 	/* Skip shaders merged into HS/GS */
4147 	vs = pipeline->shaders[MESA_SHADER_VERTEX];
4148 	if (!vs)
4149 		return;
4150 
4151 	if (vs->info.vs.as_ls)
4152 		radv_pipeline_generate_hw_ls(cs, pipeline, vs);
4153 	else if (vs->info.vs.as_es)
4154 		radv_pipeline_generate_hw_es(cs, pipeline, vs);
4155 	else if (vs->info.is_ngg)
4156 		radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs);
4157 	else
4158 		radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
4159 }
4160 
4161 static void
radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4162 radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs,
4163 				    struct radeon_cmdbuf *cs,
4164 				    const struct radv_pipeline *pipeline)
4165 {
4166 	struct radv_shader_variant *tes, *tcs;
4167 
4168 	tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL];
4169 	tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
4170 
4171 	if (tes) {
4172 		if (tes->info.is_ngg) {
4173 			radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes);
4174 		} else if (tes->info.tes.as_es)
4175 			radv_pipeline_generate_hw_es(cs, pipeline, tes);
4176 		else
4177 			radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes);
4178 	}
4179 
4180 	radv_pipeline_generate_hw_hs(cs, pipeline, tcs);
4181 
4182 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
4183 	    !radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
4184 		radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4185 		                       S_028A44_ES_VERTS_PER_SUBGRP(250) |
4186 		                       S_028A44_GS_PRIMS_PER_SUBGRP(126) |
4187 		                       S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
4188 	}
4189 }
4190 
4191 static void
radv_pipeline_generate_tess_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4192 radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs,
4193 				  const struct radv_pipeline *pipeline,
4194 				  const VkGraphicsPipelineCreateInfo *pCreateInfo)
4195 {
4196 	struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL);
4197 	unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
4198 	unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
4199 	unsigned ls_hs_config;
4200 
4201 	num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
4202 	num_tcs_output_cp = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; //TCS VERTICES OUT
4203 	num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
4204 
4205 	ls_hs_config = S_028B58_NUM_PATCHES(num_patches) |
4206 		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
4207 		       S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
4208 
4209 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
4210 		radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG,
4211 					   2, ls_hs_config);
4212 	} else {
4213 		radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG,
4214 				       ls_hs_config);
4215 	}
4216 
4217 	switch (tes->info.tes.primitive_mode) {
4218 	case GL_TRIANGLES:
4219 		type = V_028B6C_TESS_TRIANGLE;
4220 		break;
4221 	case GL_QUADS:
4222 		type = V_028B6C_TESS_QUAD;
4223 		break;
4224 	case GL_ISOLINES:
4225 		type = V_028B6C_TESS_ISOLINE;
4226 		break;
4227 	}
4228 
4229 	switch (tes->info.tes.spacing) {
4230 	case TESS_SPACING_EQUAL:
4231 		partitioning = V_028B6C_PART_INTEGER;
4232 		break;
4233 	case TESS_SPACING_FRACTIONAL_ODD:
4234 		partitioning = V_028B6C_PART_FRAC_ODD;
4235 		break;
4236 	case TESS_SPACING_FRACTIONAL_EVEN:
4237 		partitioning = V_028B6C_PART_FRAC_EVEN;
4238 		break;
4239 	default:
4240 		break;
4241 	}
4242 
4243 	bool ccw = tes->info.tes.ccw;
4244 	const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
4245 	              vk_find_struct_const(pCreateInfo->pTessellationState,
4246 	                                   PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
4247 
4248 	if (domain_origin_state && domain_origin_state->domainOrigin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
4249 		ccw = !ccw;
4250 
4251 	if (tes->info.tes.point_mode)
4252 		topology = V_028B6C_OUTPUT_POINT;
4253 	else if (tes->info.tes.primitive_mode == GL_ISOLINES)
4254 		topology = V_028B6C_OUTPUT_LINE;
4255 	else if (ccw)
4256 		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
4257 	else
4258 		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
4259 
4260 	if (pipeline->device->physical_device->rad_info.has_distributed_tess) {
4261 		if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
4262 		    pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
4263 			distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
4264 		else
4265 			distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
4266 	} else
4267 		distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
4268 
4269 	radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
4270 			       S_028B6C_TYPE(type) |
4271 			       S_028B6C_PARTITIONING(partitioning) |
4272 			       S_028B6C_TOPOLOGY(topology) |
4273 			       S_028B6C_DISTRIBUTION_MODE(distribution_mode));
4274 }
4275 
4276 static void
radv_pipeline_generate_hw_gs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * gs)4277 radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs,
4278 			     struct radeon_cmdbuf *cs,
4279 			     const struct radv_pipeline *pipeline,
4280 			     const struct radv_shader_variant *gs)
4281 {
4282 	const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
4283 	unsigned gs_max_out_vertices;
4284 	const uint8_t *num_components;
4285 	uint8_t max_stream;
4286 	unsigned offset;
4287 	uint64_t va;
4288 
4289 	gs_max_out_vertices = gs->info.gs.vertices_out;
4290 	max_stream = gs->info.gs.max_stream;
4291 	num_components = gs->info.gs.num_stream_output_components;
4292 
4293 	offset = num_components[0] * gs_max_out_vertices;
4294 
4295 	radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
4296 	radeon_emit(ctx_cs, offset);
4297 	if (max_stream >= 1)
4298 		offset += num_components[1] * gs_max_out_vertices;
4299 	radeon_emit(ctx_cs, offset);
4300 	if (max_stream >= 2)
4301 		offset += num_components[2] * gs_max_out_vertices;
4302 	radeon_emit(ctx_cs, offset);
4303 	if (max_stream >= 3)
4304 		offset += num_components[3] * gs_max_out_vertices;
4305 	radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
4306 
4307 	radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
4308 	radeon_emit(ctx_cs, num_components[0]);
4309 	radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
4310 	radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
4311 	radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
4312 
4313 	uint32_t gs_num_invocations = gs->info.gs.invocations;
4314 	radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4315 			       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
4316 			       S_028B90_ENABLE(gs_num_invocations > 0));
4317 
4318 	radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4319 			       gs_state->vgt_esgs_ring_itemsize);
4320 
4321 	va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
4322 
4323 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4324 		if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4325 			radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
4326 			radeon_emit(cs, va >> 8);
4327 			radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
4328 		} else {
4329 			radeon_set_sh_reg_seq(cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
4330 			radeon_emit(cs, va >> 8);
4331 			radeon_emit(cs, S_00B214_MEM_BASE(va >> 40));
4332 		}
4333 
4334 		radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
4335 		radeon_emit(cs, gs->config.rsrc1);
4336 		radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
4337 
4338 		radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
4339 		radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, gs_state->vgt_gs_max_prims_per_subgroup);
4340 	} else {
4341 		radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
4342 		radeon_emit(cs, va >> 8);
4343 		radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
4344 		radeon_emit(cs, gs->config.rsrc1);
4345 		radeon_emit(cs, gs->config.rsrc2);
4346 	}
4347 
4348 	radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
4349 }
4350 
4351 static void
radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4352 radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs,
4353 				       struct radeon_cmdbuf *cs,
4354 				       const struct radv_pipeline *pipeline)
4355 {
4356 	struct radv_shader_variant *gs;
4357 
4358 	gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4359 	if (!gs)
4360 		return;
4361 
4362 	if (gs->info.is_ngg)
4363 		radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs);
4364 	else
4365 		radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs);
4366 
4367 	radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT,
4368 			      gs->info.gs.vertices_out);
4369 }
4370 
offset_to_ps_input(uint32_t offset,bool flat_shade,bool explicit,bool float16)4371 static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade,
4372 				   bool explicit, bool float16)
4373 {
4374 	uint32_t ps_input_cntl;
4375 	if (offset <= AC_EXP_PARAM_OFFSET_31) {
4376 		ps_input_cntl = S_028644_OFFSET(offset);
4377 		if (flat_shade || explicit)
4378 			ps_input_cntl |= S_028644_FLAT_SHADE(1);
4379 		if (explicit) {
4380 			/* Force parameter cache to be read in passthrough
4381 			 * mode.
4382 			 */
4383 			ps_input_cntl |= S_028644_OFFSET(1 << 5);
4384 		}
4385 		if (float16) {
4386 			ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
4387 			                 S_028644_ATTR0_VALID(1);
4388 		}
4389 	} else {
4390 		/* The input is a DEFAULT_VAL constant. */
4391 		assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
4392 		       offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
4393 		offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
4394 		ps_input_cntl = S_028644_OFFSET(0x20) |
4395 			S_028644_DEFAULT_VAL(offset);
4396 	}
4397 	return ps_input_cntl;
4398 }
4399 
4400 static void
radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4401 radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
4402 				 const struct radv_pipeline *pipeline)
4403 {
4404 	struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4405 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4406 	uint32_t ps_input_cntl[32];
4407 
4408 	unsigned ps_offset = 0;
4409 
4410 	if (ps->info.ps.prim_id_input) {
4411 		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
4412 		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4413 			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4414 			++ps_offset;
4415 		}
4416 	}
4417 
4418 	if (ps->info.ps.layer_input ||
4419 	    ps->info.needs_multiview_view_index) {
4420 		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
4421 		if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4422 			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4423 		else
4424 			ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4425 		++ps_offset;
4426 	}
4427 
4428 	if (ps->info.ps.viewport_index_input) {
4429 		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT];
4430 		if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4431 			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4432 		else
4433 			ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4434 		++ps_offset;
4435 	}
4436 
4437 	if (ps->info.ps.has_pcoord) {
4438 		unsigned val;
4439 		val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
4440 		ps_input_cntl[ps_offset] = val;
4441 		ps_offset++;
4442 	}
4443 
4444 	if (ps->info.ps.num_input_clips_culls) {
4445 		unsigned vs_offset;
4446 
4447 		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
4448 		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4449 			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4450 			++ps_offset;
4451 		}
4452 
4453 		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
4454 		if (vs_offset != AC_EXP_PARAM_UNDEFINED &&
4455 		    ps->info.ps.num_input_clips_culls > 4) {
4456 			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4457 			++ps_offset;
4458 		}
4459 	}
4460 
4461 	for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) {
4462 		unsigned vs_offset;
4463 		bool flat_shade;
4464 		bool explicit;
4465 		bool float16;
4466 		if (!(ps->info.ps.input_mask & (1u << i)))
4467 			continue;
4468 
4469 		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
4470 		if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
4471 			ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20);
4472 			++ps_offset;
4473 			continue;
4474 		}
4475 
4476 		flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset));
4477 		explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset));
4478 		float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset));
4479 
4480 		ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
4481 		++ps_offset;
4482 	}
4483 
4484 	if (ps_offset) {
4485 		radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
4486 		for (unsigned i = 0; i < ps_offset; i++) {
4487 			radeon_emit(ctx_cs, ps_input_cntl[i]);
4488 		}
4489 	}
4490 }
4491 
4492 static uint32_t
radv_compute_db_shader_control(const struct radv_device * device,const struct radv_pipeline * pipeline,const struct radv_shader_variant * ps)4493 radv_compute_db_shader_control(const struct radv_device *device,
4494 			       const struct radv_pipeline *pipeline,
4495                                const struct radv_shader_variant *ps)
4496 {
4497 	unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
4498 	unsigned z_order;
4499 	if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
4500 		z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
4501 	else
4502 		z_order = V_02880C_LATE_Z;
4503 
4504 	if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
4505 		conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
4506 	else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
4507 		conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
4508 
4509 	bool disable_rbplus = device->physical_device->rad_info.has_rbplus &&
4510 	                      !device->physical_device->rad_info.rbplus_allowed;
4511 
4512 	/* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
4513 	 * but this appears to break Project Cars (DXVK). See
4514 	 * https://bugs.freedesktop.org/show_bug.cgi?id=109401
4515 	 */
4516 	bool mask_export_enable = ps->info.ps.writes_sample_mask;
4517 
4518 	return  S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
4519 		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
4520 		S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
4521 		S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
4522 		S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) |
4523 		S_02880C_Z_ORDER(z_order) |
4524 		S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
4525 		S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
4526 		S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
4527 		S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
4528 		S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
4529 }
4530 
4531 static void
radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline)4532 radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs,
4533 				       struct radeon_cmdbuf *cs,
4534 				       struct radv_pipeline *pipeline)
4535 {
4536 	struct radv_shader_variant *ps;
4537 	uint64_t va;
4538 	assert (pipeline->shaders[MESA_SHADER_FRAGMENT]);
4539 
4540 	ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4541 	va = radv_buffer_get_va(ps->bo) + ps->bo_offset;
4542 
4543 	radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
4544 	radeon_emit(cs, va >> 8);
4545 	radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
4546 	radeon_emit(cs, ps->config.rsrc1);
4547 	radeon_emit(cs, ps->config.rsrc2);
4548 
4549 	radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
4550 	                       radv_compute_db_shader_control(pipeline->device,
4551 							      pipeline, ps));
4552 
4553 	radeon_set_context_reg(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA,
4554 			       ps->config.spi_ps_input_ena);
4555 
4556 	radeon_set_context_reg(ctx_cs, R_0286D0_SPI_PS_INPUT_ADDR,
4557 			       ps->config.spi_ps_input_addr);
4558 
4559 	radeon_set_context_reg(ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
4560 			       S_0286D8_NUM_INTERP(ps->info.ps.num_interp) |
4561 			       S_0286D8_PS_W32_EN(ps->info.wave_size == 32));
4562 
4563 	radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
4564 
4565 	radeon_set_context_reg(ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
4566 	                       ac_get_spi_shader_z_format(ps->info.ps.writes_z,
4567 	                                                  ps->info.ps.writes_stencil,
4568 	                                                  ps->info.ps.writes_sample_mask));
4569 
4570 	if (pipeline->device->dfsm_allowed) {
4571 		/* optimise this? */
4572 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
4573 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
4574 	}
4575 }
4576 
4577 static void
radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4578 radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
4579 					const struct radv_pipeline *pipeline)
4580 {
4581 	if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10 ||
4582 	    pipeline->device->physical_device->rad_info.chip_class >= GFX10)
4583 		return;
4584 
4585 	unsigned vtx_reuse_depth = 30;
4586 	if (radv_pipeline_has_tess(pipeline) &&
4587 	    radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
4588 		vtx_reuse_depth = 14;
4589 	}
4590 	radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
4591 	                       S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
4592 }
4593 
4594 static void
radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4595 radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
4596 					 const struct radv_pipeline *pipeline)
4597 {
4598 	uint32_t stages = 0;
4599 	if (radv_pipeline_has_tess(pipeline)) {
4600 		stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
4601 			S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
4602 
4603 		if (radv_pipeline_has_gs(pipeline))
4604 			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
4605 				  S_028B54_GS_EN(1);
4606 		else if (radv_pipeline_has_ngg(pipeline))
4607 			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
4608 		else
4609 			stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
4610 	} else if (radv_pipeline_has_gs(pipeline)) {
4611 		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
4612 			S_028B54_GS_EN(1);
4613 	} else if (radv_pipeline_has_ngg(pipeline)) {
4614 		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
4615 	}
4616 
4617 	if (radv_pipeline_has_ngg(pipeline)) {
4618 		stages |= S_028B54_PRIMGEN_EN(1);
4619 		if (pipeline->streamout_shader)
4620 			stages |= S_028B54_NGG_WAVE_ID_EN(1);
4621 		if (radv_pipeline_has_ngg_passthrough(pipeline))
4622 			stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
4623 	} else if (radv_pipeline_has_gs(pipeline)) {
4624 		stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
4625 	}
4626 
4627 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
4628 		stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
4629 
4630 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4631 		uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
4632 
4633 		if (radv_pipeline_has_tess(pipeline))
4634 			hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
4635 
4636 		if (pipeline->shaders[MESA_SHADER_GEOMETRY]) {
4637 			vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
4638 			if (pipeline->gs_copy_shader)
4639 				vs_size = pipeline->gs_copy_shader->info.wave_size;
4640 		} else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
4641 			vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
4642 		else if (pipeline->shaders[MESA_SHADER_VERTEX])
4643 			vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size;
4644 
4645 		if (radv_pipeline_has_ngg(pipeline))
4646 			gs_size = vs_size;
4647 
4648 		/* legacy GS only supports Wave64 */
4649 		stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
4650 			  S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
4651 			  S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
4652 	}
4653 
4654 	radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
4655 }
4656 
4657 static void
radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf * ctx_cs,const VkGraphicsPipelineCreateInfo * pCreateInfo)4658 radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
4659 				     const VkGraphicsPipelineCreateInfo *pCreateInfo)
4660 {
4661 	const  VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
4662 			vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
4663 	uint32_t cliprect_rule = 0;
4664 
4665 	if (!discard_rectangle_info) {
4666 		cliprect_rule = 0xffff;
4667 	} else {
4668 		for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
4669 			/* Interpret i as a bitmask, and then set the bit in
4670 			 * the mask if that combination of rectangles in which
4671 			 * the pixel is contained should pass the cliprect
4672 			 * test.
4673 			 */
4674 			unsigned relevant_subset = i & ((1u << discard_rectangle_info->discardRectangleCount) - 1);
4675 
4676 			if (discard_rectangle_info->discardRectangleMode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT &&
4677 			    !relevant_subset)
4678 				continue;
4679 
4680 			if (discard_rectangle_info->discardRectangleMode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT &&
4681 			    relevant_subset)
4682 				continue;
4683 
4684 			cliprect_rule |= 1u << i;
4685 		}
4686 	}
4687 
4688 	radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
4689 }
4690 
4691 static void
gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf * ctx_cs,struct radv_pipeline * pipeline)4692 gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs,
4693 				struct radv_pipeline *pipeline)
4694 {
4695 	bool break_wave_at_eoi = false;
4696 	unsigned primgroup_size;
4697 	unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
4698 
4699 	if (radv_pipeline_has_tess(pipeline)) {
4700 		primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches;
4701 	} else if (radv_pipeline_has_gs(pipeline)) {
4702 		const struct gfx9_gs_info *gs_state =
4703 			&pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
4704 		unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
4705 		primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
4706 	} else {
4707 		primgroup_size = 128; /* recommended without a GS and tess */
4708 	}
4709 
4710 	if (radv_pipeline_has_tess(pipeline)) {
4711 		if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
4712 		    radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
4713 			break_wave_at_eoi = true;
4714 	}
4715 
4716 	radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
4717 			       S_03096C_PRIM_GRP_SIZE(primgroup_size) |
4718 			       S_03096C_VERT_GRP_SIZE(vertgroup_size) |
4719 			       S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
4720 			       S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
4721 }
4722 
4723 static void
radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)4724 radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
4725 				  const struct radv_pipeline *pipeline,
4726 				  const VkGraphicsPipelineCreateInfo *pCreateInfo,
4727 				  const struct radv_graphics_pipeline_create_info *extra)
4728 {
4729 	uint32_t gs_out;
4730 
4731 	if (radv_pipeline_has_gs(pipeline)) {
4732 		gs_out = si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
4733 	} else if (radv_pipeline_has_tess(pipeline)) {
4734 		if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
4735 			gs_out = V_028A6C_OUTPRIM_TYPE_POINTLIST;
4736 		} else {
4737 			gs_out = si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.primitive_mode);
4738 		}
4739 	} else {
4740 		gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology);
4741 	}
4742 
4743 	if (extra && extra->use_rectlist) {
4744 		gs_out = V_028A6C_OUTPRIM_TYPE_TRISTRIP;
4745 		if (radv_pipeline_has_ngg(pipeline))
4746 			gs_out = V_028A6C_VGT_OUT_RECT_V0;
4747 	}
4748 
4749 	radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
4750 }
4751 
4752 static void
radv_pipeline_generate_pm4(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const struct radv_blend_state * blend)4753 radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
4754                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
4755                            const struct radv_graphics_pipeline_create_info *extra,
4756                            const struct radv_blend_state *blend)
4757 {
4758 	struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
4759 	struct radeon_cmdbuf *cs = &pipeline->cs;
4760 
4761 	cs->max_dw = 64;
4762 	ctx_cs->max_dw = 256;
4763 	cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
4764 	ctx_cs->buf = cs->buf + cs->max_dw;
4765 
4766 	radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra);
4767 	radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend);
4768 	radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
4769 	radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
4770 	radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
4771 	radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline);
4772 
4773 	if (radv_pipeline_has_tess(pipeline)) {
4774 		radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline);
4775 		radv_pipeline_generate_tess_state(ctx_cs, pipeline, pCreateInfo);
4776 	}
4777 
4778 	radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline);
4779 	radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
4780 	radv_pipeline_generate_ps_inputs(ctx_cs, pipeline);
4781 	radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
4782 	radv_pipeline_generate_vgt_shader_config(ctx_cs, pipeline);
4783 	radv_pipeline_generate_cliprect_rule(ctx_cs, pCreateInfo);
4784 	radv_pipeline_generate_vgt_gs_out(ctx_cs, pipeline, pCreateInfo, extra);
4785 
4786 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
4787 		gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline);
4788 
4789 	pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
4790 
4791 	assert(ctx_cs->cdw <= ctx_cs->max_dw);
4792 	assert(cs->cdw <= cs->max_dw);
4793 }
4794 
4795 static void
radv_pipeline_init_vertex_input_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4796 radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
4797 				      const VkGraphicsPipelineCreateInfo *pCreateInfo)
4798 {
4799 	const VkPipelineVertexInputStateCreateInfo *vi_info =
4800 		pCreateInfo->pVertexInputState;
4801 
4802 	for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
4803 		const VkVertexInputBindingDescription *desc =
4804 			&vi_info->pVertexBindingDescriptions[i];
4805 
4806 		pipeline->binding_stride[desc->binding] = desc->stride;
4807 		pipeline->num_vertex_bindings =
4808 			MAX2(pipeline->num_vertex_bindings, desc->binding + 1);
4809 	}
4810 }
4811 
4812 static struct radv_shader_variant *
radv_pipeline_get_streamout_shader(struct radv_pipeline * pipeline)4813 radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
4814 {
4815 	int i;
4816 
4817 	for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
4818 		struct radv_shader_variant *shader =
4819 			radv_get_shader(pipeline, i);
4820 
4821 		if (shader && shader->info.so.num_outputs > 0)
4822 			return shader;
4823 	}
4824 
4825 	return NULL;
4826 }
4827 
4828 static void
radv_pipeline_init_shader_stages_state(struct radv_pipeline * pipeline)4829 radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
4830 {
4831 	struct radv_device *device = pipeline->device;
4832 
4833 	for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
4834 		pipeline->user_data_0[i] =
4835 			radv_pipeline_stage_to_user_data_0(pipeline, i,
4836 							   device->physical_device->rad_info.chip_class);
4837 
4838 		if (pipeline->shaders[i]) {
4839 			pipeline->need_indirect_descriptor_sets |= pipeline->shaders[i]->info.need_indirect_descriptor_sets;
4840 		}
4841 	}
4842 
4843 	struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX,
4844 							     AC_UD_VS_BASE_VERTEX_START_INSTANCE);
4845 	if (loc->sgpr_idx != -1) {
4846 		pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX];
4847 		pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
4848 		if (radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id)
4849 			pipeline->graphics.vtx_emit_num = 3;
4850 		else
4851 			pipeline->graphics.vtx_emit_num = 2;
4852 	}
4853 }
4854 
4855 static VkResult
radv_pipeline_init(struct radv_pipeline * pipeline,struct radv_device * device,struct radv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)4856 radv_pipeline_init(struct radv_pipeline *pipeline,
4857 		   struct radv_device *device,
4858 		   struct radv_pipeline_cache *cache,
4859 		   const VkGraphicsPipelineCreateInfo *pCreateInfo,
4860 		   const struct radv_graphics_pipeline_create_info *extra)
4861 {
4862 	VkResult result;
4863 
4864 	pipeline->device = device;
4865 	pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
4866 	assert(pipeline->layout);
4867 
4868 	struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
4869 
4870 	const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
4871 		vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
4872 	radv_init_feedback(creation_feedback);
4873 
4874 	VkPipelineCreationFeedbackEXT *pipeline_feedback = creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
4875 
4876 	const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, };
4877 	VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
4878 	for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
4879 		gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1;
4880 		pStages[stage] = &pCreateInfo->pStages[i];
4881 		if(creation_feedback)
4882 			stage_feedbacks[stage] = &creation_feedback->pPipelineStageCreationFeedbacks[i];
4883 	}
4884 
4885 	struct radv_pipeline_key key = radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);
4886 
4887 	result = radv_create_shaders(pipeline, device, cache, &key, pStages,
4888 		                     pCreateInfo->flags, pipeline_feedback,
4889 				     stage_feedbacks);
4890 	if (result != VK_SUCCESS)
4891 		return result;
4892 
4893 	pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
4894 	radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
4895 	radv_pipeline_init_input_assembly_state(pipeline, pCreateInfo, extra);
4896 	radv_pipeline_init_dynamic_state(pipeline, pCreateInfo, extra);
4897 	radv_pipeline_init_raster_state(pipeline, pCreateInfo);
4898 	radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo);
4899 
4900 	/* Ensure that some export memory is always allocated, for two reasons:
4901 	 *
4902 	 * 1) Correctness: The hardware ignores the EXEC mask if no export
4903 	 *    memory is allocated, so KILL and alpha test do not work correctly
4904 	 *    without this.
4905 	 * 2) Performance: Every shader needs at least a NULL export, even when
4906 	 *    it writes no color/depth output. The NULL export instruction
4907 	 *    stalls without this setting.
4908 	 *
4909 	 * Don't add this to CB_SHADER_MASK.
4910 	 *
4911 	 * GFX10 supports pixel shaders without exports by setting both the
4912 	 * color and Z formats to SPI_SHADER_ZERO. The hw will skip export
4913 	 * instructions if any are present.
4914 	 */
4915 	struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4916 	if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 ||
4917 	     ps->info.ps.can_discard) &&
4918 	    !blend.spi_shader_col_format) {
4919 		if (!ps->info.ps.writes_z &&
4920 		    !ps->info.ps.writes_stencil &&
4921 		    !ps->info.ps.writes_sample_mask)
4922 			blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
4923 	}
4924 
4925 	blend.cb_shader_mask = ps->info.ps.cb_shader_mask;
4926 
4927 	if (extra &&
4928 	    (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
4929 	     extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
4930 	     extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS ||
4931 	     extra->custom_blend_mode == V_028808_CB_RESOLVE)) {
4932 		/* According to the CB spec states, CB_SHADER_MASK should be
4933 		 * set to enable writes to all four channels of MRT0.
4934 		 */
4935 		blend.cb_shader_mask = 0xf;
4936 	}
4937 
4938 	pipeline->graphics.col_format = blend.spi_shader_col_format;
4939 	pipeline->graphics.cb_target_mask = blend.cb_target_mask;
4940 
4941 	if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
4942 		struct radv_shader_variant *gs =
4943 			pipeline->shaders[MESA_SHADER_GEOMETRY];
4944 
4945 		radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
4946 	}
4947 
4948 	if (radv_pipeline_has_tess(pipeline)) {
4949 		pipeline->graphics.tess_patch_control_points =
4950 			pCreateInfo->pTessellationState->patchControlPoints;
4951 	}
4952 
4953 	radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo);
4954 	radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend);
4955 	radv_pipeline_init_shader_stages_state(pipeline);
4956 	radv_pipeline_init_scratch(device, pipeline);
4957 
4958 	/* Find the last vertex shader stage that eventually uses streamout. */
4959 	pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
4960 
4961 	radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
4962 
4963 	return result;
4964 }
4965 
4966 VkResult
radv_graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4967 radv_graphics_pipeline_create(
4968 	VkDevice _device,
4969 	VkPipelineCache _cache,
4970 	const VkGraphicsPipelineCreateInfo *pCreateInfo,
4971 	const struct radv_graphics_pipeline_create_info *extra,
4972 	const VkAllocationCallbacks *pAllocator,
4973 	VkPipeline *pPipeline)
4974 {
4975 	RADV_FROM_HANDLE(radv_device, device, _device);
4976 	RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
4977 	struct radv_pipeline *pipeline;
4978 	VkResult result;
4979 
4980 	pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
4981 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4982 	if (pipeline == NULL)
4983 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
4984 
4985 	vk_object_base_init(&device->vk, &pipeline->base,
4986 			    VK_OBJECT_TYPE_PIPELINE);
4987 
4988 	result = radv_pipeline_init(pipeline, device, cache,
4989 				    pCreateInfo, extra);
4990 	if (result != VK_SUCCESS) {
4991 		radv_pipeline_destroy(device, pipeline, pAllocator);
4992 		return result;
4993 	}
4994 
4995 	*pPipeline = radv_pipeline_to_handle(pipeline);
4996 
4997 	return VK_SUCCESS;
4998 }
4999 
radv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)5000 VkResult radv_CreateGraphicsPipelines(
5001 	VkDevice                                    _device,
5002 	VkPipelineCache                             pipelineCache,
5003 	uint32_t                                    count,
5004 	const VkGraphicsPipelineCreateInfo*         pCreateInfos,
5005 	const VkAllocationCallbacks*                pAllocator,
5006 	VkPipeline*                                 pPipelines)
5007 {
5008 	VkResult result = VK_SUCCESS;
5009 	unsigned i = 0;
5010 
5011 	for (; i < count; i++) {
5012 		VkResult r;
5013 		r = radv_graphics_pipeline_create(_device,
5014 						  pipelineCache,
5015 						  &pCreateInfos[i],
5016 						  NULL, pAllocator, &pPipelines[i]);
5017 		if (r != VK_SUCCESS) {
5018 			result = r;
5019 			pPipelines[i] = VK_NULL_HANDLE;
5020 
5021 			if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5022 				break;
5023 		}
5024 	}
5025 
5026 	for (; i < count; ++i)
5027 		pPipelines[i] = VK_NULL_HANDLE;
5028 
5029 	return result;
5030 }
5031 
5032 static void
radv_pipeline_generate_hw_cs(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)5033 radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs,
5034 			     const struct radv_pipeline *pipeline)
5035 {
5036 	struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5037 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
5038 	struct radv_device *device = pipeline->device;
5039 
5040 	radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
5041 	radeon_emit(cs, va >> 8);
5042 	radeon_emit(cs, S_00B834_DATA(va >> 40));
5043 
5044 	radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
5045 	radeon_emit(cs, shader->config.rsrc1);
5046 	radeon_emit(cs, shader->config.rsrc2);
5047 	if (device->physical_device->rad_info.chip_class >= GFX10) {
5048 		radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
5049 	}
5050 }
5051 
5052 static void
radv_pipeline_generate_compute_state(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)5053 radv_pipeline_generate_compute_state(struct radeon_cmdbuf *cs,
5054 				     const struct radv_pipeline *pipeline)
5055 {
5056 	struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5057 	struct radv_device *device = pipeline->device;
5058 	unsigned threads_per_threadgroup;
5059 	unsigned threadgroups_per_cu = 1;
5060 	unsigned waves_per_threadgroup;
5061 	unsigned max_waves_per_sh = 0;
5062 
5063 	/* Calculate best compute resource limits. */
5064 	threads_per_threadgroup = shader->info.cs.block_size[0] *
5065 				  shader->info.cs.block_size[1] *
5066 				  shader->info.cs.block_size[2];
5067 	waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
5068 					     shader->info.wave_size);
5069 
5070 	if (device->physical_device->rad_info.chip_class >= GFX10 &&
5071 	    waves_per_threadgroup == 1)
5072 		threadgroups_per_cu = 2;
5073 
5074 	radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
5075 			  ac_get_compute_resource_limits(&device->physical_device->rad_info,
5076 							 waves_per_threadgroup,
5077 							 max_waves_per_sh,
5078 							 threadgroups_per_cu));
5079 
5080 	radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
5081 	radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
5082 	radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
5083 	radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
5084 }
5085 
5086 static void
radv_compute_generate_pm4(struct radv_pipeline * pipeline)5087 radv_compute_generate_pm4(struct radv_pipeline *pipeline)
5088 {
5089 	struct radv_device *device = pipeline->device;
5090 	struct radeon_cmdbuf *cs = &pipeline->cs;
5091 
5092 	cs->max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 19 : 16;
5093 	cs->buf = malloc(cs->max_dw * 4);
5094 
5095 	radv_pipeline_generate_hw_cs(cs, pipeline);
5096 	radv_pipeline_generate_compute_state(cs, pipeline);
5097 
5098 	assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
5099 }
5100 
5101 static struct radv_pipeline_key
radv_generate_compute_pipeline_key(struct radv_pipeline * pipeline,const VkComputePipelineCreateInfo * pCreateInfo)5102 radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
5103 				   const VkComputePipelineCreateInfo *pCreateInfo)
5104 {
5105 	const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
5106 	struct radv_pipeline_key key;
5107 	memset(&key, 0, sizeof(key));
5108 
5109 	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
5110 		key.optimisations_disabled = 1;
5111 
5112 	const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size =
5113 		vk_find_struct_const(stage->pNext,
5114 				     PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
5115 
5116 	if (subgroup_size) {
5117 		assert(subgroup_size->requiredSubgroupSize == 32 ||
5118 		       subgroup_size->requiredSubgroupSize == 64);
5119 		key.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
5120 	}
5121 
5122 	return key;
5123 }
5124 
radv_compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)5125 static VkResult radv_compute_pipeline_create(
5126 	VkDevice                                    _device,
5127 	VkPipelineCache                             _cache,
5128 	const VkComputePipelineCreateInfo*          pCreateInfo,
5129 	const VkAllocationCallbacks*                pAllocator,
5130 	VkPipeline*                                 pPipeline)
5131 {
5132 	RADV_FROM_HANDLE(radv_device, device, _device);
5133 	RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
5134 	const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, };
5135 	VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
5136 	struct radv_pipeline *pipeline;
5137 	VkResult result;
5138 
5139 	pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
5140 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5141 	if (pipeline == NULL)
5142 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
5143 
5144 	vk_object_base_init(&device->vk, &pipeline->base,
5145 			    VK_OBJECT_TYPE_PIPELINE);
5146 
5147 	pipeline->device = device;
5148 	pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
5149 	assert(pipeline->layout);
5150 
5151 	const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
5152 		vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
5153 	radv_init_feedback(creation_feedback);
5154 
5155 	VkPipelineCreationFeedbackEXT *pipeline_feedback = creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
5156 	if (creation_feedback)
5157 		stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0];
5158 
5159 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
5160 
5161 	struct radv_pipeline_key key =
5162 		radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
5163 
5164 	result = radv_create_shaders(pipeline, device, cache, &key, pStages,
5165 		                     pCreateInfo->flags, pipeline_feedback,
5166 				     stage_feedbacks);
5167 	if (result != VK_SUCCESS) {
5168 		radv_pipeline_destroy(device, pipeline, pAllocator);
5169 		return result;
5170 	}
5171 
5172 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
5173 	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
5174 	radv_pipeline_init_scratch(device, pipeline);
5175 
5176 	radv_compute_generate_pm4(pipeline);
5177 
5178 	*pPipeline = radv_pipeline_to_handle(pipeline);
5179 
5180 	return VK_SUCCESS;
5181 }
5182 
radv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)5183 VkResult radv_CreateComputePipelines(
5184 	VkDevice                                    _device,
5185 	VkPipelineCache                             pipelineCache,
5186 	uint32_t                                    count,
5187 	const VkComputePipelineCreateInfo*          pCreateInfos,
5188 	const VkAllocationCallbacks*                pAllocator,
5189 	VkPipeline*                                 pPipelines)
5190 {
5191 	VkResult result = VK_SUCCESS;
5192 
5193 	unsigned i = 0;
5194 	for (; i < count; i++) {
5195 		VkResult r;
5196 		r = radv_compute_pipeline_create(_device, pipelineCache,
5197 						 &pCreateInfos[i],
5198 						 pAllocator, &pPipelines[i]);
5199 		if (r != VK_SUCCESS) {
5200 			result = r;
5201 			pPipelines[i] = VK_NULL_HANDLE;
5202 
5203 			if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5204 				break;
5205 		}
5206 	}
5207 
5208 	for (; i < count; ++i)
5209 		pPipelines[i] = VK_NULL_HANDLE;
5210 
5211 	return result;
5212 }
5213 
5214 
radv_get_executable_count(const struct radv_pipeline * pipeline)5215 static uint32_t radv_get_executable_count(const struct radv_pipeline *pipeline)
5216 {
5217 	uint32_t ret = 0;
5218 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5219 		if (!pipeline->shaders[i])
5220 			continue;
5221 
5222 		if (i == MESA_SHADER_GEOMETRY &&
5223 		    !radv_pipeline_has_ngg(pipeline)) {
5224 			ret += 2u;
5225 		} else {
5226 			ret += 1u;
5227 		}
5228 
5229 	}
5230 	return ret;
5231 }
5232 
5233 static struct radv_shader_variant *
radv_get_shader_from_executable_index(const struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)5234 radv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index, gl_shader_stage *stage)
5235 {
5236 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5237 		if (!pipeline->shaders[i])
5238 			continue;
5239 		if (!index) {
5240 			*stage = i;
5241 			return pipeline->shaders[i];
5242 		}
5243 
5244 		--index;
5245 
5246 		if (i == MESA_SHADER_GEOMETRY &&
5247 		    !radv_pipeline_has_ngg(pipeline)) {
5248 			if (!index) {
5249 				*stage = i;
5250 				return pipeline->gs_copy_shader;
5251 			}
5252 			--index;
5253 		}
5254 	}
5255 
5256 	*stage = -1;
5257 	return NULL;
5258 }
5259 
5260 /* Basically strlcpy (which does not exist on linux) specialized for
5261  * descriptions. */
desc_copy(char * desc,const char * src)5262 static void desc_copy(char *desc, const char *src) {
5263 	int len = strlen(src);
5264 	assert(len < VK_MAX_DESCRIPTION_SIZE);
5265 	memcpy(desc, src, len);
5266 	memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
5267 }
5268 
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)5269 VkResult radv_GetPipelineExecutablePropertiesKHR(
5270     VkDevice                                    _device,
5271     const VkPipelineInfoKHR*                    pPipelineInfo,
5272     uint32_t*                                   pExecutableCount,
5273     VkPipelineExecutablePropertiesKHR*          pProperties)
5274 {
5275 	RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
5276 	const uint32_t total_count = radv_get_executable_count(pipeline);
5277 
5278 	if (!pProperties) {
5279 		*pExecutableCount = total_count;
5280 		return VK_SUCCESS;
5281 	}
5282 
5283 	const uint32_t count = MIN2(total_count, *pExecutableCount);
5284 	for (unsigned i = 0, executable_idx = 0;
5285 	     i < MESA_SHADER_STAGES && executable_idx < count; ++i) {
5286 		if (!pipeline->shaders[i])
5287 			continue;
5288 		pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
5289 		const char *name = NULL;
5290 		const char *description = NULL;
5291 		switch(i) {
5292 		case MESA_SHADER_VERTEX:
5293 			name = "Vertex Shader";
5294 			description = "Vulkan Vertex Shader";
5295 			break;
5296 		case MESA_SHADER_TESS_CTRL:
5297 			if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
5298 				pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5299 				name = "Vertex + Tessellation Control Shaders";
5300 				description = "Combined Vulkan Vertex and Tessellation Control Shaders";
5301 			} else {
5302 				name = "Tessellation Control Shader";
5303 				description = "Vulkan Tessellation Control Shader";
5304 			}
5305 			break;
5306 		case MESA_SHADER_TESS_EVAL:
5307 			name = "Tessellation Evaluation Shader";
5308 			description = "Vulkan Tessellation Evaluation Shader";
5309 			break;
5310 		case MESA_SHADER_GEOMETRY:
5311 			if (radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
5312 				pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
5313 				name = "Tessellation Evaluation + Geometry Shaders";
5314 				description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
5315 			} else if (!radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_VERTEX]) {
5316 				pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5317 				name = "Vertex + Geometry Shader";
5318 				description = "Combined Vulkan Vertex and Geometry Shaders";
5319 			} else {
5320 				name = "Geometry Shader";
5321 				description = "Vulkan Geometry Shader";
5322 			}
5323 			break;
5324 		case MESA_SHADER_FRAGMENT:
5325 			name = "Fragment Shader";
5326 			description = "Vulkan Fragment Shader";
5327 			break;
5328 		case MESA_SHADER_COMPUTE:
5329 			name = "Compute Shader";
5330 			description = "Vulkan Compute Shader";
5331 			break;
5332 		}
5333 
5334 		pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
5335 		desc_copy(pProperties[executable_idx].name, name);
5336 		desc_copy(pProperties[executable_idx].description, description);
5337 
5338 		++executable_idx;
5339 		if (i == MESA_SHADER_GEOMETRY &&
5340 		    !radv_pipeline_has_ngg(pipeline)) {
5341 			assert(pipeline->gs_copy_shader);
5342 			if (executable_idx >= count)
5343 				break;
5344 
5345 			pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
5346 			pProperties[executable_idx].subgroupSize = 64;
5347 			desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
5348 			desc_copy(pProperties[executable_idx].description,
5349 				  "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
5350 
5351 			++executable_idx;
5352 		}
5353 	}
5354 
5355 	VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
5356 	*pExecutableCount = count;
5357 	return result;
5358 }
5359 
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)5360 VkResult radv_GetPipelineExecutableStatisticsKHR(
5361     VkDevice                                    _device,
5362     const VkPipelineExecutableInfoKHR*          pExecutableInfo,
5363     uint32_t*                                   pStatisticCount,
5364     VkPipelineExecutableStatisticKHR*           pStatistics)
5365 {
5366 	RADV_FROM_HANDLE(radv_device, device, _device);
5367 	RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
5368 	gl_shader_stage stage;
5369 	struct radv_shader_variant *shader = radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
5370 
5371 	enum chip_class chip_class = device->physical_device->rad_info.chip_class;
5372 	unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
5373 	unsigned max_waves = radv_get_max_waves(device, shader, stage);
5374 
5375 	VkPipelineExecutableStatisticKHR *s = pStatistics;
5376 	VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
5377 	VkResult result = VK_SUCCESS;
5378 
5379 	if (s < end) {
5380 		desc_copy(s->name, "SGPRs");
5381 		desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
5382 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5383 		s->value.u64 = shader->config.num_sgprs;
5384 	}
5385 	++s;
5386 
5387 	if (s < end) {
5388 		desc_copy(s->name, "VGPRs");
5389 		desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
5390 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5391 		s->value.u64 = shader->config.num_vgprs;
5392 	}
5393 	++s;
5394 
5395 	if (s < end) {
5396 		desc_copy(s->name, "Spilled SGPRs");
5397 		desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
5398 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5399 		s->value.u64 = shader->config.spilled_sgprs;
5400 	}
5401 	++s;
5402 
5403 	if (s < end) {
5404 		desc_copy(s->name, "Spilled VGPRs");
5405 		desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
5406 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5407 		s->value.u64 = shader->config.spilled_vgprs;
5408 	}
5409 	++s;
5410 
5411 	if (s < end) {
5412 		desc_copy(s->name, "PrivMem VGPRs");
5413 		desc_copy(s->description, "Number of VGPRs stored in private memory per subgroup");
5414 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5415 		s->value.u64 = shader->info.private_mem_vgprs;
5416 	}
5417 	++s;
5418 
5419 	if (s < end) {
5420 		desc_copy(s->name, "Code size");
5421 		desc_copy(s->description, "Code size in bytes");
5422 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5423 		s->value.u64 = shader->exec_size;
5424 	}
5425 	++s;
5426 
5427 	if (s < end) {
5428 		desc_copy(s->name, "LDS size");
5429 		desc_copy(s->description, "LDS size in bytes per workgroup");
5430 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5431 		s->value.u64 = shader->config.lds_size * lds_increment;
5432 	}
5433 	++s;
5434 
5435 	if (s < end) {
5436 		desc_copy(s->name, "Scratch size");
5437 		desc_copy(s->description, "Private memory in bytes per subgroup");
5438 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5439 		s->value.u64 = shader->config.scratch_bytes_per_wave;
5440 	}
5441 	++s;
5442 
5443 	if (s < end) {
5444 		desc_copy(s->name, "Subgroups per SIMD");
5445 		desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
5446 		s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5447 		s->value.u64 = max_waves;
5448 	}
5449 	++s;
5450 
5451 	if (shader->statistics) {
5452 		for (unsigned i = 0; i < shader->statistics->count; i++) {
5453 			struct radv_compiler_statistic_info *info = &shader->statistics->infos[i];
5454 			uint32_t value = shader->statistics->values[i];
5455 			if (s < end) {
5456 				desc_copy(s->name, info->name);
5457 				desc_copy(s->description, info->desc);
5458 				s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5459 				s->value.u64 = value;
5460 			}
5461 			++s;
5462 		}
5463 	}
5464 
5465 	if (!pStatistics)
5466 		*pStatisticCount = s - pStatistics;
5467 	else if (s > end) {
5468 		*pStatisticCount = end - pStatistics;
5469 		result = VK_INCOMPLETE;
5470 	} else {
5471 		*pStatisticCount = s - pStatistics;
5472 	}
5473 
5474 	return result;
5475 }
5476 
radv_copy_representation(void * data,size_t * data_size,const char * src)5477 static VkResult radv_copy_representation(void *data, size_t *data_size, const char *src)
5478 {
5479 	size_t total_size  = strlen(src) + 1;
5480 
5481 	if (!data) {
5482 		*data_size = total_size;
5483 		return VK_SUCCESS;
5484 	}
5485 
5486 	size_t size = MIN2(total_size, *data_size);
5487 
5488 	memcpy(data, src, size);
5489 	if (size)
5490 		*((char*)data + size - 1) = 0;
5491 	return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
5492 }
5493 
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)5494 VkResult radv_GetPipelineExecutableInternalRepresentationsKHR(
5495     VkDevice                                    device,
5496     const VkPipelineExecutableInfoKHR*          pExecutableInfo,
5497     uint32_t*                                   pInternalRepresentationCount,
5498     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
5499 {
5500 	RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
5501 	gl_shader_stage stage;
5502 	struct radv_shader_variant *shader = radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
5503 
5504 	VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
5505 	VkPipelineExecutableInternalRepresentationKHR *end = p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
5506 	VkResult result = VK_SUCCESS;
5507 	/* optimized NIR */
5508 	if (p < end) {
5509 		p->isText = true;
5510 		desc_copy(p->name, "NIR Shader(s)");
5511 		desc_copy(p->description, "The optimized NIR shader(s)");
5512 		if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
5513 			result = VK_INCOMPLETE;
5514 	}
5515 	++p;
5516 
5517 	/* backend IR */
5518 	if (p < end) {
5519 		p->isText = true;
5520 		if (pipeline->device->physical_device->use_llvm) {
5521 			desc_copy(p->name, "LLVM IR");
5522 			desc_copy(p->description, "The LLVM IR after some optimizations");
5523 		} else {
5524 			desc_copy(p->name, "ACO IR");
5525 			desc_copy(p->description, "The ACO IR after some optimizations");
5526 		}
5527 		if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
5528 			result = VK_INCOMPLETE;
5529 	}
5530 	++p;
5531 
5532 	/* Disassembler */
5533 	if (p < end) {
5534 		p->isText = true;
5535 		desc_copy(p->name, "Assembly");
5536 		desc_copy(p->description, "Final Assembly");
5537 		if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
5538 			result = VK_INCOMPLETE;
5539 	}
5540 	++p;
5541 
5542 	if (!pInternalRepresentations)
5543 		*pInternalRepresentationCount = p - pInternalRepresentations;
5544 	else if(p > end) {
5545 		result = VK_INCOMPLETE;
5546 		*pInternalRepresentationCount = end - pInternalRepresentations;
5547 	} else {
5548 		*pInternalRepresentationCount = p - pInternalRepresentations;
5549 	}
5550 
5551 	return result;
5552 }
5553