1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file crocus_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <errno.h>
74 #include <stdio.h>
75
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #ifdef DEBUG
81 #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
82 #endif
83 #else
84 #define VG(x)
85 #endif
86
87 #include "drm-uapi/i915_drm.h"
88 #include "intel/common/intel_l3_config.h"
89 #include "intel/common/intel_sample_positions.h"
90 #include "intel/compiler/brw_compiler.h"
91 #include "compiler/shader_info.h"
92 #include "pipe/p_context.h"
93 #include "pipe/p_defines.h"
94 #include "pipe/p_screen.h"
95 #include "pipe/p_state.h"
96 #include "util/format/u_format.h"
97 #include "util/half_float.h"
98 #include "util/u_dual_blend.h"
99 #include "util/u_framebuffer.h"
100 #include "util/u_helpers.h"
101 #include "util/u_inlines.h"
102 #include "util/u_memory.h"
103 #include "util/u_prim.h"
104 #include "util/u_transfer.h"
105 #include "util/u_upload_mgr.h"
106 #include "util/u_viewport.h"
107 #include "crocus_batch.h"
108 #include "crocus_context.h"
109 #include "crocus_defines.h"
110 #include "crocus_pipe.h"
111 #include "crocus_resource.h"
112
113 #include "crocus_genx_macros.h"
114 #include "intel/common/intel_guardband.h"
115 #include "main/macros.h" /* UNCLAMPED_* */
116
117 /**
118 * Statically assert that PIPE_* enums match the hardware packets.
119 * (As long as they match, we don't need to translate them.)
120 */
pipe_asserts()121 UNUSED static void pipe_asserts()
122 {
123 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
124
125 /* pipe_logicop happens to match the hardware. */
126 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
127 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
128 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
129 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
130 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
131 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
132 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
133 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
134 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
135 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
136 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
137 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
138 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
139 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
140 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
141 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
142
143 /* pipe_blend_func happens to match the hardware. */
144 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
145 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
146 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
147 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
148 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
149 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
150 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
151 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
161 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
162 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
163
164 /* pipe_blend_func happens to match the hardware. */
165 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
166 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
167 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
168 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
169 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
170
171 /* pipe_stencil_op happens to match the hardware. */
172 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
173 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
174 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
175 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
176 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
177 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
178 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
179 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
180
181 #if GFX_VER >= 6
182 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
183 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
184 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
185 #endif
186 #undef PIPE_ASSERT
187 }
188
189 static unsigned
translate_prim_type(enum pipe_prim_type prim,uint8_t verts_per_patch)190 translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
191 {
192 static const unsigned map[] = {
193 [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,
194 [PIPE_PRIM_LINES] = _3DPRIM_LINELIST,
195 [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
196 [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
197 [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
198 [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
199 [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
200 [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,
201 [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
202 [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,
203 #if GFX_VER >= 6
204 [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
205 [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
206 [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
207 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
208 #endif
209 #if GFX_VER >= 7
210 [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
211 #endif
212 };
213
214 return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
215 }
216
217 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)218 translate_compare_func(enum pipe_compare_func pipe_func)
219 {
220 static const unsigned map[] = {
221 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
222 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
223 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
224 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
225 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
226 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
227 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
228 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
229 };
230 return map[pipe_func];
231 }
232
233 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)234 translate_shadow_func(enum pipe_compare_func pipe_func)
235 {
236 /* Gallium specifies the result of shadow comparisons as:
237 *
238 * 1 if ref <op> texel,
239 * 0 otherwise.
240 *
241 * The hardware does:
242 *
243 * 0 if texel <op> ref,
244 * 1 otherwise.
245 *
246 * So we need to flip the operator and also negate.
247 */
248 static const unsigned map[] = {
249 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
250 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
251 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
252 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
253 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
254 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
255 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
256 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
257 };
258 return map[pipe_func];
259 }
260
261 static unsigned
translate_cull_mode(unsigned pipe_face)262 translate_cull_mode(unsigned pipe_face)
263 {
264 static const unsigned map[4] = {
265 [PIPE_FACE_NONE] = CULLMODE_NONE,
266 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
267 [PIPE_FACE_BACK] = CULLMODE_BACK,
268 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
269 };
270 return map[pipe_face];
271 }
272
273 #if GFX_VER >= 6
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277 static const unsigned map[4] = {
278 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
279 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
280 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
281 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282 };
283 return map[pipe_polymode];
284 }
285 #endif
286
287 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)288 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
289 {
290 static const unsigned map[] = {
291 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
292 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
293 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
294 };
295 return map[pipe_mip];
296 }
297
298 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)299 translate_wrap(unsigned pipe_wrap, bool either_nearest)
300 {
301 static const unsigned map[] = {
302 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
303 #if GFX_VER == 8
304 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
305 #else
306 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
307 #endif
308 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
309 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
310 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
311 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
312
313 /* These are unsupported. */
314 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
315 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
316 };
317 #if GFX_VER < 8
318 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
319 return TCM_CLAMP;
320 #endif
321 return map[pipe_wrap];
322 }
323
324 /**
325 * Equiv if brw_state_batch
326 */
327 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)328 stream_state(struct crocus_batch *batch,
329 unsigned size,
330 unsigned alignment,
331 uint32_t *out_offset)
332 {
333 uint32_t offset = ALIGN(batch->state.used, alignment);
334
335 if (offset + size >= STATE_SZ && !batch->no_wrap) {
336 crocus_batch_flush(batch);
337 offset = ALIGN(batch->state.used, alignment);
338 } else if (offset + size >= batch->state.bo->size) {
339 const unsigned new_size =
340 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
341 MAX_STATE_SIZE);
342 crocus_grow_buffer(batch, true, batch->state.used, new_size);
343 assert(offset + size < batch->state.bo->size);
344 }
345
346 crocus_record_state_size(batch->state_sizes, offset, size);
347
348 batch->state.used = offset + size;
349 *out_offset = offset;
350
351 return (uint32_t *)batch->state.map + (offset >> 2);
352 }
353
354 /**
355 * stream_state() + memcpy.
356 */
357 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)358 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
359 unsigned alignment)
360 {
361 unsigned offset = 0;
362 uint32_t *map = stream_state(batch, size, alignment, &offset);
363
364 if (map)
365 memcpy(map, data, size);
366
367 return offset;
368 }
369
370 #if GFX_VER <= 5
371 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)372 upload_pipelined_state_pointers(struct crocus_batch *batch,
373 bool gs_active, uint32_t gs_offset,
374 uint32_t vs_offset, uint32_t sf_offset,
375 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
376 {
377 #if GFX_VER == 5
378 /* Need to flush before changing clip max threads for errata. */
379 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
380 #endif
381
382 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
383 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
384 pp.GSEnable = gs_active;
385 if (gs_active)
386 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
387 pp.ClipEnable = true;
388 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
389 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
390 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
391 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
392 }
393 }
394
395 #endif
396 /**
397 * Did field 'x' change between 'old_cso' and 'new_cso'?
398 *
399 * (If so, we may want to set some dirty flags.)
400 */
401 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
402 #define cso_changed_memcmp(x) \
403 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
404
405 static void
flush_before_state_base_change(struct crocus_batch * batch)406 flush_before_state_base_change(struct crocus_batch *batch)
407 {
408 #if GFX_VER >= 6
409 /* Flush before emitting STATE_BASE_ADDRESS.
410 *
411 * This isn't documented anywhere in the PRM. However, it seems to be
412 * necessary prior to changing the surface state base adress. We've
413 * seen issues in Vulkan where we get GPU hangs when using multi-level
414 * command buffers which clear depth, reset state base address, and then
415 * go render stuff.
416 *
417 * Normally, in GL, we would trust the kernel to do sufficient stalls
418 * and flushes prior to executing our batch. However, it doesn't seem
419 * as if the kernel's flushing is always sufficient and we don't want to
420 * rely on it.
421 *
422 * We make this an end-of-pipe sync instead of a normal flush because we
423 * do not know the current status of the GPU. On Haswell at least,
424 * having a fast-clear operation in flight at the same time as a normal
425 * rendering operation can cause hangs. Since the kernel's flushing is
426 * insufficient, we need to ensure that any rendering operations from
427 * other processes are definitely complete before we try to do our own
428 * rendering. It's a bit of a big hammer but it appears to work.
429 */
430 const unsigned dc_flush =
431 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
432 crocus_emit_end_of_pipe_sync(batch,
433 "change STATE_BASE_ADDRESS (flushes)",
434 PIPE_CONTROL_RENDER_TARGET_FLUSH |
435 dc_flush |
436 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
437 #endif
438 }
439
440 static void
flush_after_state_base_change(struct crocus_batch * batch)441 flush_after_state_base_change(struct crocus_batch *batch)
442 {
443 /* After re-setting the surface state base address, we have to do some
444 * cache flusing so that the sampler engine will pick up the new
445 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
446 * Shared Function > 3D Sampler > State > State Caching (page 96):
447 *
448 * Coherency with system memory in the state cache, like the texture
449 * cache is handled partially by software. It is expected that the
450 * command stream or shader will issue Cache Flush operation or
451 * Cache_Flush sampler message to ensure that the L1 cache remains
452 * coherent with system memory.
453 *
454 * [...]
455 *
456 * Whenever the value of the Dynamic_State_Base_Addr,
457 * Surface_State_Base_Addr are altered, the L1 state cache must be
458 * invalidated to ensure the new surface or sampler state is fetched
459 * from system memory.
460 *
461 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
462 * which, according the PIPE_CONTROL instruction documentation in the
463 * Broadwell PRM:
464 *
465 * Setting this bit is independent of any other bit in this packet.
466 * This bit controls the invalidation of the L1 and L2 state caches
467 * at the top of the pipe i.e. at the parsing time.
468 *
469 * Unfortunately, experimentation seems to indicate that state cache
470 * invalidation through a PIPE_CONTROL does nothing whatsoever in
471 * regards to surface state and binding tables. In stead, it seems that
472 * invalidating the texture cache is what is actually needed.
473 *
474 * XXX: As far as we have been able to determine through
475 * experimentation, shows that flush the texture cache appears to be
476 * sufficient. The theory here is that all of the sampling/rendering
477 * units cache the binding table in the texture cache. However, we have
478 * yet to be able to actually confirm this.
479 */
480 #if GFX_VER >= 6
481 crocus_emit_end_of_pipe_sync(batch,
482 "change STATE_BASE_ADDRESS (invalidates)",
483 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
484 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
485 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
486 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
487 #endif
488 }
489
490 #if GFX_VER >= 6
491 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)492 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
493 struct crocus_bo *bo, uint32_t offset,
494 bool predicated)
495 {
496 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
497 srm.RegisterAddress = reg;
498 srm.MemoryAddress = ggtt_bo(bo, offset);
499 #if GFX_VERx10 >= 75
500 srm.PredicateEnable = predicated;
501 #else
502 if (predicated)
503 unreachable("unsupported predication");
504 #endif
505 }
506 }
507
508 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)509 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
510 struct crocus_bo *bo, uint32_t offset,
511 bool predicated)
512 {
513 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
514 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
515 }
516 #endif
517
518 #if GFX_VER >= 7
519 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)520 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
521 {
522 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
523 lri.RegisterOffset = reg;
524 lri.DataDWord = val;
525 }
526 }
527 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
528
529 #if GFX_VERx10 >= 75
530 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)531 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
532 {
533 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
534 lrr.SourceRegisterAddress = src;
535 lrr.DestinationRegisterAddress = dst;
536 }
537 }
538
539 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)540 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
541 uint32_t src)
542 {
543 _crocus_emit_lrr(batch, dst, src);
544 }
545
546 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)547 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
548 uint32_t src)
549 {
550 _crocus_emit_lrr(batch, dst, src);
551 _crocus_emit_lrr(batch, dst + 4, src + 4);
552 }
553 #endif
554
555 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)556 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
557 uint32_t val)
558 {
559 _crocus_emit_lri(batch, reg, val);
560 }
561
562 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)563 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
564 uint64_t val)
565 {
566 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
567 _crocus_emit_lri(batch, reg + 4, val >> 32);
568 }
569
570 /**
571 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
572 */
573 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)574 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
575 struct crocus_bo *bo, uint32_t offset)
576 {
577 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
578 lrm.RegisterAddress = reg;
579 lrm.MemoryAddress = ro_bo(bo, offset);
580 }
581 }
582
583 /**
584 * Load a 64-bit value from a buffer into a MMIO register via
585 * two MI_LOAD_REGISTER_MEM commands.
586 */
587 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)588 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
589 struct crocus_bo *bo, uint32_t offset)
590 {
591 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
592 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
593 }
594
595 #if GFX_VERx10 >= 75
596 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)597 crocus_store_data_imm32(struct crocus_batch *batch,
598 struct crocus_bo *bo, uint32_t offset,
599 uint32_t imm)
600 {
601 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
602 sdi.Address = rw_bo(bo, offset);
603 #if GFX_VER >= 6
604 sdi.ImmediateData = imm;
605 #endif
606 }
607 }
608
609 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)610 crocus_store_data_imm64(struct crocus_batch *batch,
611 struct crocus_bo *bo, uint32_t offset,
612 uint64_t imm)
613 {
614 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
615 * 2 in genxml but it's actually variable length and we need 5 DWords.
616 */
617 void *map = crocus_get_command_space(batch, 4 * 5);
618 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
619 sdi.DWordLength = 5 - 2;
620 sdi.Address = rw_bo(bo, offset);
621 #if GFX_VER >= 6
622 sdi.ImmediateData = imm;
623 #endif
624 }
625 }
626 #endif
627
628 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)629 crocus_copy_mem_mem(struct crocus_batch *batch,
630 struct crocus_bo *dst_bo, uint32_t dst_offset,
631 struct crocus_bo *src_bo, uint32_t src_offset,
632 unsigned bytes)
633 {
634 assert(bytes % 4 == 0);
635 assert(dst_offset % 4 == 0);
636 assert(src_offset % 4 == 0);
637
638 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
639 for (unsigned i = 0; i < bytes; i += 4) {
640 crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
641 src_bo, src_offset + i);
642 crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
643 dst_bo, dst_offset + i, false);
644 }
645 }
646 #endif
647
648 /**
649 * Gallium CSO for rasterizer state.
650 */
651 struct crocus_rasterizer_state {
652 struct pipe_rasterizer_state cso;
653 #if GFX_VER >= 6
654 uint32_t sf[GENX(3DSTATE_SF_length)];
655 uint32_t clip[GENX(3DSTATE_CLIP_length)];
656 #endif
657 #if GFX_VER >= 8
658 uint32_t raster[GENX(3DSTATE_RASTER_length)];
659 #endif
660 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
661
662 uint8_t num_clip_plane_consts;
663 bool fill_mode_point_or_line;
664 };
665
666 #if GFX_VER <= 5
667 #define URB_VS 0
668 #define URB_GS 1
669 #define URB_CLP 2
670 #define URB_SF 3
671 #define URB_CS 4
672
673 static const struct {
674 uint32_t min_nr_entries;
675 uint32_t preferred_nr_entries;
676 uint32_t min_entry_size;
677 uint32_t max_entry_size;
678 } limits[URB_CS+1] = {
679 { 16, 32, 1, 5 }, /* vs */
680 { 4, 8, 1, 5 }, /* gs */
681 { 5, 10, 1, 5 }, /* clp */
682 { 1, 8, 1, 12 }, /* sf */
683 { 1, 4, 1, 32 } /* cs */
684 };
685
check_urb_layout(struct crocus_context * ice)686 static bool check_urb_layout(struct crocus_context *ice)
687 {
688 ice->urb.vs_start = 0;
689 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
690 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
691 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
692 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
693
694 return ice->urb.cs_start + ice->urb.nr_cs_entries *
695 ice->urb.csize <= ice->urb.size;
696 }
697
698
699 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)700 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
701 unsigned vsize, unsigned sfsize)
702 {
703 struct crocus_context *ice = batch->ice;
704 if (csize < limits[URB_CS].min_entry_size)
705 csize = limits[URB_CS].min_entry_size;
706
707 if (vsize < limits[URB_VS].min_entry_size)
708 vsize = limits[URB_VS].min_entry_size;
709
710 if (sfsize < limits[URB_SF].min_entry_size)
711 sfsize = limits[URB_SF].min_entry_size;
712
713 if (ice->urb.vsize < vsize ||
714 ice->urb.sfsize < sfsize ||
715 ice->urb.csize < csize ||
716 (ice->urb.constrained && (ice->urb.vsize > vsize ||
717 ice->urb.sfsize > sfsize ||
718 ice->urb.csize > csize))) {
719
720
721 ice->urb.csize = csize;
722 ice->urb.sfsize = sfsize;
723 ice->urb.vsize = vsize;
724
725 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
726 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
727 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
728 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
729 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
730
731 ice->urb.constrained = 0;
732
733 if (GFX_VER == 5) {
734 ice->urb.nr_vs_entries = 128;
735 ice->urb.nr_sf_entries = 48;
736 if (check_urb_layout(ice)) {
737 goto done;
738 } else {
739 ice->urb.constrained = 1;
740 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
741 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
742 }
743 } else if (GFX_VERx10 == 45) {
744 ice->urb.nr_vs_entries = 64;
745 if (check_urb_layout(ice)) {
746 goto done;
747 } else {
748 ice->urb.constrained = 1;
749 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
750 }
751 }
752
753 if (!check_urb_layout(ice)) {
754 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
755 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
756 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
757 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
758 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
759
760 /* Mark us as operating with constrained nr_entries, so that next
761 * time we recalculate we'll resize the fences in the hope of
762 * escaping constrained mode and getting back to normal performance.
763 */
764 ice->urb.constrained = 1;
765
766 if (!check_urb_layout(ice)) {
767 /* This is impossible, given the maximal sizes of urb
768 * entries and the values for minimum nr of entries
769 * provided above.
770 */
771 fprintf(stderr, "couldn't calculate URB layout!\n");
772 exit(1);
773 }
774
775 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
776 fprintf(stderr, "URB CONSTRAINED\n");
777 }
778
779 done:
780 if (INTEL_DEBUG(DEBUG_URB))
781 fprintf(stderr,
782 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
783 ice->urb.vs_start,
784 ice->urb.gs_start,
785 ice->urb.clip_start,
786 ice->urb.sf_start,
787 ice->urb.cs_start,
788 ice->urb.size);
789 return true;
790 }
791 return false;
792 }
793
794 static void
crocus_upload_urb_fence(struct crocus_batch * batch)795 crocus_upload_urb_fence(struct crocus_batch *batch)
796 {
797 uint32_t urb_fence[3];
798 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
799 urb.VSUnitURBReallocationRequest = 1;
800 urb.GSUnitURBReallocationRequest = 1;
801 urb.CLIPUnitURBReallocationRequest = 1;
802 urb.SFUnitURBReallocationRequest = 1;
803 urb.VFEUnitURBReallocationRequest = 1;
804 urb.CSUnitURBReallocationRequest = 1;
805
806 urb.VSFence = batch->ice->urb.gs_start;
807 urb.GSFence = batch->ice->urb.clip_start;
808 urb.CLIPFence = batch->ice->urb.sf_start;
809 urb.SFFence = batch->ice->urb.cs_start;
810 urb.CSFence = batch->ice->urb.size;
811 }
812
813 /* erratum: URB_FENCE must not cross a 64byte cacheline */
814 if ((crocus_batch_bytes_used(batch) & 15) > 12) {
815 int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
816 do {
817 *(uint32_t *)batch->command.map_next = 0;
818 batch->command.map_next += sizeof(uint32_t);
819 } while (--pad);
820 }
821
822 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
823 }
824
825 static bool
calculate_curbe_offsets(struct crocus_batch * batch)826 calculate_curbe_offsets(struct crocus_batch *batch)
827 {
828 struct crocus_context *ice = batch->ice;
829
830 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
831 unsigned total_regs;
832
833 nr_fp_regs = 0;
834 for (int i = 0; i < 4; i++) {
835 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
836 if (range->length == 0)
837 continue;
838
839 /* ubo range tracks at 256-bit, we need 512-bit */
840 nr_fp_regs += (range->length + 1) / 2;
841 }
842
843 if (ice->state.cso_rast->cso.clip_plane_enable) {
844 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
845 nr_clip_regs = (nr_planes * 4 + 15) / 16;
846 }
847
848 nr_vp_regs = 0;
849 for (int i = 0; i < 4; i++) {
850 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
851 if (range->length == 0)
852 continue;
853
854 /* ubo range tracks at 256-bit, we need 512-bit */
855 nr_vp_regs += (range->length + 1) / 2;
856 }
857 if (nr_vp_regs == 0) {
858 /* The pre-gen6 VS requires that some push constants get loaded no
859 * matter what, or the GPU would hang.
860 */
861 nr_vp_regs = 1;
862 }
863 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
864
865 /* The CURBE allocation size is limited to 32 512-bit units (128 EU
866 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
867 * (volume 1, part 1) PRMs.
868 *
869 * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
870 * values as push constants before spilling to pull constants, and in
871 * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU
872 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
873 * regs for clip.
874 */
875 assert(total_regs <= 32);
876
877 /* Lazy resize:
878 */
879 if (nr_fp_regs > ice->curbe.wm_size ||
880 nr_vp_regs > ice->curbe.vs_size ||
881 nr_clip_regs != ice->curbe.clip_size ||
882 (total_regs < ice->curbe.total_size / 4 &&
883 ice->curbe.total_size > 16)) {
884
885 GLuint reg = 0;
886
887 /* Calculate a new layout:
888 */
889 reg = 0;
890 ice->curbe.wm_start = reg;
891 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
892 ice->curbe.clip_start = reg;
893 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
894 ice->curbe.vs_start = reg;
895 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
896 ice->curbe.total_size = reg;
897
898 if (0)
899 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
900 ice->curbe.wm_start,
901 ice->curbe.wm_size,
902 ice->curbe.clip_start,
903 ice->curbe.clip_size,
904 ice->curbe.vs_start,
905 ice->curbe.vs_size );
906 return true;
907 }
908 return false;
909 }
910
911 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)912 upload_shader_consts(struct crocus_context *ice,
913 gl_shader_stage stage,
914 uint32_t *map,
915 unsigned start)
916 {
917 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
918 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
919 uint32_t *cmap;
920 bool found = false;
921 unsigned offset = start * 16;
922 int total = 0;
923 for (int i = 0; i < 4; i++) {
924 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
925
926 if (range->length == 0)
927 continue;
928
929 unsigned block_index = crocus_bti_to_group_index(
930 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
931 unsigned len = range->length * 8 * sizeof(float);
932 unsigned start = range->start * 8 * sizeof(float);
933 struct pipe_transfer *transfer;
934
935 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
936 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
937 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
938 if (cmap)
939 memcpy(&map[offset + (total * 8)], cmap, len);
940 pipe_buffer_unmap(&ice->ctx, transfer);
941 total += range->length;
942 found = true;
943 }
944
945 if (stage == MESA_SHADER_VERTEX && !found) {
946 /* The pre-gen6 VS requires that some push constants get loaded no
947 * matter what, or the GPU would hang.
948 */
949 unsigned len = 16;
950 memset(&map[offset], 0, len);
951 }
952 }
953
954 static const float fixed_plane[6][4] = {
955 { 0, 0, -1, 1 },
956 { 0, 0, 1, 1 },
957 { 0, -1, 0, 1 },
958 { 0, 1, 0, 1 },
959 {-1, 0, 0, 1 },
960 { 1, 0, 0, 1 }
961 };
962
963 static void
gen4_upload_curbe(struct crocus_batch * batch)964 gen4_upload_curbe(struct crocus_batch *batch)
965 {
966 struct crocus_context *ice = batch->ice;
967 const unsigned sz = ice->curbe.total_size;
968 const unsigned buf_sz = sz * 16 * sizeof(float);
969
970 if (sz == 0)
971 goto emit;
972
973 uint32_t *map;
974 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
975 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
976
977 /* fragment shader constants */
978 if (ice->curbe.wm_size) {
979 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
980 }
981
982 /* clipper constants */
983 if (ice->curbe.clip_size) {
984 unsigned offset = ice->curbe.clip_start * 16;
985 float *fmap = (float *)map;
986 unsigned i;
987 /* If any planes are going this way, send them all this way:
988 */
989 for (i = 0; i < 6; i++) {
990 fmap[offset + i * 4 + 0] = fixed_plane[i][0];
991 fmap[offset + i * 4 + 1] = fixed_plane[i][1];
992 fmap[offset + i * 4 + 2] = fixed_plane[i][2];
993 fmap[offset + i * 4 + 3] = fixed_plane[i][3];
994 }
995
996 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
997 struct pipe_clip_state *cp = &ice->state.clip_planes;
998 while (mask) {
999 const int j = u_bit_scan(&mask);
1000 fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1001 fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1002 fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1003 fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1004 i++;
1005 }
1006 }
1007
1008 /* vertex shader constants */
1009 if (ice->curbe.vs_size) {
1010 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1011 }
1012 if (0) {
1013 for (int i = 0; i < sz*16; i+=4) {
1014 float *f = (float *)map;
1015 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1016 f[i+0], f[i+1], f[i+2], f[i+3]);
1017 }
1018 }
1019
1020 emit:
1021 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1022 if (ice->curbe.curbe_res) {
1023 cb.BufferLength = ice->curbe.total_size - 1;
1024 cb.Valid = 1;
1025 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1026 }
1027 }
1028
1029 #if GFX_VER == 4 && GFX_VERx10 != 45
1030 /* Work around a Broadwater/Crestline depth interpolator bug. The
1031 * following sequence will cause GPU hangs:
1032 *
1033 * 1. Change state so that all depth related fields in CC_STATE are
1034 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1035 * 2. Emit a CONSTANT_BUFFER packet.
1036 * 3. Draw via 3DPRIMITIVE.
1037 *
1038 * The recommended workaround is to emit a non-pipelined state change after
1039 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1040 *
1041 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1042 * and always emit it when "PS Use Source Depth" is set. We could be more
1043 * precise, but the additional complexity is probably not worth it.
1044 *
1045 */
1046 const struct shader_info *fs_info =
1047 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1048
1049 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1050 ice->state.global_depth_offset_clamp = 0;
1051 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1052 }
1053 #endif
1054 }
1055 #endif
1056
1057 #if GFX_VER >= 7
1058
1059 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
1060 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
1061 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
1062
1063 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1064 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1065 {
1066 #if GFX_VER == 7
1067 const struct intel_device_info *devinfo = &batch->screen->devinfo;
1068 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1069 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1070 cfg->n[INTEL_L3P_ALL];
1071 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1072 cfg->n[INTEL_L3P_ALL];
1073 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1074 cfg->n[INTEL_L3P_ALL];
1075 const bool has_slm = cfg->n[INTEL_L3P_SLM];
1076 #endif
1077
1078 /* According to the hardware docs, the L3 partitioning can only be changed
1079 * while the pipeline is completely drained and the caches are flushed,
1080 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1081 */
1082 crocus_emit_pipe_control_flush(batch, "l3_config",
1083 PIPE_CONTROL_DATA_CACHE_FLUSH |
1084 PIPE_CONTROL_CS_STALL);
1085
1086 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1087 * invalidation of the relevant caches. Note that because RO invalidation
1088 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1089 * command is processed by the CS) we cannot combine it with the previous
1090 * stalling flush as the hardware documentation suggests, because that
1091 * would cause the CS to stall on previous rendering *after* RO
1092 * invalidation and wouldn't prevent the RO caches from being polluted by
1093 * concurrent rendering before the stall completes. This intentionally
1094 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1095 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1096 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1097 * already guarantee that there is no concurrent GPGPU kernel execution
1098 * (see SKL HSD 2132585).
1099 */
1100 crocus_emit_pipe_control_flush(batch, "l3 config",
1101 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1102 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1103 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1104 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1105
1106 /* Now send a third stalling flush to make sure that invalidation is
1107 * complete when the L3 configuration registers are modified.
1108 */
1109 crocus_emit_pipe_control_flush(batch, "l3 config",
1110 PIPE_CONTROL_DATA_CACHE_FLUSH |
1111 PIPE_CONTROL_CS_STALL);
1112
1113 #if GFX_VER == 8
1114 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1115 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1116 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1117 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1118 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1119 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1120 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1121 }
1122 #else
1123 assert(!cfg->n[INTEL_L3P_ALL]);
1124
1125 /* When enabled SLM only uses a portion of the L3 on half of the banks,
1126 * the matching space on the remaining banks has to be allocated to a
1127 * client (URB for all validated configurations) set to the
1128 * lower-bandwidth 2-bank address hashing mode.
1129 */
1130 const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1131 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1132
1133 /* Minimum number of ways that can be allocated to the URB. */
1134 const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1135 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1136
1137 uint32_t l3sqcr1, l3cr2, l3cr3;
1138
1139 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1140 reg.ConvertDC_UC = !has_dc;
1141 reg.ConvertIS_UC = !has_is;
1142 reg.ConvertC_UC = !has_c;
1143 reg.ConvertT_UC = !has_t;
1144 #if GFX_VERx10 == 75
1145 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1146 #else
1147 reg.L3SQGeneralPriorityCreditInitialization =
1148 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1149 #endif
1150 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1151 };
1152
1153 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1154 reg.SLMEnable = has_slm;
1155 reg.URBLowBandwidth = urb_low_bw;
1156 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1157 #if !(GFX_VERx10 == 75)
1158 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1159 #endif
1160 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1161 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1162 };
1163
1164 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1165 reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1166 reg.ISLowBandwidth = 0;
1167 reg.CAllocation = cfg->n[INTEL_L3P_C];
1168 reg.CLowBandwidth = 0;
1169 reg.TAllocation = cfg->n[INTEL_L3P_T];
1170 reg.TLowBandwidth = 0;
1171 };
1172
1173 /* Set up the L3 partitioning. */
1174 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1175 crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1176 crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1177
1178 #if GFX_VERSIONx10 == 75
1179 /* TODO: Fail screen creation if command parser version < 4 */
1180 uint32_t scratch1, chicken3;
1181 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1182 reg.L3AtomicDisable = !has_dc;
1183 }
1184 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1185 reg.L3AtomicDisableMask = true;
1186 reg.L3AtomicDisable = !has_dc;
1187 }
1188 crocus_emit_lri(batch, SCRATCH1, scratch1);
1189 crocus_emit_lri(batch, CHICKEN3, chicken3);
1190 #endif
1191 #endif
1192 }
1193
1194 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1195 emit_l3_state(struct crocus_batch *batch, bool compute)
1196 {
1197 const struct intel_l3_config *const cfg =
1198 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1199
1200 setup_l3_config(batch, cfg);
1201 if (INTEL_DEBUG(DEBUG_L3)) {
1202 intel_dump_l3_config(cfg, stderr);
1203 }
1204 }
1205
1206 /**
1207 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1208 */
1209 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1210 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1211 {
1212 crocus_emit_pipe_control_write(batch,
1213 "workaround",
1214 PIPE_CONTROL_CS_STALL
1215 | PIPE_CONTROL_WRITE_IMMEDIATE,
1216 batch->ice->workaround_bo,
1217 batch->ice->workaround_offset, 0);
1218 }
1219 #endif
1220
1221 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1222 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1223 {
1224 #if GFX_VER == 8
1225 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1226 *
1227 * Software must clear the COLOR_CALC_STATE Valid field in
1228 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1229 * with Pipeline Select set to GPGPU.
1230 *
1231 * The internal hardware docs recommend the same workaround for Gfx9
1232 * hardware too.
1233 */
1234 if (pipeline == GPGPU)
1235 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1236 #endif
1237
1238 #if GFX_VER >= 6
1239 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1240 * PIPELINE_SELECT [DevBWR+]":
1241 *
1242 * "Project: DEVSNB+
1243 *
1244 * Software must ensure all the write caches are flushed through a
1245 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1246 * command to invalidate read only caches prior to programming
1247 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1248 */
1249 const unsigned dc_flush =
1250 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1251 crocus_emit_pipe_control_flush(batch,
1252 "workaround: PIPELINE_SELECT flushes (1/2)",
1253 PIPE_CONTROL_RENDER_TARGET_FLUSH |
1254 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1255 dc_flush |
1256 PIPE_CONTROL_CS_STALL);
1257
1258 crocus_emit_pipe_control_flush(batch,
1259 "workaround: PIPELINE_SELECT flushes (2/2)",
1260 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1261 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1262 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1263 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1264 #else
1265 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1266 * PIPELINE_SELECT [DevBWR+]":
1267 *
1268 * Project: PRE-DEVSNB
1269 *
1270 * Software must ensure the current pipeline is flushed via an
1271 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1272 */
1273 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1274 #endif
1275
1276 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1277 sel.PipelineSelection = pipeline;
1278 }
1279
1280 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1281 if (pipeline == _3D) {
1282 gen7_emit_cs_stall_flush(batch);
1283
1284 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1285 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1286 };
1287 }
1288 #endif
1289 }
1290
1291 /**
1292 * The following diagram shows how we partition the URB:
1293 *
1294 * 16kB or 32kB Rest of the URB space
1295 * __________-__________ _________________-_________________
1296 * / \ / \
1297 * +-------------------------------------------------------------+
1298 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
1299 * | Constants | Entries |
1300 * +-------------------------------------------------------------+
1301 *
1302 * Notably, push constants must be stored at the beginning of the URB
1303 * space, while entries can be stored anywhere. Ivybridge and Haswell
1304 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1305 * doubles this (32kB).
1306 *
1307 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1308 * sized) in increments of 1kB. Haswell GT3 requires them to be located and
1309 * sized in increments of 2kB.
1310 *
1311 * Currently we split the constant buffer space evenly among whatever stages
1312 * are active. This is probably not ideal, but simple.
1313 *
1314 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1315 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1316 * Haswell GT3 has 512kB of URB space.
1317 *
1318 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1319 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1320 */
1321 #if GFX_VER >= 7
1322 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1323 crocus_alloc_push_constants(struct crocus_batch *batch)
1324 {
1325 const unsigned push_constant_kb =
1326 batch->screen->devinfo.max_constant_urb_size_kb;
1327 unsigned size_per_stage = push_constant_kb / 5;
1328
1329 /* For now, we set a static partitioning of the push constant area,
1330 * assuming that all stages could be in use.
1331 *
1332 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1333 * see if that improves performance by offering more space to
1334 * the VS/FS when those aren't in use. Also, try dynamically
1335 * enabling/disabling it like i965 does. This would be more
1336 * stalls and may not actually help; we don't know yet.
1337 */
1338 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1339 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1340 alloc._3DCommandSubOpcode = 18 + i;
1341 alloc.ConstantBufferOffset = size_per_stage * i;
1342 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1343 }
1344 }
1345
1346 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1347 *
1348 * A PIPE_CONTROL command with the CS Stall bit set must be programmed
1349 * in the ring after this instruction.
1350 *
1351 * No such restriction exists for Haswell or Baytrail.
1352 */
1353 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1354 gen7_emit_cs_stall_flush(batch);
1355 }
1356 #endif
1357
1358 /**
1359 * Upload the initial GPU state for a render context.
1360 *
1361 * This sets some invariant state that needs to be programmed a particular
1362 * way, but we never actually change.
1363 */
1364 static void
crocus_init_render_context(struct crocus_batch * batch)1365 crocus_init_render_context(struct crocus_batch *batch)
1366 {
1367 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1368
1369 emit_pipeline_select(batch, _3D);
1370
1371 crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1372
1373 #if GFX_VER >= 7
1374 emit_l3_state(batch, false);
1375 #endif
1376 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1377 crocus_emit_reg(batch, GENX(INSTPM), reg) {
1378 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1379 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1380 }
1381 #endif
1382 #if GFX_VER >= 5 || GFX_VERx10 == 45
1383 /* Use the legacy AA line coverage computation. */
1384 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1385 #endif
1386
1387 /* No polygon stippling offsets are necessary. */
1388 /* TODO: may need to set an offset for origin-UL framebuffers */
1389 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1390
1391 #if GFX_VER >= 7
1392 crocus_alloc_push_constants(batch);
1393 #endif
1394
1395 #if GFX_VER == 8
1396 /* Set the initial MSAA sample positions. */
1397 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1398 INTEL_SAMPLE_POS_1X(pat._1xSample);
1399 INTEL_SAMPLE_POS_2X(pat._2xSample);
1400 INTEL_SAMPLE_POS_4X(pat._4xSample);
1401 INTEL_SAMPLE_POS_8X(pat._8xSample);
1402 }
1403
1404 /* Disable chromakeying (it's for media) */
1405 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1406
1407 /* We want regular rendering, not special HiZ operations. */
1408 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1409 #endif
1410 }
1411
1412 #if GFX_VER >= 7
1413 static void
crocus_init_compute_context(struct crocus_batch * batch)1414 crocus_init_compute_context(struct crocus_batch *batch)
1415 {
1416 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1417
1418 emit_pipeline_select(batch, GPGPU);
1419
1420 #if GFX_VER >= 7
1421 emit_l3_state(batch, true);
1422 #endif
1423 }
1424 #endif
1425
1426 /**
1427 * Generation-specific context state (ice->state.genx->...).
1428 *
1429 * Most state can go in crocus_context directly, but these encode hardware
1430 * packets which vary by generation.
1431 */
1432 struct crocus_genx_state {
1433 struct {
1434 #if GFX_VER >= 7
1435 struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1436 #endif
1437 } shaders[MESA_SHADER_STAGES];
1438
1439 #if GFX_VER == 8
1440 bool pma_fix_enabled;
1441 #endif
1442 };
1443
1444 /**
1445 * The pipe->set_blend_color() driver hook.
1446 *
1447 * This corresponds to our COLOR_CALC_STATE.
1448 */
1449 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1450 crocus_set_blend_color(struct pipe_context *ctx,
1451 const struct pipe_blend_color *state)
1452 {
1453 struct crocus_context *ice = (struct crocus_context *) ctx;
1454
1455 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1456 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1457 #if GFX_VER <= 5
1458 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1459 #else
1460 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1461 #endif
1462 }
1463
1464 /**
1465 * Gallium CSO for blend state (see pipe_blend_state).
1466 */
1467 struct crocus_blend_state {
1468 #if GFX_VER == 8
1469 /** Partial 3DSTATE_PS_BLEND */
1470 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1471 #endif
1472
1473 /** copy of BLEND_STATE */
1474 struct pipe_blend_state cso;
1475
1476 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1477 uint8_t blend_enables;
1478
1479 /** Bitfield of whether color writes are enabled for RT[i] */
1480 uint8_t color_write_enables;
1481
1482 /** Does RT[0] use dual color blending? */
1483 bool dual_color_blending;
1484 };
1485
1486 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1487 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1488 {
1489 if (alpha_to_one) {
1490 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1491 return PIPE_BLENDFACTOR_ONE;
1492
1493 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1494 return PIPE_BLENDFACTOR_ZERO;
1495 }
1496
1497 return f;
1498 }
1499
1500 #if GFX_VER >= 6
1501 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1502 #else
1503 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1504 #endif
1505
1506 static bool
1507 can_emit_logic_op(struct crocus_context *ice)
1508 {
1509 /* all pre gen8 have logicop restricted to unorm */
1510 enum pipe_format pformat = PIPE_FORMAT_NONE;
1511 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1512 if (ice->state.framebuffer.cbufs[i]) {
1513 pformat = ice->state.framebuffer.cbufs[i]->format;
1514 break;
1515 }
1516 }
1517 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1518 }
1519
1520 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1521 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1522 struct crocus_blend_state *cso_blend,
1523 int idx)
1524 {
1525 struct crocus_context *ice = batch->ice;
1526 bool independent_alpha_blend = false;
1527 const struct pipe_rt_blend_state *rt =
1528 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1529 const unsigned blend_enabled = rt->blend_enable;
1530
1531 enum pipe_blendfactor src_rgb =
1532 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1533 enum pipe_blendfactor src_alpha =
1534 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1535 enum pipe_blendfactor dst_rgb =
1536 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1537 enum pipe_blendfactor dst_alpha =
1538 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1539
1540 if (rt->rgb_func != rt->alpha_func ||
1541 src_rgb != src_alpha || dst_rgb != dst_alpha)
1542 independent_alpha_blend = true;
1543 if (cso_blend->cso.logicop_enable) {
1544 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1545 entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1546 entry->LogicOpFunction = cso_blend->cso.logicop_func;
1547 }
1548 } else if (blend_enabled) {
1549 if (idx == 0) {
1550 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1551 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1552 entry->ColorBufferBlendEnable =
1553 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1554 } else
1555 entry->ColorBufferBlendEnable = 1;
1556
1557 entry->ColorBlendFunction = rt->rgb_func;
1558 entry->AlphaBlendFunction = rt->alpha_func;
1559 entry->SourceBlendFactor = (int) src_rgb;
1560 entry->SourceAlphaBlendFactor = (int) src_alpha;
1561 entry->DestinationBlendFactor = (int) dst_rgb;
1562 entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1563 }
1564 #if GFX_VER <= 5
1565 /*
1566 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1567 * when a dual src blend shader is in use. Setup dummy blending.
1568 */
1569 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1570 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1571 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1572 entry->ColorBufferBlendEnable = 1;
1573 entry->ColorBlendFunction = PIPE_BLEND_ADD;
1574 entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1575 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1576 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1577 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1579 }
1580 #endif
1581 return independent_alpha_blend;
1582 }
1583
1584 /**
1585 * The pipe->create_blend_state() driver hook.
1586 *
1587 * Translates a pipe_blend_state into crocus_blend_state.
1588 */
1589 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1590 crocus_create_blend_state(struct pipe_context *ctx,
1591 const struct pipe_blend_state *state)
1592 {
1593 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1594
1595 cso->blend_enables = 0;
1596 cso->color_write_enables = 0;
1597 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1598
1599 cso->cso = *state;
1600 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1601
1602 #if GFX_VER == 8
1603 bool indep_alpha_blend = false;
1604 #endif
1605 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1606 const struct pipe_rt_blend_state *rt =
1607 &state->rt[state->independent_blend_enable ? i : 0];
1608 if (rt->blend_enable)
1609 cso->blend_enables |= 1u << i;
1610 if (rt->colormask)
1611 cso->color_write_enables |= 1u << i;
1612 #if GFX_VER == 8
1613 enum pipe_blendfactor src_rgb =
1614 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1615 enum pipe_blendfactor src_alpha =
1616 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1617 enum pipe_blendfactor dst_rgb =
1618 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1619 enum pipe_blendfactor dst_alpha =
1620 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1621
1622 if (rt->rgb_func != rt->alpha_func ||
1623 src_rgb != src_alpha || dst_rgb != dst_alpha)
1624 indep_alpha_blend = true;
1625 #endif
1626 }
1627
1628 #if GFX_VER == 8
1629 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1630 /* pb.HasWriteableRT is filled in at draw time.
1631 * pb.AlphaTestEnable is filled in at draw time.
1632 *
1633 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1634 * setting it when dual color blending without an appropriate shader.
1635 */
1636
1637 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1638 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1639
1640 /* The casts prevent warnings about implicit enum type conversions. */
1641 pb.SourceBlendFactor =
1642 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1643 pb.SourceAlphaBlendFactor =
1644 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1645 pb.DestinationBlendFactor =
1646 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1647 pb.DestinationAlphaBlendFactor =
1648 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1649 }
1650 #endif
1651 return cso;
1652 }
1653
1654 /**
1655 * The pipe->bind_blend_state() driver hook.
1656 *
1657 * Bind a blending CSO and flag related dirty bits.
1658 */
1659 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1660 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1661 {
1662 struct crocus_context *ice = (struct crocus_context *) ctx;
1663 struct crocus_blend_state *cso = state;
1664
1665 ice->state.cso_blend = cso;
1666 ice->state.blend_enables = cso ? cso->blend_enables : 0;
1667
1668 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1669 ice->state.dirty |= CROCUS_DIRTY_WM;
1670 #if GFX_VER >= 6
1671 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1672 #endif
1673 #if GFX_VER >= 7
1674 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1675 #endif
1676 #if GFX_VER == 8
1677 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1678 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1679 #endif
1680 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1681 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1682 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1683 }
1684
1685 /**
1686 * Return true if the FS writes to any color outputs which are not disabled
1687 * via color masking.
1688 */
1689 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1690 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1691 const struct shader_info *fs_info)
1692 {
1693 if (!fs_info)
1694 return false;
1695
1696 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1697
1698 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1699 rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1700
1701 return cso_blend->color_write_enables & rt_outputs;
1702 }
1703
1704 /**
1705 * Gallium CSO for depth, stencil, and alpha testing state.
1706 */
1707 struct crocus_depth_stencil_alpha_state {
1708 struct pipe_depth_stencil_alpha_state cso;
1709
1710 bool depth_writes_enabled;
1711 bool stencil_writes_enabled;
1712 };
1713
1714 /**
1715 * The pipe->create_depth_stencil_alpha_state() driver hook.
1716 *
1717 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1718 * testing state since we need pieces of it in a variety of places.
1719 */
1720 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1721 crocus_create_zsa_state(struct pipe_context *ctx,
1722 const struct pipe_depth_stencil_alpha_state *state)
1723 {
1724 struct crocus_depth_stencil_alpha_state *cso =
1725 malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1726
1727 bool two_sided_stencil = state->stencil[1].enabled;
1728 cso->cso = *state;
1729
1730 cso->depth_writes_enabled = state->depth_writemask;
1731 cso->stencil_writes_enabled =
1732 state->stencil[0].writemask != 0 ||
1733 (two_sided_stencil && state->stencil[1].writemask != 0);
1734
1735 /* The state tracker needs to optimize away EQUAL writes for us. */
1736 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1737
1738 return cso;
1739 }
1740
1741 /**
1742 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1743 *
1744 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1745 */
1746 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1747 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1748 {
1749 struct crocus_context *ice = (struct crocus_context *) ctx;
1750 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1751 struct crocus_depth_stencil_alpha_state *new_cso = state;
1752
1753 if (new_cso) {
1754 if (cso_changed(cso.alpha_ref_value))
1755 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1756
1757 if (cso_changed(cso.alpha_enabled))
1758 ice->state.dirty |= CROCUS_DIRTY_WM;
1759 #if GFX_VER >= 6
1760 if (cso_changed(cso.alpha_enabled))
1761 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1762
1763 if (cso_changed(cso.alpha_func))
1764 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1765 #endif
1766 #if GFX_VER == 8
1767 if (cso_changed(cso.alpha_enabled))
1768 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1769 #endif
1770
1771 if (cso_changed(depth_writes_enabled))
1772 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1773
1774 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1775 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1776
1777 #if GFX_VER <= 5
1778 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1779 #endif
1780 }
1781
1782 ice->state.cso_zsa = new_cso;
1783 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1784 #if GFX_VER >= 6
1785 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1786 #endif
1787 #if GFX_VER == 8
1788 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1789 #endif
1790 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1791 }
1792
1793 #if GFX_VER == 8
1794 static bool
want_pma_fix(struct crocus_context * ice)1795 want_pma_fix(struct crocus_context *ice)
1796 {
1797 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1798 UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1799 const struct brw_wm_prog_data *wm_prog_data = (void *)
1800 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1801 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1802 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1803 const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1804
1805 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1806 * to avoid stalling at the pixel mask array. The state equations are
1807 * documented in these places:
1808 *
1809 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1810 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1811 *
1812 * Both equations share some common elements:
1813 *
1814 * no_hiz_op =
1815 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1816 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1817 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1818 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1819 *
1820 * killpixels =
1821 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1822 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1823 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1824 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1825 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1826 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1827 *
1828 * (Technically the stencil PMA treats ForceKillPix differently,
1829 * but I think this is a documentation oversight, and we don't
1830 * ever use it in this way, so it doesn't matter).
1831 *
1832 * common_pma_fix =
1833 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1834 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1835 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1836 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1837 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1838 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1839 * no_hiz_op
1840 *
1841 * These are always true:
1842 *
1843 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1844 * 3DSTATE_PS_EXTRA::PixelShaderValid
1845 *
1846 * Also, we never use the normal drawing path for HiZ ops; these are true:
1847 *
1848 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1849 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1850 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1851 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
1852 *
1853 * This happens sometimes:
1854 *
1855 * 3DSTATE_WM::ForceThreadDispatch != 1
1856 *
1857 * However, we choose to ignore it as it either agrees with the signal
1858 * (dispatch was already enabled, so nothing out of the ordinary), or
1859 * there are no framebuffer attachments (so no depth or HiZ anyway,
1860 * meaning the PMA signal will already be disabled).
1861 */
1862
1863 if (!cso_fb->zsbuf)
1864 return false;
1865
1866 struct crocus_resource *zres, *sres;
1867 crocus_get_depth_stencil_resources(devinfo,
1868 cso_fb->zsbuf->texture, &zres, &sres);
1869
1870 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1871 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1872 */
1873 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1874 return false;
1875
1876 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1877 if (wm_prog_data->early_fragment_tests)
1878 return false;
1879
1880 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1881 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1882 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1883 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1884 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1885 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1886 */
1887 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1888 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1889
1890 /* The Gfx8 depth PMA equation becomes:
1891 *
1892 * depth_writes =
1893 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1894 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1895 *
1896 * stencil_writes =
1897 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1898 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1899 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1900 *
1901 * Z_PMA_OPT =
1902 * common_pma_fix &&
1903 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1904 * ((killpixels && (depth_writes || stencil_writes)) ||
1905 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1906 *
1907 */
1908 if (!cso_zsa->cso.depth_enabled)
1909 return false;
1910
1911 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1912 (killpixels && (cso_zsa->depth_writes_enabled ||
1913 (sres && cso_zsa->stencil_writes_enabled)));
1914 }
1915 #endif
1916 void
genX(crocus_update_pma_fix)1917 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1918 struct crocus_batch *batch,
1919 bool enable)
1920 {
1921 #if GFX_VER == 8
1922 struct crocus_genx_state *genx = ice->state.genx;
1923
1924 if (genx->pma_fix_enabled == enable)
1925 return;
1926
1927 genx->pma_fix_enabled = enable;
1928
1929 /* According to the Broadwell PIPE_CONTROL documentation, software should
1930 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1931 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
1932 *
1933 * The Gfx9 docs say to use a depth stall rather than a command streamer
1934 * stall. However, the hardware seems to violently disagree. A full
1935 * command streamer stall seems to be needed in both cases.
1936 */
1937 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1938 PIPE_CONTROL_CS_STALL |
1939 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1940 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1941
1942 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1943 reg.NPPMAFixEnable = enable;
1944 reg.NPEarlyZFailsDisable = enable;
1945 reg.NPPMAFixEnableMask = true;
1946 reg.NPEarlyZFailsDisableMask = true;
1947 }
1948
1949 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1950 * Flush bits is often necessary. We do it regardless because it's easier.
1951 * The render cache flush is also necessary if stencil writes are enabled.
1952 *
1953 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1954 * flushes seem to work just as well.
1955 */
1956 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1957 PIPE_CONTROL_DEPTH_STALL |
1958 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1959 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1960 #endif
1961 }
1962
1963 static float
get_line_width(const struct pipe_rasterizer_state * state)1964 get_line_width(const struct pipe_rasterizer_state *state)
1965 {
1966 float line_width = state->line_width;
1967
1968 /* From the OpenGL 4.4 spec:
1969 *
1970 * "The actual width of non-antialiased lines is determined by rounding
1971 * the supplied width to the nearest integer, then clamping it to the
1972 * implementation-dependent maximum non-antialiased line width."
1973 */
1974 if (!state->multisample && !state->line_smooth)
1975 line_width = roundf(state->line_width);
1976
1977 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1978 /* For 1 pixel line thickness or less, the general anti-aliasing
1979 * algorithm gives up, and a garbage line is generated. Setting a
1980 * Line Width of 0.0 specifies the rasterization of the "thinnest"
1981 * (one-pixel-wide), non-antialiased lines.
1982 *
1983 * Lines rendered with zero Line Width are rasterized using the
1984 * "Grid Intersection Quantization" rules as specified by the
1985 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1986 */
1987 /* hack around this for gfx4/5 fps counters in hud. */
1988 line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1989 }
1990 return line_width;
1991 }
1992
1993 /**
1994 * The pipe->create_rasterizer_state() driver hook.
1995 */
1996 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1997 crocus_create_rasterizer_state(struct pipe_context *ctx,
1998 const struct pipe_rasterizer_state *state)
1999 {
2000 struct crocus_rasterizer_state *cso =
2001 malloc(sizeof(struct crocus_rasterizer_state));
2002
2003 cso->fill_mode_point_or_line =
2004 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2005 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2006 state->fill_back == PIPE_POLYGON_MODE_LINE ||
2007 state->fill_back == PIPE_POLYGON_MODE_POINT;
2008
2009 if (state->clip_plane_enable != 0)
2010 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2011 else
2012 cso->num_clip_plane_consts = 0;
2013
2014 cso->cso = *state;
2015
2016 #if GFX_VER >= 6
2017 float line_width = get_line_width(state);
2018
2019 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2020 sf.StatisticsEnable = true;
2021 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2022 sf.LineEndCapAntialiasingRegionWidth =
2023 state->line_smooth ? _10pixels : _05pixels;
2024 sf.LastPixelEnable = state->line_last_pixel;
2025 #if GFX_VER <= 7
2026 sf.AntialiasingEnable = state->line_smooth;
2027 #endif
2028 #if GFX_VER == 8
2029 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2030 if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2031 sf.CHVLineWidth = line_width;
2032 else
2033 sf.LineWidth = line_width;
2034 #else
2035 sf.LineWidth = line_width;
2036 #endif
2037 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2038 sf.PointWidth = state->point_size;
2039
2040 if (state->flatshade_first) {
2041 sf.TriangleFanProvokingVertexSelect = 1;
2042 } else {
2043 sf.TriangleStripListProvokingVertexSelect = 2;
2044 sf.TriangleFanProvokingVertexSelect = 2;
2045 sf.LineStripListProvokingVertexSelect = 1;
2046 }
2047
2048 #if GFX_VER == 6
2049 sf.AttributeSwizzleEnable = true;
2050 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2051 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2052 else
2053 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2054 #endif
2055
2056 #if GFX_VER <= 7
2057 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2058
2059 #if GFX_VER >= 6
2060 sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2061 sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2062 sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2063 sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2064 sf.GlobalDepthOffsetScale = state->offset_scale;
2065 sf.GlobalDepthOffsetClamp = state->offset_clamp;
2066
2067 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2068 sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2069 #endif
2070
2071 sf.CullMode = translate_cull_mode(state->cull_face);
2072 sf.ScissorRectangleEnable = true;
2073
2074 #if GFX_VERx10 == 75
2075 sf.LineStippleEnable = state->line_stipple_enable;
2076 #endif
2077 #endif
2078 }
2079 #endif
2080
2081 #if GFX_VER == 8
2082 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2083 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2084 rr.CullMode = translate_cull_mode(state->cull_face);
2085 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2086 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2087 rr.DXMultisampleRasterizationEnable = state->multisample;
2088 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2089 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2090 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2091 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2092 rr.GlobalDepthOffsetScale = state->offset_scale;
2093 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2094 rr.SmoothPointEnable = state->point_smooth;
2095 rr.AntialiasingEnable = state->line_smooth;
2096 rr.ScissorRectangleEnable = state->scissor;
2097 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2098 }
2099 #endif
2100
2101 #if GFX_VER >= 6
2102 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2103 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2104 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2105 */
2106 #if GFX_VER >= 7
2107 cl.EarlyCullEnable = true;
2108 #endif
2109
2110 #if GFX_VER == 7
2111 cl.FrontWinding = state->front_ccw ? 1 : 0;
2112 cl.CullMode = translate_cull_mode(state->cull_face);
2113 #endif
2114 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2115 #if GFX_VER < 8
2116 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2117 #endif
2118 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2119 cl.GuardbandClipTestEnable = true;
2120 cl.ClipEnable = true;
2121 cl.MinimumPointWidth = 0.125;
2122 cl.MaximumPointWidth = 255.875;
2123
2124 #if GFX_VER == 8
2125 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2126 #endif
2127
2128 if (state->flatshade_first) {
2129 cl.TriangleFanProvokingVertexSelect = 1;
2130 } else {
2131 cl.TriangleStripListProvokingVertexSelect = 2;
2132 cl.TriangleFanProvokingVertexSelect = 2;
2133 cl.LineStripListProvokingVertexSelect = 1;
2134 }
2135 }
2136 #endif
2137
2138 /* Remap from 0..255 back to 1..256 */
2139 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2140
2141 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2142 if (state->line_stipple_enable) {
2143 line.LineStipplePattern = state->line_stipple_pattern;
2144 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2145 line.LineStippleRepeatCount = line_stipple_factor;
2146 }
2147 }
2148
2149 return cso;
2150 }
2151
2152 /**
2153 * The pipe->bind_rasterizer_state() driver hook.
2154 *
2155 * Bind a rasterizer CSO and flag related dirty bits.
2156 */
2157 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2158 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2159 {
2160 struct crocus_context *ice = (struct crocus_context *) ctx;
2161 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2162 struct crocus_rasterizer_state *new_cso = state;
2163
2164 if (new_cso) {
2165 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2166 if (cso_changed_memcmp(line_stipple))
2167 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2168 #if GFX_VER >= 6
2169 if (cso_changed(cso.half_pixel_center))
2170 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2171 if (cso_changed(cso.scissor))
2172 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2173 if (cso_changed(cso.multisample))
2174 ice->state.dirty |= CROCUS_DIRTY_WM;
2175 #else
2176 if (cso_changed(cso.scissor))
2177 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2178 #endif
2179
2180 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2181 ice->state.dirty |= CROCUS_DIRTY_WM;
2182
2183 #if GFX_VER >= 6
2184 if (cso_changed(cso.rasterizer_discard))
2185 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2186
2187 if (cso_changed(cso.flatshade_first))
2188 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2189 #endif
2190
2191 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2192 cso_changed(cso.clip_halfz))
2193 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2194
2195 #if GFX_VER >= 7
2196 if (cso_changed(cso.sprite_coord_enable) ||
2197 cso_changed(cso.sprite_coord_mode) ||
2198 cso_changed(cso.light_twoside))
2199 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2200 #endif
2201 #if GFX_VER <= 5
2202 if (cso_changed(cso.clip_plane_enable))
2203 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2204 #endif
2205 }
2206
2207 ice->state.cso_rast = new_cso;
2208 ice->state.dirty |= CROCUS_DIRTY_RASTER;
2209 ice->state.dirty |= CROCUS_DIRTY_CLIP;
2210 #if GFX_VER <= 5
2211 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2212 ice->state.dirty |= CROCUS_DIRTY_WM;
2213 #endif
2214 #if GFX_VER <= 6
2215 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2216 #endif
2217 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2218 }
2219
2220 /**
2221 * Return true if the given wrap mode requires the border color to exist.
2222 *
2223 * (We can skip uploading it if the sampler isn't going to use it.)
2224 */
2225 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2226 wrap_mode_needs_border_color(unsigned wrap_mode)
2227 {
2228 #if GFX_VER == 8
2229 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2230 #else
2231 return wrap_mode == TCM_CLAMP_BORDER;
2232 #endif
2233 }
2234
2235 /**
2236 * Gallium CSO for sampler state.
2237 */
2238 struct crocus_sampler_state {
2239 struct pipe_sampler_state pstate;
2240 union pipe_color_union border_color;
2241 bool needs_border_color;
2242 unsigned wrap_s;
2243 unsigned wrap_t;
2244 unsigned wrap_r;
2245 unsigned mag_img_filter;
2246 float min_lod;
2247 };
2248
2249 /**
2250 * The pipe->create_sampler_state() driver hook.
2251 *
2252 * We fill out SAMPLER_STATE (except for the border color pointer), and
2253 * store that on the CPU. It doesn't make sense to upload it to a GPU
2254 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2255 * all bound sampler states to be in contiguous memor.
2256 */
2257 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2258 crocus_create_sampler_state(struct pipe_context *ctx,
2259 const struct pipe_sampler_state *state)
2260 {
2261 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2262
2263 if (!cso)
2264 return NULL;
2265
2266 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2267 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2268
2269 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2270 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2271 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2272 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2273 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2274
2275 cso->pstate = *state;
2276
2277 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2278
2279 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2280 wrap_mode_needs_border_color(cso->wrap_t) ||
2281 wrap_mode_needs_border_color(cso->wrap_r);
2282
2283 cso->min_lod = state->min_lod;
2284 cso->mag_img_filter = state->mag_img_filter;
2285
2286 // XXX: explain this code ported from ilo...I don't get it at all...
2287 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2288 state->min_lod > 0.0f) {
2289 cso->min_lod = 0.0f;
2290 cso->mag_img_filter = state->min_img_filter;
2291 }
2292
2293 return cso;
2294 }
2295
2296 /**
2297 * The pipe->bind_sampler_states() driver hook.
2298 */
2299 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2300 crocus_bind_sampler_states(struct pipe_context *ctx,
2301 enum pipe_shader_type p_stage,
2302 unsigned start, unsigned count,
2303 void **states)
2304 {
2305 struct crocus_context *ice = (struct crocus_context *) ctx;
2306 gl_shader_stage stage = stage_from_pipe(p_stage);
2307 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2308
2309 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2310
2311 bool dirty = false;
2312
2313 for (int i = 0; i < count; i++) {
2314 if (shs->samplers[start + i] != states[i]) {
2315 shs->samplers[start + i] = states[i];
2316 dirty = true;
2317 }
2318 }
2319
2320 if (dirty) {
2321 #if GFX_VER <= 5
2322 if (p_stage == PIPE_SHADER_FRAGMENT)
2323 ice->state.dirty |= CROCUS_DIRTY_WM;
2324 else if (p_stage == PIPE_SHADER_VERTEX)
2325 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2326 #endif
2327 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2328 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2329 }
2330 }
2331
2332 enum samp_workaround {
2333 SAMP_NORMAL,
2334 SAMP_CUBE_CLAMP,
2335 SAMP_CUBE_CUBE,
2336 SAMP_T_WRAP,
2337 };
2338
2339 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2340 crocus_upload_sampler_state(struct crocus_batch *batch,
2341 struct crocus_sampler_state *cso,
2342 uint32_t border_color_offset,
2343 enum samp_workaround samp_workaround,
2344 uint32_t first_level,
2345 void *map)
2346 {
2347 struct pipe_sampler_state *state = &cso->pstate;
2348 uint32_t wrap_s, wrap_t, wrap_r;
2349
2350 wrap_s = cso->wrap_s;
2351 wrap_t = cso->wrap_t;
2352 wrap_r = cso->wrap_r;
2353
2354 switch (samp_workaround) {
2355 case SAMP_CUBE_CLAMP:
2356 wrap_s = TCM_CLAMP;
2357 wrap_t = TCM_CLAMP;
2358 wrap_r = TCM_CLAMP;
2359 break;
2360 case SAMP_CUBE_CUBE:
2361 wrap_s = TCM_CUBE;
2362 wrap_t = TCM_CUBE;
2363 wrap_r = TCM_CUBE;
2364 break;
2365 case SAMP_T_WRAP:
2366 wrap_t = TCM_WRAP;
2367 break;
2368 default:
2369 break;
2370 }
2371
2372 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2373 samp.TCXAddressControlMode = wrap_s;
2374 samp.TCYAddressControlMode = wrap_t;
2375 samp.TCZAddressControlMode = wrap_r;
2376
2377 #if GFX_VER >= 6
2378 samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
2379 #endif
2380 samp.MinModeFilter = state->min_img_filter;
2381 samp.MagModeFilter = cso->mag_img_filter;
2382 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2383 samp.MaximumAnisotropy = RATIO21;
2384
2385 if (state->max_anisotropy >= 2) {
2386 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2387 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2388 #if GFX_VER >= 7
2389 samp.AnisotropicAlgorithm = EWAApproximation;
2390 #endif
2391 }
2392
2393 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2394 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2395
2396 samp.MaximumAnisotropy =
2397 MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2398 }
2399
2400 /* Set address rounding bits if not using nearest filtering. */
2401 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2402 samp.UAddressMinFilterRoundingEnable = true;
2403 samp.VAddressMinFilterRoundingEnable = true;
2404 samp.RAddressMinFilterRoundingEnable = true;
2405 }
2406
2407 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2408 samp.UAddressMagFilterRoundingEnable = true;
2409 samp.VAddressMagFilterRoundingEnable = true;
2410 samp.RAddressMagFilterRoundingEnable = true;
2411 }
2412
2413 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2414 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2415
2416 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2417
2418 #if GFX_VER == 8
2419 samp.LODPreClampMode = CLAMP_MODE_OGL;
2420 #else
2421 samp.LODPreClampEnable = true;
2422 #endif
2423 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2424 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2425 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2426
2427 #if GFX_VER == 6
2428 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2429 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2430 #endif
2431
2432 #if GFX_VER < 6
2433 samp.BorderColorPointer =
2434 ro_bo(batch->state.bo, border_color_offset);
2435 #else
2436 samp.BorderColorPointer = border_color_offset;
2437 #endif
2438 }
2439 }
2440
2441 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2442 crocus_upload_border_color(struct crocus_batch *batch,
2443 struct crocus_sampler_state *cso,
2444 struct crocus_sampler_view *tex,
2445 uint32_t *bc_offset)
2446 {
2447 /* We may need to swizzle the border color for format faking.
2448 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2449 * This means we need to move the border color's A channel into
2450 * the R or G channels so that those read swizzles will move it
2451 * back into A.
2452 */
2453 enum pipe_format internal_format = PIPE_FORMAT_NONE;
2454 union pipe_color_union *color = &cso->border_color;
2455 union pipe_color_union tmp;
2456 if (tex) {
2457 internal_format = tex->res->internal_format;
2458
2459 if (util_format_is_alpha(internal_format)) {
2460 unsigned char swz[4] = {
2461 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2462 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2463 };
2464 util_format_apply_color_swizzle(&tmp, color, swz, true);
2465 color = &tmp;
2466 } else if (util_format_is_luminance_alpha(internal_format) &&
2467 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2468 unsigned char swz[4] = {
2469 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2470 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2471 };
2472 util_format_apply_color_swizzle(&tmp, color, swz, true);
2473 color = &tmp;
2474 }
2475 }
2476 bool is_integer_format = util_format_is_pure_integer(internal_format);
2477 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2478 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2479 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2480
2481 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2482
2483 #define ASSIGN(dst, src) \
2484 do { \
2485 dst = src; \
2486 } while (0)
2487
2488 #define ASSIGNu16(dst, src) \
2489 do { \
2490 dst = (uint16_t)src; \
2491 } while (0)
2492
2493 #define ASSIGNu8(dst, src) \
2494 do { \
2495 dst = (uint8_t)src; \
2496 } while (0)
2497
2498 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
2499 macro(state.BorderColor ## _color_type ## Red, src[0]); \
2500 macro(state.BorderColor ## _color_type ## Green, src[1]); \
2501 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
2502 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2503
2504 #if GFX_VER >= 8
2505 /* On Broadwell, the border color is represented as four 32-bit floats,
2506 * integers, or unsigned values, interpreted according to the surface
2507 * format. This matches the sampler->BorderColor union exactly; just
2508 * memcpy the values.
2509 */
2510 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2511 #elif GFX_VERx10 == 75
2512 if (is_integer_format) {
2513 const struct util_format_description *format_desc =
2514 util_format_description(internal_format);
2515
2516 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2517 * "If any color channel is missing from the surface format,
2518 * corresponding border color should be programmed as zero and if
2519 * alpha channel is missing, corresponding Alpha border color should
2520 * be programmed as 1."
2521 */
2522 unsigned c[4] = { 0, 0, 0, 1 };
2523 for (int i = 0; i < 4; i++) {
2524 if (format_desc->channel[i].size)
2525 c[i] = color->ui[i];
2526 }
2527
2528 switch (format_desc->channel[0].size) {
2529 case 8:
2530 /* Copy RGBA in order. */
2531 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2532 break;
2533 case 10:
2534 /* R10G10B10A2_UINT is treated like a 16-bit format. */
2535 case 16:
2536 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2537 break;
2538 case 32:
2539 if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2540 /* Careful inspection of the tables reveals that for RG32 formats,
2541 * the green channel needs to go where blue normally belongs.
2542 */
2543 state.BorderColor32bitRed = c[0];
2544 state.BorderColor32bitBlue = c[1];
2545 state.BorderColor32bitAlpha = 1;
2546 } else {
2547 /* Copy RGBA in order. */
2548 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2549 }
2550 break;
2551 default:
2552 assert(!"Invalid number of bits per channel in integer format.");
2553 break;
2554 }
2555 } else {
2556 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2557 }
2558 #elif GFX_VER == 5 || GFX_VER == 6
2559 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2560 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2561 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2562
2563 #define MESA_FLOAT_TO_HALF(dst, src) \
2564 dst = _mesa_float_to_half(src);
2565
2566 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2567
2568 #undef MESA_FLOAT_TO_HALF
2569
2570 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
2571 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2572 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
2573 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2574
2575 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2576
2577 #elif GFX_VER == 4
2578 BORDER_COLOR_ATTR(ASSIGN, , color->f);
2579 #else
2580 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2581 #endif
2582
2583 #undef ASSIGN
2584 #undef BORDER_COLOR_ATTR
2585
2586 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2587 }
2588
2589 /**
2590 * Upload the sampler states into a contiguous area of GPU memory, for
2591 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2592 *
2593 * Also fill out the border color state pointers.
2594 */
2595 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2596 crocus_upload_sampler_states(struct crocus_context *ice,
2597 struct crocus_batch *batch, gl_shader_stage stage)
2598 {
2599 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2600 const struct shader_info *info = crocus_get_shader_info(ice, stage);
2601
2602 /* We assume the state tracker will call pipe->bind_sampler_states()
2603 * if the program's number of textures changes.
2604 */
2605 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2606
2607 if (!count)
2608 return;
2609
2610 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2611 * in the dynamic state memory zone, so we can point to it via the
2612 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2613 */
2614 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2615 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2616
2617 if (unlikely(!map))
2618 return;
2619
2620 for (int i = 0; i < count; i++) {
2621 struct crocus_sampler_state *state = shs->samplers[i];
2622 struct crocus_sampler_view *tex = shs->textures[i];
2623
2624 if (!state || !tex) {
2625 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2626 } else {
2627 unsigned border_color_offset = 0;
2628 if (state->needs_border_color) {
2629 crocus_upload_border_color(batch, state, tex, &border_color_offset);
2630 }
2631
2632 enum samp_workaround wa = SAMP_NORMAL;
2633 /* There's a bug in 1D texture sampling - it actually pays
2634 * attention to the wrap_t value, though it should not.
2635 * Override the wrap_t value here to GL_REPEAT to keep
2636 * any nonexistent border pixels from floating in.
2637 */
2638 if (tex->base.target == PIPE_TEXTURE_1D)
2639 wa = SAMP_T_WRAP;
2640 else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2641 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2642 /* Cube maps must use the same wrap mode for all three coordinate
2643 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
2644 *
2645 * Ivybridge and Baytrail seem to have problems with CUBE mode and
2646 * integer formats. Fall back to CLAMP for now.
2647 */
2648 if (state->pstate.seamless_cube_map &&
2649 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2650 wa = SAMP_CUBE_CUBE;
2651 else
2652 wa = SAMP_CUBE_CLAMP;
2653 }
2654
2655 uint32_t first_level = 0;
2656 if (tex->base.target != PIPE_BUFFER)
2657 first_level = tex->base.u.tex.first_level;
2658
2659 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2660 }
2661
2662 map += GENX(SAMPLER_STATE_length);
2663 }
2664 }
2665
2666 /**
2667 * The pipe->create_sampler_view() driver hook.
2668 */
2669 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2670 crocus_create_sampler_view(struct pipe_context *ctx,
2671 struct pipe_resource *tex,
2672 const struct pipe_sampler_view *tmpl)
2673 {
2674 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2675 const struct intel_device_info *devinfo = &screen->devinfo;
2676 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2677
2678 if (!isv)
2679 return NULL;
2680
2681 /* initialize base object */
2682 isv->base = *tmpl;
2683 isv->base.context = ctx;
2684 isv->base.texture = NULL;
2685 pipe_reference_init(&isv->base.reference, 1);
2686 pipe_resource_reference(&isv->base.texture, tex);
2687
2688 if (util_format_is_depth_or_stencil(tmpl->format)) {
2689 struct crocus_resource *zres, *sres;
2690 const struct util_format_description *desc =
2691 util_format_description(tmpl->format);
2692
2693 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2694
2695 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2696
2697 if (tex->format == PIPE_FORMAT_S8_UINT)
2698 if (GFX_VER == 7 && sres->shadow)
2699 tex = &sres->shadow->base.b;
2700 }
2701
2702 isv->res = (struct crocus_resource *) tex;
2703
2704 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2705
2706 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2707 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2708 usage |= ISL_SURF_USAGE_CUBE_BIT;
2709
2710 const struct crocus_format_info fmt =
2711 crocus_format_for_usage(devinfo, tmpl->format, usage);
2712
2713 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2714 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2715
2716 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2717 if (GFX_VER < 6 &&
2718 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2719 tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2720 isv->swizzle[0] = tmpl->swizzle_g;
2721 isv->swizzle[1] = tmpl->swizzle_g;
2722 isv->swizzle[2] = tmpl->swizzle_g;
2723 isv->swizzle[3] = tmpl->swizzle_g;
2724 }
2725
2726 isv->clear_color = isv->res->aux.clear_color;
2727
2728 isv->view = (struct isl_view) {
2729 .format = fmt.fmt,
2730 #if GFX_VERx10 >= 75
2731 .swizzle = (struct isl_swizzle) {
2732 .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2733 .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2734 .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2735 .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2736 },
2737 #else
2738 /* swizzling handled in shader code */
2739 .swizzle = ISL_SWIZZLE_IDENTITY,
2740 #endif
2741 .usage = usage,
2742 };
2743
2744 /* Fill out SURFACE_STATE for this view. */
2745 if (tmpl->target != PIPE_BUFFER) {
2746 isv->view.base_level = tmpl->u.tex.first_level;
2747 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2748
2749 /* Hardware older than skylake ignores this value */
2750 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2751
2752 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2753 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2754 isv->view.array_len =
2755 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2756 }
2757 #if GFX_VER >= 6
2758 /* just create a second view struct for texture gather just in case */
2759 isv->gather_view = isv->view;
2760
2761 #if GFX_VER == 7
2762 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2763 fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2764 fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2765 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2766 #if GFX_VERx10 >= 75
2767 isv->gather_view.swizzle = (struct isl_swizzle) {
2768 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2769 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2770 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2771 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2772 };
2773 #endif
2774 }
2775 #endif
2776 #if GFX_VER == 6
2777 /* Sandybridge's gather4 message is broken for integer formats.
2778 * To work around this, we pretend the surface is UNORM for
2779 * 8 or 16-bit formats, and emit shader instructions to recover
2780 * the real INT/UINT value. For 32-bit formats, we pretend
2781 * the surface is FLOAT, and simply reinterpret the resulting
2782 * bits.
2783 */
2784 switch (fmt.fmt) {
2785 case ISL_FORMAT_R8_SINT:
2786 case ISL_FORMAT_R8_UINT:
2787 isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2788 break;
2789
2790 case ISL_FORMAT_R16_SINT:
2791 case ISL_FORMAT_R16_UINT:
2792 isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2793 break;
2794
2795 case ISL_FORMAT_R32_SINT:
2796 case ISL_FORMAT_R32_UINT:
2797 isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2798 break;
2799
2800 default:
2801 break;
2802 }
2803 #endif
2804 #endif
2805 /* Fill out SURFACE_STATE for this view. */
2806 if (tmpl->target != PIPE_BUFFER) {
2807 if (crocus_resource_unfinished_aux_import(isv->res))
2808 crocus_resource_finish_aux_import(&screen->base, isv->res);
2809
2810 }
2811
2812 return &isv->base;
2813 }
2814
2815 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2816 crocus_sampler_view_destroy(struct pipe_context *ctx,
2817 struct pipe_sampler_view *state)
2818 {
2819 struct crocus_sampler_view *isv = (void *) state;
2820 pipe_resource_reference(&state->texture, NULL);
2821 free(isv);
2822 }
2823
2824 /**
2825 * The pipe->create_surface() driver hook.
2826 *
2827 * In Gallium nomenclature, "surfaces" are a view of a resource that
2828 * can be bound as a render target or depth/stencil buffer.
2829 */
2830 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2831 crocus_create_surface(struct pipe_context *ctx,
2832 struct pipe_resource *tex,
2833 const struct pipe_surface *tmpl)
2834 {
2835 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2836 const struct intel_device_info *devinfo = &screen->devinfo;
2837
2838 isl_surf_usage_flags_t usage = 0;
2839 if (tmpl->writable)
2840 usage = ISL_SURF_USAGE_STORAGE_BIT;
2841 else if (util_format_is_depth_or_stencil(tmpl->format))
2842 usage = ISL_SURF_USAGE_DEPTH_BIT;
2843 else
2844 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2845
2846 const struct crocus_format_info fmt =
2847 crocus_format_for_usage(devinfo, tmpl->format, usage);
2848
2849 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2850 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2851 /* Framebuffer validation will reject this invalid case, but it
2852 * hasn't had the opportunity yet. In the meantime, we need to
2853 * avoid hitting ISL asserts about unsupported formats below.
2854 */
2855 return NULL;
2856 }
2857
2858 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2859 struct pipe_surface *psurf = &surf->base;
2860 struct crocus_resource *res = (struct crocus_resource *) tex;
2861
2862 if (!surf)
2863 return NULL;
2864
2865 pipe_reference_init(&psurf->reference, 1);
2866 pipe_resource_reference(&psurf->texture, tex);
2867 psurf->context = ctx;
2868 psurf->format = tmpl->format;
2869 psurf->width = tex->width0;
2870 psurf->height = tex->height0;
2871 psurf->texture = tex;
2872 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2873 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2874 psurf->u.tex.level = tmpl->u.tex.level;
2875
2876 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2877
2878 struct isl_view *view = &surf->view;
2879 *view = (struct isl_view) {
2880 .format = fmt.fmt,
2881 .base_level = tmpl->u.tex.level,
2882 .levels = 1,
2883 .base_array_layer = tmpl->u.tex.first_layer,
2884 .array_len = array_len,
2885 .swizzle = ISL_SWIZZLE_IDENTITY,
2886 .usage = usage,
2887 };
2888
2889 #if GFX_VER >= 6
2890 struct isl_view *read_view = &surf->read_view;
2891 *read_view = (struct isl_view) {
2892 .format = fmt.fmt,
2893 .base_level = tmpl->u.tex.level,
2894 .levels = 1,
2895 .base_array_layer = tmpl->u.tex.first_layer,
2896 .array_len = array_len,
2897 .swizzle = ISL_SWIZZLE_IDENTITY,
2898 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2899 };
2900 #endif
2901
2902 surf->clear_color = res->aux.clear_color;
2903
2904 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2905 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2906 ISL_SURF_USAGE_STENCIL_BIT))
2907 return psurf;
2908
2909 if (!isl_format_is_compressed(res->surf.format)) {
2910 if (crocus_resource_unfinished_aux_import(res))
2911 crocus_resource_finish_aux_import(&screen->base, res);
2912
2913 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2914 uint64_t temp_offset;
2915 uint32_t temp_x, temp_y;
2916
2917 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2918 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2919 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2920 &temp_offset, &temp_x, &temp_y);
2921 if (!devinfo->has_surface_tile_offset &&
2922 (temp_x || temp_y)) {
2923 /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2924 * destination.
2925 */
2926 /* move to temp */
2927 struct pipe_resource wa_templ = (struct pipe_resource) {
2928 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2929 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2930 .depth0 = 1,
2931 .array_size = 1,
2932 .format = res->base.b.format,
2933 .target = PIPE_TEXTURE_2D,
2934 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2935 };
2936 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2937 view->base_level = 0;
2938 view->base_array_layer = 0;
2939 view->array_len = 1;
2940 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2941 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2942 }
2943 return psurf;
2944 }
2945
2946 /* The resource has a compressed format, which is not renderable, but we
2947 * have a renderable view format. We must be attempting to upload blocks
2948 * of compressed data via an uncompressed view.
2949 *
2950 * In this case, we can assume there are no auxiliary buffers, a single
2951 * miplevel, and that the resource is single-sampled. Gallium may try
2952 * and create an uncompressed view with multiple layers, however.
2953 */
2954 assert(!isl_format_is_compressed(fmt.fmt));
2955 assert(res->surf.samples == 1);
2956 assert(view->levels == 1);
2957
2958 /* TODO: compressed pbo uploads aren't working here */
2959 return NULL;
2960
2961 uint64_t offset_B = 0;
2962 uint32_t tile_x_sa = 0, tile_y_sa = 0;
2963
2964 if (view->base_level > 0) {
2965 /* We can't rely on the hardware's miplevel selection with such
2966 * a substantial lie about the format, so we select a single image
2967 * using the Tile X/Y Offset fields. In this case, we can't handle
2968 * multiple array slices.
2969 *
2970 * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2971 * hard-coded to align to exactly the block size of the compressed
2972 * texture. This means that, when reinterpreted as a non-compressed
2973 * texture, the tile offsets may be anything and we can't rely on
2974 * X/Y Offset.
2975 *
2976 * Return NULL to force the state tracker to take fallback paths.
2977 */
2978 // TODO: check if the gen7 check is right, originally gen8
2979 if (view->array_len > 1 || GFX_VER == 7)
2980 return NULL;
2981
2982 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2983 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2984 view->base_level,
2985 is_3d ? 0 : view->base_array_layer,
2986 is_3d ? view->base_array_layer : 0,
2987 &surf->surf,
2988 &offset_B, &tile_x_sa, &tile_y_sa);
2989
2990 /* We use address and tile offsets to access a single level/layer
2991 * as a subimage, so reset level/layer so it doesn't offset again.
2992 */
2993 view->base_array_layer = 0;
2994 view->base_level = 0;
2995 } else {
2996 /* Level 0 doesn't require tile offsets, and the hardware can find
2997 * array slices using QPitch even with the format override, so we
2998 * can allow layers in this case. Copy the original ISL surface.
2999 */
3000 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
3001 }
3002
3003 /* Scale down the image dimensions by the block size. */
3004 const struct isl_format_layout *fmtl =
3005 isl_format_get_layout(res->surf.format);
3006 surf->surf.format = fmt.fmt;
3007 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3008 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3009 tile_x_sa /= fmtl->bw;
3010 tile_y_sa /= fmtl->bh;
3011
3012 psurf->width = surf->surf.logical_level0_px.width;
3013 psurf->height = surf->surf.logical_level0_px.height;
3014
3015 return psurf;
3016 }
3017
3018 #if GFX_VER >= 7
3019 static void
fill_default_image_param(struct brw_image_param * param)3020 fill_default_image_param(struct brw_image_param *param)
3021 {
3022 memset(param, 0, sizeof(*param));
3023 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3024 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3025 * detailed explanation of these parameters.
3026 */
3027 param->swizzling[0] = 0xff;
3028 param->swizzling[1] = 0xff;
3029 }
3030
3031 static void
fill_buffer_image_param(struct brw_image_param * param,enum pipe_format pfmt,unsigned size)3032 fill_buffer_image_param(struct brw_image_param *param,
3033 enum pipe_format pfmt,
3034 unsigned size)
3035 {
3036 const unsigned cpp = util_format_get_blocksize(pfmt);
3037
3038 fill_default_image_param(param);
3039 param->size[0] = size / cpp;
3040 param->stride[0] = cpp;
3041 }
3042
3043 #endif
3044
3045 /**
3046 * The pipe->set_shader_images() driver hook.
3047 */
3048 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3049 crocus_set_shader_images(struct pipe_context *ctx,
3050 enum pipe_shader_type p_stage,
3051 unsigned start_slot, unsigned count,
3052 unsigned unbind_num_trailing_slots,
3053 const struct pipe_image_view *p_images)
3054 {
3055 #if GFX_VER >= 7
3056 struct crocus_context *ice = (struct crocus_context *) ctx;
3057 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3058 const struct intel_device_info *devinfo = &screen->devinfo;
3059 gl_shader_stage stage = stage_from_pipe(p_stage);
3060 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3061 struct crocus_genx_state *genx = ice->state.genx;
3062 struct brw_image_param *image_params = genx->shaders[stage].image_param;
3063
3064 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3065
3066 for (unsigned i = 0; i < count; i++) {
3067 struct crocus_image_view *iv = &shs->image[start_slot + i];
3068
3069 if (p_images && p_images[i].resource) {
3070 const struct pipe_image_view *img = &p_images[i];
3071 struct crocus_resource *res = (void *) img->resource;
3072
3073 util_copy_image_view(&iv->base, img);
3074
3075 shs->bound_image_views |= 1 << (start_slot + i);
3076
3077 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3078 res->bind_stages |= 1 << stage;
3079
3080 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3081 struct crocus_format_info fmt =
3082 crocus_format_for_usage(devinfo, img->format, usage);
3083
3084 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3085 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3086 /* On Gen8, try to use typed surfaces reads (which support a
3087 * limited number of formats), and if not possible, fall back
3088 * to untyped reads.
3089 */
3090 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3091 fmt.fmt = ISL_FORMAT_RAW;
3092 else
3093 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3094 }
3095
3096 if (res->base.b.target != PIPE_BUFFER) {
3097 struct isl_view view = {
3098 .format = fmt.fmt,
3099 .base_level = img->u.tex.level,
3100 .levels = 1,
3101 .base_array_layer = img->u.tex.first_layer,
3102 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3103 .swizzle = swiz,
3104 .usage = usage,
3105 };
3106
3107 iv->view = view;
3108
3109 isl_surf_fill_image_param(&screen->isl_dev,
3110 &image_params[start_slot + i],
3111 &res->surf, &view);
3112 } else {
3113 struct isl_view view = {
3114 .format = fmt.fmt,
3115 .swizzle = swiz,
3116 .usage = usage,
3117 };
3118 iv->view = view;
3119
3120 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3121 img->u.buf.offset + img->u.buf.size);
3122 fill_buffer_image_param(&image_params[start_slot + i],
3123 img->format, img->u.buf.size);
3124 }
3125 } else {
3126 pipe_resource_reference(&iv->base.resource, NULL);
3127 fill_default_image_param(&image_params[start_slot + i]);
3128 }
3129 }
3130
3131 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3132 ice->state.dirty |=
3133 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3134 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3135
3136 /* Broadwell also needs brw_image_params re-uploaded */
3137 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3138 shs->sysvals_need_upload = true;
3139 #endif
3140 }
3141
3142
3143 /**
3144 * The pipe->set_sampler_views() driver hook.
3145 */
3146 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3147 crocus_set_sampler_views(struct pipe_context *ctx,
3148 enum pipe_shader_type p_stage,
3149 unsigned start, unsigned count,
3150 unsigned unbind_num_trailing_slots,
3151 bool take_ownership,
3152 struct pipe_sampler_view **views)
3153 {
3154 struct crocus_context *ice = (struct crocus_context *) ctx;
3155 gl_shader_stage stage = stage_from_pipe(p_stage);
3156 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3157
3158 shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3159
3160 for (unsigned i = 0; i < count; i++) {
3161 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3162
3163 if (take_ownership) {
3164 pipe_sampler_view_reference((struct pipe_sampler_view **)
3165 &shs->textures[start + i], NULL);
3166 shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3167 } else {
3168 pipe_sampler_view_reference((struct pipe_sampler_view **)
3169 &shs->textures[start + i], pview);
3170 }
3171
3172 struct crocus_sampler_view *view = (void *) pview;
3173 if (view) {
3174 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3175 view->res->bind_stages |= 1 << stage;
3176
3177 shs->bound_sampler_views |= 1 << (start + i);
3178 }
3179 }
3180 #if GFX_VER == 6
3181 /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3182 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3183 #endif
3184 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3185 ice->state.dirty |=
3186 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3187 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3188 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3189 }
3190
3191 /**
3192 * The pipe->set_tess_state() driver hook.
3193 */
3194 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3195 crocus_set_tess_state(struct pipe_context *ctx,
3196 const float default_outer_level[4],
3197 const float default_inner_level[2])
3198 {
3199 struct crocus_context *ice = (struct crocus_context *) ctx;
3200 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3201
3202 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3203 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3204
3205 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3206 shs->sysvals_need_upload = true;
3207 }
3208
3209 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3210 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3211 {
3212 struct crocus_context *ice = (struct crocus_context *) ctx;
3213
3214 ice->state.patch_vertices = patch_vertices;
3215 }
3216
3217 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3218 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3219 {
3220 struct crocus_surface *surf = (void *) p_surf;
3221 pipe_resource_reference(&p_surf->texture, NULL);
3222
3223 pipe_resource_reference(&surf->align_res, NULL);
3224 free(surf);
3225 }
3226
3227 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3228 crocus_set_clip_state(struct pipe_context *ctx,
3229 const struct pipe_clip_state *state)
3230 {
3231 struct crocus_context *ice = (struct crocus_context *) ctx;
3232 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3233 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3234 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3235
3236 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3237
3238 #if GFX_VER <= 5
3239 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3240 #endif
3241 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3242 CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3243 shs->sysvals_need_upload = true;
3244 gshs->sysvals_need_upload = true;
3245 tshs->sysvals_need_upload = true;
3246 }
3247
3248 /**
3249 * The pipe->set_polygon_stipple() driver hook.
3250 */
3251 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3252 crocus_set_polygon_stipple(struct pipe_context *ctx,
3253 const struct pipe_poly_stipple *state)
3254 {
3255 struct crocus_context *ice = (struct crocus_context *) ctx;
3256 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3257 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3258 }
3259
3260 /**
3261 * The pipe->set_sample_mask() driver hook.
3262 */
3263 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3264 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3265 {
3266 struct crocus_context *ice = (struct crocus_context *) ctx;
3267
3268 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3269 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3270 */
3271 ice->state.sample_mask = sample_mask & 0xff;
3272 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3273 }
3274
3275 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3276 crocus_fill_scissor_rect(struct crocus_context *ice,
3277 int idx,
3278 struct pipe_scissor_state *ss)
3279 {
3280 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3281 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3282 const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3283 struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3284 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3285 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3286 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3287 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3288 };
3289 if (cso_state->scissor) {
3290 struct pipe_scissor_state *s = &ice->state.scissors[idx];
3291 scissor.minx = MAX2(scissor.minx, s->minx);
3292 scissor.miny = MAX2(scissor.miny, s->miny);
3293 scissor.maxx = MIN2(scissor.maxx, s->maxx);
3294 scissor.maxy = MIN2(scissor.maxy, s->maxy);
3295 }
3296 *ss = scissor;
3297 }
3298
3299 /**
3300 * The pipe->set_scissor_states() driver hook.
3301 *
3302 * This corresponds to our SCISSOR_RECT state structures. It's an
3303 * exact match, so we just store them, and memcpy them out later.
3304 */
3305 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3306 crocus_set_scissor_states(struct pipe_context *ctx,
3307 unsigned start_slot,
3308 unsigned num_scissors,
3309 const struct pipe_scissor_state *rects)
3310 {
3311 struct crocus_context *ice = (struct crocus_context *) ctx;
3312
3313 for (unsigned i = 0; i < num_scissors; i++) {
3314 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3315 /* If the scissor was out of bounds and got clamped to 0 width/height
3316 * at the bounds, the subtraction of 1 from maximums could produce a
3317 * negative number and thus not clip anything. Instead, just provide
3318 * a min > max scissor inside the bounds, which produces the expected
3319 * no rendering.
3320 */
3321 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3322 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3323 };
3324 } else {
3325 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3326 .minx = rects[i].minx, .miny = rects[i].miny,
3327 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3328 };
3329 }
3330 }
3331
3332 #if GFX_VER < 6
3333 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3334 #else
3335 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3336 #endif
3337 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3338
3339 }
3340
3341 /**
3342 * The pipe->set_stencil_ref() driver hook.
3343 *
3344 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3345 */
3346 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3347 crocus_set_stencil_ref(struct pipe_context *ctx,
3348 const struct pipe_stencil_ref ref)
3349 {
3350 struct crocus_context *ice = (struct crocus_context *) ctx;
3351 ice->state.stencil_ref = ref;
3352 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3353 }
3354
3355 #if GFX_VER == 8
3356 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3357 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3358 {
3359 return copysignf(state->scale[axis], sign) + state->translate[axis];
3360 }
3361 #endif
3362
3363 /**
3364 * The pipe->set_viewport_states() driver hook.
3365 *
3366 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3367 * the guardband yet, as we need the framebuffer dimensions, but we can
3368 * at least fill out the rest.
3369 */
3370 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3371 crocus_set_viewport_states(struct pipe_context *ctx,
3372 unsigned start_slot,
3373 unsigned count,
3374 const struct pipe_viewport_state *states)
3375 {
3376 struct crocus_context *ice = (struct crocus_context *) ctx;
3377
3378 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3379
3380 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3381 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3382 #if GFX_VER >= 6
3383 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3384 #endif
3385
3386 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3387 !ice->state.cso_rast->cso.depth_clip_far))
3388 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3389 }
3390
3391 /**
3392 * The pipe->set_framebuffer_state() driver hook.
3393 *
3394 * Sets the current draw FBO, including color render targets, depth,
3395 * and stencil buffers.
3396 */
3397 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3398 crocus_set_framebuffer_state(struct pipe_context *ctx,
3399 const struct pipe_framebuffer_state *state)
3400 {
3401 struct crocus_context *ice = (struct crocus_context *) ctx;
3402 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3403 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3404 const struct intel_device_info *devinfo = &screen->devinfo;
3405 #if 0
3406 struct isl_device *isl_dev = &screen->isl_dev;
3407 struct crocus_resource *zres;
3408 struct crocus_resource *stencil_res;
3409 #endif
3410
3411 unsigned samples = util_framebuffer_get_num_samples(state);
3412 unsigned layers = util_framebuffer_get_num_layers(state);
3413
3414 #if GFX_VER >= 6
3415 if (cso->samples != samples) {
3416 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3417 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3418 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3419 #if GFX_VERx10 == 75
3420 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3421 #endif
3422 }
3423 #endif
3424
3425 #if GFX_VER >= 6 && GFX_VER < 8
3426 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3427 #endif
3428
3429 if ((cso->layers == 0) != (layers == 0)) {
3430 ice->state.dirty |= CROCUS_DIRTY_CLIP;
3431 }
3432
3433 if (cso->width != state->width || cso->height != state->height) {
3434 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3435 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3436 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3437 #if GFX_VER >= 6
3438 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3439 #endif
3440 }
3441
3442 if (cso->zsbuf || state->zsbuf) {
3443 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3444
3445 /* update SF's depth buffer format */
3446 if (GFX_VER == 7 && cso->zsbuf)
3447 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3448 }
3449
3450 /* wm thread dispatch enable */
3451 ice->state.dirty |= CROCUS_DIRTY_WM;
3452 util_copy_framebuffer_state(cso, state);
3453 cso->samples = samples;
3454 cso->layers = layers;
3455
3456 if (cso->zsbuf) {
3457 struct crocus_resource *zres;
3458 struct crocus_resource *stencil_res;
3459 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3460 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3461 &stencil_res);
3462 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3463 aux_usage = zres->aux.usage;
3464 }
3465 ice->state.hiz_usage = aux_usage;
3466 }
3467
3468 /* Render target change */
3469 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3470
3471 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3472
3473 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3474 }
3475
3476 /**
3477 * The pipe->set_constant_buffer() driver hook.
3478 *
3479 * This uploads any constant data in user buffers, and references
3480 * any UBO resources containing constant data.
3481 */
3482 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3483 crocus_set_constant_buffer(struct pipe_context *ctx,
3484 enum pipe_shader_type p_stage, unsigned index,
3485 bool take_ownership,
3486 const struct pipe_constant_buffer *input)
3487 {
3488 struct crocus_context *ice = (struct crocus_context *) ctx;
3489 gl_shader_stage stage = stage_from_pipe(p_stage);
3490 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3491 struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3492
3493 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3494
3495 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3496 shs->bound_cbufs |= 1u << index;
3497
3498 if (input->user_buffer) {
3499 void *map = NULL;
3500 pipe_resource_reference(&cbuf->buffer, NULL);
3501 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3502 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3503
3504 if (!cbuf->buffer) {
3505 /* Allocation was unsuccessful - just unbind */
3506 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3507 return;
3508 }
3509
3510 assert(map);
3511 memcpy(map, input->user_buffer, input->buffer_size);
3512 }
3513 cbuf->buffer_size =
3514 MIN2(input->buffer_size,
3515 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3516
3517 struct crocus_resource *res = (void *) cbuf->buffer;
3518 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3519 res->bind_stages |= 1 << stage;
3520 } else {
3521 shs->bound_cbufs &= ~(1u << index);
3522 }
3523
3524 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3525 }
3526
3527 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3528 upload_sysvals(struct crocus_context *ice,
3529 gl_shader_stage stage)
3530 {
3531 UNUSED struct crocus_genx_state *genx = ice->state.genx;
3532 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3533
3534 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3535 if (!shader || shader->num_system_values == 0)
3536 return;
3537
3538 assert(shader->num_cbufs > 0);
3539
3540 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3541 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3542 unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3543 uint32_t *map = NULL;
3544
3545 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3546 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3547 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3548
3549 for (int i = 0; i < shader->num_system_values; i++) {
3550 uint32_t sysval = shader->system_values[i];
3551 uint32_t value = 0;
3552
3553 if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3554 #if GFX_VER >= 7
3555 unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3556 unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3557 struct brw_image_param *param =
3558 &genx->shaders[stage].image_param[img];
3559
3560 assert(offset < sizeof(struct brw_image_param));
3561 value = ((uint32_t *) param)[offset];
3562 #endif
3563 } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3564 value = 0;
3565 } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3566 int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3567 int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3568 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3569 } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3570 if (stage == MESA_SHADER_TESS_CTRL) {
3571 value = ice->state.vertices_per_patch;
3572 } else {
3573 assert(stage == MESA_SHADER_TESS_EVAL);
3574 const struct shader_info *tcs_info =
3575 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3576 if (tcs_info)
3577 value = tcs_info->tess.tcs_vertices_out;
3578 else
3579 value = ice->state.vertices_per_patch;
3580 }
3581 } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3582 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3583 unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3584 value = fui(ice->state.default_outer_level[i]);
3585 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3586 value = fui(ice->state.default_inner_level[0]);
3587 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3588 value = fui(ice->state.default_inner_level[1]);
3589 } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3590 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3591 unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3592 value = ice->state.last_block[i];
3593 } else {
3594 assert(!"unhandled system value");
3595 }
3596
3597 *map++ = value;
3598 }
3599
3600 cbuf->buffer_size = upload_size;
3601 shs->sysvals_need_upload = false;
3602 }
3603
3604 /**
3605 * The pipe->set_shader_buffers() driver hook.
3606 *
3607 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
3608 * SURFACE_STATE here, as the buffer offset may change each time.
3609 */
3610 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3611 crocus_set_shader_buffers(struct pipe_context *ctx,
3612 enum pipe_shader_type p_stage,
3613 unsigned start_slot, unsigned count,
3614 const struct pipe_shader_buffer *buffers,
3615 unsigned writable_bitmask)
3616 {
3617 struct crocus_context *ice = (struct crocus_context *) ctx;
3618 gl_shader_stage stage = stage_from_pipe(p_stage);
3619 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3620
3621 unsigned modified_bits = u_bit_consecutive(start_slot, count);
3622
3623 shs->bound_ssbos &= ~modified_bits;
3624 shs->writable_ssbos &= ~modified_bits;
3625 shs->writable_ssbos |= writable_bitmask << start_slot;
3626
3627 for (unsigned i = 0; i < count; i++) {
3628 if (buffers && buffers[i].buffer) {
3629 struct crocus_resource *res = (void *) buffers[i].buffer;
3630 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3631 pipe_resource_reference(&ssbo->buffer, &res->base.b);
3632 ssbo->buffer_offset = buffers[i].buffer_offset;
3633 ssbo->buffer_size =
3634 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3635
3636 shs->bound_ssbos |= 1 << (start_slot + i);
3637
3638 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3639 res->bind_stages |= 1 << stage;
3640
3641 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3642 ssbo->buffer_offset + ssbo->buffer_size);
3643 } else {
3644 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3645 }
3646 }
3647
3648 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3649 }
3650
3651 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3652 crocus_delete_state(struct pipe_context *ctx, void *state)
3653 {
3654 free(state);
3655 }
3656
3657 /**
3658 * The pipe->set_vertex_buffers() driver hook.
3659 *
3660 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3661 */
3662 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,const struct pipe_vertex_buffer * buffers)3663 crocus_set_vertex_buffers(struct pipe_context *ctx,
3664 unsigned start_slot, unsigned count,
3665 unsigned unbind_num_trailing_slots,
3666 bool take_ownership,
3667 const struct pipe_vertex_buffer *buffers)
3668 {
3669 struct crocus_context *ice = (struct crocus_context *) ctx;
3670 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3671 const unsigned padding =
3672 (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3673 ice->state.bound_vertex_buffers &=
3674 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3675
3676 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3677 buffers, start_slot, count, unbind_num_trailing_slots,
3678 take_ownership);
3679
3680 for (unsigned i = 0; i < count; i++) {
3681 struct pipe_vertex_buffer *state =
3682 &ice->state.vertex_buffers[start_slot + i];
3683
3684 if (!state->is_user_buffer && state->buffer.resource) {
3685 struct crocus_resource *res = (void *)state->buffer.resource;
3686 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3687 }
3688
3689 uint32_t end = 0;
3690 if (state->buffer.resource)
3691 end = state->buffer.resource->width0 + padding;
3692 ice->state.vb_end[start_slot + i] = end;
3693 }
3694 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3695 }
3696
3697 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3698 static uint8_t get_wa_flags(enum isl_format format)
3699 {
3700 uint8_t wa_flags = 0;
3701
3702 switch (format) {
3703 case ISL_FORMAT_R10G10B10A2_USCALED:
3704 wa_flags = BRW_ATTRIB_WA_SCALE;
3705 break;
3706 case ISL_FORMAT_R10G10B10A2_SSCALED:
3707 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
3708 break;
3709 case ISL_FORMAT_R10G10B10A2_UNORM:
3710 wa_flags = BRW_ATTRIB_WA_NORMALIZE;
3711 break;
3712 case ISL_FORMAT_R10G10B10A2_SNORM:
3713 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
3714 break;
3715 case ISL_FORMAT_R10G10B10A2_SINT:
3716 wa_flags = BRW_ATTRIB_WA_SIGN;
3717 break;
3718 case ISL_FORMAT_B10G10R10A2_USCALED:
3719 wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3720 break;
3721 case ISL_FORMAT_B10G10R10A2_SSCALED:
3722 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3723 break;
3724 case ISL_FORMAT_B10G10R10A2_UNORM:
3725 wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3726 break;
3727 case ISL_FORMAT_B10G10R10A2_SNORM:
3728 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3729 break;
3730 case ISL_FORMAT_B10G10R10A2_SINT:
3731 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
3732 break;
3733 case ISL_FORMAT_B10G10R10A2_UINT:
3734 wa_flags = BRW_ATTRIB_WA_BGRA;
3735 break;
3736 default:
3737 break;
3738 }
3739 return wa_flags;
3740 }
3741 #endif
3742
3743 /**
3744 * Gallium CSO for vertex elements.
3745 */
3746 struct crocus_vertex_element_state {
3747 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3748 #if GFX_VER == 8
3749 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3750 #endif
3751 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3752 #if GFX_VER == 8
3753 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3754 #endif
3755 uint32_t step_rate[16];
3756 uint8_t wa_flags[33];
3757 unsigned count;
3758 };
3759
3760 /**
3761 * The pipe->create_vertex_elements() driver hook.
3762 *
3763 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3764 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3765 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3766 * needed. In these cases we will need information available at draw time.
3767 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3768 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3769 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3770 */
3771 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3772 crocus_create_vertex_elements(struct pipe_context *ctx,
3773 unsigned count,
3774 const struct pipe_vertex_element *state)
3775 {
3776 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3777 const struct intel_device_info *devinfo = &screen->devinfo;
3778 struct crocus_vertex_element_state *cso =
3779 malloc(sizeof(struct crocus_vertex_element_state));
3780
3781 cso->count = count;
3782
3783 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3784 ve.DWordLength =
3785 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3786 }
3787
3788 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3789 #if GFX_VER == 8
3790 uint32_t *vfi_pack_dest = cso->vf_instancing;
3791 #endif
3792
3793 if (count == 0) {
3794 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3795 ve.Valid = true;
3796 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3797 ve.Component0Control = VFCOMP_STORE_0;
3798 ve.Component1Control = VFCOMP_STORE_0;
3799 ve.Component2Control = VFCOMP_STORE_0;
3800 ve.Component3Control = VFCOMP_STORE_1_FP;
3801 }
3802 #if GFX_VER == 8
3803 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3804 }
3805 #endif
3806 }
3807
3808 for (int i = 0; i < count; i++) {
3809 const struct crocus_format_info fmt =
3810 crocus_format_for_usage(devinfo, state[i].src_format, 0);
3811 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3812 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3813 enum isl_format actual_fmt = fmt.fmt;
3814
3815 #if GFX_VERx10 < 75
3816 cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3817
3818 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3819 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3820 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3821 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3822 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3823 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3824 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3825 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3826 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3827 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3828 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3829 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3830 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3831 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3832 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3833 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3834 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3835 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3836 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3837 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3838 #endif
3839
3840 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3841
3842 switch (isl_format_get_num_channels(fmt.fmt)) {
3843 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3844 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3845 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3846 case 3:
3847 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3848 : VFCOMP_STORE_1_FP;
3849 break;
3850 }
3851 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3852 #if GFX_VER >= 6
3853 ve.EdgeFlagEnable = false;
3854 #endif
3855 ve.VertexBufferIndex = state[i].vertex_buffer_index;
3856 ve.Valid = true;
3857 ve.SourceElementOffset = state[i].src_offset;
3858 ve.SourceElementFormat = actual_fmt;
3859 ve.Component0Control = comp[0];
3860 ve.Component1Control = comp[1];
3861 ve.Component2Control = comp[2];
3862 ve.Component3Control = comp[3];
3863 #if GFX_VER < 5
3864 ve.DestinationElementOffset = i * 4;
3865 #endif
3866 }
3867
3868 #if GFX_VER == 8
3869 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3870 vi.VertexElementIndex = i;
3871 vi.InstancingEnable = state[i].instance_divisor > 0;
3872 vi.InstanceDataStepRate = state[i].instance_divisor;
3873 }
3874 #endif
3875 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3876 #if GFX_VER == 8
3877 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3878 #endif
3879 }
3880
3881 /* An alternative version of the last VE and VFI is stored so it
3882 * can be used at draw time in case Vertex Shader uses EdgeFlag
3883 */
3884 if (count) {
3885 const unsigned edgeflag_index = count - 1;
3886 const struct crocus_format_info fmt =
3887 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3888 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3889 #if GFX_VER >= 6
3890 ve.EdgeFlagEnable = true;
3891 #endif
3892 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3893 ve.Valid = true;
3894 ve.SourceElementOffset = state[edgeflag_index].src_offset;
3895 ve.SourceElementFormat = fmt.fmt;
3896 ve.Component0Control = VFCOMP_STORE_SRC;
3897 ve.Component1Control = VFCOMP_STORE_0;
3898 ve.Component2Control = VFCOMP_STORE_0;
3899 ve.Component3Control = VFCOMP_STORE_0;
3900 }
3901 #if GFX_VER == 8
3902 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3903 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3904 * at draw time, as it should change if SGVs are emitted.
3905 */
3906 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3907 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3908 }
3909 #endif
3910 }
3911
3912 return cso;
3913 }
3914
3915 /**
3916 * The pipe->bind_vertex_elements_state() driver hook.
3917 */
3918 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3919 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3920 {
3921 struct crocus_context *ice = (struct crocus_context *) ctx;
3922 #if GFX_VER == 8
3923 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3924 struct crocus_vertex_element_state *new_cso = state;
3925
3926 if (new_cso && cso_changed(count))
3927 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3928 #endif
3929 ice->state.cso_vertex_elements = state;
3930 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3931 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3932 }
3933
3934 #if GFX_VER >= 6
3935 struct crocus_streamout_counter {
3936 uint32_t offset_start;
3937 uint32_t offset_end;
3938
3939 uint64_t accum;
3940 };
3941
3942 /**
3943 * Gallium CSO for stream output (transform feedback) targets.
3944 */
3945 struct crocus_stream_output_target {
3946 struct pipe_stream_output_target base;
3947
3948 /** Stride (bytes-per-vertex) during this transform feedback operation */
3949 uint16_t stride;
3950
3951 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3952 bool zeroed;
3953
3954 struct crocus_resource *offset_res;
3955 uint32_t offset_offset;
3956
3957 #if GFX_VER == 6
3958 void *prim_map;
3959 struct crocus_streamout_counter prev_count;
3960 struct crocus_streamout_counter count;
3961 #endif
3962 #if GFX_VER == 8
3963 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3964 bool zero_offset;
3965 #endif
3966 };
3967
3968 #if GFX_VER >= 7
3969 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3970 crocus_get_so_offset(struct pipe_stream_output_target *so)
3971 {
3972 struct crocus_stream_output_target *tgt = (void *)so;
3973 struct pipe_transfer *transfer;
3974 struct pipe_box box;
3975 uint32_t result;
3976 u_box_1d(tgt->offset_offset, 4, &box);
3977 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3978 0, PIPE_MAP_DIRECTLY,
3979 &box, &transfer);
3980 assert(val);
3981 result = *(uint32_t *)val;
3982 so->context->buffer_unmap(so->context, transfer);
3983
3984 return result / tgt->stride;
3985 }
3986 #endif
3987
3988 #if GFX_VER == 6
3989 static void
3990 compute_vertices_written_so_far(struct crocus_context *ice,
3991 struct crocus_stream_output_target *tgt,
3992 struct crocus_streamout_counter *count,
3993 uint64_t *svbi);
3994
3995 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3996 crocus_get_so_offset(struct pipe_stream_output_target *so)
3997 {
3998 struct crocus_stream_output_target *tgt = (void *)so;
3999 struct crocus_context *ice = (void *)so->context;
4000
4001 uint64_t vert_written;
4002 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
4003 return vert_written;
4004 }
4005 #endif
4006
4007 /**
4008 * The pipe->create_stream_output_target() driver hook.
4009 *
4010 * "Target" here refers to a destination buffer. We translate this into
4011 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4012 * know which buffer this represents, or whether we ought to zero the
4013 * write-offsets, or append. Those are handled in the set() hook.
4014 */
4015 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4016 crocus_create_stream_output_target(struct pipe_context *ctx,
4017 struct pipe_resource *p_res,
4018 unsigned buffer_offset,
4019 unsigned buffer_size)
4020 {
4021 struct crocus_resource *res = (void *) p_res;
4022 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4023 if (!cso)
4024 return NULL;
4025
4026 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4027
4028 pipe_reference_init(&cso->base.reference, 1);
4029 pipe_resource_reference(&cso->base.buffer, p_res);
4030 cso->base.buffer_offset = buffer_offset;
4031 cso->base.buffer_size = buffer_size;
4032 cso->base.context = ctx;
4033
4034 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4035 buffer_offset + buffer_size);
4036 #if GFX_VER >= 7
4037 struct crocus_context *ice = (struct crocus_context *) ctx;
4038 void *temp;
4039 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4040 &cso->offset_offset,
4041 (struct pipe_resource **)&cso->offset_res,
4042 &temp);
4043 #endif
4044
4045 return &cso->base;
4046 }
4047
4048 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4049 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4050 struct pipe_stream_output_target *state)
4051 {
4052 struct crocus_stream_output_target *cso = (void *) state;
4053
4054 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4055 pipe_resource_reference(&cso->base.buffer, NULL);
4056
4057 free(cso);
4058 }
4059
4060 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
4061 #define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
4062
4063 #if GFX_VER == 6
4064 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4065 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4066 struct crocus_streamout_counter *counter)
4067 {
4068 uint64_t *prim_counts = tgt->prim_map;
4069
4070 if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4071 struct pipe_fence_handle *out_fence = NULL;
4072 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4073 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4074 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4075 }
4076
4077 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4078 counter->accum += prim_counts[i + 1] - prim_counts[i];
4079 }
4080 tgt->count.offset_start = tgt->count.offset_end = 0;
4081 }
4082
4083 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4084 crocus_stream_store_prims_written(struct crocus_batch *batch,
4085 struct crocus_stream_output_target *tgt)
4086 {
4087 if (!tgt->offset_res) {
4088 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4089 &tgt->offset_offset,
4090 (struct pipe_resource **)&tgt->offset_res,
4091 &tgt->prim_map);
4092 tgt->count.offset_start = tgt->count.offset_end = 0;
4093 }
4094
4095 if (tgt->count.offset_end + 16 >= 4096) {
4096 aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4097 aggregate_stream_counter(batch, tgt, &tgt->count);
4098 }
4099
4100 crocus_emit_mi_flush(batch);
4101 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4102 tgt->offset_res->bo,
4103 tgt->count.offset_end + tgt->offset_offset, false);
4104 tgt->count.offset_end += 8;
4105 }
4106
4107 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4108 compute_vertices_written_so_far(struct crocus_context *ice,
4109 struct crocus_stream_output_target *tgt,
4110 struct crocus_streamout_counter *counter,
4111 uint64_t *svbi)
4112 {
4113 //TODO vertices per prim
4114 aggregate_stream_counter(&ice->batches[0], tgt, counter);
4115
4116 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4117 }
4118 #endif
4119 /**
4120 * The pipe->set_stream_output_targets() driver hook.
4121 *
4122 * At this point, we know which targets are bound to a particular index,
4123 * and also whether we want to append or start over. We can finish the
4124 * 3DSTATE_SO_BUFFER packets we started earlier.
4125 */
4126 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4127 crocus_set_stream_output_targets(struct pipe_context *ctx,
4128 unsigned num_targets,
4129 struct pipe_stream_output_target **targets,
4130 const unsigned *offsets)
4131 {
4132 struct crocus_context *ice = (struct crocus_context *) ctx;
4133 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4134 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4135 const bool active = num_targets > 0;
4136 if (ice->state.streamout_active != active) {
4137 ice->state.streamout_active = active;
4138 #if GFX_VER >= 7
4139 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4140 #else
4141 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4142 #endif
4143
4144 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4145 * it's a non-pipelined command. If we're switching streamout on, we
4146 * may have missed emitting it earlier, so do so now. (We're already
4147 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4148 */
4149 if (active) {
4150 #if GFX_VER >= 7
4151 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4152 #endif
4153 } else {
4154 uint32_t flush = 0;
4155 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4156 struct crocus_stream_output_target *tgt =
4157 (void *) ice->state.so_target[i];
4158 if (tgt) {
4159 struct crocus_resource *res = (void *) tgt->base.buffer;
4160
4161 flush |= crocus_flush_bits_for_history(res);
4162 crocus_dirty_for_history(ice, res);
4163 }
4164 }
4165 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4166 "make streamout results visible", flush);
4167 }
4168 }
4169
4170 ice->state.so_targets = num_targets;
4171 for (int i = 0; i < 4; i++) {
4172 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4173 pipe_so_target_reference(&ice->state.so_target[i],
4174 i < num_targets ? targets[i] : NULL);
4175 }
4176
4177 #if GFX_VER == 6
4178 bool stored_num_prims = false;
4179 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4180 if (num_targets) {
4181 struct crocus_stream_output_target *tgt =
4182 (void *) ice->state.so_target[i];
4183
4184 if (!tgt)
4185 continue;
4186 if (offsets[i] == 0) {
4187 // This means that we're supposed to ignore anything written to
4188 // the buffer before. We can do this by just clearing out the
4189 // count of writes to the prim count buffer.
4190 tgt->count.offset_start = tgt->count.offset_end;
4191 tgt->count.accum = 0;
4192 ice->state.svbi = 0;
4193 } else {
4194 if (tgt->offset_res) {
4195 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4196 tgt->count.offset_start = tgt->count.offset_end;
4197 }
4198 }
4199
4200 if (!stored_num_prims) {
4201 crocus_stream_store_prims_written(batch, tgt);
4202 stored_num_prims = true;
4203 }
4204 } else {
4205 struct crocus_stream_output_target *tgt =
4206 (void *) old_tgt[i];
4207 if (tgt) {
4208 if (!stored_num_prims) {
4209 crocus_stream_store_prims_written(batch, tgt);
4210 stored_num_prims = true;
4211 }
4212
4213 if (tgt->offset_res) {
4214 tgt->prev_count = tgt->count;
4215 }
4216 }
4217 }
4218 pipe_so_target_reference(&old_tgt[i], NULL);
4219 }
4220 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4221 #else
4222 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4223 if (num_targets) {
4224 struct crocus_stream_output_target *tgt =
4225 (void *) ice->state.so_target[i];
4226
4227 if (offsets[i] == 0) {
4228 #if GFX_VER == 8
4229 if (tgt)
4230 tgt->zero_offset = true;
4231 #endif
4232 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4233 }
4234 else if (tgt)
4235 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4236 tgt->offset_res->bo,
4237 tgt->offset_offset);
4238 } else {
4239 struct crocus_stream_output_target *tgt =
4240 (void *) old_tgt[i];
4241 if (tgt)
4242 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4243 tgt->offset_res->bo,
4244 tgt->offset_offset, false);
4245 }
4246 pipe_so_target_reference(&old_tgt[i], NULL);
4247 }
4248 #endif
4249 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4250 if (!active)
4251 return;
4252 #if GFX_VER >= 7
4253 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4254 #elif GFX_VER == 6
4255 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4256 #endif
4257 }
4258
4259 #endif
4260
4261 #if GFX_VER >= 7
4262 /**
4263 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4264 * 3DSTATE_STREAMOUT packets.
4265 *
4266 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4267 * hardware to record. We can create it entirely based on the shader, with
4268 * no dynamic state dependencies.
4269 *
4270 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4271 * state-based settings. We capture the shader-related ones here, and merge
4272 * the rest in at draw time.
4273 */
4274 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct brw_vue_map * vue_map)4275 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4276 const struct brw_vue_map *vue_map)
4277 {
4278 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
4279 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4280 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4281 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4282 int max_decls = 0;
4283 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
4284
4285 memset(so_decl, 0, sizeof(so_decl));
4286
4287 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4288 * command feels strange -- each dword pair contains a SO_DECL per stream.
4289 */
4290 for (unsigned i = 0; i < info->num_outputs; i++) {
4291 const struct pipe_stream_output *output = &info->output[i];
4292 const int buffer = output->output_buffer;
4293 const int varying = output->register_index;
4294 const unsigned stream_id = output->stream;
4295 assert(stream_id < MAX_VERTEX_STREAMS);
4296
4297 buffer_mask[stream_id] |= 1 << buffer;
4298
4299 assert(vue_map->varying_to_slot[varying] >= 0);
4300
4301 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4302 * array. Instead, it simply increments DstOffset for the following
4303 * input by the number of components that should be skipped.
4304 *
4305 * Our hardware is unusual in that it requires us to program SO_DECLs
4306 * for fake "hole" components, rather than simply taking the offset
4307 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4308 * program as many size = 4 holes as we can, then a final hole to
4309 * accommodate the final 1, 2, or 3 remaining.
4310 */
4311 int skip_components = output->dst_offset - next_offset[buffer];
4312
4313 while (skip_components > 0) {
4314 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4315 .HoleFlag = 1,
4316 .OutputBufferSlot = output->output_buffer,
4317 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4318 };
4319 skip_components -= 4;
4320 }
4321
4322 next_offset[buffer] = output->dst_offset + output->num_components;
4323
4324 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4325 .OutputBufferSlot = output->output_buffer,
4326 .RegisterIndex = vue_map->varying_to_slot[varying],
4327 .ComponentMask =
4328 ((1 << output->num_components) - 1) << output->start_component,
4329 };
4330
4331 if (decls[stream_id] > max_decls)
4332 max_decls = decls[stream_id];
4333 }
4334
4335 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4336 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4337 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4338
4339 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4340 int urb_entry_read_offset = 0;
4341 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4342 urb_entry_read_offset;
4343
4344 /* We always read the whole vertex. This could be reduced at some
4345 * point by reading less and offsetting the register index in the
4346 * SO_DECLs.
4347 */
4348 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4349 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4350 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4351 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4352 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4353 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4354 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4355 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4356
4357 // TODO: Double-check that stride == 0 means no buffer. Probably this
4358 // needs to go elsewhere, where the buffer enable stuff is actually
4359 // known.
4360 #if GFX_VER < 8
4361 sol.SOBufferEnable0 = !!info->stride[0];
4362 sol.SOBufferEnable1 = !!info->stride[1];
4363 sol.SOBufferEnable2 = !!info->stride[2];
4364 sol.SOBufferEnable3 = !!info->stride[3];
4365 #else
4366 /* Set buffer pitches; 0 means unbound. */
4367 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4368 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4369 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4370 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4371 #endif
4372 }
4373
4374 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4375 list.DWordLength = 3 + 2 * max_decls - 2;
4376 list.StreamtoBufferSelects0 = buffer_mask[0];
4377 list.StreamtoBufferSelects1 = buffer_mask[1];
4378 list.StreamtoBufferSelects2 = buffer_mask[2];
4379 list.StreamtoBufferSelects3 = buffer_mask[3];
4380 list.NumEntries0 = decls[0];
4381 list.NumEntries1 = decls[1];
4382 list.NumEntries2 = decls[2];
4383 list.NumEntries3 = decls[3];
4384 }
4385
4386 for (int i = 0; i < max_decls; i++) {
4387 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4388 entry.Stream0Decl = so_decl[0][i];
4389 entry.Stream1Decl = so_decl[1][i];
4390 entry.Stream2Decl = so_decl[2][i];
4391 entry.Stream3Decl = so_decl[3][i];
4392 }
4393 }
4394
4395 return map;
4396 }
4397 #endif
4398
4399 #if GFX_VER == 6
4400 static void
crocus_emit_so_svbi(struct crocus_context * ice)4401 crocus_emit_so_svbi(struct crocus_context *ice)
4402 {
4403 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4404
4405 unsigned max_vertex = 0xffffffff;
4406 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4407 struct crocus_stream_output_target *tgt =
4408 (void *) ice->state.so_target[i];
4409 if (tgt)
4410 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4411 }
4412
4413 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4414 svbi.IndexNumber = 0;
4415 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4416 svbi.MaximumIndex = max_vertex;
4417 }
4418
4419 /* initialize the rest of the SVBI's to reasonable values so that we don't
4420 * run out of room writing the regular data.
4421 */
4422 for (int i = 1; i < 4; i++) {
4423 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4424 svbi.IndexNumber = i;
4425 svbi.StreamedVertexBufferIndex = 0;
4426 svbi.MaximumIndex = 0xffffffff;
4427 }
4428 }
4429 }
4430
4431 #endif
4432
4433
4434 #if GFX_VER >= 6
4435 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4436 crocus_is_drawing_points(const struct crocus_context *ice)
4437 {
4438 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4439
4440 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4441 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4442 return true;
4443
4444 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4445 const struct brw_gs_prog_data *gs_prog_data =
4446 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4447 return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4448 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4449 const struct brw_tes_prog_data *tes_data =
4450 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4451 return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4452 } else {
4453 return ice->state.prim_mode == PIPE_PRIM_POINTS;
4454 }
4455 }
4456 #endif
4457
4458 #if GFX_VER >= 6
4459 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct brw_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4460 get_attr_override(
4461 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4462 const struct brw_vue_map *vue_map,
4463 int urb_entry_read_offset, int fs_attr,
4464 bool two_side_color, uint32_t *max_source_attr)
4465 {
4466 /* Find the VUE slot for this attribute. */
4467 int slot = vue_map->varying_to_slot[fs_attr];
4468
4469 /* Viewport and Layer are stored in the VUE header. We need to override
4470 * them to zero if earlier stages didn't write them, as GL requires that
4471 * they read back as zero when not explicitly set.
4472 */
4473 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4474 attr->ComponentOverrideX = true;
4475 attr->ComponentOverrideW = true;
4476 attr->ConstantSource = CONST_0000;
4477
4478 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4479 attr->ComponentOverrideY = true;
4480 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4481 attr->ComponentOverrideZ = true;
4482
4483 return;
4484 }
4485
4486 /* If there was only a back color written but not front, use back
4487 * as the color instead of undefined
4488 */
4489 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4490 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4491 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4492 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4493
4494 if (slot == -1) {
4495 /* This attribute does not exist in the VUE--that means that the vertex
4496 * shader did not write to it. This means that either:
4497 *
4498 * (a) This attribute is a texture coordinate, and it is going to be
4499 * replaced with point coordinates (as a consequence of a call to
4500 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4501 * hardware will ignore whatever attribute override we supply.
4502 *
4503 * (b) This attribute is read by the fragment shader but not written by
4504 * the vertex shader, so its value is undefined. Therefore the
4505 * attribute override we supply doesn't matter.
4506 *
4507 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4508 * previous shader stage.
4509 *
4510 * Note that we don't have to worry about the cases where the attribute
4511 * is gl_PointCoord or is undergoing point sprite coordinate
4512 * replacement, because in those cases, this function isn't called.
4513 *
4514 * In case (c), we need to program the attribute overrides so that the
4515 * primitive ID will be stored in this slot. In every other case, the
4516 * attribute override we supply doesn't matter. So just go ahead and
4517 * program primitive ID in every case.
4518 */
4519 attr->ComponentOverrideW = true;
4520 attr->ComponentOverrideX = true;
4521 attr->ComponentOverrideY = true;
4522 attr->ComponentOverrideZ = true;
4523 attr->ConstantSource = PRIM_ID;
4524 return;
4525 }
4526
4527 /* Compute the location of the attribute relative to urb_entry_read_offset.
4528 * Each increment of urb_entry_read_offset represents a 256-bit value, so
4529 * it counts for two 128-bit VUE slots.
4530 */
4531 int source_attr = slot - 2 * urb_entry_read_offset;
4532 assert(source_attr >= 0 && source_attr < 32);
4533
4534 /* If we are doing two-sided color, and the VUE slot following this one
4535 * represents a back-facing color, then we need to instruct the SF unit to
4536 * do back-facing swizzling.
4537 */
4538 bool swizzling = two_side_color &&
4539 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4540 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4541 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4542 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4543
4544 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
4545 if (*max_source_attr < source_attr + swizzling)
4546 *max_source_attr = source_attr + swizzling;
4547
4548 attr->SourceAttribute = source_attr;
4549 if (swizzling)
4550 attr->SwizzleSelect = INPUTATTR_FACING;
4551 }
4552
4553 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4554 calculate_attr_overrides(
4555 const struct crocus_context *ice,
4556 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4557 uint32_t *point_sprite_enables,
4558 uint32_t *urb_entry_read_length,
4559 uint32_t *urb_entry_read_offset)
4560 {
4561 const struct brw_wm_prog_data *wm_prog_data = (void *)
4562 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4563 const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
4564 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4565 uint32_t max_source_attr = 0;
4566 const struct shader_info *fs_info =
4567 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4568
4569 int first_slot =
4570 brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4571
4572 /* Each URB offset packs two varying slots */
4573 assert(first_slot % 2 == 0);
4574 *urb_entry_read_offset = first_slot / 2;
4575 *point_sprite_enables = 0;
4576
4577 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4578 const int input_index = wm_prog_data->urb_setup[fs_attr];
4579
4580 if (input_index < 0)
4581 continue;
4582
4583 bool point_sprite = false;
4584 if (crocus_is_drawing_points(ice)) {
4585 if (fs_attr >= VARYING_SLOT_TEX0 &&
4586 fs_attr <= VARYING_SLOT_TEX7 &&
4587 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4588 point_sprite = true;
4589
4590 if (fs_attr == VARYING_SLOT_PNTC)
4591 point_sprite = true;
4592
4593 if (point_sprite)
4594 *point_sprite_enables |= 1U << input_index;
4595 }
4596
4597 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4598 if (!point_sprite) {
4599 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4600 cso_rast->cso.light_twoside, &max_source_attr);
4601 }
4602
4603 /* The hardware can only do the overrides on 16 overrides at a
4604 * time, and the other up to 16 have to be lined up so that the
4605 * input index = the output index. We'll need to do some
4606 * tweaking to make sure that's the case.
4607 */
4608 if (input_index < 16)
4609 attr_overrides[input_index] = attribute;
4610 else
4611 assert(attribute.SourceAttribute == input_index);
4612 }
4613
4614 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4615 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4616 *
4617 * "This field should be set to the minimum length required to read the
4618 * maximum source attribute. The maximum source attribute is indicated
4619 * by the maximum value of the enabled Attribute # Source Attribute if
4620 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4621 * enable is not set.
4622 * read_length = ceiling((max_source_attr + 1) / 2)
4623 *
4624 * [errata] Corruption/Hang possible if length programmed larger than
4625 * recommended"
4626 *
4627 * Similar text exists for Ivy Bridge.
4628 */
4629 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4630 }
4631 #endif
4632
4633 #if GFX_VER >= 7
4634 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4635 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4636 {
4637 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4638 const struct brw_wm_prog_data *wm_prog_data = (void *)
4639 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4640 #if GFX_VER >= 8
4641 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4642 #else
4643 #define attr_overrides sbe.Attribute
4644 #endif
4645
4646 uint32_t urb_entry_read_length;
4647 uint32_t urb_entry_read_offset;
4648 uint32_t point_sprite_enables;
4649
4650 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4651 sbe.AttributeSwizzleEnable = true;
4652 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4653 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4654
4655 calculate_attr_overrides(ice,
4656 attr_overrides,
4657 &point_sprite_enables,
4658 &urb_entry_read_length,
4659 &urb_entry_read_offset);
4660 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4661 sbe.VertexURBEntryReadLength = urb_entry_read_length;
4662 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4663 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4664 #if GFX_VER >= 8
4665 sbe.ForceVertexURBEntryReadLength = true;
4666 sbe.ForceVertexURBEntryReadOffset = true;
4667 #endif
4668 }
4669 #if GFX_VER >= 8
4670 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4671 for (int i = 0; i < 16; i++)
4672 sbes.Attribute[i] = attr_overrides[i];
4673 }
4674 #endif
4675 }
4676 #endif
4677
4678 /* ------------------------------------------------------------------- */
4679
4680 /**
4681 * Populate VS program key fields based on the current state.
4682 */
4683 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_vs_prog_key * key)4684 crocus_populate_vs_key(const struct crocus_context *ice,
4685 const struct shader_info *info,
4686 gl_shader_stage last_stage,
4687 struct brw_vs_prog_key *key)
4688 {
4689 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4690
4691 if (info->clip_distance_array_size == 0 &&
4692 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4693 last_stage == MESA_SHADER_VERTEX)
4694 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4695
4696 if (last_stage == MESA_SHADER_VERTEX &&
4697 info->outputs_written & (VARYING_BIT_PSIZ))
4698 key->clamp_pointsize = 1;
4699
4700 #if GFX_VER <= 5
4701 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4702 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4703 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4704 #endif
4705
4706 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4707
4708 #if GFX_VERx10 < 75
4709 uint64_t inputs_read = info->inputs_read;
4710 int ve_idx = 0;
4711 while (inputs_read) {
4712 int i = u_bit_scan64(&inputs_read);
4713 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4714 ve_idx++;
4715 }
4716 #endif
4717 }
4718
4719 /**
4720 * Populate TCS program key fields based on the current state.
4721 */
4722 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct brw_tcs_prog_key * key)4723 crocus_populate_tcs_key(const struct crocus_context *ice,
4724 struct brw_tcs_prog_key *key)
4725 {
4726 }
4727
4728 /**
4729 * Populate TES program key fields based on the current state.
4730 */
4731 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_tes_prog_key * key)4732 crocus_populate_tes_key(const struct crocus_context *ice,
4733 const struct shader_info *info,
4734 gl_shader_stage last_stage,
4735 struct brw_tes_prog_key *key)
4736 {
4737 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4738
4739 if (info->clip_distance_array_size == 0 &&
4740 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4741 last_stage == MESA_SHADER_TESS_EVAL)
4742 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4743
4744 if (last_stage == MESA_SHADER_TESS_EVAL &&
4745 info->outputs_written & (VARYING_BIT_PSIZ))
4746 key->clamp_pointsize = 1;
4747 }
4748
4749 /**
4750 * Populate GS program key fields based on the current state.
4751 */
4752 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_gs_prog_key * key)4753 crocus_populate_gs_key(const struct crocus_context *ice,
4754 const struct shader_info *info,
4755 gl_shader_stage last_stage,
4756 struct brw_gs_prog_key *key)
4757 {
4758 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4759
4760 if (info->clip_distance_array_size == 0 &&
4761 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4762 last_stage == MESA_SHADER_GEOMETRY)
4763 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4764
4765 if (last_stage == MESA_SHADER_GEOMETRY &&
4766 info->outputs_written & (VARYING_BIT_PSIZ))
4767 key->clamp_pointsize = 1;
4768 }
4769
4770 /**
4771 * Populate FS program key fields based on the current state.
4772 */
4773 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct brw_wm_prog_key * key)4774 crocus_populate_fs_key(const struct crocus_context *ice,
4775 const struct shader_info *info,
4776 struct brw_wm_prog_key *key)
4777 {
4778 struct crocus_screen *screen = (void *) ice->ctx.screen;
4779 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4780 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4781 const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4782 const struct crocus_blend_state *blend = ice->state.cso_blend;
4783
4784 #if GFX_VER < 6
4785 uint32_t lookup = 0;
4786
4787 if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4788 lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
4789
4790 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4791 lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4792
4793 if (fb->zsbuf && zsa->cso.depth_enabled) {
4794 lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4795
4796 if (zsa->cso.depth_writemask)
4797 lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4798
4799 }
4800 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4801 lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4802 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4803 lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4804 }
4805 key->iz_lookup = lookup;
4806 key->stats_wm = ice->state.stats_wm;
4807 #endif
4808
4809 uint32_t line_aa = BRW_WM_AA_NEVER;
4810 if (rast->cso.line_smooth) {
4811 int reduced_prim = ice->state.reduced_prim_mode;
4812 if (reduced_prim == PIPE_PRIM_LINES)
4813 line_aa = BRW_WM_AA_ALWAYS;
4814 else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
4815 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4816 line_aa = BRW_WM_AA_SOMETIMES;
4817
4818 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4819 rast->cso.cull_face == PIPE_FACE_BACK)
4820 line_aa = BRW_WM_AA_ALWAYS;
4821 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4822 line_aa = BRW_WM_AA_SOMETIMES;
4823
4824 if (rast->cso.cull_face == PIPE_FACE_FRONT)
4825 line_aa = BRW_WM_AA_ALWAYS;
4826 }
4827 }
4828 }
4829 key->line_aa = line_aa;
4830
4831 key->nr_color_regions = fb->nr_cbufs;
4832
4833 key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4834
4835 key->alpha_to_coverage = blend->cso.alpha_to_coverage;
4836
4837 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4838
4839 key->flat_shade = rast->cso.flatshade &&
4840 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4841
4842 key->persample_interp = rast->cso.force_persample_interp;
4843 key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
4844
4845 key->ignore_sample_mask_out = !key->multisample_fbo;
4846 key->coherent_fb_fetch = false; // TODO: needed?
4847
4848 key->force_dual_color_blend =
4849 screen->driconf.dual_color_blend_by_location &&
4850 (blend->blend_enables & 1) && blend->dual_color_blending;
4851
4852 #if GFX_VER <= 5
4853 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4854 key->emit_alpha_test = true;
4855 key->alpha_test_func = zsa->cso.alpha_func;
4856 key->alpha_test_ref = zsa->cso.alpha_ref_value;
4857 }
4858 #endif
4859 }
4860
4861 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct brw_cs_prog_key * key)4862 crocus_populate_cs_key(const struct crocus_context *ice,
4863 struct brw_cs_prog_key *key)
4864 {
4865 }
4866
4867 #if GFX_VER == 4
4868 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4869 #elif GFX_VER >= 5
4870 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4871 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4872 {
4873 return shader->offset;
4874 }
4875 #endif
4876
4877 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4878 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
4879 * this WA on C0 stepping.
4880 *
4881 * TODO: Fill out SamplerCount for prefetching?
4882 */
4883
4884 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4885 pkt.KernelStartPointer = KSP(ice, shader); \
4886 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4887 pkt.FloatingPointMode = prog_data->use_alt_mode; \
4888 \
4889 pkt.DispatchGRFStartRegisterForURBData = \
4890 prog_data->dispatch_grf_start_reg; \
4891 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
4892 pkt.prefix##URBEntryReadOffset = 0; \
4893 \
4894 pkt.StatisticsEnable = true; \
4895 pkt.Enable = true; \
4896 \
4897 if (prog_data->total_scratch) { \
4898 struct crocus_bo *bo = \
4899 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4900 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
4901 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
4902 }
4903
4904 /* ------------------------------------------------------------------- */
4905 #if GFX_VER >= 6
4906 static const uint32_t push_constant_opcodes[] = {
4907 [MESA_SHADER_VERTEX] = 21,
4908 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4909 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4910 [MESA_SHADER_GEOMETRY] = 22,
4911 [MESA_SHADER_FRAGMENT] = 23,
4912 [MESA_SHADER_COMPUTE] = 0,
4913 };
4914 #endif
4915
4916 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4917 emit_sized_null_surface(struct crocus_batch *batch,
4918 unsigned width, unsigned height,
4919 unsigned layers, unsigned levels,
4920 unsigned minimum_array_element,
4921 uint32_t *out_offset)
4922 {
4923 struct isl_device *isl_dev = &batch->screen->isl_dev;
4924 uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4925 isl_dev->ss.align,
4926 out_offset);
4927 //TODO gen 6 multisample crash
4928 isl_null_fill_state(isl_dev, surf,
4929 .size = isl_extent3d(width, height, layers),
4930 .levels = levels,
4931 .minimum_array_element = minimum_array_element);
4932 }
4933 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4934 emit_null_surface(struct crocus_batch *batch,
4935 uint32_t *out_offset)
4936 {
4937 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4938 }
4939
4940 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4941 emit_null_fb_surface(struct crocus_batch *batch,
4942 struct crocus_context *ice,
4943 uint32_t *out_offset)
4944 {
4945 uint32_t width, height, layers, level, layer;
4946 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4947 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4948 emit_null_surface(batch, out_offset);
4949 return;
4950 }
4951
4952 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4953 width = MAX2(cso->width, 1);
4954 height = MAX2(cso->height, 1);
4955 layers = cso->layers ? cso->layers : 1;
4956 level = 0;
4957 layer = 0;
4958
4959 if (cso->nr_cbufs == 0 && cso->zsbuf) {
4960 width = cso->zsbuf->width;
4961 height = cso->zsbuf->height;
4962 level = cso->zsbuf->u.tex.level;
4963 layer = cso->zsbuf->u.tex.first_layer;
4964 }
4965 emit_sized_null_surface(batch, width, height,
4966 layers, level, layer,
4967 out_offset);
4968 }
4969
4970 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4971 emit_surface_state(struct crocus_batch *batch,
4972 struct crocus_resource *res,
4973 const struct isl_surf *in_surf,
4974 bool adjust_surf,
4975 struct isl_view *in_view,
4976 bool writeable,
4977 enum isl_aux_usage aux_usage,
4978 bool blend_enable,
4979 uint32_t write_disables,
4980 uint32_t *surf_state,
4981 uint32_t addr_offset)
4982 {
4983 struct isl_device *isl_dev = &batch->screen->isl_dev;
4984 uint32_t reloc = RELOC_32BIT;
4985 uint64_t offset_B = res->offset;
4986 uint32_t tile_x_sa = 0, tile_y_sa = 0;
4987
4988 if (writeable)
4989 reloc |= RELOC_WRITE;
4990
4991 struct isl_surf surf = *in_surf;
4992 struct isl_view view = *in_view;
4993 if (adjust_surf) {
4994 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4995 isl_surf_get_image_surf(isl_dev, in_surf,
4996 view.base_level, 0,
4997 view.base_array_layer,
4998 &surf, &offset_B,
4999 &tile_x_sa, &tile_y_sa);
5000 view.base_array_layer = 0;
5001 view.base_level = 0;
5002 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5003 isl_surf_get_image_surf(isl_dev, in_surf,
5004 view.base_level, view.base_array_layer,
5005 0,
5006 &surf, &offset_B,
5007 &tile_x_sa, &tile_y_sa);
5008 view.base_array_layer = 0;
5009 view.base_level = 0;
5010 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5011 surf.dim = ISL_SURF_DIM_2D;
5012 }
5013
5014 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5015 struct crocus_bo *aux_bo = NULL;
5016 uint32_t aux_offset = 0;
5017 struct isl_surf *aux_surf = NULL;
5018 if (aux_usage != ISL_AUX_USAGE_NONE) {
5019 aux_surf = &res->aux.surf;
5020 aux_offset = res->aux.offset;
5021 aux_bo = res->aux.bo;
5022
5023 clear_color = crocus_resource_get_clear_color(res);
5024 }
5025
5026 isl_surf_fill_state(isl_dev, surf_state,
5027 .surf = &surf,
5028 .view = &view,
5029 .address = crocus_state_reloc(batch,
5030 addr_offset + isl_dev->ss.addr_offset,
5031 res->bo, offset_B, reloc),
5032 .aux_surf = aux_surf,
5033 .aux_usage = aux_usage,
5034 .aux_address = aux_offset,
5035 .mocs = crocus_mocs(res->bo, isl_dev),
5036 .clear_color = clear_color,
5037 .use_clear_address = false,
5038 .clear_address = 0,
5039 .x_offset_sa = tile_x_sa,
5040 .y_offset_sa = tile_y_sa,
5041 #if GFX_VER <= 5
5042 .blend_enable = blend_enable,
5043 .write_disables = write_disables,
5044 #endif
5045 );
5046
5047 if (aux_surf) {
5048 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5049 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5050 * contain other control information. Since buffer addresses are always
5051 * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5052 * an ordinary reloc to do the necessary address translation.
5053 *
5054 * FIXME: move to the point of assignment.
5055 */
5056 if (GFX_VER == 8) {
5057 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5058 *aux_addr = crocus_state_reloc(batch,
5059 addr_offset + isl_dev->ss.aux_addr_offset,
5060 aux_bo, *aux_addr,
5061 reloc);
5062 } else {
5063 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5064 *aux_addr = crocus_state_reloc(batch,
5065 addr_offset + isl_dev->ss.aux_addr_offset,
5066 aux_bo, *aux_addr,
5067 reloc);
5068 }
5069 }
5070
5071 }
5072
5073 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5074 emit_surface(struct crocus_batch *batch,
5075 struct crocus_surface *surf,
5076 enum isl_aux_usage aux_usage,
5077 bool blend_enable,
5078 uint32_t write_disables)
5079 {
5080 struct isl_device *isl_dev = &batch->screen->isl_dev;
5081 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5082 struct isl_view *view = &surf->view;
5083 uint32_t offset = 0;
5084 enum pipe_texture_target target = res->base.b.target;
5085 bool adjust_surf = false;
5086
5087 if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5088 adjust_surf = true;
5089
5090 if (surf->align_res)
5091 res = (struct crocus_resource *)surf->align_res;
5092
5093 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5094
5095 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5096 aux_usage, blend_enable,
5097 write_disables,
5098 surf_state, offset);
5099 return offset;
5100 }
5101
5102 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5103 emit_rt_surface(struct crocus_batch *batch,
5104 struct crocus_surface *surf,
5105 enum isl_aux_usage aux_usage)
5106 {
5107 struct isl_device *isl_dev = &batch->screen->isl_dev;
5108 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5109 struct isl_view *view = &surf->read_view;
5110 uint32_t offset = 0;
5111 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5112
5113 emit_surface_state(batch, res, &surf->surf, true, view, false,
5114 aux_usage, 0, false,
5115 surf_state, offset);
5116 return offset;
5117 }
5118
5119 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5120 emit_grid(struct crocus_context *ice,
5121 struct crocus_batch *batch)
5122 {
5123 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5124 uint32_t offset = 0;
5125 struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5126 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5127 isl_dev->ss.align, &offset);
5128 isl_buffer_fill_state(isl_dev, surf_state,
5129 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5130 crocus_resource_bo(grid_ref->res),
5131 grid_ref->offset,
5132 RELOC_32BIT),
5133 .size_B = 12,
5134 .format = ISL_FORMAT_RAW,
5135 .stride_B = 1,
5136 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5137 return offset;
5138 }
5139
5140 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5141 emit_ubo_buffer(struct crocus_context *ice,
5142 struct crocus_batch *batch,
5143 struct pipe_constant_buffer *buffer)
5144 {
5145 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5146 uint32_t offset = 0;
5147
5148 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5149 isl_dev->ss.align, &offset);
5150 isl_buffer_fill_state(isl_dev, surf_state,
5151 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5152 crocus_resource_bo(buffer->buffer),
5153 buffer->buffer_offset,
5154 RELOC_32BIT),
5155 .size_B = buffer->buffer_size,
5156 .format = 0,
5157 .swizzle = ISL_SWIZZLE_IDENTITY,
5158 .stride_B = 1,
5159 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5160
5161 return offset;
5162 }
5163
5164 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5165 emit_ssbo_buffer(struct crocus_context *ice,
5166 struct crocus_batch *batch,
5167 struct pipe_shader_buffer *buffer, bool writeable)
5168 {
5169 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5170 uint32_t offset = 0;
5171 uint32_t reloc = RELOC_32BIT;
5172
5173 if (writeable)
5174 reloc |= RELOC_WRITE;
5175 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5176 isl_dev->ss.align, &offset);
5177 isl_buffer_fill_state(isl_dev, surf_state,
5178 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5179 crocus_resource_bo(buffer->buffer),
5180 buffer->buffer_offset,
5181 reloc),
5182 .size_B = buffer->buffer_size,
5183 .format = ISL_FORMAT_RAW,
5184 .swizzle = ISL_SWIZZLE_IDENTITY,
5185 .stride_B = 1,
5186 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5187
5188 return offset;
5189 }
5190
5191 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5192 emit_sampler_view(struct crocus_context *ice,
5193 struct crocus_batch *batch,
5194 bool for_gather,
5195 struct crocus_sampler_view *isv)
5196 {
5197 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5198 uint32_t offset = 0;
5199
5200 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5201 isl_dev->ss.align, &offset);
5202
5203 if (isv->base.target == PIPE_BUFFER) {
5204 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5205 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5206 unsigned final_size =
5207 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5208 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5209 isl_buffer_fill_state(isl_dev, surf_state,
5210 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5211 isv->res->bo,
5212 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5213 .size_B = final_size,
5214 .format = isv->view.format,
5215 .swizzle = isv->view.swizzle,
5216 .stride_B = cpp,
5217 .mocs = crocus_mocs(isv->res->bo, isl_dev)
5218 );
5219 } else {
5220 enum isl_aux_usage aux_usage =
5221 crocus_resource_texture_aux_usage(isv->res);
5222
5223 emit_surface_state(batch, isv->res, &isv->res->surf, false,
5224 for_gather ? &isv->gather_view : &isv->view,
5225 false, aux_usage, false,
5226 0, surf_state, offset);
5227 }
5228 return offset;
5229 }
5230
5231 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5232 emit_image_view(struct crocus_context *ice,
5233 struct crocus_batch *batch,
5234 struct crocus_image_view *iv)
5235 {
5236 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5237 uint32_t offset = 0;
5238
5239 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5240 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5241 isl_dev->ss.align, &offset);
5242 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5243 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5244 if (res->base.b.target == PIPE_BUFFER) {
5245 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5246 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5247 unsigned final_size =
5248 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5249 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5250 isl_buffer_fill_state(isl_dev, surf_state,
5251 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5252 res->bo,
5253 res->offset + iv->base.u.buf.offset, reloc),
5254 .size_B = final_size,
5255 .format = iv->view.format,
5256 .swizzle = iv->view.swizzle,
5257 .stride_B = cpp,
5258 .mocs = crocus_mocs(res->bo, isl_dev)
5259 );
5260 } else {
5261 if (iv->view.format == ISL_FORMAT_RAW) {
5262 isl_buffer_fill_state(isl_dev, surf_state,
5263 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5264 res->bo,
5265 res->offset, reloc),
5266 .size_B = res->bo->size - res->offset,
5267 .format = iv->view.format,
5268 .swizzle = iv->view.swizzle,
5269 .stride_B = 1,
5270 .mocs = crocus_mocs(res->bo, isl_dev),
5271 );
5272
5273
5274 } else {
5275 emit_surface_state(batch, res,
5276 &res->surf, false, &iv->view,
5277 write, 0, false,
5278 0, surf_state, offset);
5279 }
5280 }
5281
5282 return offset;
5283 }
5284
5285 #if GFX_VER == 6
5286 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5287 emit_sol_surface(struct crocus_batch *batch,
5288 struct pipe_stream_output_info *so_info,
5289 uint32_t idx)
5290 {
5291 struct crocus_context *ice = batch->ice;
5292
5293 if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5294 return 0;
5295 const struct pipe_stream_output *output = &so_info->output[idx];
5296 const int buffer = output->output_buffer;
5297 assert(output->stream == 0);
5298
5299 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5300 unsigned stride_dwords = so_info->stride[buffer];
5301 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5302
5303 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5304 unsigned num_vector_components = output->num_components;
5305 unsigned num_elements;
5306 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5307 * too big to map using a single binding table entry?
5308 */
5309 // assert((size_dwords - offset_dwords) / stride_dwords
5310 // <= BRW_MAX_NUM_BUFFER_ENTRIES);
5311
5312 if (size_dwords > offset_dwords + num_vector_components) {
5313 /* There is room for at least 1 transform feedback output in the buffer.
5314 * Compute the number of additional transform feedback outputs the
5315 * buffer has room for.
5316 */
5317 num_elements =
5318 (size_dwords - offset_dwords - num_vector_components);
5319 } else {
5320 /* There isn't even room for a single transform feedback output in the
5321 * buffer. We can't configure the binding table entry to prevent output
5322 * entirely; we'll have to rely on the geometry shader to detect
5323 * overflow. But to minimize the damage in case of a bug, set up the
5324 * binding table entry to just allow a single output.
5325 */
5326 num_elements = 0;
5327 }
5328 num_elements += stride_dwords;
5329
5330 uint32_t surface_format;
5331 switch (num_vector_components) {
5332 case 1:
5333 surface_format = ISL_FORMAT_R32_FLOAT;
5334 break;
5335 case 2:
5336 surface_format = ISL_FORMAT_R32G32_FLOAT;
5337 break;
5338 case 3:
5339 surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5340 break;
5341 case 4:
5342 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5343 break;
5344 default:
5345 unreachable("Invalid vector size for transform feedback output");
5346 }
5347
5348 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5349 uint32_t offset = 0;
5350
5351 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5352 isl_dev->ss.align, &offset);
5353 isl_buffer_fill_state(isl_dev, surf_state,
5354 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5355 crocus_resource_bo(&buf->base.b),
5356 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5357 .size_B = num_elements * 4,
5358 .stride_B = stride_dwords * 4,
5359 .swizzle = ISL_SWIZZLE_IDENTITY,
5360 .format = surface_format);
5361 return offset;
5362 }
5363 #endif
5364
5365 #define foreach_surface_used(index, group) \
5366 for (int index = 0; index < bt->sizes[group]; index++) \
5367 if (crocus_group_index_to_bti(bt, group, index) != \
5368 CROCUS_SURFACE_NOT_USED)
5369
5370 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5371 crocus_populate_binding_table(struct crocus_context *ice,
5372 struct crocus_batch *batch,
5373 gl_shader_stage stage, bool ff_gs)
5374 {
5375 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5376 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5377 if (!shader)
5378 return;
5379
5380 struct crocus_binding_table *bt = &shader->bt;
5381 int s = 0;
5382 uint32_t *surf_offsets = shader->surf_offset;
5383
5384 #if GFX_VER < 8
5385 const struct shader_info *info = crocus_get_shader_info(ice, stage);
5386 #endif
5387
5388 if (stage == MESA_SHADER_FRAGMENT) {
5389 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5390 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5391 if (cso_fb->nr_cbufs) {
5392 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5393 uint32_t write_disables = 0;
5394 bool blend_enable = false;
5395 #if GFX_VER <= 5
5396 const struct pipe_rt_blend_state *rt =
5397 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5398 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5399 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5400 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5401 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5402 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5403 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5404 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5405 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5406 #endif
5407 if (cso_fb->cbufs[i]) {
5408 surf_offsets[s] = emit_surface(batch,
5409 (struct crocus_surface *)cso_fb->cbufs[i],
5410 ice->state.draw_aux_usage[i],
5411 blend_enable,
5412 write_disables);
5413 } else {
5414 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5415 }
5416 s++;
5417 }
5418 } else {
5419 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5420 s++;
5421 }
5422
5423 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5424 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5425 if (cso_fb->cbufs[i]) {
5426 surf_offsets[s++] = emit_rt_surface(batch,
5427 (struct crocus_surface *)cso_fb->cbufs[i],
5428 ice->state.draw_aux_usage[i]);
5429 }
5430 }
5431 }
5432
5433 if (stage == MESA_SHADER_COMPUTE) {
5434 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5435 surf_offsets[s] = emit_grid(ice, batch);
5436 s++;
5437 }
5438 }
5439
5440 #if GFX_VER == 6
5441 if (stage == MESA_SHADER_GEOMETRY) {
5442 struct pipe_stream_output_info *so_info;
5443 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5444 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5445 else
5446 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5447
5448 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5449 surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5450 s++;
5451 }
5452 }
5453 #endif
5454
5455 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5456 struct crocus_sampler_view *view = shs->textures[i];
5457 if (view)
5458 surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5459 else
5460 emit_null_surface(batch, &surf_offsets[s]);
5461 s++;
5462 }
5463
5464 #if GFX_VER < 8
5465 if (info && info->uses_texture_gather) {
5466 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5467 struct crocus_sampler_view *view = shs->textures[i];
5468 if (view)
5469 surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5470 else
5471 emit_null_surface(batch, &surf_offsets[s]);
5472 s++;
5473 }
5474 }
5475 #endif
5476
5477 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5478 struct crocus_image_view *view = &shs->image[i];
5479 if (view->base.resource)
5480 surf_offsets[s] = emit_image_view(ice, batch, view);
5481 else
5482 emit_null_surface(batch, &surf_offsets[s]);
5483 s++;
5484 }
5485 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5486 if (shs->constbufs[i].buffer)
5487 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5488 else
5489 emit_null_surface(batch, &surf_offsets[s]);
5490 s++;
5491 }
5492 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5493 if (shs->ssbo[i].buffer)
5494 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5495 !!(shs->writable_ssbos & (1 << i)));
5496 else
5497 emit_null_surface(batch, &surf_offsets[s]);
5498 s++;
5499 }
5500
5501 }
5502 /* ------------------------------------------------------------------- */
5503 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5504 crocus_upload_binding_table(struct crocus_context *ice,
5505 struct crocus_batch *batch,
5506 uint32_t *table,
5507 uint32_t size)
5508
5509 {
5510 if (size == 0)
5511 return 0;
5512 return emit_state(batch, table, size, 32);
5513 }
5514
5515 /**
5516 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5517 */
5518
5519 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5520 crocus_update_surface_base_address(struct crocus_batch *batch)
5521 {
5522 if (batch->state_base_address_emitted)
5523 return;
5524
5525 UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5526
5527 flush_before_state_base_change(batch);
5528
5529 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5530 /* Set base addresses */
5531 sba.GeneralStateBaseAddressModifyEnable = true;
5532
5533 #if GFX_VER >= 6
5534 sba.DynamicStateBaseAddressModifyEnable = true;
5535 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5536 #endif
5537
5538 sba.SurfaceStateBaseAddressModifyEnable = true;
5539 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5540
5541 sba.IndirectObjectBaseAddressModifyEnable = true;
5542
5543 #if GFX_VER >= 5
5544 sba.InstructionBaseAddressModifyEnable = true;
5545 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5546 #endif
5547
5548 /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5549 #if GFX_VER == 8
5550 sba.GeneralStateBufferSize = 0xfffff;
5551 sba.IndirectObjectBufferSize = 0xfffff;
5552 sba.InstructionBufferSize = 0xfffff;
5553 sba.DynamicStateBufferSize = MAX_STATE_SIZE;
5554
5555 sba.GeneralStateBufferSizeModifyEnable = true;
5556 sba.DynamicStateBufferSizeModifyEnable = true;
5557 sba.IndirectObjectBufferSizeModifyEnable = true;
5558 sba.InstructionBuffersizeModifyEnable = true;
5559 #else
5560 sba.GeneralStateAccessUpperBoundModifyEnable = true;
5561 sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5562
5563 #if GFX_VER >= 5
5564 sba.InstructionAccessUpperBoundModifyEnable = true;
5565 #endif
5566
5567 #if GFX_VER >= 6
5568 /* Dynamic state upper bound. Although the documentation says that
5569 * programming it to zero will cause it to be ignored, that is a lie.
5570 * If this isn't programmed to a real bound, the sampler border color
5571 * pointer is rejected, causing border color to mysteriously fail.
5572 */
5573 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5574 sba.DynamicStateAccessUpperBoundModifyEnable = true;
5575 #else
5576 /* Same idea but using General State Base Address on Gen4-5 */
5577 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5578 #endif
5579 #endif
5580
5581 #if GFX_VER >= 6
5582 /* The hardware appears to pay attention to the MOCS fields even
5583 * if you don't set the "Address Modify Enable" bit for the base.
5584 */
5585 sba.GeneralStateMOCS = mocs;
5586 sba.StatelessDataPortAccessMOCS = mocs;
5587 sba.DynamicStateMOCS = mocs;
5588 sba.IndirectObjectMOCS = mocs;
5589 sba.InstructionMOCS = mocs;
5590 sba.SurfaceStateMOCS = mocs;
5591 #endif
5592 }
5593
5594 flush_after_state_base_change(batch);
5595
5596 /* According to section 3.6.1 of VOL1 of the 965 PRM,
5597 * STATE_BASE_ADDRESS updates require a reissue of:
5598 *
5599 * 3DSTATE_PIPELINE_POINTERS
5600 * 3DSTATE_BINDING_TABLE_POINTERS
5601 * MEDIA_STATE_POINTERS
5602 *
5603 * and this continues through Ironlake. The Sandy Bridge PRM, vol
5604 * 1 part 1 says that the folowing packets must be reissued:
5605 *
5606 * 3DSTATE_CC_POINTERS
5607 * 3DSTATE_BINDING_TABLE_POINTERS
5608 * 3DSTATE_SAMPLER_STATE_POINTERS
5609 * 3DSTATE_VIEWPORT_STATE_POINTERS
5610 * MEDIA_STATE_POINTERS
5611 *
5612 * Those are always reissued following SBA updates anyway (new
5613 * batch time), except in the case of the program cache BO
5614 * changing. Having a separate state flag makes the sequence more
5615 * obvious.
5616 */
5617 #if GFX_VER <= 5
5618 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5619 #elif GFX_VER == 6
5620 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5621 #endif
5622 batch->state_base_address_emitted = true;
5623 }
5624
5625 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5626 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5627 bool window_space_position, float *zmin, float *zmax)
5628 {
5629 if (window_space_position) {
5630 *zmin = 0.f;
5631 *zmax = 1.f;
5632 return;
5633 }
5634 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5635 }
5636
5637 struct push_bos {
5638 struct {
5639 struct crocus_address addr;
5640 uint32_t length;
5641 } buffers[4];
5642 int buffer_count;
5643 uint32_t max_length;
5644 };
5645
5646 #if GFX_VER >= 6
5647 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5648 setup_constant_buffers(struct crocus_context *ice,
5649 struct crocus_batch *batch,
5650 int stage,
5651 struct push_bos *push_bos)
5652 {
5653 struct crocus_shader_state *shs = &ice->state.shaders[stage];
5654 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5655 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5656
5657 uint32_t push_range_sum = 0;
5658
5659 int n = 0;
5660 for (int i = 0; i < 4; i++) {
5661 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5662
5663 if (range->length == 0)
5664 continue;
5665
5666 push_range_sum += range->length;
5667
5668 if (range->length > push_bos->max_length)
5669 push_bos->max_length = range->length;
5670
5671 /* Range block is a binding table index, map back to UBO index. */
5672 unsigned block_index = crocus_bti_to_group_index(
5673 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5674 assert(block_index != CROCUS_SURFACE_NOT_USED);
5675
5676 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5677 struct crocus_resource *res = (void *) cbuf->buffer;
5678
5679 assert(cbuf->buffer_offset % 32 == 0);
5680
5681 push_bos->buffers[n].length = range->length;
5682 push_bos->buffers[n].addr =
5683 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5684 : ro_bo(batch->ice->workaround_bo,
5685 batch->ice->workaround_offset);
5686 n++;
5687 }
5688
5689 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5690 *
5691 * "The sum of all four read length fields must be less than or
5692 * equal to the size of 64."
5693 */
5694 assert(push_range_sum <= 64);
5695
5696 push_bos->buffer_count = n;
5697 }
5698
5699 #if GFX_VER == 7
5700 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5701 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5702 {
5703 crocus_emit_pipe_control_write(batch,
5704 "vs workaround",
5705 PIPE_CONTROL_WRITE_IMMEDIATE
5706 | PIPE_CONTROL_DEPTH_STALL,
5707 batch->ice->workaround_bo,
5708 batch->ice->workaround_offset, 0);
5709 }
5710 #endif
5711
5712 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5713 emit_push_constant_packets(struct crocus_context *ice,
5714 struct crocus_batch *batch,
5715 int stage,
5716 const struct push_bos *push_bos)
5717 {
5718 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5719 struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5720 UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5721
5722 #if GFX_VER == 7
5723 if (stage == MESA_SHADER_VERTEX) {
5724 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5725 gen7_emit_vs_workaround_flush(batch);
5726 }
5727 #endif
5728 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5729 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5730 #if GFX_VER >= 7
5731 #if GFX_VER != 8
5732 /* MOCS is MBZ on Gen8 so we skip it there */
5733 pkt.ConstantBody.MOCS = mocs;
5734 #endif
5735
5736 if (prog_data) {
5737 /* The Skylake PRM contains the following restriction:
5738 *
5739 * "The driver must ensure The following case does not occur
5740 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5741 * buffer 3 read length equal to zero committed followed by a
5742 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5743 * zero committed."
5744 *
5745 * To avoid this, we program the buffers in the highest slots.
5746 * This way, slot 0 is only used if slot 3 is also used.
5747 */
5748 int n = push_bos->buffer_count;
5749 assert(n <= 4);
5750 #if GFX_VERx10 >= 75
5751 const unsigned shift = 4 - n;
5752 #else
5753 const unsigned shift = 0;
5754 #endif
5755 for (int i = 0; i < n; i++) {
5756 pkt.ConstantBody.ReadLength[i + shift] =
5757 push_bos->buffers[i].length;
5758 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5759 }
5760 }
5761 #else
5762 if (prog_data) {
5763 int n = push_bos->buffer_count;
5764 assert (n <= 1);
5765 if (n == 1) {
5766 pkt.Buffer0Valid = true;
5767 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5768 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5769 }
5770 }
5771 #endif
5772 }
5773 }
5774
5775 #endif
5776
5777 #if GFX_VER == 8
5778 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5779 #elif GFX_VER >= 6
5780 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
5781 #else
5782 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
5783 #endif
5784
5785 static inline void
5786 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5787 {
5788 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5789 ds->DepthTestEnable = cso->cso.depth_enabled;
5790 ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5791 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5792
5793 ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5794 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5795 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5796 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5797
5798 ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5799 ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5800
5801 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5802 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5803 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5804 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5805
5806 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5807 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5808 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5809 ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5810 ds->StencilBufferWriteEnable =
5811 cso->cso.stencil[0].writemask != 0 ||
5812 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5813 }
5814
5815 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5816 emit_vertex_buffer_state(struct crocus_batch *batch,
5817 unsigned buffer_id,
5818 struct crocus_bo *bo,
5819 unsigned start_offset,
5820 unsigned end_offset,
5821 unsigned stride,
5822 unsigned step_rate,
5823 uint32_t **map)
5824 {
5825 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5826 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5827 vb.BufferStartingAddress = ro_bo(bo, start_offset);
5828 #if GFX_VER >= 8
5829 vb.BufferSize = end_offset - start_offset;
5830 #endif
5831 vb.VertexBufferIndex = buffer_id;
5832 vb.BufferPitch = stride;
5833 #if GFX_VER >= 7
5834 vb.AddressModifyEnable = true;
5835 #endif
5836 #if GFX_VER >= 6
5837 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5838 #endif
5839 #if GFX_VER < 8
5840 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5841 vb.InstanceDataStepRate = step_rate;
5842 #if GFX_VER >= 5
5843 vb.EndAddress = ro_bo(bo, end_offset - 1);
5844 #endif
5845 #endif
5846 }
5847 *map += vb_dwords;
5848 }
5849
5850 #if GFX_VER >= 6
5851 static uint32_t
determine_sample_mask(struct crocus_context * ice)5852 determine_sample_mask(struct crocus_context *ice)
5853 {
5854 uint32_t num_samples = ice->state.framebuffer.samples;
5855
5856 if (num_samples <= 1)
5857 return 1;
5858
5859 uint32_t fb_mask = (1 << num_samples) - 1;
5860 return ice->state.sample_mask & fb_mask;
5861 }
5862 #endif
5863
5864 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5865 crocus_upload_dirty_render_state(struct crocus_context *ice,
5866 struct crocus_batch *batch,
5867 const struct pipe_draw_info *draw)
5868 {
5869 uint64_t dirty = ice->state.dirty;
5870 uint64_t stage_dirty = ice->state.stage_dirty;
5871
5872 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5873 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5874 return;
5875
5876 if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5877 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5878 vf.StatisticsEnable = true;
5879 }
5880 }
5881
5882 #if GFX_VER <= 5
5883 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5884 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5885 bool ret = calculate_curbe_offsets(batch);
5886 if (ret) {
5887 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5888 stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5889 }
5890 }
5891
5892 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5893 stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5894 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5895 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5896 ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5897 if (ret) {
5898 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5899 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5900 }
5901 }
5902 #endif
5903 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5904 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5905 uint32_t cc_vp_address;
5906
5907 /* XXX: could avoid streaming for depth_clip [0,1] case. */
5908 uint32_t *cc_vp_map =
5909 stream_state(batch,
5910 4 * ice->state.num_viewports *
5911 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5912 for (int i = 0; i < ice->state.num_viewports; i++) {
5913 float zmin, zmax;
5914 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5915 ice->state.window_space_position,
5916 &zmin, &zmax);
5917 if (cso_rast->cso.depth_clip_near)
5918 zmin = 0.0;
5919 if (cso_rast->cso.depth_clip_far)
5920 zmax = 1.0;
5921
5922 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5923 ccv.MinimumDepth = zmin;
5924 ccv.MaximumDepth = zmax;
5925 }
5926
5927 cc_vp_map += GENX(CC_VIEWPORT_length);
5928 }
5929
5930 #if GFX_VER >= 7
5931 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5932 ptr.CCViewportPointer = cc_vp_address;
5933 }
5934 #elif GFX_VER == 6
5935 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5936 vp.CCViewportStateChange = 1;
5937 vp.PointertoCC_VIEWPORT = cc_vp_address;
5938 }
5939 #else
5940 ice->state.cc_vp_address = cc_vp_address;
5941 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5942 #endif
5943 }
5944
5945 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5946 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5947 #if GFX_VER >= 7
5948 uint32_t sf_cl_vp_address;
5949 uint32_t *vp_map =
5950 stream_state(batch,
5951 4 * ice->state.num_viewports *
5952 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5953 #else
5954 uint32_t *vp_map =
5955 stream_state(batch,
5956 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5957 32, &ice->state.sf_vp_address);
5958 uint32_t *clip_map =
5959 stream_state(batch,
5960 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5961 32, &ice->state.clip_vp_address);
5962 #endif
5963
5964 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5965 const struct pipe_viewport_state *state = &ice->state.viewports[i];
5966 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5967
5968 #if GFX_VER == 8
5969 float vp_xmin = viewport_extent(state, 0, -1.0f);
5970 float vp_xmax = viewport_extent(state, 0, 1.0f);
5971 float vp_ymin = viewport_extent(state, 1, -1.0f);
5972 float vp_ymax = viewport_extent(state, 1, 1.0f);
5973 #endif
5974 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5975 state->scale[0], state->scale[1],
5976 state->translate[0], state->translate[1],
5977 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5978 #if GFX_VER >= 7
5979 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5980 #else
5981 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5982 #endif
5983 {
5984 vp.ViewportMatrixElementm00 = state->scale[0];
5985 vp.ViewportMatrixElementm11 = state->scale[1];
5986 vp.ViewportMatrixElementm22 = state->scale[2];
5987 vp.ViewportMatrixElementm30 = state->translate[0];
5988 vp.ViewportMatrixElementm31 = state->translate[1];
5989 vp.ViewportMatrixElementm32 = state->translate[2];
5990 #if GFX_VER < 6
5991 struct pipe_scissor_state scissor;
5992 crocus_fill_scissor_rect(ice, 0, &scissor);
5993 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5994 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5995 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5996 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5997 #endif
5998
5999 #if GFX_VER >= 7
6000 vp.XMinClipGuardband = gb_xmin;
6001 vp.XMaxClipGuardband = gb_xmax;
6002 vp.YMinClipGuardband = gb_ymin;
6003 vp.YMaxClipGuardband = gb_ymax;
6004 #endif
6005 #if GFX_VER == 8
6006 vp.XMinViewPort = MAX2(vp_xmin, 0);
6007 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6008 vp.YMinViewPort = MAX2(vp_ymin, 0);
6009 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6010 #endif
6011 }
6012 #if GFX_VER < 7
6013 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6014 clip.XMinClipGuardband = gb_xmin;
6015 clip.XMaxClipGuardband = gb_xmax;
6016 clip.YMinClipGuardband = gb_ymin;
6017 clip.YMaxClipGuardband = gb_ymax;
6018 }
6019 #endif
6020 #if GFX_VER >= 7
6021 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6022 #else
6023 vp_map += GENX(SF_VIEWPORT_length);
6024 clip_map += GENX(CLIP_VIEWPORT_length);
6025 #endif
6026 }
6027 #if GFX_VER >= 7
6028 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6029 ptr.SFClipViewportPointer = sf_cl_vp_address;
6030 }
6031 #elif GFX_VER == 6
6032 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6033 vp.SFViewportStateChange = 1;
6034 vp.CLIPViewportStateChange = 1;
6035 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6036 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6037 }
6038 #endif
6039 }
6040
6041 #if GFX_VER >= 6
6042 if (dirty & CROCUS_DIRTY_GEN6_URB) {
6043 #if GFX_VER == 6
6044 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6045 || ice->shaders.ff_gs_prog;
6046
6047 struct brw_vue_prog_data *vue_prog_data =
6048 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6049 const unsigned vs_size = vue_prog_data->urb_entry_size;
6050 unsigned gs_size = vs_size;
6051 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6052 struct brw_vue_prog_data *gs_vue_prog_data =
6053 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6054 gs_size = gs_vue_prog_data->urb_entry_size;
6055 }
6056
6057 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6058 #endif
6059 #if GFX_VER >= 7
6060 const struct intel_device_info *devinfo = &batch->screen->devinfo;
6061 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6062 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6063 unsigned entry_size[4];
6064
6065 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6066 if (!ice->shaders.prog[i]) {
6067 entry_size[i] = 1;
6068 } else {
6069 struct brw_vue_prog_data *vue_prog_data =
6070 (void *) ice->shaders.prog[i]->prog_data;
6071 entry_size[i] = vue_prog_data->urb_entry_size;
6072 }
6073 assert(entry_size[i] != 0);
6074 }
6075
6076 /* If we're just switching between programs with the same URB requirements,
6077 * skip the rest of the logic.
6078 */
6079 bool no_change = false;
6080 if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
6081 ice->urb.gs_present == gs_present &&
6082 ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
6083 ice->urb.tess_present == tess_present &&
6084 ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
6085 ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
6086 no_change = true;
6087 }
6088
6089 if (!no_change) {
6090 ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
6091 ice->urb.gs_present = gs_present;
6092 ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
6093 ice->urb.tess_present = tess_present;
6094 ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
6095 ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
6096
6097 unsigned entries[4];
6098 unsigned start[4];
6099 bool constrained;
6100 intel_get_urb_config(devinfo,
6101 batch->screen->l3_config_3d,
6102 tess_present,
6103 gs_present,
6104 entry_size,
6105 entries, start, NULL, &constrained);
6106
6107 #if GFX_VER == 7
6108 if (devinfo->platform == INTEL_PLATFORM_IVB)
6109 gen7_emit_vs_workaround_flush(batch);
6110 #endif
6111 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6112 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6113 urb._3DCommandSubOpcode += i;
6114 urb.VSURBStartingAddress = start[i];
6115 urb.VSURBEntryAllocationSize = entry_size[i] - 1;
6116 urb.VSNumberofURBEntries = entries[i];
6117 }
6118 }
6119 }
6120 #endif
6121 }
6122
6123 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6124 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6125 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6126 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6127
6128 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6129 int rt_dwords =
6130 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6131 #if GFX_VER >= 8
6132 rt_dwords += GENX(BLEND_STATE_length);
6133 #endif
6134 uint32_t blend_offset;
6135 uint32_t *blend_map =
6136 stream_state(batch,
6137 4 * rt_dwords, 64, &blend_offset);
6138
6139 #if GFX_VER >= 8
6140 struct GENX(BLEND_STATE) be = { 0 };
6141 {
6142 #else
6143 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6144 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6145 #define be entry
6146 #endif
6147
6148 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6149 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6150 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6151 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6152 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
6153 be.ColorDitherEnable = cso_blend->cso.dither;
6154
6155 #if GFX_VER >= 8
6156 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6157 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6158 #else
6159 {
6160 #endif
6161 const struct pipe_rt_blend_state *rt =
6162 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6163
6164 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6165 be.IndependentAlphaBlendEnable;
6166
6167 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6168 entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6169 entry.LogicOpFunction = cso_blend->cso.logicop_func;
6170 }
6171
6172 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6173 entry.PreBlendColorClampEnable = true;
6174 entry.PostBlendColorClampEnable = true;
6175
6176 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
6177 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6178 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
6179 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6180
6181 #if GFX_VER >= 8
6182 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6183 #else
6184 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6185 #endif
6186 }
6187 }
6188 #if GFX_VER >= 8
6189 GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6190 #endif
6191 #if GFX_VER < 7
6192 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6193 ptr.PointertoBLEND_STATE = blend_offset;
6194 ptr.BLEND_STATEChange = true;
6195 }
6196 #else
6197 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6198 ptr.BlendStatePointer = blend_offset;
6199 #if GFX_VER >= 8
6200 ptr.BlendStatePointerValid = true;
6201 #endif
6202 }
6203 #endif
6204 }
6205 #endif
6206
6207 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6208 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6209 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6210 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6211 uint32_t cc_offset;
6212 void *cc_map =
6213 stream_state(batch,
6214 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6215 64, &cc_offset);
6216 #if GFX_VER <= 5
6217 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6218 #endif
6219 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6220 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6221 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6222
6223 #if GFX_VER <= 5
6224
6225 set_depth_stencil_bits(ice, &cc);
6226
6227 if (cso_blend->cso.logicop_enable) {
6228 if (can_emit_logic_op(ice)) {
6229 cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6230 cc.LogicOpFunction = cso_blend->cso.logicop_func;
6231 }
6232 }
6233 cc.ColorDitherEnable = cso_blend->cso.dither;
6234
6235 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6236
6237 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6238 cc.AlphaTestEnable = cso->cso.alpha_enabled;
6239 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6240 }
6241 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6242 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6243 #else
6244 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6245 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6246
6247 cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6248 cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6249 cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6250 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6251 #endif
6252 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6253 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6254 }
6255 ice->shaders.cc_offset = cc_offset;
6256 #if GFX_VER >= 6
6257 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6258 ptr.ColorCalcStatePointer = cc_offset;
6259 #if GFX_VER != 7
6260 ptr.ColorCalcStatePointerValid = true;
6261 #endif
6262 }
6263 #endif
6264 }
6265 #if GFX_VER <= 5
6266 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6267 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6268 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6269 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6270 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6271 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6272 }
6273 }
6274 #endif
6275 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6276 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6277 continue;
6278
6279 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6280 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6281
6282 if (!shader)
6283 continue;
6284
6285 if (shs->sysvals_need_upload)
6286 upload_sysvals(ice, stage);
6287
6288 #if GFX_VER <= 5
6289 dirty |= CROCUS_DIRTY_GEN4_CURBE;
6290 #endif
6291 #if GFX_VER >= 7
6292 struct push_bos push_bos = {};
6293 setup_constant_buffers(ice, batch, stage, &push_bos);
6294
6295 emit_push_constant_packets(ice, batch, stage, &push_bos);
6296 #endif
6297 }
6298
6299 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6300 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6301 if (ice->shaders.prog[stage]) {
6302 #if GFX_VER <= 6
6303 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6304 #endif
6305 crocus_populate_binding_table(ice, batch, stage, false);
6306 ice->shaders.prog[stage]->bind_bo_offset =
6307 crocus_upload_binding_table(ice, batch,
6308 ice->shaders.prog[stage]->surf_offset,
6309 ice->shaders.prog[stage]->bt.size_bytes);
6310
6311 #if GFX_VER >= 7
6312 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6313 ptr._3DCommandSubOpcode = 38 + stage;
6314 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6315 }
6316 #endif
6317 #if GFX_VER == 6
6318 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6319 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6320 crocus_populate_binding_table(ice, batch, stage, true);
6321 ice->shaders.ff_gs_prog->bind_bo_offset =
6322 crocus_upload_binding_table(ice, batch,
6323 ice->shaders.ff_gs_prog->surf_offset,
6324 ice->shaders.ff_gs_prog->bt.size_bytes);
6325 #endif
6326 }
6327 }
6328 }
6329 #if GFX_VER <= 6
6330 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6331 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6332 if (gs == NULL)
6333 gs = ice->shaders.ff_gs_prog;
6334 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6335 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6336 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6337 #if GFX_VER == 6
6338 ptr.VSBindingTableChange = true;
6339 ptr.PSBindingTableChange = true;
6340 ptr.GSBindingTableChange = gs ? true : false;
6341 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6342 #endif
6343 }
6344 }
6345 #endif
6346
6347 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6348 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6349 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6350 !ice->shaders.prog[stage])
6351 continue;
6352
6353 crocus_upload_sampler_states(ice, batch, stage);
6354
6355 sampler_updates = true;
6356
6357 #if GFX_VER >= 7
6358 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6359
6360 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6361 ptr._3DCommandSubOpcode = 43 + stage;
6362 ptr.PointertoVSSamplerState = shs->sampler_offset;
6363 }
6364 #endif
6365 }
6366
6367 if (sampler_updates) {
6368 #if GFX_VER == 6
6369 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6370 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6371 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6372 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6373 if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6374 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6375 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6376 ptr.VSSamplerStateChange = true;
6377 ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6378 }
6379 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6380 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6381 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6382 ptr.GSSamplerStateChange = true;
6383 ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6384 }
6385 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6386 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6387 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6388 ptr.PSSamplerStateChange = true;
6389 ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6390 }
6391 }
6392 #endif
6393 }
6394
6395 #if GFX_VER >= 6
6396 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6397 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6398 ms.PixelLocation =
6399 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6400 if (ice->state.framebuffer.samples > 0)
6401 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6402 #if GFX_VER == 6
6403 INTEL_SAMPLE_POS_4X(ms.Sample);
6404 #elif GFX_VER == 7
6405 switch (ice->state.framebuffer.samples) {
6406 case 1:
6407 INTEL_SAMPLE_POS_1X(ms.Sample);
6408 break;
6409 case 2:
6410 INTEL_SAMPLE_POS_2X(ms.Sample);
6411 break;
6412 case 4:
6413 INTEL_SAMPLE_POS_4X(ms.Sample);
6414 break;
6415 case 8:
6416 INTEL_SAMPLE_POS_8X(ms.Sample);
6417 break;
6418 default:
6419 break;
6420 }
6421 #endif
6422 }
6423 }
6424
6425 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6426 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6427 ms.SampleMask = determine_sample_mask(ice);
6428 }
6429 }
6430 #endif
6431
6432 #if GFX_VER >= 7
6433 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6434 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6435 struct brw_stage_prog_data *prog_data = shader->prog_data;
6436 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6437
6438 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6439
6440 /* Initialize the execution mask with VMask. Otherwise, derivatives are
6441 * incorrect for subspans where some of the pixels are unlit. We believe
6442 * the bit just didn't take effect in previous generations.
6443 */
6444 ps.VectorMaskEnable = GFX_VER >= 8;
6445
6446 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
6447 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
6448 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
6449
6450 ps.DispatchGRFStartRegisterForConstantSetupData0 =
6451 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6452 ps.DispatchGRFStartRegisterForConstantSetupData1 =
6453 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6454 ps.DispatchGRFStartRegisterForConstantSetupData2 =
6455 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6456
6457 ps.KernelStartPointer0 = KSP(ice, shader) +
6458 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6459 ps.KernelStartPointer1 = KSP(ice, shader) +
6460 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6461 ps.KernelStartPointer2 = KSP(ice, shader) +
6462 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6463
6464 #if GFX_VERx10 == 75
6465 ps.SampleMask = determine_sample_mask(ice);
6466 #endif
6467 // XXX: WABTPPrefetchDisable, see above, drop at C0
6468 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6469 ps.FloatingPointMode = prog_data->use_alt_mode;
6470 #if GFX_VER >= 8
6471 ps.MaximumNumberofThreadsPerPSD =
6472 batch->screen->devinfo.max_threads_per_psd - 2;
6473 #else
6474 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6475 #endif
6476
6477 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6478
6479 #if GFX_VER < 8
6480 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6481 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6482 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6483 #endif
6484 /* From the documentation for this packet:
6485 * "If the PS kernel does not need the Position XY Offsets to
6486 * compute a Position Value, then this field should be programmed
6487 * to POSOFFSET_NONE."
6488 *
6489 * "SW Recommendation: If the PS kernel needs the Position Offsets
6490 * to compute a Position XY value, this field should match Position
6491 * ZW Interpolation Mode to ensure a consistent position.xyzw
6492 * computation."
6493 *
6494 * We only require XY sample offsets. So, this recommendation doesn't
6495 * look useful at the moment. We might need this in future.
6496 */
6497 ps.PositionXYOffsetSelect =
6498 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6499
6500 if (wm_prog_data->base.total_scratch) {
6501 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6502 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6503 ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6504 }
6505 }
6506 #if GFX_VER == 8
6507 const struct shader_info *fs_info =
6508 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6509 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6510 psx.PixelShaderValid = true;
6511 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6512 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6513 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6514 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6515 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6516 psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
6517
6518 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
6519 if (wm_prog_data->uses_sample_mask)
6520 psx.PixelShaderUsesInputCoverageMask = true;
6521
6522 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6523
6524 /* The stricter cross-primitive coherency guarantees that the hardware
6525 * gives us with the "Accesses UAV" bit set for at least one shader stage
6526 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6527 * are redundant within the current image, atomic counter and SSBO GL
6528 * APIs, which all have very loose ordering and coherency requirements
6529 * and generally rely on the application to insert explicit barriers when
6530 * a shader invocation is expected to see the memory writes performed by
6531 * the invocations of some previous primitive. Regardless of the value
6532 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6533 * cause an in most cases useless DC flush when the lowermost stage with
6534 * the bit set finishes execution.
6535 *
6536 * It would be nice to disable it, but in some cases we can't because on
6537 * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6538 * signal (which could be set independently from the coherency mechanism
6539 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6540 * determine whether the hardware skips execution of the fragment shader
6541 * or not via the ThreadDispatchEnable signal. However if we know that
6542 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6543 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6544 * difference so we may just disable it here.
6545 *
6546 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6547 * take into account KillPixels when no depth or stencil writes are
6548 * enabled. In order for occlusion queries to work correctly with no
6549 * attachments, we need to force-enable here.
6550 *
6551 */
6552 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6553 !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6554 psx.PixelShaderHasUAV = true;
6555 }
6556 #endif
6557 }
6558 #endif
6559
6560 #if GFX_VER >= 7
6561 if (ice->state.streamout_active) {
6562 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6563 for (int i = 0; i < 4; i++) {
6564 struct crocus_stream_output_target *tgt =
6565 (void *) ice->state.so_target[i];
6566
6567 if (!tgt) {
6568 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6569 sob.SOBufferIndex = i;
6570 sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6571 }
6572 continue;
6573 }
6574 struct crocus_resource *res = (void *) tgt->base.buffer;
6575 uint32_t start = tgt->base.buffer_offset;
6576 #if GFX_VER < 8
6577 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6578 #endif
6579 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6580 sob.SOBufferIndex = i;
6581
6582 sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6583 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6584 #if GFX_VER < 8
6585 sob.SurfacePitch = tgt->stride;
6586 sob.SurfaceEndAddress = rw_bo(res->bo, end);
6587 #else
6588 sob.SOBufferEnable = true;
6589 sob.StreamOffsetWriteEnable = true;
6590 sob.StreamOutputBufferOffsetAddressEnable = true;
6591
6592 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6593 sob.StreamOutputBufferOffsetAddress =
6594 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6595 if (tgt->zero_offset) {
6596 sob.StreamOffset = 0;
6597 tgt->zero_offset = false;
6598 } else
6599 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6600 #endif
6601 }
6602 }
6603 }
6604
6605 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6606 uint32_t *decl_list =
6607 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6608 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6609 }
6610
6611 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6612 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6613
6614 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6615 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6616 sol.SOFunctionEnable = true;
6617 sol.SOStatisticsEnable = true;
6618
6619 sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6620 !ice->state.prims_generated_query_active;
6621 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6622 }
6623
6624 assert(ice->state.streamout);
6625
6626 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6627 GENX(3DSTATE_STREAMOUT_length));
6628 }
6629 } else {
6630 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6631 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6632 }
6633 }
6634 #endif
6635 #if GFX_VER == 6
6636 if (ice->state.streamout_active) {
6637 if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6638 crocus_emit_so_svbi(ice);
6639 }
6640 }
6641 #endif
6642
6643 if (dirty & CROCUS_DIRTY_CLIP) {
6644 #if GFX_VER < 6
6645 const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6646 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6647
6648 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6649 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6650 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6651 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6652 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6653 clip.SingleProgramFlow = true;
6654 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6655
6656 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6657 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6658
6659 clip.DispatchGRFStartRegisterForURBData = 1;
6660 clip.VertexURBEntryReadOffset = 0;
6661 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6662
6663 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6664 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6665
6666 if (batch->ice->urb.nr_clip_entries >= 10) {
6667 /* Half of the URB entries go to each thread, and it has to be an
6668 * even number.
6669 */
6670 assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6671
6672 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6673 * only 2 threads can output VUEs at a time.
6674 */
6675 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6676 } else {
6677 assert(batch->ice->urb.nr_clip_entries >= 5);
6678 clip.MaximumNumberofThreads = 1 - 1;
6679 }
6680 clip.VertexPositionSpace = VPOS_NDCSPACE;
6681 clip.UserClipFlagsMustClipEnable = true;
6682 clip.GuardbandClipTestEnable = true;
6683
6684 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6685 clip.ScreenSpaceViewportXMin = -1.0;
6686 clip.ScreenSpaceViewportXMax = 1.0;
6687 clip.ScreenSpaceViewportYMin = -1.0;
6688 clip.ScreenSpaceViewportYMax = 1.0;
6689 clip.ViewportXYClipTestEnable = true;
6690 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6691
6692 #if GFX_VER == 5 || GFX_VERx10 == 45
6693 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6694 #else
6695 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6696 * workaround.
6697 */
6698 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6699 #endif
6700
6701 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6702 clip.GuardbandClipTestEnable = true;
6703
6704 clip.ClipMode = clip_prog_data->clip_mode;
6705 #if GFX_VERx10 == 45
6706 clip.NegativeWClipTestEnable = true;
6707 #endif
6708 }
6709
6710 #else //if GFX_VER >= 6
6711 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6712 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6713 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6714 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6715 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6716 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6717 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6718 : ice->state.prim_is_points_or_lines);
6719 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6720 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6721 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6722 if (cso_rast->cso.rasterizer_discard)
6723 cl.ClipMode = CLIPMODE_REJECT_ALL;
6724 else if (ice->state.window_space_position)
6725 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6726 else
6727 cl.ClipMode = CLIPMODE_NORMAL;
6728
6729 cl.PerspectiveDivideDisable = ice->state.window_space_position;
6730 cl.ViewportXYClipTestEnable = !points_or_lines;
6731
6732 cl.UserClipDistanceCullTestEnableBitmask =
6733 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6734
6735 if (wm_prog_data->barycentric_interp_modes &
6736 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
6737 cl.NonPerspectiveBarycentricEnable = true;
6738
6739 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6740 cl.MaximumVPIndex = ice->state.num_viewports - 1;
6741 }
6742 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6743 ARRAY_SIZE(cso_rast->clip));
6744 #endif
6745 }
6746
6747 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6748 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6749 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6750 const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
6751 #if GFX_VER == 7
6752 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6753 gen7_emit_vs_workaround_flush(batch);
6754 #endif
6755
6756
6757 #if GFX_VER == 6
6758 struct push_bos push_bos = {};
6759 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6760
6761 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6762 #endif
6763 #if GFX_VER >= 6
6764 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6765 #else
6766 uint32_t *vs_ptr = stream_state(batch,
6767 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6768 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6769 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6770 #endif
6771 {
6772 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6773
6774 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6775
6776 #if GFX_VER < 6
6777 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6778 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6779 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6780
6781 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6782 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6783
6784 vs.MaximumNumberofThreads =
6785 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6786 vs.StatisticsEnable = false;
6787 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6788 #endif
6789 #if GFX_VER == 5
6790 /* Force single program flow on Ironlake. We cannot reliably get
6791 * all applications working without it. See:
6792 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6793 *
6794 * The most notable and reliably failing application is the Humus
6795 * demo "CelShading"
6796 */
6797 vs.SingleProgramFlow = true;
6798 vs.SamplerCount = 0; /* hardware requirement */
6799
6800 #endif
6801 #if GFX_VER >= 8
6802 vs.SIMD8DispatchEnable =
6803 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6804
6805 vs.UserClipDistanceCullTestEnableBitmask =
6806 vue_prog_data->cull_distance_mask;
6807 #endif
6808 }
6809
6810 #if GFX_VER == 6
6811 crocus_emit_pipe_control_flush(batch,
6812 "post VS const",
6813 PIPE_CONTROL_DEPTH_STALL |
6814 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6815 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6816 #endif
6817 }
6818
6819 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6820 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6821 bool active = GFX_VER >= 6 && shader;
6822 #if GFX_VER == 6
6823 struct push_bos push_bos = {};
6824 if (shader)
6825 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6826
6827 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6828 #endif
6829 #if GFX_VERx10 == 70
6830 /**
6831 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6832 * Geometry > Geometry Shader > State:
6833 *
6834 * "Note: Because of corruption in IVB:GT2, software needs to flush the
6835 * whole fixed function pipeline when the GS enable changes value in
6836 * the 3DSTATE_GS."
6837 *
6838 * The hardware architects have clarified that in this context "flush the
6839 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6840 * Stall" bit set.
6841 */
6842 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6843 gen7_emit_cs_stall_flush(batch);
6844 #endif
6845 #if GFX_VER >= 6
6846 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6847 #else
6848 uint32_t *gs_ptr = stream_state(batch,
6849 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6850 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6851 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6852 #endif
6853 {
6854 #if GFX_VER >= 6
6855 if (active) {
6856 const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
6857 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6858 const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
6859
6860 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6861 #if GFX_VER >= 7
6862 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6863 gs.OutputTopology = gs_prog_data->output_topology;
6864 gs.ControlDataHeaderSize =
6865 gs_prog_data->control_data_header_size_hwords;
6866
6867 gs.InstanceControl = gs_prog_data->invocations - 1;
6868 gs.DispatchMode = vue_prog_data->dispatch_mode;
6869
6870 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6871
6872 gs.ControlDataFormat = gs_prog_data->control_data_format;
6873 #endif
6874
6875 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6876 * Ivy Bridge and Haswell.
6877 *
6878 * On Ivy Bridge, setting this bit causes the vertices of a triangle
6879 * strip to be delivered to the geometry shader in an order that does
6880 * not strictly follow the OpenGL spec, but preserves triangle
6881 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
6882 * the geometry shader sees triangles:
6883 *
6884 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6885 *
6886 * (Clearing the bit is even worse, because it fails to preserve
6887 * orientation).
6888 *
6889 * Triangle strips with adjacency always ordered in a way that preserves
6890 * triangle orientation but does not strictly follow the OpenGL spec,
6891 * regardless of the setting of this bit.
6892 *
6893 * On Haswell, both triangle strips and triangle strips with adjacency
6894 * are always ordered in a way that preserves triangle orientation.
6895 * Setting this bit causes the ordering to strictly follow the OpenGL
6896 * spec.
6897 *
6898 * So in either case we want to set the bit. Unfortunately on Ivy
6899 * Bridge this will get the order close to correct but not perfect.
6900 */
6901 gs.ReorderMode = TRAILING;
6902 gs.MaximumNumberofThreads =
6903 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6904 (batch->screen->devinfo.max_gs_threads - 1);
6905 #if GFX_VER < 7
6906 gs.SOStatisticsEnable = true;
6907 if (gs_prog_data->num_transform_feedback_bindings)
6908 gs.SVBIPayloadEnable = ice->state.streamout_active;
6909
6910 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6911 * was previously done for gen6.
6912 *
6913 * TODO: test with both disabled to see if the HW is behaving
6914 * as expected, like in gen7.
6915 */
6916 gs.SingleProgramFlow = true;
6917 gs.VectorMaskEnable = true;
6918 #endif
6919 #if GFX_VER >= 8
6920 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6921
6922 if (gs_prog_data->static_vertex_count != -1) {
6923 gs.StaticOutput = true;
6924 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6925 }
6926 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6927
6928 gs.UserClipDistanceCullTestEnableBitmask =
6929 vue_prog_data->cull_distance_mask;
6930
6931 const int urb_entry_write_offset = 1;
6932 const uint32_t urb_entry_output_length =
6933 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6934 urb_entry_write_offset;
6935
6936 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6937 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6938 #endif
6939 }
6940 #endif
6941 #if GFX_VER <= 6
6942 if (!active && ice->shaders.ff_gs_prog) {
6943 const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6944 /* In gen6, transform feedback for the VS stage is done with an
6945 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6946 * for this.
6947 */
6948 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6949 gs.SingleProgramFlow = true;
6950 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6951 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6952
6953 #if GFX_VER <= 5
6954 gs.GRFRegisterCount =
6955 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6956 /* BRW_NEW_URB_FENCE */
6957 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6958 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6959 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6960 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6961 #else
6962 gs.Enable = true;
6963 gs.VectorMaskEnable = true;
6964 gs.SVBIPayloadEnable = true;
6965 gs.SVBIPostIncrementEnable = true;
6966 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6967 gs.SOStatisticsEnable = true;
6968 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6969 #endif
6970 }
6971 #endif
6972 if (!active && !ice->shaders.ff_gs_prog) {
6973 #if GFX_VER < 8
6974 gs.DispatchGRFStartRegisterForURBData = 1;
6975 #if GFX_VER >= 7
6976 gs.IncludeVertexHandles = true;
6977 #endif
6978 #endif
6979 }
6980 #if GFX_VER >= 6
6981 gs.StatisticsEnable = true;
6982 #endif
6983 #if GFX_VER == 5 || GFX_VER == 6
6984 gs.RenderingEnabled = true;
6985 #endif
6986 #if GFX_VER <= 5
6987 gs.MaximumVPIndex = ice->state.num_viewports - 1;
6988 #endif
6989 }
6990 ice->state.gs_enabled = active;
6991 }
6992
6993 #if GFX_VER >= 7
6994 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6995 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6996
6997 if (shader) {
6998 const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
6999 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7000 const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
7001
7002 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
7003 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
7004 hs.InstanceCount = tcs_prog_data->instances - 1;
7005 hs.IncludeVertexHandles = true;
7006 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7007 }
7008 } else {
7009 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7010 }
7011
7012 }
7013
7014 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7015 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7016 if (shader) {
7017 const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
7018 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7019 const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
7020
7021 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7022 te.Partitioning = tes_prog_data->partitioning;
7023 te.OutputTopology = tes_prog_data->output_topology;
7024 te.TEDomain = tes_prog_data->domain;
7025 te.TEEnable = true;
7026 te.MaximumTessellationFactorOdd = 63.0;
7027 te.MaximumTessellationFactorNotOdd = 64.0;
7028 };
7029 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7030 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7031
7032 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7033 ds.ComputeWCoordinateEnable =
7034 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
7035
7036 #if GFX_VER >= 8
7037 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7038 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7039 ds.UserClipDistanceCullTestEnableBitmask =
7040 vue_prog_data->cull_distance_mask;
7041 #endif
7042 };
7043 } else {
7044 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7045 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7046 }
7047 }
7048 #endif
7049 if (dirty & CROCUS_DIRTY_RASTER) {
7050
7051 #if GFX_VER < 6
7052 const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7053 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7054 uint32_t *sf_ptr = stream_state(batch,
7055 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7056 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7057 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7058 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7059 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7060 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7061 sf.DispatchGRFStartRegisterForURBData = 3;
7062 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
7063 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7064 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7065 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7066 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7067
7068 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7069
7070 sf.MaximumNumberofThreads =
7071 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7072
7073 sf.SpritePointEnable = cso_state->point_quad_rasterization;
7074 sf.DestinationOriginHorizontalBias = 0.5;
7075 sf.DestinationOriginVerticalBias = 0.5;
7076
7077 sf.LineEndCapAntialiasingRegionWidth =
7078 cso_state->line_smooth ? _10pixels : _05pixels;
7079 sf.LastPixelEnable = cso_state->line_last_pixel;
7080 sf.AntialiasingEnable = cso_state->line_smooth;
7081
7082 sf.LineWidth = get_line_width(cso_state);
7083 sf.PointWidth = cso_state->point_size;
7084 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7085 #if GFX_VERx10 >= 45
7086 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7087 #endif
7088 sf.ViewportTransformEnable = true;
7089 sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7090 sf.ScissorRectangleEnable = true;
7091 sf.CullMode = translate_cull_mode(cso_state->cull_face);
7092
7093 if (cso_state->flatshade_first) {
7094 sf.TriangleFanProvokingVertexSelect = 1;
7095 } else {
7096 sf.TriangleStripListProvokingVertexSelect = 2;
7097 sf.TriangleFanProvokingVertexSelect = 2;
7098 sf.LineStripListProvokingVertexSelect = 1;
7099 }
7100 }
7101 #else
7102 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7103 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7104 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7105 sf.ViewportTransformEnable = !ice->state.window_space_position;
7106
7107 #if GFX_VER == 6
7108 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7109 uint32_t urb_entry_read_length;
7110 uint32_t urb_entry_read_offset;
7111 uint32_t point_sprite_enables;
7112 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7113 &urb_entry_read_length,
7114 &urb_entry_read_offset);
7115 sf.VertexURBEntryReadLength = urb_entry_read_length;
7116 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7117 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7118 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7119 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7120 #endif
7121
7122 #if GFX_VER >= 6 && GFX_VER < 8
7123 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7124 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7125 #endif
7126 #if GFX_VER == 7
7127 if (ice->state.framebuffer.zsbuf) {
7128 struct crocus_resource *zres, *sres;
7129 crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7130 ice->state.framebuffer.zsbuf->texture,
7131 &zres, &sres);
7132 /* ANV thinks that the stencil-ness doesn't matter, this is just
7133 * about handling polygon offset scaling.
7134 */
7135 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7136 }
7137 #endif
7138 }
7139 crocus_emit_merge(batch, cso->sf, dynamic_sf,
7140 ARRAY_SIZE(dynamic_sf));
7141 #if GFX_VER == 8
7142 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7143 #endif
7144 #endif
7145 }
7146
7147 if (dirty & CROCUS_DIRTY_WM) {
7148 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7149 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7150 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
7151 UNUSED const struct shader_info *fs_info =
7152 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7153
7154 #if GFX_VER == 6
7155 struct push_bos push_bos = {};
7156 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7157
7158 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7159 #endif
7160 #if GFX_VER >= 6
7161 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7162 #else
7163 uint32_t *wm_ptr = stream_state(batch,
7164 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7165
7166 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7167
7168 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7169 #endif
7170 {
7171 #if GFX_VER <= 6
7172 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7173 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7174 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7175 #endif
7176 #if GFX_VER == 4
7177 /* On gen4, we only have one shader kernel */
7178 if (brw_wm_state_has_ksp(wm, 0)) {
7179 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7180 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7181 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7182 wm_prog_data->base.dispatch_grf_start_reg;
7183 }
7184 #elif GFX_VER == 5
7185 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7186 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7187 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7188 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7189 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7190 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7191
7192 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7193 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7194 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7195
7196 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7197 wm_prog_data->base.dispatch_grf_start_reg;
7198 #elif GFX_VER == 6
7199 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7200 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7201 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7202 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7203 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7204 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7205
7206 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7207 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7208 wm.DispatchGRFStartRegisterForConstantSetupData1 =
7209 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7210 wm.DispatchGRFStartRegisterForConstantSetupData2 =
7211 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7212 #endif
7213 #if GFX_VER <= 5
7214 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7215 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7216 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7217 wm.SetupURBEntryReadOffset = 0;
7218 wm.EarlyDepthTestEnable = true;
7219 wm.LineAntialiasingRegionWidth = _05pixels;
7220 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7221 wm.DepthCoefficientURBReadOffset = 1;
7222
7223 if (cso->cso.offset_tri) {
7224 wm.GlobalDepthOffsetEnable = true;
7225
7226 /* Something weird going on with legacy_global_depth_bias,
7227 * offset_constant, scaling and MRD. This value passes glean
7228 * but gives some odd results elsewere (eg. the
7229 * quad-offset-units test).
7230 */
7231 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7232 wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7233 }
7234 wm.SamplerStatePointer = ro_bo(batch->state.bo,
7235 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7236 #endif
7237
7238 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7239 ice->state.statistics_counters_enabled : 0;
7240
7241 #if GFX_VER >= 6
7242 wm.LineAntialiasingRegionWidth = _10pixels;
7243 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7244
7245 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7246 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7247 #endif
7248 #if GFX_VER == 6
7249 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7250 ice->state.cso_blend->dual_color_blending;
7251 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7252 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7253
7254 /* From the SNB PRM, volume 2 part 1, page 281:
7255 * "If the PS kernel does not need the Position XY Offsets
7256 * to compute a Position XY value, then this field should be
7257 * programmed to POSOFFSET_NONE."
7258 *
7259 * "SW Recommendation: If the PS kernel needs the Position Offsets
7260 * to compute a Position XY value, this field should match Position
7261 * ZW Interpolation Mode to ensure a consistent position.xyzw
7262 * computation."
7263 * We only require XY sample offsets. So, this recommendation doesn't
7264 * look useful at the moment. We might need this in future.
7265 */
7266 if (wm_prog_data->uses_pos_offset)
7267 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7268 else
7269 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7270 #endif
7271 wm.LineStippleEnable = cso->cso.line_stipple_enable;
7272 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7273
7274 #if GFX_VER < 7
7275 if (wm_prog_data->base.use_alt_mode)
7276 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7277 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7278 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7279 #endif
7280
7281 #if GFX_VER < 8
7282 #if GFX_VER >= 6
7283 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7284
7285 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7286 if (fb->samples > 1) {
7287 if (cso->cso.multisample)
7288 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7289 else
7290 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7291
7292 if (wm_prog_data->persample_dispatch)
7293 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7294 else
7295 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7296 } else {
7297 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7298 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7299 }
7300 #endif
7301
7302 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7303
7304 if (wm_prog_data->uses_kill ||
7305 ice->state.cso_zsa->cso.alpha_enabled ||
7306 ice->state.cso_blend->cso.alpha_to_coverage ||
7307 (GFX_VER >= 6 && wm_prog_data->uses_omask))
7308 wm.PixelShaderKillsPixel = true;
7309
7310 if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7311 writes_depth || wm.PixelShaderKillsPixel ||
7312 (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7313 wm.ThreadDispatchEnable = true;
7314
7315 #if GFX_VER >= 7
7316 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7317 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7318 #else
7319 if (wm_prog_data->base.total_scratch) {
7320 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7321 MESA_SHADER_FRAGMENT);
7322 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7323 wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7324 }
7325
7326 wm.PixelShaderComputedDepth = writes_depth;
7327
7328 #endif
7329 /* The "UAV access enable" bits are unnecessary on HSW because they only
7330 * seem to have an effect on the HW-assisted coherency mechanism which we
7331 * don't need, and the rasterization-related UAV_ONLY flag and the
7332 * DISPATCH_ENABLE bit can be set independently from it.
7333 * C.f. gen8_upload_ps_extra().
7334 *
7335 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7336 * _NEW_COLOR
7337 */
7338 #if GFX_VERx10 == 75
7339 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7340 wm_prog_data->has_side_effects)
7341 wm.PSUAVonly = ON;
7342 #endif
7343 #endif
7344 #if GFX_VER >= 7
7345 /* BRW_NEW_FS_PROG_DATA */
7346 if (wm_prog_data->early_fragment_tests)
7347 wm.EarlyDepthStencilControl = EDSC_PREPS;
7348 else if (wm_prog_data->has_side_effects)
7349 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7350 #endif
7351 #if GFX_VER == 8
7352 /* We could skip this bit if color writes are enabled. */
7353 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7354 wm.ForceThreadDispatchEnable = ForceON;
7355 #endif
7356 };
7357
7358 #if GFX_VER <= 5
7359 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7360 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7361 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7362 }
7363 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7364 }
7365 #endif
7366 }
7367
7368 #if GFX_VER >= 7
7369 if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7370 crocus_emit_sbe(batch, ice);
7371 }
7372 #endif
7373
7374 #if GFX_VER >= 8
7375 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7376 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7377 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7378 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7379 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7380 const struct shader_info *fs_info =
7381 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7382 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7383 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7384 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7385 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7386 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7387 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7388 }
7389 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7390 ARRAY_SIZE(cso_blend->ps_blend));
7391 }
7392 #endif
7393
7394 #if GFX_VER >= 6
7395 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7396
7397 #if GFX_VER >= 8
7398 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7399 set_depth_stencil_bits(ice, &wmds);
7400 }
7401 #else
7402 uint32_t ds_offset;
7403 void *ds_map = stream_state(batch,
7404 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7405 64, &ds_offset);
7406 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7407 set_depth_stencil_bits(ice, &ds);
7408 }
7409
7410 #if GFX_VER == 6
7411 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7412 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7413 ptr.DEPTH_STENCIL_STATEChange = true;
7414 }
7415 #else
7416 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7417 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7418 }
7419 #endif
7420 #endif
7421 }
7422
7423 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7424 /* Align to 64-byte boundary as per anv. */
7425 uint32_t scissor_offset;
7426 struct pipe_scissor_state *scissor_map = (void *)
7427 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7428 64, &scissor_offset);
7429 for (int i = 0; i < ice->state.num_viewports; i++) {
7430 struct pipe_scissor_state scissor;
7431 crocus_fill_scissor_rect(ice, i, &scissor);
7432 scissor_map[i] = scissor;
7433 }
7434
7435 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7436 ptr.ScissorRectPointer = scissor_offset;
7437 }
7438 }
7439 #endif
7440
7441 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7442 struct isl_device *isl_dev = &batch->screen->isl_dev;
7443 #if GFX_VER >= 6
7444 crocus_emit_depth_stall_flushes(batch);
7445 #endif
7446 void *batch_ptr;
7447 struct crocus_resource *zres, *sres;
7448 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7449 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7450
7451 struct isl_view view = {
7452 .base_level = 0,
7453 .levels = 1,
7454 .base_array_layer = 0,
7455 .array_len = 1,
7456 .swizzle = ISL_SWIZZLE_IDENTITY,
7457 };
7458 struct isl_depth_stencil_hiz_emit_info info = {
7459 .view = &view,
7460 .mocs = crocus_mocs(NULL, isl_dev),
7461 };
7462
7463 if (cso->zsbuf) {
7464 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7465 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7466 if (zsbuf->align_res) {
7467 zres = (struct crocus_resource *)zsbuf->align_res;
7468 }
7469 view.base_level = cso->zsbuf->u.tex.level;
7470 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7471 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7472
7473 if (zres) {
7474 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7475
7476 info.depth_surf = &zres->surf;
7477 info.depth_address = crocus_command_reloc(batch,
7478 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7479 zres->bo, 0, RELOC_32BIT);
7480
7481 info.mocs = crocus_mocs(zres->bo, isl_dev);
7482 view.format = zres->surf.format;
7483
7484 if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7485 info.hiz_usage = zres->aux.usage;
7486 info.hiz_surf = &zres->aux.surf;
7487 uint64_t hiz_offset = 0;
7488
7489 #if GFX_VER == 6
7490 /* HiZ surfaces on Sandy Bridge technically don't support
7491 * mip-mapping. However, we can fake it by offsetting to the
7492 * first slice of LOD0 in the HiZ surface.
7493 */
7494 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7495 view.base_level, 0, 0,
7496 &hiz_offset, NULL, NULL);
7497 #endif
7498 info.hiz_address = crocus_command_reloc(batch,
7499 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7500 zres->aux.bo, zres->aux.offset + hiz_offset,
7501 RELOC_32BIT);
7502 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7503 }
7504 }
7505
7506 #if GFX_VER >= 6
7507 if (sres) {
7508 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7509 info.stencil_aux_usage = sres->aux.usage;
7510 info.stencil_surf = &sres->surf;
7511
7512 uint64_t stencil_offset = 0;
7513 #if GFX_VER == 6
7514 /* Stencil surfaces on Sandy Bridge technically don't support
7515 * mip-mapping. However, we can fake it by offsetting to the
7516 * first slice of LOD0 in the stencil surface.
7517 */
7518 isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7519 view.base_level, 0, 0,
7520 &stencil_offset, NULL, NULL);
7521 #endif
7522
7523 info.stencil_address = crocus_command_reloc(batch,
7524 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7525 sres->bo, stencil_offset, RELOC_32BIT);
7526 if (!zres) {
7527 view.format = sres->surf.format;
7528 info.mocs = crocus_mocs(sres->bo, isl_dev);
7529 }
7530 }
7531 #endif
7532 }
7533 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7534 }
7535
7536 /* TODO: Disable emitting this until something uses a stipple. */
7537 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7538 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7539 for (int i = 0; i < 32; i++) {
7540 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7541 }
7542 }
7543 }
7544
7545 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7546 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7547 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7548 }
7549
7550 #if GFX_VER >= 8
7551 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7552 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7553 topo.PrimitiveTopologyType =
7554 translate_prim_type(draw->mode, ice->state.patch_vertices);
7555 }
7556 }
7557 #endif
7558
7559 #if GFX_VER <= 5
7560 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7561 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7562 ice->shaders.vs_offset, ice->shaders.sf_offset,
7563 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7564 crocus_upload_urb_fence(batch);
7565
7566 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7567 cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7568 cs.URBEntryAllocationSize = ice->urb.csize - 1;
7569 }
7570 dirty |= CROCUS_DIRTY_GEN4_CURBE;
7571 }
7572 #endif
7573 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7574 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7575 if (fb->width && fb->height) {
7576 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7577 rect.ClippedDrawingRectangleXMax = fb->width - 1;
7578 rect.ClippedDrawingRectangleYMax = fb->height - 1;
7579 }
7580 }
7581 }
7582
7583 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7584 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7585 const uint32_t count = user_count +
7586 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7587 uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7588
7589 if (count) {
7590 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7591
7592 uint32_t *map =
7593 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7594 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7595 vb.DWordLength = (vb_dwords * count + 1) - 2;
7596 }
7597 map += 1;
7598
7599 uint32_t bound = dynamic_bound;
7600 int i;
7601 while (bound) {
7602 i = u_bit_scan(&bound);
7603 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7604 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7605 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7606
7607 emit_vertex_buffer_state(batch, i, bo,
7608 buf->buffer_offset,
7609 ice->state.vb_end[i],
7610 buf->stride,
7611 step_rate,
7612 &map);
7613 }
7614 i = user_count;
7615 if (ice->state.vs_uses_draw_params) {
7616 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7617 emit_vertex_buffer_state(batch, i++,
7618 res->bo,
7619 ice->draw.draw_params.offset,
7620 ice->draw.draw_params.res->width0,
7621 0, 0, &map);
7622 }
7623 if (ice->state.vs_uses_derived_draw_params) {
7624 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7625 emit_vertex_buffer_state(batch, i++,
7626 res->bo,
7627 ice->draw.derived_draw_params.offset,
7628 ice->draw.derived_draw_params.res->width0,
7629 0, 0, &map);
7630 }
7631 }
7632 }
7633
7634 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7635 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7636 const unsigned entries = MAX2(cso->count, 1);
7637 if (!(ice->state.vs_needs_sgvs_element ||
7638 ice->state.vs_uses_derived_draw_params ||
7639 ice->state.vs_needs_edge_flag)) {
7640 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7641 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7642 } else {
7643 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7644 const unsigned dyn_count = cso->count +
7645 ice->state.vs_needs_sgvs_element +
7646 ice->state.vs_uses_derived_draw_params;
7647
7648 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7649 &dynamic_ves, ve) {
7650 ve.DWordLength =
7651 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7652 }
7653 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7654 (cso->count - ice->state.vs_needs_edge_flag) *
7655 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7656 uint32_t *ve_pack_dest =
7657 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7658 GENX(VERTEX_ELEMENT_STATE_length)];
7659
7660 if (ice->state.vs_needs_sgvs_element) {
7661 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7662 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7663 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7664 ve.Valid = true;
7665 ve.VertexBufferIndex =
7666 util_bitcount64(ice->state.bound_vertex_buffers);
7667 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7668 ve.Component0Control = base_ctrl;
7669 ve.Component1Control = base_ctrl;
7670 #if GFX_VER < 8
7671 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7672 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7673 #else
7674 ve.Component2Control = VFCOMP_STORE_0;
7675 ve.Component3Control = VFCOMP_STORE_0;
7676 #endif
7677 #if GFX_VER < 5
7678 ve.DestinationElementOffset = cso->count * 4;
7679 #endif
7680 }
7681 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7682 }
7683 if (ice->state.vs_uses_derived_draw_params) {
7684 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7685 ve.Valid = true;
7686 ve.VertexBufferIndex =
7687 util_bitcount64(ice->state.bound_vertex_buffers) +
7688 ice->state.vs_uses_draw_params;
7689 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7690 ve.Component0Control = VFCOMP_STORE_SRC;
7691 ve.Component1Control = VFCOMP_STORE_SRC;
7692 ve.Component2Control = VFCOMP_STORE_0;
7693 ve.Component3Control = VFCOMP_STORE_0;
7694 #if GFX_VER < 5
7695 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7696 #endif
7697 }
7698 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7699 }
7700 if (ice->state.vs_needs_edge_flag) {
7701 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
7702 ve_pack_dest[i] = cso->edgeflag_ve[i];
7703 }
7704
7705 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7706 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7707 }
7708
7709 #if GFX_VER == 8
7710 if (!ice->state.vs_needs_edge_flag) {
7711 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7712 entries * GENX(3DSTATE_VF_INSTANCING_length));
7713 } else {
7714 assert(cso->count > 0);
7715 const unsigned edgeflag_index = cso->count - 1;
7716 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7717 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7718 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7719
7720 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7721 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7722 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7723 vi.VertexElementIndex = edgeflag_index +
7724 ice->state.vs_needs_sgvs_element +
7725 ice->state.vs_uses_derived_draw_params;
7726 }
7727 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
7728 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7729
7730 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7731 entries * GENX(3DSTATE_VF_INSTANCING_length));
7732 }
7733 #endif
7734 }
7735
7736 #if GFX_VER == 8
7737 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7738 const struct brw_vs_prog_data *vs_prog_data = (void *)
7739 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7740 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7741
7742 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7743 if (vs_prog_data->uses_vertexid) {
7744 sgv.VertexIDEnable = true;
7745 sgv.VertexIDComponentNumber = 2;
7746 sgv.VertexIDElementOffset =
7747 cso->count - ice->state.vs_needs_edge_flag;
7748 }
7749
7750 if (vs_prog_data->uses_instanceid) {
7751 sgv.InstanceIDEnable = true;
7752 sgv.InstanceIDComponentNumber = 3;
7753 sgv.InstanceIDElementOffset =
7754 cso->count - ice->state.vs_needs_edge_flag;
7755 }
7756 }
7757 }
7758 #endif
7759 #if GFX_VERx10 >= 75
7760 if (dirty & CROCUS_DIRTY_GEN75_VF) {
7761 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7762 if (draw->primitive_restart) {
7763 vf.IndexedDrawCutIndexEnable = true;
7764 vf.CutIndex = draw->restart_index;
7765 }
7766 }
7767 }
7768 #endif
7769
7770 #if GFX_VER == 8
7771 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7772 bool enable = want_pma_fix(ice);
7773 genX(crocus_update_pma_fix)(ice, batch, enable);
7774 }
7775 #endif
7776
7777 #if GFX_VER <= 5
7778 if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7779 gen4_upload_curbe(batch);
7780 }
7781 #endif
7782 }
7783
7784 static void
7785 crocus_upload_render_state(struct crocus_context *ice,
7786 struct crocus_batch *batch,
7787 const struct pipe_draw_info *draw,
7788 unsigned drawid_offset,
7789 const struct pipe_draw_indirect_info *indirect,
7790 const struct pipe_draw_start_count_bias *sc)
7791 {
7792 #if GFX_VER >= 7
7793 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7794 #endif
7795
7796 batch->no_wrap = true;
7797 batch->contains_draw = true;
7798
7799 crocus_update_surface_base_address(batch);
7800
7801 crocus_upload_dirty_render_state(ice, batch, draw);
7802
7803 batch->no_wrap = false;
7804 if (draw->index_size > 0) {
7805 unsigned offset;
7806 unsigned size;
7807 bool emit_index = false;
7808
7809 if (draw->has_user_indices) {
7810 unsigned start_offset = draw->index_size * sc->start;
7811 u_upload_data(ice->ctx.stream_uploader, 0,
7812 sc->count * draw->index_size, 4,
7813 (char *)draw->index.user + start_offset,
7814 &offset, &ice->state.index_buffer.res);
7815 offset -= start_offset;
7816 size = start_offset + sc->count * draw->index_size;
7817 emit_index = true;
7818 } else {
7819 struct crocus_resource *res = (void *) draw->index.resource;
7820
7821 if (ice->state.index_buffer.res != draw->index.resource) {
7822 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7823 pipe_resource_reference(&ice->state.index_buffer.res,
7824 draw->index.resource);
7825 emit_index = true;
7826 }
7827 offset = 0;
7828 size = draw->index.resource->width0;
7829 }
7830
7831 if (!emit_index &&
7832 (ice->state.index_buffer.size != size ||
7833 ice->state.index_buffer.index_size != draw->index_size
7834 #if GFX_VERx10 < 75
7835 || ice->state.index_buffer.prim_restart != draw->primitive_restart
7836 #endif
7837 )
7838 )
7839 emit_index = true;
7840
7841 if (emit_index) {
7842 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7843
7844 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7845 #if GFX_VERx10 < 75
7846 ib.CutIndexEnable = draw->primitive_restart;
7847 #endif
7848 ib.IndexFormat = draw->index_size >> 1;
7849 ib.BufferStartingAddress = ro_bo(bo, offset);
7850 #if GFX_VER >= 8
7851 ib.BufferSize = bo->size - offset;
7852 #else
7853 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7854 #endif
7855 #if GFX_VER >= 6
7856 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7857 #endif
7858 }
7859 ice->state.index_buffer.size = size;
7860 ice->state.index_buffer.offset = offset;
7861 ice->state.index_buffer.index_size = draw->index_size;
7862 #if GFX_VERx10 < 75
7863 ice->state.index_buffer.prim_restart = draw->primitive_restart;
7864 #endif
7865 }
7866 }
7867
7868 #define _3DPRIM_END_OFFSET 0x2420
7869 #define _3DPRIM_START_VERTEX 0x2430
7870 #define _3DPRIM_VERTEX_COUNT 0x2434
7871 #define _3DPRIM_INSTANCE_COUNT 0x2438
7872 #define _3DPRIM_START_INSTANCE 0x243C
7873 #define _3DPRIM_BASE_VERTEX 0x2440
7874
7875 #if GFX_VER >= 7
7876 if (indirect && !indirect->count_from_stream_output) {
7877 if (indirect->indirect_draw_count) {
7878 use_predicate = true;
7879
7880 struct crocus_bo *draw_count_bo =
7881 crocus_resource_bo(indirect->indirect_draw_count);
7882 unsigned draw_count_offset =
7883 indirect->indirect_draw_count_offset;
7884
7885 crocus_emit_pipe_control_flush(batch,
7886 "ensure indirect draw buffer is flushed",
7887 PIPE_CONTROL_FLUSH_ENABLE);
7888 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7889 #if GFX_VERx10 >= 75
7890 struct mi_builder b;
7891 mi_builder_init(&b, &batch->screen->devinfo, batch);
7892
7893 /* comparison = draw id < draw count */
7894 struct mi_value comparison =
7895 mi_ult(&b, mi_imm(drawid_offset),
7896 mi_mem32(ro_bo(draw_count_bo,
7897 draw_count_offset)));
7898 #if GFX_VER == 8
7899 /* predicate = comparison & conditional rendering predicate */
7900 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7901 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7902 #else
7903 /* predicate = comparison & conditional rendering predicate */
7904 struct mi_value pred = mi_iand(&b, comparison,
7905 mi_reg32(CS_GPR(15)));
7906
7907 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7908 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7909
7910 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7911 MI_PREDICATE_COMBINEOP_SET |
7912 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7913
7914 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7915 #endif
7916 #endif
7917 } else {
7918 uint32_t mi_predicate;
7919
7920 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7921 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7922 /* Upload the current draw count from the draw parameters buffer
7923 * to MI_PREDICATE_SRC0.
7924 */
7925 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7926 draw_count_bo, draw_count_offset);
7927 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7928 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7929
7930 if (drawid_offset == 0) {
7931 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7932 MI_PREDICATE_COMBINEOP_SET |
7933 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7934 } else {
7935 /* While draw_index < draw_count the predicate's result will be
7936 * (draw_index == draw_count) ^ TRUE = TRUE
7937 * When draw_index == draw_count the result is
7938 * (TRUE) ^ TRUE = FALSE
7939 * After this all results will be:
7940 * (FALSE) ^ FALSE = FALSE
7941 */
7942 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7943 MI_PREDICATE_COMBINEOP_XOR |
7944 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7945 }
7946 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7947 }
7948 }
7949
7950 #if GFX_VER >= 7
7951 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7952 assert(bo);
7953
7954 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7955 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7956 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7957 }
7958 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7959 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7960 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7961 }
7962 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7963 lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7964 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7965 }
7966 if (draw->index_size) {
7967 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7968 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7969 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7970 }
7971 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7972 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7973 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7974 }
7975 } else {
7976 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7977 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7978 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7979 }
7980 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7981 lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7982 lri.DataDWord = 0;
7983 }
7984 }
7985 #endif
7986 } else if (indirect && indirect->count_from_stream_output) {
7987 #if GFX_VERx10 >= 75
7988 struct crocus_stream_output_target *so =
7989 (void *) indirect->count_from_stream_output;
7990
7991 /* XXX: Replace with actual cache tracking */
7992 crocus_emit_pipe_control_flush(batch,
7993 "draw count from stream output stall",
7994 PIPE_CONTROL_CS_STALL);
7995
7996 struct mi_builder b;
7997 mi_builder_init(&b, &batch->screen->devinfo, batch);
7998
7999 struct crocus_address addr =
8000 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
8001 struct mi_value offset =
8002 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8003
8004 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8005 mi_udiv32_imm(&b, offset, so->stride));
8006
8007 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8008 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8009 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8010 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8011 #endif
8012 }
8013 #else
8014 assert(!indirect);
8015 #endif
8016
8017 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8018 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8019 #if GFX_VER >= 7
8020 prim.PredicateEnable = use_predicate;
8021 #endif
8022
8023 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8024 if (indirect) {
8025 // XXX Probably have to do something for gen6 here?
8026 #if GFX_VER >= 7
8027 prim.IndirectParameterEnable = true;
8028 #endif
8029 } else {
8030 #if GFX_VER >= 5
8031 prim.StartInstanceLocation = draw->start_instance;
8032 #endif
8033 prim.InstanceCount = draw->instance_count;
8034 prim.VertexCountPerInstance = sc->count;
8035
8036 prim.StartVertexLocation = sc->start;
8037
8038 if (draw->index_size) {
8039 prim.BaseVertexLocation += sc->index_bias;
8040 }
8041 }
8042 }
8043 }
8044
8045 #if GFX_VER >= 7
8046
8047 static void
8048 crocus_upload_compute_state(struct crocus_context *ice,
8049 struct crocus_batch *batch,
8050 const struct pipe_grid_info *grid)
8051 {
8052 const uint64_t stage_dirty = ice->state.stage_dirty;
8053 struct crocus_screen *screen = batch->screen;
8054 const struct intel_device_info *devinfo = &screen->devinfo;
8055 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8056 struct crocus_compiled_shader *shader =
8057 ice->shaders.prog[MESA_SHADER_COMPUTE];
8058 struct brw_stage_prog_data *prog_data = shader->prog_data;
8059 struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
8060 const struct brw_cs_dispatch_info dispatch =
8061 brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8062
8063 crocus_update_surface_base_address(batch);
8064 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8065 upload_sysvals(ice, MESA_SHADER_COMPUTE);
8066
8067 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8068 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8069 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8070 crocus_upload_binding_table(ice, batch,
8071 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8072 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8073 }
8074
8075 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8076 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8077
8078 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8079 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8080 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8081 *
8082 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8083 * the only bits that are changed are scoreboard related: Scoreboard
8084 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
8085 * these scoreboard related states, a MEDIA_STATE_FLUSH is
8086 * sufficient."
8087 */
8088 crocus_emit_pipe_control_flush(batch,
8089 "workaround: stall before MEDIA_VFE_STATE",
8090 PIPE_CONTROL_CS_STALL);
8091
8092 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8093 if (prog_data->total_scratch) {
8094 struct crocus_bo *bo =
8095 crocus_get_scratch_space(ice, prog_data->total_scratch,
8096 MESA_SHADER_COMPUTE);
8097 #if GFX_VER == 8
8098 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8099 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8100 */
8101 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8102 #elif GFX_VERx10 == 75
8103 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8104 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8105 */
8106 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8107 #else
8108 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8109 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8110 */
8111 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8112 #endif
8113 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8114 }
8115
8116 vfe.MaximumNumberofThreads =
8117 devinfo->max_cs_threads * devinfo->subslice_total - 1;
8118 vfe.ResetGatewayTimer =
8119 Resettingrelativetimerandlatchingtheglobaltimestamp;
8120 vfe.BypassGatewayControl = true;
8121 #if GFX_VER == 7
8122 vfe.GPGPUMode = true;
8123 #endif
8124 #if GFX_VER == 8
8125 vfe.BypassGatewayControl = true;
8126 #endif
8127 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8128 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8129
8130 vfe.CURBEAllocationSize =
8131 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8132 cs_prog_data->push.cross_thread.regs, 2);
8133 }
8134 }
8135
8136 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8137 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8138 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8139 uint32_t curbe_data_offset = 0;
8140 assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8141 cs_prog_data->push.per_thread.dwords == 1 &&
8142 cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
8143 const unsigned push_const_size =
8144 brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8145 uint32_t *curbe_data_map =
8146 stream_state(batch,
8147 ALIGN(push_const_size, 64), 64,
8148 &curbe_data_offset);
8149 assert(curbe_data_map);
8150 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8151 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8152 curbe_data_map);
8153
8154 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8155 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8156 curbe.CURBEDataStartAddress = curbe_data_offset;
8157 }
8158 }
8159
8160 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8161 CROCUS_STAGE_DIRTY_BINDINGS_CS |
8162 CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8163 CROCUS_STAGE_DIRTY_CS)) {
8164 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8165 const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8166 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8167 idd.KernelStartPointer = ksp;
8168 idd.SamplerStatePointer = shs->sampler_offset;
8169 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8170 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8171 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8172 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8173 idd.BarrierEnable = cs_prog_data->uses_barrier;
8174 idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
8175 prog_data->total_shared);
8176 #if GFX_VERx10 >= 75
8177 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8178 #endif
8179 }
8180
8181 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8182 load.InterfaceDescriptorTotalLength =
8183 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8184 load.InterfaceDescriptorDataStartAddress =
8185 emit_state(batch, desc, sizeof(desc), 64);
8186 }
8187 }
8188
8189 #define GPGPU_DISPATCHDIMX 0x2500
8190 #define GPGPU_DISPATCHDIMY 0x2504
8191 #define GPGPU_DISPATCHDIMZ 0x2508
8192
8193 if (grid->indirect) {
8194 struct crocus_state_ref *grid_size = &ice->state.grid_size;
8195 struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8196 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8197 lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8198 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8199 }
8200 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8201 lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8202 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8203 }
8204 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8205 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8206 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8207 }
8208
8209 #if GFX_VER == 7
8210 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8211 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8212 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8213
8214 /* Load compute_dispatch_indirect_x_size into SRC0 */
8215 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8216
8217 /* predicate = (compute_dispatch_indirect_x_size == 0); */
8218 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8219 mip.LoadOperation = LOAD_LOAD;
8220 mip.CombineOperation = COMBINE_SET;
8221 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8222 };
8223
8224 /* Load compute_dispatch_indirect_y_size into SRC0 */
8225 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8226
8227 /* predicate = (compute_dispatch_indirect_y_size == 0); */
8228 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8229 mip.LoadOperation = LOAD_LOAD;
8230 mip.CombineOperation = COMBINE_OR;
8231 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8232 };
8233
8234 /* Load compute_dispatch_indirect_z_size into SRC0 */
8235 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8236
8237 /* predicate = (compute_dispatch_indirect_z_size == 0); */
8238 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8239 mip.LoadOperation = LOAD_LOAD;
8240 mip.CombineOperation = COMBINE_OR;
8241 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8242 };
8243
8244 /* predicate = !predicate; */
8245 #define COMPARE_FALSE 1
8246 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8247 mip.LoadOperation = LOAD_LOADINV;
8248 mip.CombineOperation = COMBINE_OR;
8249 mip.CompareOperation = COMPARE_FALSE;
8250 }
8251 #endif
8252 }
8253
8254 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8255 ggw.IndirectParameterEnable = grid->indirect != NULL;
8256 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
8257 ggw.SIMDSize = dispatch.simd_size / 16;
8258 ggw.ThreadDepthCounterMaximum = 0;
8259 ggw.ThreadHeightCounterMaximum = 0;
8260 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
8261 ggw.ThreadGroupIDXDimension = grid->grid[0];
8262 ggw.ThreadGroupIDYDimension = grid->grid[1];
8263 ggw.ThreadGroupIDZDimension = grid->grid[2];
8264 ggw.RightExecutionMask = dispatch.right_mask;
8265 ggw.BottomExecutionMask = 0xffffffff;
8266 }
8267
8268 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8269
8270 batch->contains_draw = true;
8271 }
8272
8273 #endif /* GFX_VER >= 7 */
8274
8275 /**
8276 * State module teardown.
8277 */
8278 static void
8279 crocus_destroy_state(struct crocus_context *ice)
8280 {
8281 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8282 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8283
8284 free(ice->state.genx);
8285
8286 for (int i = 0; i < 4; i++) {
8287 pipe_so_target_reference(&ice->state.so_target[i], NULL);
8288 }
8289
8290 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
8291 pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
8292 }
8293 pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
8294
8295 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8296 struct crocus_shader_state *shs = &ice->state.shaders[stage];
8297 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8298 pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8299 }
8300 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8301 pipe_resource_reference(&shs->image[i].base.resource, NULL);
8302 }
8303 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8304 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8305 }
8306 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8307 pipe_sampler_view_reference((struct pipe_sampler_view **)
8308 &shs->textures[i], NULL);
8309 }
8310 }
8311
8312 for (int i = 0; i < 16; i++)
8313 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8314 pipe_resource_reference(&ice->state.grid_size.res, NULL);
8315
8316 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8317 }
8318
8319 /* ------------------------------------------------------------------- */
8320
8321 static void
8322 crocus_rebind_buffer(struct crocus_context *ice,
8323 struct crocus_resource *res)
8324 {
8325 struct pipe_context *ctx = &ice->ctx;
8326
8327 assert(res->base.b.target == PIPE_BUFFER);
8328
8329 /* Buffers can't be framebuffer attachments, nor display related,
8330 * and we don't have upstream Clover support.
8331 */
8332 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8333 PIPE_BIND_RENDER_TARGET |
8334 PIPE_BIND_BLENDABLE |
8335 PIPE_BIND_DISPLAY_TARGET |
8336 PIPE_BIND_CURSOR |
8337 PIPE_BIND_COMPUTE_RESOURCE |
8338 PIPE_BIND_GLOBAL)));
8339
8340 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8341 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8342 while (bound_vbs) {
8343 const int i = u_bit_scan64(&bound_vbs);
8344 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8345
8346 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8347 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8348 }
8349 }
8350
8351 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8352 ice->state.index_buffer.res) {
8353 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8354 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8355 }
8356 /* There is no need to handle these:
8357 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8358 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8359 */
8360
8361 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8362 /* XXX: be careful about resetting vs appending... */
8363 for (int i = 0; i < 4; i++) {
8364 if (ice->state.so_target[i] &&
8365 (ice->state.so_target[i]->buffer == &res->base.b)) {
8366 #if GFX_VER == 6
8367 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8368 #else
8369 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8370 #endif
8371 }
8372 }
8373 }
8374
8375 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8376 struct crocus_shader_state *shs = &ice->state.shaders[s];
8377 enum pipe_shader_type p_stage = stage_to_pipe(s);
8378
8379 if (!(res->bind_stages & (1 << s)))
8380 continue;
8381
8382 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8383 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8384 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8385 while (bound_cbufs) {
8386 const int i = u_bit_scan(&bound_cbufs);
8387 struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8388
8389 if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8390 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8391 }
8392 }
8393 }
8394
8395 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8396 uint32_t bound_ssbos = shs->bound_ssbos;
8397 while (bound_ssbos) {
8398 const int i = u_bit_scan(&bound_ssbos);
8399 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8400
8401 if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8402 struct pipe_shader_buffer buf = {
8403 .buffer = &res->base.b,
8404 .buffer_offset = ssbo->buffer_offset,
8405 .buffer_size = ssbo->buffer_size,
8406 };
8407 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8408 (shs->writable_ssbos >> i) & 1);
8409 }
8410 }
8411 }
8412
8413 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8414 uint32_t bound_sampler_views = shs->bound_sampler_views;
8415 while (bound_sampler_views) {
8416 const int i = u_bit_scan(&bound_sampler_views);
8417 struct crocus_sampler_view *isv = shs->textures[i];
8418 struct crocus_bo *bo = isv->res->bo;
8419
8420 if (res->bo == bo) {
8421 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8422 }
8423 }
8424 }
8425
8426 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8427 uint32_t bound_image_views = shs->bound_image_views;
8428 while (bound_image_views) {
8429 const int i = u_bit_scan(&bound_image_views);
8430 struct crocus_image_view *iv = &shs->image[i];
8431 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8432
8433 if (res->bo == bo)
8434 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8435 }
8436 }
8437 }
8438 }
8439
8440 /* ------------------------------------------------------------------- */
8441
8442 static unsigned
8443 flags_to_post_sync_op(uint32_t flags)
8444 {
8445 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8446 return WriteImmediateData;
8447
8448 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8449 return WritePSDepthCount;
8450
8451 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8452 return WriteTimestamp;
8453
8454 return 0;
8455 }
8456
8457 /*
8458 * Do the given flags have a Post Sync or LRI Post Sync operation?
8459 */
8460 static enum pipe_control_flags
8461 get_post_sync_flags(enum pipe_control_flags flags)
8462 {
8463 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8464 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8465 PIPE_CONTROL_WRITE_TIMESTAMP |
8466 PIPE_CONTROL_LRI_POST_SYNC_OP;
8467
8468 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8469 * "LRI Post Sync Operation". So more than one bit set would be illegal.
8470 */
8471 assert(util_bitcount(flags) <= 1);
8472
8473 return flags;
8474 }
8475
8476 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8477
8478 /**
8479 * Emit a series of PIPE_CONTROL commands, taking into account any
8480 * workarounds necessary to actually accomplish the caller's request.
8481 *
8482 * Unless otherwise noted, spec quotations in this function come from:
8483 *
8484 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8485 * Restrictions for PIPE_CONTROL.
8486 *
8487 * You should not use this function directly. Use the helpers in
8488 * crocus_pipe_control.c instead, which may split the pipe control further.
8489 */
8490 static void
8491 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8492 const char *reason,
8493 uint32_t flags,
8494 struct crocus_bo *bo,
8495 uint32_t offset,
8496 uint64_t imm)
8497 {
8498 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8499 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8500 UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8501 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8502
8503 /* Recursive PIPE_CONTROL workarounds --------------------------------
8504 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8505 *
8506 * We do these first because we want to look at the original operation,
8507 * rather than any workarounds we set.
8508 */
8509
8510 /* "Flush Types" workarounds ---------------------------------------------
8511 * We do these now because they may add post-sync operations or CS stalls.
8512 */
8513
8514 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8515 /* Hardware workaround: SNB B-Spec says:
8516 *
8517 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8518 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8519 * required."
8520 */
8521 crocus_emit_post_sync_nonzero_flush(batch);
8522 }
8523
8524 #if GFX_VER == 8
8525 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8526 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8527 *
8528 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8529 * 'Write PS Depth Count' or 'Write Timestamp'."
8530 */
8531 if (!bo) {
8532 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8533 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8534 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8535 bo = batch->ice->workaround_bo;
8536 offset = batch->ice->workaround_offset;
8537 }
8538 }
8539 #endif
8540
8541 #if GFX_VERx10 < 75
8542 if (flags & PIPE_CONTROL_DEPTH_STALL) {
8543 /* Project: PRE-HSW / Argument: Depth Stall
8544 *
8545 * "The following bits must be clear:
8546 * - Render Target Cache Flush Enable ([12] of DW1)
8547 * - Depth Cache Flush Enable ([0] of DW1)"
8548 */
8549 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8550 PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8551 }
8552 #endif
8553 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8554 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8555 *
8556 * "This bit must be DISABLED for operations other than writing
8557 * PS_DEPTH_COUNT."
8558 *
8559 * This seems like nonsense. An Ivybridge workaround requires us to
8560 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8561 * operation. Gen8+ requires us to emit depth stalls and depth cache
8562 * flushes together. So, it's hard to imagine this means anything other
8563 * than "we originally intended this to be used for PS_DEPTH_COUNT".
8564 *
8565 * We ignore the supposed restriction and do nothing.
8566 */
8567 }
8568
8569 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8570 /* Project: PRE-HSW / Argument: Depth Cache Flush
8571 *
8572 * "Depth Stall must be clear ([13] of DW1)."
8573 */
8574 assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8575 }
8576
8577 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8578 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8579 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8580 *
8581 * "This bit must be DISABLED for End-of-pipe (Read) fences,
8582 * PS_DEPTH_COUNT or TIMESTAMP queries."
8583 *
8584 * TODO: Implement end-of-pipe checking.
8585 */
8586 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8587 PIPE_CONTROL_WRITE_TIMESTAMP)));
8588 }
8589
8590 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8591 /* From the PIPE_CONTROL instruction table, bit 1:
8592 *
8593 * "This bit is ignored if Depth Stall Enable is set.
8594 * Further, the render cache is not flushed even if Write Cache
8595 * Flush Enable bit is set."
8596 *
8597 * We assert that the caller doesn't do this combination, to try and
8598 * prevent mistakes. It shouldn't hurt the GPU, though.
8599 *
8600 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8601 * and "Render Target Flush" combo is explicitly required for BTI
8602 * update workarounds.
8603 */
8604 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8605 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8606 }
8607
8608 /* PIPE_CONTROL page workarounds ------------------------------------- */
8609
8610 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8611 /* From the PIPE_CONTROL page itself:
8612 *
8613 * "IVB, HSW, BDW
8614 * Restriction: Pipe_control with CS-stall bit set must be issued
8615 * before a pipe-control command that has the State Cache
8616 * Invalidate bit set."
8617 */
8618 flags |= PIPE_CONTROL_CS_STALL;
8619 }
8620
8621 if ((GFX_VERx10 == 75)) {
8622 /* From the PIPE_CONTROL page itself:
8623 *
8624 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8625 * Prior to programming a PIPECONTROL command with any of the RO
8626 * cache invalidation bit set, program a PIPECONTROL flush command
8627 * with “CS stall” bit and “HDC Flush” bit set."
8628 *
8629 * TODO: Actually implement this. What's an HDC Flush?
8630 */
8631 }
8632
8633 if (flags & PIPE_CONTROL_FLUSH_LLC) {
8634 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8635 *
8636 * "Project: ALL
8637 * SW must always program Post-Sync Operation to "Write Immediate
8638 * Data" when Flush LLC is set."
8639 *
8640 * For now, we just require the caller to do it.
8641 */
8642 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8643 }
8644
8645 /* "Post-Sync Operation" workarounds -------------------------------- */
8646
8647 /* Project: All / Argument: Global Snapshot Count Reset [19]
8648 *
8649 * "This bit must not be exercised on any product.
8650 * Requires stall bit ([20] of DW1) set."
8651 *
8652 * We don't use this, so we just assert that it isn't used. The
8653 * PIPE_CONTROL instruction page indicates that they intended this
8654 * as a debug feature and don't think it is useful in production,
8655 * but it may actually be usable, should we ever want to.
8656 */
8657 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8658
8659 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8660 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8661 /* Project: All / Arguments:
8662 *
8663 * - Generic Media State Clear [16]
8664 * - Indirect State Pointers Disable [16]
8665 *
8666 * "Requires stall bit ([20] of DW1) set."
8667 *
8668 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8669 * State Clear) says:
8670 *
8671 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
8672 * programmed prior to programming a PIPECONTROL command with "Media
8673 * State Clear" set in GPGPU mode of operation"
8674 *
8675 * This is a subset of the earlier rule, so there's nothing to do.
8676 */
8677 flags |= PIPE_CONTROL_CS_STALL;
8678 }
8679
8680 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8681 /* Project: All / Argument: Store Data Index
8682 *
8683 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8684 * than '0'."
8685 *
8686 * For now, we just assert that the caller does this. We might want to
8687 * automatically add a write to the workaround BO...
8688 */
8689 assert(non_lri_post_sync_flags != 0);
8690 }
8691
8692 if (flags & PIPE_CONTROL_SYNC_GFDT) {
8693 /* Project: All / Argument: Sync GFDT
8694 *
8695 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8696 * than '0' or 0x2520[13] must be set."
8697 *
8698 * For now, we just assert that the caller does this.
8699 */
8700 assert(non_lri_post_sync_flags != 0);
8701 }
8702
8703 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8704 /* Project: SNB, IVB, HSW / Argument: TLB inv
8705 *
8706 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8707 * must be set to something other than '0'."
8708 *
8709 * For now, we just assert that the caller does this.
8710 */
8711 assert(non_lri_post_sync_flags != 0);
8712 }
8713
8714 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8715 /* Project: IVB+ / Argument: TLB inv
8716 *
8717 * "Requires stall bit ([20] of DW1) set."
8718 *
8719 * Also, from the PIPE_CONTROL instruction table:
8720 *
8721 * "Project: SKL+
8722 * Post Sync Operation or CS stall must be set to ensure a TLB
8723 * invalidation occurs. Otherwise no cycle will occur to the TLB
8724 * cache to invalidate."
8725 *
8726 * This is not a subset of the earlier rule, so there's nothing to do.
8727 */
8728 flags |= PIPE_CONTROL_CS_STALL;
8729 }
8730 #if GFX_VER == 8
8731 if (IS_COMPUTE_PIPELINE(batch)) {
8732 if (post_sync_flags ||
8733 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8734 PIPE_CONTROL_DEPTH_STALL |
8735 PIPE_CONTROL_RENDER_TARGET_FLUSH |
8736 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8737 PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8738 /* Project: BDW / Arguments:
8739 *
8740 * - LRI Post Sync Operation [23]
8741 * - Post Sync Op [15:14]
8742 * - Notify En [8]
8743 * - Depth Stall [13]
8744 * - Render Target Cache Flush [12]
8745 * - Depth Cache Flush [0]
8746 * - DC Flush Enable [5]
8747 *
8748 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
8749 * Workloads."
8750 *
8751 * (The docs have separate table rows for each bit, with essentially
8752 * the same workaround text. We've combined them here.)
8753 */
8754 flags |= PIPE_CONTROL_CS_STALL;
8755
8756 /* Also, from the PIPE_CONTROL instruction table, bit 20:
8757 *
8758 * "Project: BDW
8759 * This bit must be always set when PIPE_CONTROL command is
8760 * programmed by GPGPU and MEDIA workloads, except for the cases
8761 * when only Read Only Cache Invalidation bits are set (State
8762 * Cache Invalidation Enable, Instruction cache Invalidation
8763 * Enable, Texture Cache Invalidation Enable, Constant Cache
8764 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
8765 * need not implemented when FF_DOP_CG is disable via "Fixed
8766 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8767 *
8768 * It sounds like we could avoid CS stalls in some cases, but we
8769 * don't currently bother. This list isn't exactly the list above,
8770 * either...
8771 */
8772 }
8773 }
8774 #endif
8775 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8776 *
8777 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8778 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8779 *
8780 * Note that the kernel does CS stalls between batches, so we only need
8781 * to count them within a batch. We currently naively count every 4, and
8782 * don't skip the ones with only read-cache-invalidate bits set. This
8783 * may or may not be a problem...
8784 */
8785 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8786 if (flags & PIPE_CONTROL_CS_STALL) {
8787 /* If we're doing a CS stall, reset the counter and carry on. */
8788 batch->pipe_controls_since_last_cs_stall = 0;
8789 }
8790
8791 /* If this is the fourth pipe control without a CS stall, do one now. */
8792 if (++batch->pipe_controls_since_last_cs_stall == 4) {
8793 batch->pipe_controls_since_last_cs_stall = 0;
8794 flags |= PIPE_CONTROL_CS_STALL;
8795 }
8796 }
8797
8798 /* "Stall" workarounds ----------------------------------------------
8799 * These have to come after the earlier ones because we may have added
8800 * some additional CS stalls above.
8801 */
8802
8803 if (flags & PIPE_CONTROL_CS_STALL) {
8804 /* Project: PRE-SKL, VLV, CHV
8805 *
8806 * "[All Stepping][All SKUs]:
8807 *
8808 * One of the following must also be set:
8809 *
8810 * - Render Target Cache Flush Enable ([12] of DW1)
8811 * - Depth Cache Flush Enable ([0] of DW1)
8812 * - Stall at Pixel Scoreboard ([1] of DW1)
8813 * - Depth Stall ([13] of DW1)
8814 * - Post-Sync Operation ([13] of DW1)
8815 * - DC Flush Enable ([5] of DW1)"
8816 *
8817 * If we don't already have one of those bits set, we choose to add
8818 * "Stall at Pixel Scoreboard". Some of the other bits require a
8819 * CS stall as a workaround (see above), which would send us into
8820 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
8821 * appears to be safe, so we choose that.
8822 */
8823 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8824 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8825 PIPE_CONTROL_WRITE_IMMEDIATE |
8826 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8827 PIPE_CONTROL_WRITE_TIMESTAMP |
8828 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8829 PIPE_CONTROL_DEPTH_STALL |
8830 PIPE_CONTROL_DATA_CACHE_FLUSH;
8831 if (!(flags & wa_bits))
8832 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8833 }
8834
8835 /* Emit --------------------------------------------------------------- */
8836
8837 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8838 fprintf(stderr,
8839 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8840 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8841 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8842 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8843 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8844 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8845 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8846 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8847 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8848 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8849 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8850 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8851 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8852 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8853 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8854 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8855 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8856 "SnapRes" : "",
8857 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8858 "ISPDis" : "",
8859 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8860 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8861 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8862 imm, reason);
8863 }
8864
8865 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8866 #if GFX_VER >= 7
8867 pc.LRIPostSyncOperation = NoLRIOperation;
8868 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8869 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8870 #endif
8871 #if GFX_VER >= 6
8872 pc.StoreDataIndex = 0;
8873 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8874 pc.GlobalSnapshotCountReset =
8875 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8876 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8877 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8878 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8879 pc.RenderTargetCacheFlushEnable =
8880 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8881 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8882 pc.StateCacheInvalidationEnable =
8883 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8884 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8885 pc.ConstantCacheInvalidationEnable =
8886 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8887 #else
8888 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8889 #endif
8890 pc.PostSyncOperation = flags_to_post_sync_op(flags);
8891 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8892 pc.InstructionCacheInvalidateEnable =
8893 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8894 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8895 #if GFX_VER >= 5 || GFX_VERx10 == 45
8896 pc.IndirectStatePointersDisable =
8897 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8898 #endif
8899 #if GFX_VER >= 6
8900 pc.TextureCacheInvalidationEnable =
8901 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8902 #elif GFX_VER == 5 || GFX_VERx10 == 45
8903 pc.TextureCacheFlushEnable =
8904 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8905 #endif
8906 pc.Address = ggtt_bo(bo, offset);
8907 if (GFX_VER < 7 && bo)
8908 pc.DestinationAddressType = DAT_GGTT;
8909 pc.ImmediateData = imm;
8910 }
8911 }
8912
8913 #if GFX_VER == 6
8914 void
8915 genX(crocus_upload_urb)(struct crocus_batch *batch,
8916 unsigned vs_size,
8917 bool gs_present,
8918 unsigned gs_size)
8919 {
8920 struct crocus_context *ice = batch->ice;
8921 int nr_vs_entries, nr_gs_entries;
8922 int total_urb_size = ice->urb.size * 1024; /* in bytes */
8923 const struct intel_device_info *devinfo = &batch->screen->devinfo;
8924
8925 /* Calculate how many entries fit in each stage's section of the URB */
8926 if (gs_present) {
8927 nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8928 nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8929 } else {
8930 nr_vs_entries = total_urb_size / (vs_size * 128);
8931 nr_gs_entries = 0;
8932 }
8933
8934 /* Then clamp to the maximum allowed by the hardware */
8935 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8936 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8937
8938 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8939 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8940
8941 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8942 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8943 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8944
8945 assert(ice->urb.nr_vs_entries >=
8946 devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8947 assert(ice->urb.nr_vs_entries % 4 == 0);
8948 assert(ice->urb.nr_gs_entries % 4 == 0);
8949 assert(vs_size <= 5);
8950 assert(gs_size <= 5);
8951
8952 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8953 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8954 urb.VSURBEntryAllocationSize = vs_size - 1;
8955
8956 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8957 urb.GSURBEntryAllocationSize = gs_size - 1;
8958 };
8959 /* From the PRM Volume 2 part 1, section 1.4.7:
8960 *
8961 * Because of a urb corruption caused by allocating a previous gsunit’s
8962 * urb entry to vsunit software is required to send a "GS NULL
8963 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8964 * a dummy DRAW call before any case where VS will be taking over GS URB
8965 * space.
8966 *
8967 * It is not clear exactly what this means ("URB fence" is a command that
8968 * doesn't exist on Gen6). So for now we just do a full pipeline flush as
8969 * a workaround.
8970 */
8971 if (ice->urb.gs_present && !gs_present)
8972 crocus_emit_mi_flush(batch);
8973 ice->urb.gs_present = gs_present;
8974 }
8975 #endif
8976
8977 static void
8978 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8979 {
8980 }
8981
8982 static void
8983 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8984 struct crocus_bo *bo,
8985 uint32_t offset_in_bytes,
8986 uint32_t report_id)
8987 {
8988 #if GFX_VER >= 7
8989 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8990 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8991 mi_rpc.ReportID = report_id;
8992 }
8993 #endif
8994 }
8995
8996 /**
8997 * From the PRM, Volume 2a:
8998 *
8999 * "Indirect State Pointers Disable
9000 *
9001 * At the completion of the post-sync operation associated with this pipe
9002 * control packet, the indirect state pointers in the hardware are
9003 * considered invalid; the indirect pointers are not saved in the context.
9004 * If any new indirect state commands are executed in the command stream
9005 * while the pipe control is pending, the new indirect state commands are
9006 * preserved.
9007 *
9008 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9009 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9010 * commands are only considered as Indirect State Pointers. Once ISP is
9011 * issued in a context, SW must initialize by programming push constant
9012 * commands for all the shaders (at least to zero length) before attempting
9013 * any rendering operation for the same context."
9014 *
9015 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9016 * even though they point to a BO that has been already unreferenced at
9017 * the end of the previous batch buffer. This has been fine so far since
9018 * we are protected by these scratch page (every address not covered by
9019 * a BO should be pointing to the scratch page). But on CNL, it is
9020 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9021 * instruction.
9022 *
9023 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9024 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9025 * context restore, so the mentioned hang doesn't happen. However,
9026 * software must program push constant commands for all stages prior to
9027 * rendering anything, so we flag them as dirty.
9028 *
9029 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9030 * constants have been loaded into the EUs prior to disable the push constants
9031 * so that it doesn't hang a previous 3DPRIMITIVE.
9032 */
9033 #if GFX_VER >= 7
9034 static void
9035 gen7_emit_isp_disable(struct crocus_batch *batch)
9036 {
9037 crocus_emit_raw_pipe_control(batch, "isp disable",
9038 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9039 PIPE_CONTROL_CS_STALL,
9040 NULL, 0, 0);
9041 crocus_emit_raw_pipe_control(batch, "isp disable",
9042 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9043 PIPE_CONTROL_CS_STALL,
9044 NULL, 0, 0);
9045
9046 struct crocus_context *ice = batch->ice;
9047 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9048 CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9049 CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9050 CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9051 CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9052 }
9053 #endif
9054
9055 #if GFX_VER >= 7
9056 static void
9057 crocus_state_finish_batch(struct crocus_batch *batch)
9058 {
9059 #if GFX_VERx10 == 75
9060 if (batch->name == CROCUS_BATCH_RENDER) {
9061 crocus_emit_mi_flush(batch);
9062 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9063 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9064 }
9065
9066 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9067 PIPE_CONTROL_CS_STALL);
9068 }
9069 #endif
9070 gen7_emit_isp_disable(batch);
9071 }
9072 #endif
9073
9074 static void
9075 crocus_batch_reset_dirty(struct crocus_batch *batch)
9076 {
9077 /* unreference any index buffer so it get reemitted. */
9078 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9079
9080 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9081 * as the old state batch won't still be available.
9082 */
9083 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9084 CROCUS_DIRTY_COLOR_CALC_STATE;
9085
9086 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9087
9088 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9089 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9090 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9091 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9093 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9095
9096 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9097 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9098 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9099 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9100 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9101 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9102
9103 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9104 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9105 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9106 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9107
9108 #if GFX_VER >= 6
9109 /* SCISSOR_STATE */
9110 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9111 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9112 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9113
9114 #endif
9115 #if GFX_VER <= 5
9116 /* dirty the SF state on gen4/5 */
9117 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9118 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9119 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9120 batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9121 #endif
9122 #if GFX_VER >= 7
9123 /* Streamout dirty */
9124 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9125 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9126 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9127 #endif
9128 }
9129
9130 #if GFX_VERx10 == 75
9131 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9132 {
9133 return &ice->state.cso_rast->cso;
9134 }
9135 #endif
9136
9137 #if GFX_VER >= 6
9138 static void update_so_strides(struct crocus_context *ice,
9139 uint16_t *strides)
9140 {
9141 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9142 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9143 if (so)
9144 so->stride = strides[i] * sizeof(uint32_t);
9145 }
9146 }
9147 #endif
9148
9149 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9150 int s,
9151 uint32_t *clamp_mask)
9152 {
9153 #if GFX_VER < 8
9154 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9155 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9156 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9157 clamp_mask[0] |= (1 << s);
9158 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9159 clamp_mask[1] |= (1 << s);
9160 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9161 clamp_mask[2] |= (1 << s);
9162 }
9163 #endif
9164 }
9165
9166 static void
9167 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9168 {
9169 struct crocus_context *ice = (struct crocus_context *) ctx;
9170
9171 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9172 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9173 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9174 }
9175
9176 if (ice->batch_count == 1)
9177 return;
9178
9179 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9180 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9181 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9182 }
9183 }
9184
9185 void
9186 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9187 {
9188 assert(screen->devinfo.verx10 == GFX_VERx10);
9189 assert(screen->devinfo.ver == GFX_VER);
9190 screen->vtbl.destroy_state = crocus_destroy_state;
9191 screen->vtbl.init_render_context = crocus_init_render_context;
9192 screen->vtbl.upload_render_state = crocus_upload_render_state;
9193 #if GFX_VER >= 7
9194 screen->vtbl.init_compute_context = crocus_init_compute_context;
9195 screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9196 #endif
9197 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9198 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9199 screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9200 #if GFX_VERx10 >= 75
9201 screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9202 screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9203 screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9204 screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9205 screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9206 screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9207 #endif
9208 #if GFX_VER >= 7
9209 screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9210 screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9211 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9212 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9213 #endif
9214 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9215 #if GFX_VER >= 6
9216 screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9217 screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9218 #endif
9219 screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9220 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9221 screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9222 screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9223 screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9224 screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9225 screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9226 #if GFX_VER >= 7
9227 screen->vtbl.finish_batch = crocus_state_finish_batch;
9228 #endif
9229 #if GFX_VER <= 5
9230 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9231 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9232 #endif
9233 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9234 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9235 screen->vtbl.translate_prim_type = translate_prim_type;
9236 #if GFX_VER >= 6
9237 screen->vtbl.update_so_strides = update_so_strides;
9238 screen->vtbl.get_so_offset = crocus_get_so_offset;
9239 #endif
9240
9241 genX(crocus_init_blt)(screen);
9242 }
9243
9244 void
9245 genX(crocus_init_state)(struct crocus_context *ice)
9246 {
9247 struct pipe_context *ctx = &ice->ctx;
9248
9249 ctx->create_blend_state = crocus_create_blend_state;
9250 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9251 ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9252 ctx->create_sampler_state = crocus_create_sampler_state;
9253 ctx->create_sampler_view = crocus_create_sampler_view;
9254 ctx->create_surface = crocus_create_surface;
9255 ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9256 ctx->bind_blend_state = crocus_bind_blend_state;
9257 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9258 ctx->bind_sampler_states = crocus_bind_sampler_states;
9259 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9260 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9261 ctx->delete_blend_state = crocus_delete_state;
9262 ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9263 ctx->delete_rasterizer_state = crocus_delete_state;
9264 ctx->delete_sampler_state = crocus_delete_state;
9265 ctx->delete_vertex_elements_state = crocus_delete_state;
9266 ctx->set_blend_color = crocus_set_blend_color;
9267 ctx->set_clip_state = crocus_set_clip_state;
9268 ctx->set_constant_buffer = crocus_set_constant_buffer;
9269 ctx->set_shader_buffers = crocus_set_shader_buffers;
9270 ctx->set_shader_images = crocus_set_shader_images;
9271 ctx->set_sampler_views = crocus_set_sampler_views;
9272 ctx->set_tess_state = crocus_set_tess_state;
9273 ctx->set_patch_vertices = crocus_set_patch_vertices;
9274 ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9275 ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9276 ctx->set_sample_mask = crocus_set_sample_mask;
9277 ctx->set_scissor_states = crocus_set_scissor_states;
9278 ctx->set_stencil_ref = crocus_set_stencil_ref;
9279 ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9280 ctx->set_viewport_states = crocus_set_viewport_states;
9281 ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9282 ctx->surface_destroy = crocus_surface_destroy;
9283 ctx->draw_vbo = crocus_draw_vbo;
9284 ctx->launch_grid = crocus_launch_grid;
9285
9286 ctx->set_frontend_noop = crocus_set_frontend_noop;
9287
9288 #if GFX_VER >= 6
9289 ctx->create_stream_output_target = crocus_create_stream_output_target;
9290 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9291 ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9292 #endif
9293
9294 ice->state.dirty = ~0ull;
9295 ice->state.stage_dirty = ~0ull;
9296
9297 ice->state.statistics_counters_enabled = true;
9298
9299 ice->state.sample_mask = 0xff;
9300 ice->state.num_viewports = 1;
9301 ice->state.prim_mode = PIPE_PRIM_MAX;
9302 ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
9303 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9304 ice->draw.derived_params.drawid = -1;
9305
9306 /* Default all scissor rectangles to be empty regions. */
9307 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9308 ice->state.scissors[i] = (struct pipe_scissor_state) {
9309 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9310 };
9311 }
9312 }
9313