1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file crocus_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <errno.h>
74 #include <stdio.h>
75
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #ifdef DEBUG
81 #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
82 #endif
83 #else
84 #define VG(x)
85 #endif
86
87 #include "drm-uapi/i915_drm.h"
88 #include "intel/common/intel_l3_config.h"
89 #include "intel/common/intel_sample_positions.h"
90 #include "intel/compiler/brw_compiler.h"
91 #include "pipe/p_context.h"
92 #include "pipe/p_defines.h"
93 #include "pipe/p_screen.h"
94 #include "pipe/p_state.h"
95 #include "util/format/u_format.h"
96 #include "util/half_float.h"
97 #include "util/u_dual_blend.h"
98 #include "util/u_framebuffer.h"
99 #include "util/u_helpers.h"
100 #include "util/u_inlines.h"
101 #include "util/u_memory.h"
102 #include "util/u_prim.h"
103 #include "util/u_transfer.h"
104 #include "util/u_upload_mgr.h"
105 #include "util/u_viewport.h"
106 #include "crocus_batch.h"
107 #include "crocus_context.h"
108 #include "crocus_defines.h"
109 #include "crocus_pipe.h"
110 #include "crocus_resource.h"
111
112 #include "crocus_genx_macros.h"
113 #include "intel/common/intel_guardband.h"
114
115 /**
116 * Statically assert that PIPE_* enums match the hardware packets.
117 * (As long as they match, we don't need to translate them.)
118 */
pipe_asserts()119 UNUSED static void pipe_asserts()
120 {
121 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
122
123 /* pipe_logicop happens to match the hardware. */
124 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
125 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
126 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
127 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
128 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
129 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
130 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
131 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
132 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
133 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
134 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
135 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
136 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
137 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
138 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
139 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
140
141 /* pipe_blend_func happens to match the hardware. */
142 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
143 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
144 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
145 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
146 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
147 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
148 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
149 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
150 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
151 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
161
162 /* pipe_blend_func happens to match the hardware. */
163 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
164 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
165 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
166 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
167 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
168
169 /* pipe_stencil_op happens to match the hardware. */
170 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
171 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
172 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
173 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
174 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
175 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
176 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
177 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
178
179 #if GFX_VER >= 6
180 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
181 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
182 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
183 #endif
184 #undef PIPE_ASSERT
185 }
186
187 static unsigned
translate_prim_type(enum pipe_prim_type prim,uint8_t verts_per_patch)188 translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
189 {
190 static const unsigned map[] = {
191 [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,
192 [PIPE_PRIM_LINES] = _3DPRIM_LINELIST,
193 [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
194 [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
195 [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
196 [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
197 [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
198 [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,
199 [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
200 [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,
201 #if GFX_VER >= 6
202 [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
203 [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
204 [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
205 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
206 #endif
207 #if GFX_VER >= 7
208 [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
209 #endif
210 };
211
212 return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
213 }
214
215 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)216 translate_compare_func(enum pipe_compare_func pipe_func)
217 {
218 static const unsigned map[] = {
219 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
220 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
221 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
222 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
223 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
224 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
225 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
226 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
227 };
228 return map[pipe_func];
229 }
230
231 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)232 translate_shadow_func(enum pipe_compare_func pipe_func)
233 {
234 /* Gallium specifies the result of shadow comparisons as:
235 *
236 * 1 if ref <op> texel,
237 * 0 otherwise.
238 *
239 * The hardware does:
240 *
241 * 0 if texel <op> ref,
242 * 1 otherwise.
243 *
244 * So we need to flip the operator and also negate.
245 */
246 static const unsigned map[] = {
247 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
248 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
249 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
250 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
251 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
252 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
253 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
254 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
255 };
256 return map[pipe_func];
257 }
258
259 static unsigned
translate_cull_mode(unsigned pipe_face)260 translate_cull_mode(unsigned pipe_face)
261 {
262 static const unsigned map[4] = {
263 [PIPE_FACE_NONE] = CULLMODE_NONE,
264 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
265 [PIPE_FACE_BACK] = CULLMODE_BACK,
266 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
267 };
268 return map[pipe_face];
269 }
270
271 #if GFX_VER >= 6
272 static unsigned
translate_fill_mode(unsigned pipe_polymode)273 translate_fill_mode(unsigned pipe_polymode)
274 {
275 static const unsigned map[4] = {
276 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
277 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
278 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
279 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
280 };
281 return map[pipe_polymode];
282 }
283 #endif
284
285 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)286 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
287 {
288 static const unsigned map[] = {
289 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
290 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
291 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
292 };
293 return map[pipe_mip];
294 }
295
296 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)297 translate_wrap(unsigned pipe_wrap, bool either_nearest)
298 {
299 static const unsigned map[] = {
300 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
301 #if GFX_VER == 8
302 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
303 #else
304 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
305 #endif
306 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
307 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
308 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
309 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
310
311 /* These are unsupported. */
312 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
313 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
314 };
315 #if GFX_VER < 8
316 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
317 return TCM_CLAMP;
318 #endif
319 return map[pipe_wrap];
320 }
321
322 /**
323 * Equiv if brw_state_batch
324 */
325 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)326 stream_state(struct crocus_batch *batch,
327 unsigned size,
328 unsigned alignment,
329 uint32_t *out_offset)
330 {
331 uint32_t offset = ALIGN(batch->state.used, alignment);
332
333 if (offset + size >= STATE_SZ && !batch->no_wrap) {
334 crocus_batch_flush(batch);
335 offset = ALIGN(batch->state.used, alignment);
336 } else if (offset + size >= batch->state.bo->size) {
337 const unsigned new_size =
338 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
339 MAX_STATE_SIZE);
340 crocus_grow_buffer(batch, true, batch->state.used, new_size);
341 assert(offset + size < batch->state.bo->size);
342 }
343
344 crocus_record_state_size(batch->state_sizes, offset, size);
345
346 batch->state.used = offset + size;
347 *out_offset = offset;
348
349 return (uint32_t *)batch->state.map + (offset >> 2);
350 }
351
352 /**
353 * stream_state() + memcpy.
354 */
355 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)356 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
357 unsigned alignment)
358 {
359 unsigned offset = 0;
360 uint32_t *map = stream_state(batch, size, alignment, &offset);
361
362 if (map)
363 memcpy(map, data, size);
364
365 return offset;
366 }
367
368 #if GFX_VER <= 5
369 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)370 upload_pipelined_state_pointers(struct crocus_batch *batch,
371 bool gs_active, uint32_t gs_offset,
372 uint32_t vs_offset, uint32_t sf_offset,
373 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
374 {
375 #if GFX_VER == 5
376 /* Need to flush before changing clip max threads for errata. */
377 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
378 #endif
379
380 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
381 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
382 pp.GSEnable = gs_active;
383 if (gs_active)
384 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
385 pp.ClipEnable = true;
386 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
387 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
388 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
389 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
390 }
391 }
392
393 #endif
394 /**
395 * Did field 'x' change between 'old_cso' and 'new_cso'?
396 *
397 * (If so, we may want to set some dirty flags.)
398 */
399 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
400 #define cso_changed_memcmp(x) \
401 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
402
403 static void
flush_before_state_base_change(struct crocus_batch * batch)404 flush_before_state_base_change(struct crocus_batch *batch)
405 {
406 #if GFX_VER >= 6
407 /* Flush before emitting STATE_BASE_ADDRESS.
408 *
409 * This isn't documented anywhere in the PRM. However, it seems to be
410 * necessary prior to changing the surface state base adress. We've
411 * seen issues in Vulkan where we get GPU hangs when using multi-level
412 * command buffers which clear depth, reset state base address, and then
413 * go render stuff.
414 *
415 * Normally, in GL, we would trust the kernel to do sufficient stalls
416 * and flushes prior to executing our batch. However, it doesn't seem
417 * as if the kernel's flushing is always sufficient and we don't want to
418 * rely on it.
419 *
420 * We make this an end-of-pipe sync instead of a normal flush because we
421 * do not know the current status of the GPU. On Haswell at least,
422 * having a fast-clear operation in flight at the same time as a normal
423 * rendering operation can cause hangs. Since the kernel's flushing is
424 * insufficient, we need to ensure that any rendering operations from
425 * other processes are definitely complete before we try to do our own
426 * rendering. It's a bit of a big hammer but it appears to work.
427 */
428 const unsigned dc_flush =
429 batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
430 crocus_emit_end_of_pipe_sync(batch,
431 "change STATE_BASE_ADDRESS (flushes)",
432 PIPE_CONTROL_RENDER_TARGET_FLUSH |
433 dc_flush |
434 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
435 #endif
436 }
437
438 static void
flush_after_state_base_change(struct crocus_batch * batch)439 flush_after_state_base_change(struct crocus_batch *batch)
440 {
441 /* After re-setting the surface state base address, we have to do some
442 * cache flusing so that the sampler engine will pick up the new
443 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
444 * Shared Function > 3D Sampler > State > State Caching (page 96):
445 *
446 * Coherency with system memory in the state cache, like the texture
447 * cache is handled partially by software. It is expected that the
448 * command stream or shader will issue Cache Flush operation or
449 * Cache_Flush sampler message to ensure that the L1 cache remains
450 * coherent with system memory.
451 *
452 * [...]
453 *
454 * Whenever the value of the Dynamic_State_Base_Addr,
455 * Surface_State_Base_Addr are altered, the L1 state cache must be
456 * invalidated to ensure the new surface or sampler state is fetched
457 * from system memory.
458 *
459 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
460 * which, according the PIPE_CONTROL instruction documentation in the
461 * Broadwell PRM:
462 *
463 * Setting this bit is independent of any other bit in this packet.
464 * This bit controls the invalidation of the L1 and L2 state caches
465 * at the top of the pipe i.e. at the parsing time.
466 *
467 * Unfortunately, experimentation seems to indicate that state cache
468 * invalidation through a PIPE_CONTROL does nothing whatsoever in
469 * regards to surface state and binding tables. In stead, it seems that
470 * invalidating the texture cache is what is actually needed.
471 *
472 * XXX: As far as we have been able to determine through
473 * experimentation, shows that flush the texture cache appears to be
474 * sufficient. The theory here is that all of the sampling/rendering
475 * units cache the binding table in the texture cache. However, we have
476 * yet to be able to actually confirm this.
477 */
478 #if GFX_VER >= 6
479 crocus_emit_end_of_pipe_sync(batch,
480 "change STATE_BASE_ADDRESS (invalidates)",
481 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
482 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
483 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
484 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
485 #endif
486 }
487
488 #if GFX_VER >= 6
489 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)490 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
491 struct crocus_bo *bo, uint32_t offset,
492 bool predicated)
493 {
494 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
495 srm.RegisterAddress = reg;
496 srm.MemoryAddress = ggtt_bo(bo, offset);
497 #if GFX_VERx10 >= 75
498 srm.PredicateEnable = predicated;
499 #else
500 if (predicated)
501 unreachable("unsupported predication");
502 #endif
503 }
504 }
505
506 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)507 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
508 struct crocus_bo *bo, uint32_t offset,
509 bool predicated)
510 {
511 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
512 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
513 }
514 #endif
515
516 #if GFX_VER >= 7
517 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)518 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
519 {
520 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
521 lri.RegisterOffset = reg;
522 lri.DataDWord = val;
523 }
524 }
525 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
526
527 #if GFX_VERx10 >= 75
528 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)529 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
530 {
531 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
532 lrr.SourceRegisterAddress = src;
533 lrr.DestinationRegisterAddress = dst;
534 }
535 }
536
537 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)538 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
539 uint32_t src)
540 {
541 _crocus_emit_lrr(batch, dst, src);
542 }
543
544 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)545 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
546 uint32_t src)
547 {
548 _crocus_emit_lrr(batch, dst, src);
549 _crocus_emit_lrr(batch, dst + 4, src + 4);
550 }
551 #endif
552
553 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)554 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
555 uint32_t val)
556 {
557 _crocus_emit_lri(batch, reg, val);
558 }
559
560 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)561 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
562 uint64_t val)
563 {
564 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
565 _crocus_emit_lri(batch, reg + 4, val >> 32);
566 }
567
568 /**
569 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
570 */
571 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)572 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
573 struct crocus_bo *bo, uint32_t offset)
574 {
575 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
576 lrm.RegisterAddress = reg;
577 lrm.MemoryAddress = ro_bo(bo, offset);
578 }
579 }
580
581 /**
582 * Load a 64-bit value from a buffer into a MMIO register via
583 * two MI_LOAD_REGISTER_MEM commands.
584 */
585 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)586 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
587 struct crocus_bo *bo, uint32_t offset)
588 {
589 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
590 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
591 }
592
593 #if GFX_VERx10 >= 75
594 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)595 crocus_store_data_imm32(struct crocus_batch *batch,
596 struct crocus_bo *bo, uint32_t offset,
597 uint32_t imm)
598 {
599 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
600 sdi.Address = rw_bo(bo, offset);
601 #if GFX_VER >= 6
602 sdi.ImmediateData = imm;
603 #endif
604 }
605 }
606
607 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)608 crocus_store_data_imm64(struct crocus_batch *batch,
609 struct crocus_bo *bo, uint32_t offset,
610 uint64_t imm)
611 {
612 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
613 * 2 in genxml but it's actually variable length and we need 5 DWords.
614 */
615 void *map = crocus_get_command_space(batch, 4 * 5);
616 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
617 sdi.DWordLength = 5 - 2;
618 sdi.Address = rw_bo(bo, offset);
619 #if GFX_VER >= 6
620 sdi.ImmediateData = imm;
621 #endif
622 }
623 }
624 #endif
625
626 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)627 crocus_copy_mem_mem(struct crocus_batch *batch,
628 struct crocus_bo *dst_bo, uint32_t dst_offset,
629 struct crocus_bo *src_bo, uint32_t src_offset,
630 unsigned bytes)
631 {
632 assert(bytes % 4 == 0);
633 assert(dst_offset % 4 == 0);
634 assert(src_offset % 4 == 0);
635
636 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
637 for (unsigned i = 0; i < bytes; i += 4) {
638 crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
639 src_bo, src_offset + i);
640 crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
641 dst_bo, dst_offset + i, false);
642 }
643 }
644 #endif
645
646 /**
647 * Gallium CSO for rasterizer state.
648 */
649 struct crocus_rasterizer_state {
650 struct pipe_rasterizer_state cso;
651 #if GFX_VER >= 6
652 uint32_t sf[GENX(3DSTATE_SF_length)];
653 uint32_t clip[GENX(3DSTATE_CLIP_length)];
654 #endif
655 #if GFX_VER >= 8
656 uint32_t raster[GENX(3DSTATE_RASTER_length)];
657 #endif
658 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
659
660 uint8_t num_clip_plane_consts;
661 bool fill_mode_point_or_line;
662 };
663
664 #if GFX_VER <= 5
665 #define URB_VS 0
666 #define URB_GS 1
667 #define URB_CLP 2
668 #define URB_SF 3
669 #define URB_CS 4
670
671 static const struct {
672 uint32_t min_nr_entries;
673 uint32_t preferred_nr_entries;
674 uint32_t min_entry_size;
675 uint32_t max_entry_size;
676 } limits[URB_CS+1] = {
677 { 16, 32, 1, 5 }, /* vs */
678 { 4, 8, 1, 5 }, /* gs */
679 { 5, 10, 1, 5 }, /* clp */
680 { 1, 8, 1, 12 }, /* sf */
681 { 1, 4, 1, 32 } /* cs */
682 };
683
check_urb_layout(struct crocus_context * ice)684 static bool check_urb_layout(struct crocus_context *ice)
685 {
686 ice->urb.vs_start = 0;
687 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
688 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
689 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
690 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
691
692 return ice->urb.cs_start + ice->urb.nr_cs_entries *
693 ice->urb.csize <= ice->urb.size;
694 }
695
696
697 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)698 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
699 unsigned vsize, unsigned sfsize)
700 {
701 const struct intel_device_info *devinfo = &batch->screen->devinfo;
702 struct crocus_context *ice = batch->ice;
703 if (csize < limits[URB_CS].min_entry_size)
704 csize = limits[URB_CS].min_entry_size;
705
706 if (vsize < limits[URB_VS].min_entry_size)
707 vsize = limits[URB_VS].min_entry_size;
708
709 if (sfsize < limits[URB_SF].min_entry_size)
710 sfsize = limits[URB_SF].min_entry_size;
711
712 if (ice->urb.vsize < vsize ||
713 ice->urb.sfsize < sfsize ||
714 ice->urb.csize < csize ||
715 (ice->urb.constrained && (ice->urb.vsize > vsize ||
716 ice->urb.sfsize > sfsize ||
717 ice->urb.csize > csize))) {
718
719
720 ice->urb.csize = csize;
721 ice->urb.sfsize = sfsize;
722 ice->urb.vsize = vsize;
723
724 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
725 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
726 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
727 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
728 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
729
730 ice->urb.constrained = 0;
731
732 if (devinfo->ver == 5) {
733 ice->urb.nr_vs_entries = 128;
734 ice->urb.nr_sf_entries = 48;
735 if (check_urb_layout(ice)) {
736 goto done;
737 } else {
738 ice->urb.constrained = 1;
739 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
740 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
741 }
742 } else if (devinfo->is_g4x) {
743 ice->urb.nr_vs_entries = 64;
744 if (check_urb_layout(ice)) {
745 goto done;
746 } else {
747 ice->urb.constrained = 1;
748 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
749 }
750 }
751
752 if (!check_urb_layout(ice)) {
753 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
754 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
755 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
756 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
757 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
758
759 /* Mark us as operating with constrained nr_entries, so that next
760 * time we recalculate we'll resize the fences in the hope of
761 * escaping constrained mode and getting back to normal performance.
762 */
763 ice->urb.constrained = 1;
764
765 if (!check_urb_layout(ice)) {
766 /* This is impossible, given the maximal sizes of urb
767 * entries and the values for minimum nr of entries
768 * provided above.
769 */
770 fprintf(stderr, "couldn't calculate URB layout!\n");
771 exit(1);
772 }
773
774 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
775 fprintf(stderr, "URB CONSTRAINED\n");
776 }
777
778 done:
779 if (INTEL_DEBUG(DEBUG_URB))
780 fprintf(stderr,
781 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
782 ice->urb.vs_start,
783 ice->urb.gs_start,
784 ice->urb.clip_start,
785 ice->urb.sf_start,
786 ice->urb.cs_start,
787 ice->urb.size);
788 return true;
789 }
790 return false;
791 }
792
793 static void
crocus_upload_urb_fence(struct crocus_batch * batch)794 crocus_upload_urb_fence(struct crocus_batch *batch)
795 {
796 uint32_t urb_fence[3];
797 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
798 urb.VSUnitURBReallocationRequest = 1;
799 urb.GSUnitURBReallocationRequest = 1;
800 urb.CLIPUnitURBReallocationRequest = 1;
801 urb.SFUnitURBReallocationRequest = 1;
802 urb.VFEUnitURBReallocationRequest = 1;
803 urb.CSUnitURBReallocationRequest = 1;
804
805 urb.VSFence = batch->ice->urb.gs_start;
806 urb.GSFence = batch->ice->urb.clip_start;
807 urb.CLIPFence = batch->ice->urb.sf_start;
808 urb.SFFence = batch->ice->urb.cs_start;
809 urb.CSFence = batch->ice->urb.size;
810 }
811
812 /* erratum: URB_FENCE must not cross a 64byte cacheline */
813 if ((crocus_batch_bytes_used(batch) & 15) > 12) {
814 int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
815 do {
816 *(uint32_t *)batch->command.map_next = 0;
817 batch->command.map_next += sizeof(uint32_t);
818 } while (--pad);
819 }
820
821 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
822 }
823
824 static bool
calculate_curbe_offsets(struct crocus_batch * batch)825 calculate_curbe_offsets(struct crocus_batch *batch)
826 {
827 struct crocus_context *ice = batch->ice;
828
829 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
830 unsigned total_regs;
831
832 nr_fp_regs = 0;
833 for (int i = 0; i < 4; i++) {
834 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
835 if (range->length == 0)
836 continue;
837
838 /* ubo range tracks at 256-bit, we need 512-bit */
839 nr_fp_regs += (range->length + 1) / 2;
840 }
841
842 if (ice->state.cso_rast->cso.clip_plane_enable) {
843 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
844 nr_clip_regs = (nr_planes * 4 + 15) / 16;
845 }
846
847 nr_vp_regs = 0;
848 for (int i = 0; i < 4; i++) {
849 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
850 if (range->length == 0)
851 continue;
852
853 /* ubo range tracks at 256-bit, we need 512-bit */
854 nr_vp_regs += (range->length + 1) / 2;
855 }
856 if (nr_vp_regs == 0) {
857 /* The pre-gen6 VS requires that some push constants get loaded no
858 * matter what, or the GPU would hang.
859 */
860 nr_vp_regs = 1;
861 }
862 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
863
864 /* The CURBE allocation size is limited to 32 512-bit units (128 EU
865 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
866 * (volume 1, part 1) PRMs.
867 *
868 * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
869 * values as push constants before spilling to pull constants, and in
870 * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU
871 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
872 * regs for clip.
873 */
874 assert(total_regs <= 32);
875
876 /* Lazy resize:
877 */
878 if (nr_fp_regs > ice->curbe.wm_size ||
879 nr_vp_regs > ice->curbe.vs_size ||
880 nr_clip_regs != ice->curbe.clip_size ||
881 (total_regs < ice->curbe.total_size / 4 &&
882 ice->curbe.total_size > 16)) {
883
884 GLuint reg = 0;
885
886 /* Calculate a new layout:
887 */
888 reg = 0;
889 ice->curbe.wm_start = reg;
890 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
891 ice->curbe.clip_start = reg;
892 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
893 ice->curbe.vs_start = reg;
894 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
895 ice->curbe.total_size = reg;
896
897 if (0)
898 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
899 ice->curbe.wm_start,
900 ice->curbe.wm_size,
901 ice->curbe.clip_start,
902 ice->curbe.clip_size,
903 ice->curbe.vs_start,
904 ice->curbe.vs_size );
905 return true;
906 }
907 return false;
908 }
909
910 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)911 upload_shader_consts(struct crocus_context *ice,
912 gl_shader_stage stage,
913 uint32_t *map,
914 unsigned start)
915 {
916 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
917 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
918 uint32_t *cmap;
919 bool found = false;
920 unsigned offset = start * 16;
921 int total = 0;
922 for (int i = 0; i < 4; i++) {
923 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
924
925 if (range->length == 0)
926 continue;
927
928 unsigned block_index = crocus_bti_to_group_index(
929 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
930 unsigned len = range->length * 8 * sizeof(float);
931 unsigned start = range->start * 8 * sizeof(float);
932 struct pipe_transfer *transfer;
933
934 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
935 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
936 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
937 if (cmap)
938 memcpy(&map[offset + (total * 8)], cmap, len);
939 pipe_buffer_unmap(&ice->ctx, transfer);
940 total += range->length;
941 found = true;
942 }
943
944 if (stage == MESA_SHADER_VERTEX && !found) {
945 /* The pre-gen6 VS requires that some push constants get loaded no
946 * matter what, or the GPU would hang.
947 */
948 unsigned len = 16;
949 memset(&map[offset], 0, len);
950 }
951 }
952
953 static const float fixed_plane[6][4] = {
954 { 0, 0, -1, 1 },
955 { 0, 0, 1, 1 },
956 { 0, -1, 0, 1 },
957 { 0, 1, 0, 1 },
958 {-1, 0, 0, 1 },
959 { 1, 0, 0, 1 }
960 };
961
962 static void
gen4_upload_curbe(struct crocus_batch * batch)963 gen4_upload_curbe(struct crocus_batch *batch)
964 {
965 struct crocus_context *ice = batch->ice;
966 const unsigned sz = ice->curbe.total_size;
967 const unsigned buf_sz = sz * 16 * sizeof(float);
968
969 if (sz == 0)
970 goto emit;
971
972 uint32_t *map;
973 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
974 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
975
976 /* fragment shader constants */
977 if (ice->curbe.wm_size) {
978 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
979 }
980
981 /* clipper constants */
982 if (ice->curbe.clip_size) {
983 unsigned offset = ice->curbe.clip_start * 16;
984 float *fmap = (float *)map;
985 unsigned i;
986 /* If any planes are going this way, send them all this way:
987 */
988 for (i = 0; i < 6; i++) {
989 fmap[offset + i * 4 + 0] = fixed_plane[i][0];
990 fmap[offset + i * 4 + 1] = fixed_plane[i][1];
991 fmap[offset + i * 4 + 2] = fixed_plane[i][2];
992 fmap[offset + i * 4 + 3] = fixed_plane[i][3];
993 }
994
995 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
996 struct pipe_clip_state *cp = &ice->state.clip_planes;
997 while (mask) {
998 const int j = u_bit_scan(&mask);
999 fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1000 fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1001 fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1002 fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1003 i++;
1004 }
1005 }
1006
1007 /* vertex shader constants */
1008 if (ice->curbe.vs_size) {
1009 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1010 }
1011 if (0) {
1012 for (int i = 0; i < sz*16; i+=4) {
1013 float *f = (float *)map;
1014 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1015 f[i+0], f[i+1], f[i+2], f[i+3]);
1016 }
1017 }
1018
1019 emit:
1020 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1021 if (ice->curbe.curbe_res) {
1022 cb.BufferLength = ice->curbe.total_size - 1;
1023 cb.Valid = 1;
1024 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1025 }
1026 }
1027
1028 #if GFX_VER == 4 && GFX_VERx10 != 45
1029 /* Work around a Broadwater/Crestline depth interpolator bug. The
1030 * following sequence will cause GPU hangs:
1031 *
1032 * 1. Change state so that all depth related fields in CC_STATE are
1033 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1034 * 2. Emit a CONSTANT_BUFFER packet.
1035 * 3. Draw via 3DPRIMITIVE.
1036 *
1037 * The recommended workaround is to emit a non-pipelined state change after
1038 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1039 *
1040 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1041 * and always emit it when "PS Use Source Depth" is set. We could be more
1042 * precise, but the additional complexity is probably not worth it.
1043 *
1044 */
1045 const struct shader_info *fs_info =
1046 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1047
1048 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1049 ice->state.global_depth_offset_clamp = 0;
1050 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1051 }
1052 #endif
1053 }
1054 #endif
1055
1056 #if GFX_VER >= 7
1057
1058 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
1059 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
1060 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
1061
1062 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1063 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1064 {
1065 #if GFX_VER == 7
1066 const struct intel_device_info *devinfo = &batch->screen->devinfo;
1067 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1068 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1069 cfg->n[INTEL_L3P_ALL];
1070 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1071 cfg->n[INTEL_L3P_ALL];
1072 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1073 cfg->n[INTEL_L3P_ALL];
1074 const bool has_slm = cfg->n[INTEL_L3P_SLM];
1075 #endif
1076
1077 /* According to the hardware docs, the L3 partitioning can only be changed
1078 * while the pipeline is completely drained and the caches are flushed,
1079 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1080 */
1081 crocus_emit_pipe_control_flush(batch, "l3_config",
1082 PIPE_CONTROL_DATA_CACHE_FLUSH |
1083 PIPE_CONTROL_CS_STALL);
1084
1085 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1086 * invalidation of the relevant caches. Note that because RO invalidation
1087 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1088 * command is processed by the CS) we cannot combine it with the previous
1089 * stalling flush as the hardware documentation suggests, because that
1090 * would cause the CS to stall on previous rendering *after* RO
1091 * invalidation and wouldn't prevent the RO caches from being polluted by
1092 * concurrent rendering before the stall completes. This intentionally
1093 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1094 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1095 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1096 * already guarantee that there is no concurrent GPGPU kernel execution
1097 * (see SKL HSD 2132585).
1098 */
1099 crocus_emit_pipe_control_flush(batch, "l3 config",
1100 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1101 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1102 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1103 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1104
1105 /* Now send a third stalling flush to make sure that invalidation is
1106 * complete when the L3 configuration registers are modified.
1107 */
1108 crocus_emit_pipe_control_flush(batch, "l3 config",
1109 PIPE_CONTROL_DATA_CACHE_FLUSH |
1110 PIPE_CONTROL_CS_STALL);
1111
1112 #if GFX_VER == 8
1113 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1114 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1115 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1116 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1117 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1118 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1119 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1120 }
1121 #else
1122 assert(!cfg->n[INTEL_L3P_ALL]);
1123
1124 /* When enabled SLM only uses a portion of the L3 on half of the banks,
1125 * the matching space on the remaining banks has to be allocated to a
1126 * client (URB for all validated configurations) set to the
1127 * lower-bandwidth 2-bank address hashing mode.
1128 */
1129 const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
1130 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1131
1132 /* Minimum number of ways that can be allocated to the URB. */
1133 const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
1134 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1135
1136 uint32_t l3sqcr1, l3cr2, l3cr3;
1137
1138 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1139 reg.ConvertDC_UC = !has_dc;
1140 reg.ConvertIS_UC = !has_is;
1141 reg.ConvertC_UC = !has_c;
1142 reg.ConvertT_UC = !has_t;
1143 #if GFX_VERx10 == 75
1144 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1145 #else
1146 reg.L3SQGeneralPriorityCreditInitialization =
1147 devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1148 #endif
1149 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1150 };
1151
1152 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1153 reg.SLMEnable = has_slm;
1154 reg.URBLowBandwidth = urb_low_bw;
1155 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1156 #if !(GFX_VERx10 == 75)
1157 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1158 #endif
1159 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1160 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1161 };
1162
1163 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1164 reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1165 reg.ISLowBandwidth = 0;
1166 reg.CAllocation = cfg->n[INTEL_L3P_C];
1167 reg.CLowBandwidth = 0;
1168 reg.TAllocation = cfg->n[INTEL_L3P_T];
1169 reg.TLowBandwidth = 0;
1170 };
1171
1172 /* Set up the L3 partitioning. */
1173 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1174 crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1175 crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1176
1177 #if GFX_VERSIONx10 == 75
1178 /* TODO: Fail screen creation if command parser version < 4 */
1179 uint32_t scratch1, chicken3;
1180 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1181 reg.L3AtomicDisable = !has_dc;
1182 }
1183 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1184 reg.L3AtomicDisableMask = true;
1185 reg.L3AtomicDisable = !has_dc;
1186 }
1187 crocus_emit_lri(batch, SCRATCH1, scratch1);
1188 crocus_emit_lri(batch, CHICKEN3, chicken3);
1189 #endif
1190 #endif
1191 }
1192
1193 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1194 emit_l3_state(struct crocus_batch *batch, bool compute)
1195 {
1196 const struct intel_l3_config *const cfg =
1197 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1198
1199 setup_l3_config(batch, cfg);
1200 if (INTEL_DEBUG(DEBUG_L3)) {
1201 intel_dump_l3_config(cfg, stderr);
1202 }
1203 }
1204
1205 /**
1206 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1207 */
1208 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1209 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1210 {
1211 crocus_emit_pipe_control_write(batch,
1212 "workaround",
1213 PIPE_CONTROL_CS_STALL
1214 | PIPE_CONTROL_WRITE_IMMEDIATE,
1215 batch->ice->workaround_bo,
1216 batch->ice->workaround_offset, 0);
1217 }
1218 #endif
1219
1220 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1221 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1222 {
1223 #if GFX_VER == 8
1224 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1225 *
1226 * Software must clear the COLOR_CALC_STATE Valid field in
1227 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1228 * with Pipeline Select set to GPGPU.
1229 *
1230 * The internal hardware docs recommend the same workaround for Gfx9
1231 * hardware too.
1232 */
1233 if (pipeline == GPGPU)
1234 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1235 #endif
1236
1237 #if GFX_VER >= 6
1238 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1239 * PIPELINE_SELECT [DevBWR+]":
1240 *
1241 * "Project: DEVSNB+
1242 *
1243 * Software must ensure all the write caches are flushed through a
1244 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1245 * command to invalidate read only caches prior to programming
1246 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1247 */
1248 const unsigned dc_flush =
1249 batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1250 crocus_emit_pipe_control_flush(batch,
1251 "workaround: PIPELINE_SELECT flushes (1/2)",
1252 PIPE_CONTROL_RENDER_TARGET_FLUSH |
1253 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1254 dc_flush |
1255 PIPE_CONTROL_CS_STALL);
1256
1257 crocus_emit_pipe_control_flush(batch,
1258 "workaround: PIPELINE_SELECT flushes (2/2)",
1259 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1260 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1261 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1262 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1263 #else
1264 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1265 * PIPELINE_SELECT [DevBWR+]":
1266 *
1267 * Project: PRE-DEVSNB
1268 *
1269 * Software must ensure the current pipeline is flushed via an
1270 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1271 */
1272 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1273 #endif
1274
1275 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1276 sel.PipelineSelection = pipeline;
1277 }
1278
1279 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1280 if (pipeline == _3D) {
1281 gen7_emit_cs_stall_flush(batch);
1282
1283 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1284 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1285 };
1286 }
1287 #endif
1288 }
1289
1290 /**
1291 * The following diagram shows how we partition the URB:
1292 *
1293 * 16kB or 32kB Rest of the URB space
1294 * __________-__________ _________________-_________________
1295 * / \ / \
1296 * +-------------------------------------------------------------+
1297 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
1298 * | Constants | Entries |
1299 * +-------------------------------------------------------------+
1300 *
1301 * Notably, push constants must be stored at the beginning of the URB
1302 * space, while entries can be stored anywhere. Ivybridge and Haswell
1303 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1304 * doubles this (32kB).
1305 *
1306 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1307 * sized) in increments of 1kB. Haswell GT3 requires them to be located and
1308 * sized in increments of 2kB.
1309 *
1310 * Currently we split the constant buffer space evenly among whatever stages
1311 * are active. This is probably not ideal, but simple.
1312 *
1313 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1314 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1315 * Haswell GT3 has 512kB of URB space.
1316 *
1317 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1318 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1319 */
1320 #if GFX_VER >= 7
1321 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1322 crocus_alloc_push_constants(struct crocus_batch *batch)
1323 {
1324 const unsigned push_constant_kb =
1325 batch->screen->devinfo.max_constant_urb_size_kb;
1326 unsigned size_per_stage = push_constant_kb / 5;
1327
1328 /* For now, we set a static partitioning of the push constant area,
1329 * assuming that all stages could be in use.
1330 *
1331 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1332 * see if that improves performance by offering more space to
1333 * the VS/FS when those aren't in use. Also, try dynamically
1334 * enabling/disabling it like i965 does. This would be more
1335 * stalls and may not actually help; we don't know yet.
1336 */
1337 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1338 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1339 alloc._3DCommandSubOpcode = 18 + i;
1340 alloc.ConstantBufferOffset = size_per_stage * i;
1341 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1342 }
1343 }
1344
1345 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1346 *
1347 * A PIPE_CONTROL command with the CS Stall bit set must be programmed
1348 * in the ring after this instruction.
1349 *
1350 * No such restriction exists for Haswell or Baytrail.
1351 */
1352 if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
1353 gen7_emit_cs_stall_flush(batch);
1354 }
1355 #endif
1356
1357 /**
1358 * Upload the initial GPU state for a render context.
1359 *
1360 * This sets some invariant state that needs to be programmed a particular
1361 * way, but we never actually change.
1362 */
1363 static void
crocus_init_render_context(struct crocus_batch * batch)1364 crocus_init_render_context(struct crocus_batch *batch)
1365 {
1366 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1367
1368 emit_pipeline_select(batch, _3D);
1369
1370 crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1371
1372 #if GFX_VER >= 7
1373 emit_l3_state(batch, false);
1374 #endif
1375 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1376 crocus_emit_reg(batch, GENX(INSTPM), reg) {
1377 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1378 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1379 }
1380 #endif
1381 #if GFX_VER >= 5 || GFX_VERx10 == 45
1382 /* Use the legacy AA line coverage computation. */
1383 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1384 #endif
1385
1386 /* No polygon stippling offsets are necessary. */
1387 /* TODO: may need to set an offset for origin-UL framebuffers */
1388 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1389
1390 #if GFX_VER >= 7
1391 crocus_alloc_push_constants(batch);
1392 #endif
1393
1394 #if GFX_VER == 8
1395 /* Set the initial MSAA sample positions. */
1396 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1397 INTEL_SAMPLE_POS_1X(pat._1xSample);
1398 INTEL_SAMPLE_POS_2X(pat._2xSample);
1399 INTEL_SAMPLE_POS_4X(pat._4xSample);
1400 INTEL_SAMPLE_POS_8X(pat._8xSample);
1401 }
1402
1403 /* Disable chromakeying (it's for media) */
1404 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1405
1406 /* We want regular rendering, not special HiZ operations. */
1407 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1408 #endif
1409 }
1410
1411 #if GFX_VER >= 7
1412 static void
crocus_init_compute_context(struct crocus_batch * batch)1413 crocus_init_compute_context(struct crocus_batch *batch)
1414 {
1415 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1416
1417 emit_pipeline_select(batch, GPGPU);
1418
1419 #if GFX_VER >= 7
1420 emit_l3_state(batch, true);
1421 #endif
1422 }
1423 #endif
1424
1425 /**
1426 * Generation-specific context state (ice->state.genx->...).
1427 *
1428 * Most state can go in crocus_context directly, but these encode hardware
1429 * packets which vary by generation.
1430 */
1431 struct crocus_genx_state {
1432 struct {
1433 #if GFX_VER >= 7
1434 struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1435 #endif
1436 } shaders[MESA_SHADER_STAGES];
1437
1438 #if GFX_VER == 8
1439 bool pma_fix_enabled;
1440 #endif
1441 };
1442
1443 /**
1444 * The pipe->set_blend_color() driver hook.
1445 *
1446 * This corresponds to our COLOR_CALC_STATE.
1447 */
1448 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1449 crocus_set_blend_color(struct pipe_context *ctx,
1450 const struct pipe_blend_color *state)
1451 {
1452 struct crocus_context *ice = (struct crocus_context *) ctx;
1453
1454 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1455 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1456 #if GFX_VER <= 5
1457 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1458 #else
1459 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1460 #endif
1461 }
1462
1463 /**
1464 * Gallium CSO for blend state (see pipe_blend_state).
1465 */
1466 struct crocus_blend_state {
1467 #if GFX_VER == 8
1468 /** Partial 3DSTATE_PS_BLEND */
1469 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1470 #endif
1471
1472 /** copy of BLEND_STATE */
1473 struct pipe_blend_state cso;
1474
1475 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1476 uint8_t blend_enables;
1477
1478 /** Bitfield of whether color writes are enabled for RT[i] */
1479 uint8_t color_write_enables;
1480
1481 /** Does RT[0] use dual color blending? */
1482 bool dual_color_blending;
1483 };
1484
1485 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1486 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1487 {
1488 if (alpha_to_one) {
1489 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1490 return PIPE_BLENDFACTOR_ONE;
1491
1492 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1493 return PIPE_BLENDFACTOR_ZERO;
1494 }
1495
1496 return f;
1497 }
1498
1499 #if GFX_VER >= 6
1500 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1501 #else
1502 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1503 #endif
1504
1505 static bool
1506 can_emit_logic_op(struct crocus_context *ice)
1507 {
1508 /* all pre gen8 have logicop restricted to unorm */
1509 enum pipe_format pformat = PIPE_FORMAT_NONE;
1510 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1511 if (ice->state.framebuffer.cbufs[i]) {
1512 pformat = ice->state.framebuffer.cbufs[i]->format;
1513 break;
1514 }
1515 }
1516 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1517 }
1518
1519 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1520 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1521 struct crocus_blend_state *cso_blend,
1522 int idx)
1523 {
1524 struct crocus_context *ice = batch->ice;
1525 bool independent_alpha_blend = false;
1526 const struct pipe_rt_blend_state *rt =
1527 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1528 const unsigned blend_enabled = rt->blend_enable;
1529
1530 enum pipe_blendfactor src_rgb =
1531 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1532 enum pipe_blendfactor src_alpha =
1533 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1534 enum pipe_blendfactor dst_rgb =
1535 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1536 enum pipe_blendfactor dst_alpha =
1537 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1538
1539 if (rt->rgb_func != rt->alpha_func ||
1540 src_rgb != src_alpha || dst_rgb != dst_alpha)
1541 independent_alpha_blend = true;
1542 if (cso_blend->cso.logicop_enable) {
1543 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1544 entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1545 entry->LogicOpFunction = cso_blend->cso.logicop_func;
1546 }
1547 } else if (blend_enabled) {
1548 if (idx == 0) {
1549 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1550 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1551 entry->ColorBufferBlendEnable =
1552 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1553 } else
1554 entry->ColorBufferBlendEnable = 1;
1555
1556 entry->ColorBlendFunction = rt->rgb_func;
1557 entry->AlphaBlendFunction = rt->alpha_func;
1558 entry->SourceBlendFactor = (int) src_rgb;
1559 entry->SourceAlphaBlendFactor = (int) src_alpha;
1560 entry->DestinationBlendFactor = (int) dst_rgb;
1561 entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1562 }
1563 #if GFX_VER <= 5
1564 /*
1565 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1566 * when a dual src blend shader is in use. Setup dummy blending.
1567 */
1568 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1569 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1570 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1571 entry->ColorBufferBlendEnable = 1;
1572 entry->ColorBlendFunction = PIPE_BLEND_ADD;
1573 entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1574 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1575 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1576 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578 }
1579 #endif
1580 return independent_alpha_blend;
1581 }
1582
1583 /**
1584 * The pipe->create_blend_state() driver hook.
1585 *
1586 * Translates a pipe_blend_state into crocus_blend_state.
1587 */
1588 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1589 crocus_create_blend_state(struct pipe_context *ctx,
1590 const struct pipe_blend_state *state)
1591 {
1592 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1593
1594 cso->blend_enables = 0;
1595 cso->color_write_enables = 0;
1596 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1597
1598 cso->cso = *state;
1599 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1600
1601 #if GFX_VER == 8
1602 bool indep_alpha_blend = false;
1603 #endif
1604 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1605 const struct pipe_rt_blend_state *rt =
1606 &state->rt[state->independent_blend_enable ? i : 0];
1607 if (rt->blend_enable)
1608 cso->blend_enables |= 1u << i;
1609 if (rt->colormask)
1610 cso->color_write_enables |= 1u << i;
1611 #if GFX_VER == 8
1612 enum pipe_blendfactor src_rgb =
1613 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1614 enum pipe_blendfactor src_alpha =
1615 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1616 enum pipe_blendfactor dst_rgb =
1617 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1618 enum pipe_blendfactor dst_alpha =
1619 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1620
1621 if (rt->rgb_func != rt->alpha_func ||
1622 src_rgb != src_alpha || dst_rgb != dst_alpha)
1623 indep_alpha_blend = true;
1624 #endif
1625 }
1626
1627 #if GFX_VER == 8
1628 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1629 /* pb.HasWriteableRT is filled in at draw time.
1630 * pb.AlphaTestEnable is filled in at draw time.
1631 *
1632 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1633 * setting it when dual color blending without an appropriate shader.
1634 */
1635
1636 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1637 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1638
1639 /* The casts prevent warnings about implicit enum type conversions. */
1640 pb.SourceBlendFactor =
1641 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1642 pb.SourceAlphaBlendFactor =
1643 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1644 pb.DestinationBlendFactor =
1645 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1646 pb.DestinationAlphaBlendFactor =
1647 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1648 }
1649 #endif
1650 return cso;
1651 }
1652
1653 /**
1654 * The pipe->bind_blend_state() driver hook.
1655 *
1656 * Bind a blending CSO and flag related dirty bits.
1657 */
1658 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1659 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1660 {
1661 struct crocus_context *ice = (struct crocus_context *) ctx;
1662 struct crocus_blend_state *cso = state;
1663
1664 ice->state.cso_blend = cso;
1665 ice->state.blend_enables = cso ? cso->blend_enables : 0;
1666
1667 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1668 ice->state.dirty |= CROCUS_DIRTY_WM;
1669 #if GFX_VER >= 6
1670 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1671 #endif
1672 #if GFX_VER >= 7
1673 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1674 #endif
1675 #if GFX_VER == 8
1676 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1677 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1678 #endif
1679 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1680 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1681 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1682 }
1683
1684 /**
1685 * Return true if the FS writes to any color outputs which are not disabled
1686 * via color masking.
1687 */
1688 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1689 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1690 const struct shader_info *fs_info)
1691 {
1692 if (!fs_info)
1693 return false;
1694
1695 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1696
1697 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1698 rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1699
1700 return cso_blend->color_write_enables & rt_outputs;
1701 }
1702
1703 /**
1704 * Gallium CSO for depth, stencil, and alpha testing state.
1705 */
1706 struct crocus_depth_stencil_alpha_state {
1707 struct pipe_depth_stencil_alpha_state cso;
1708
1709 bool depth_writes_enabled;
1710 bool stencil_writes_enabled;
1711 };
1712
1713 /**
1714 * The pipe->create_depth_stencil_alpha_state() driver hook.
1715 *
1716 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1717 * testing state since we need pieces of it in a variety of places.
1718 */
1719 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1720 crocus_create_zsa_state(struct pipe_context *ctx,
1721 const struct pipe_depth_stencil_alpha_state *state)
1722 {
1723 struct crocus_depth_stencil_alpha_state *cso =
1724 malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1725
1726 bool two_sided_stencil = state->stencil[1].enabled;
1727 cso->cso = *state;
1728
1729 cso->depth_writes_enabled = state->depth_writemask;
1730 cso->stencil_writes_enabled =
1731 state->stencil[0].writemask != 0 ||
1732 (two_sided_stencil && state->stencil[1].writemask != 0);
1733
1734 /* The state tracker needs to optimize away EQUAL writes for us. */
1735 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1736
1737 return cso;
1738 }
1739
1740 /**
1741 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1742 *
1743 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1744 */
1745 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1746 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1747 {
1748 struct crocus_context *ice = (struct crocus_context *) ctx;
1749 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1750 struct crocus_depth_stencil_alpha_state *new_cso = state;
1751
1752 if (new_cso) {
1753 if (cso_changed(cso.alpha_ref_value))
1754 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1755
1756 if (cso_changed(cso.alpha_enabled))
1757 ice->state.dirty |= CROCUS_DIRTY_WM;
1758 #if GFX_VER >= 6
1759 if (cso_changed(cso.alpha_enabled))
1760 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1761
1762 if (cso_changed(cso.alpha_func))
1763 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1764 #endif
1765 #if GFX_VER == 8
1766 if (cso_changed(cso.alpha_enabled))
1767 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1768 #endif
1769
1770 if (cso_changed(depth_writes_enabled))
1771 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1772
1773 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1774 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1775
1776 #if GFX_VER <= 5
1777 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1778 #endif
1779 }
1780
1781 ice->state.cso_zsa = new_cso;
1782 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1783 #if GFX_VER >= 6
1784 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1785 #endif
1786 #if GFX_VER == 8
1787 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1788 #endif
1789 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1790 }
1791
1792 #if GFX_VER == 8
1793 static bool
want_pma_fix(struct crocus_context * ice)1794 want_pma_fix(struct crocus_context *ice)
1795 {
1796 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1797 UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1798 const struct brw_wm_prog_data *wm_prog_data = (void *)
1799 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1800 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1801 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1802 const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1803
1804 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1805 * to avoid stalling at the pixel mask array. The state equations are
1806 * documented in these places:
1807 *
1808 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1809 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1810 *
1811 * Both equations share some common elements:
1812 *
1813 * no_hiz_op =
1814 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1815 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1816 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1817 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1818 *
1819 * killpixels =
1820 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1821 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1822 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1823 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1824 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1825 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1826 *
1827 * (Technically the stencil PMA treats ForceKillPix differently,
1828 * but I think this is a documentation oversight, and we don't
1829 * ever use it in this way, so it doesn't matter).
1830 *
1831 * common_pma_fix =
1832 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1833 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1834 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1835 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1836 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1837 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1838 * no_hiz_op
1839 *
1840 * These are always true:
1841 *
1842 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1843 * 3DSTATE_PS_EXTRA::PixelShaderValid
1844 *
1845 * Also, we never use the normal drawing path for HiZ ops; these are true:
1846 *
1847 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1848 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1849 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1850 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
1851 *
1852 * This happens sometimes:
1853 *
1854 * 3DSTATE_WM::ForceThreadDispatch != 1
1855 *
1856 * However, we choose to ignore it as it either agrees with the signal
1857 * (dispatch was already enabled, so nothing out of the ordinary), or
1858 * there are no framebuffer attachments (so no depth or HiZ anyway,
1859 * meaning the PMA signal will already be disabled).
1860 */
1861
1862 if (!cso_fb->zsbuf)
1863 return false;
1864
1865 struct crocus_resource *zres, *sres;
1866 crocus_get_depth_stencil_resources(devinfo,
1867 cso_fb->zsbuf->texture, &zres, &sres);
1868
1869 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1870 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1871 */
1872 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1873 return false;
1874
1875 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1876 if (wm_prog_data->early_fragment_tests)
1877 return false;
1878
1879 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1880 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1881 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1882 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1883 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1884 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1885 */
1886 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1887 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1888
1889 /* The Gfx8 depth PMA equation becomes:
1890 *
1891 * depth_writes =
1892 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1893 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1894 *
1895 * stencil_writes =
1896 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1897 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1898 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1899 *
1900 * Z_PMA_OPT =
1901 * common_pma_fix &&
1902 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1903 * ((killpixels && (depth_writes || stencil_writes)) ||
1904 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1905 *
1906 */
1907 if (!cso_zsa->cso.depth_enabled)
1908 return false;
1909
1910 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1911 (killpixels && (cso_zsa->depth_writes_enabled ||
1912 (sres && cso_zsa->stencil_writes_enabled)));
1913 }
1914 #endif
1915 void
genX(crocus_update_pma_fix)1916 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1917 struct crocus_batch *batch,
1918 bool enable)
1919 {
1920 #if GFX_VER == 8
1921 struct crocus_genx_state *genx = ice->state.genx;
1922
1923 if (genx->pma_fix_enabled == enable)
1924 return;
1925
1926 genx->pma_fix_enabled = enable;
1927
1928 /* According to the Broadwell PIPE_CONTROL documentation, software should
1929 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1930 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
1931 *
1932 * The Gfx9 docs say to use a depth stall rather than a command streamer
1933 * stall. However, the hardware seems to violently disagree. A full
1934 * command streamer stall seems to be needed in both cases.
1935 */
1936 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1937 PIPE_CONTROL_CS_STALL |
1938 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1939 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1940
1941 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1942 reg.NPPMAFixEnable = enable;
1943 reg.NPEarlyZFailsDisable = enable;
1944 reg.NPPMAFixEnableMask = true;
1945 reg.NPEarlyZFailsDisableMask = true;
1946 }
1947
1948 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1949 * Flush bits is often necessary. We do it regardless because it's easier.
1950 * The render cache flush is also necessary if stencil writes are enabled.
1951 *
1952 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1953 * flushes seem to work just as well.
1954 */
1955 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1956 PIPE_CONTROL_DEPTH_STALL |
1957 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1958 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1959 #endif
1960 }
1961
1962 static float
get_line_width(const struct pipe_rasterizer_state * state)1963 get_line_width(const struct pipe_rasterizer_state *state)
1964 {
1965 float line_width = state->line_width;
1966
1967 /* From the OpenGL 4.4 spec:
1968 *
1969 * "The actual width of non-antialiased lines is determined by rounding
1970 * the supplied width to the nearest integer, then clamping it to the
1971 * implementation-dependent maximum non-antialiased line width."
1972 */
1973 if (!state->multisample && !state->line_smooth)
1974 line_width = roundf(state->line_width);
1975
1976 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1977 /* For 1 pixel line thickness or less, the general anti-aliasing
1978 * algorithm gives up, and a garbage line is generated. Setting a
1979 * Line Width of 0.0 specifies the rasterization of the "thinnest"
1980 * (one-pixel-wide), non-antialiased lines.
1981 *
1982 * Lines rendered with zero Line Width are rasterized using the
1983 * "Grid Intersection Quantization" rules as specified by the
1984 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1985 */
1986 line_width = 0.0f;
1987 }
1988
1989 return line_width;
1990 }
1991
1992 /**
1993 * The pipe->create_rasterizer_state() driver hook.
1994 */
1995 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1996 crocus_create_rasterizer_state(struct pipe_context *ctx,
1997 const struct pipe_rasterizer_state *state)
1998 {
1999 struct crocus_rasterizer_state *cso =
2000 malloc(sizeof(struct crocus_rasterizer_state));
2001
2002 cso->fill_mode_point_or_line =
2003 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2004 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2005 state->fill_back == PIPE_POLYGON_MODE_LINE ||
2006 state->fill_back == PIPE_POLYGON_MODE_POINT;
2007
2008 if (state->clip_plane_enable != 0)
2009 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2010 else
2011 cso->num_clip_plane_consts = 0;
2012
2013 cso->cso = *state;
2014
2015 #if GFX_VER >= 6
2016 float line_width = get_line_width(state);
2017
2018 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2019 sf.StatisticsEnable = true;
2020 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2021 sf.LineEndCapAntialiasingRegionWidth =
2022 state->line_smooth ? _10pixels : _05pixels;
2023 sf.LastPixelEnable = state->line_last_pixel;
2024 #if GFX_VER <= 7
2025 sf.AntialiasingEnable = state->line_smooth;
2026 #endif
2027 #if GFX_VER == 8
2028 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2029 if (screen->devinfo.is_cherryview)
2030 sf.CHVLineWidth = line_width;
2031 else
2032 sf.LineWidth = line_width;
2033 #else
2034 sf.LineWidth = line_width;
2035 #endif
2036 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2037 sf.PointWidth = state->point_size;
2038
2039 if (state->flatshade_first) {
2040 sf.TriangleFanProvokingVertexSelect = 1;
2041 } else {
2042 sf.TriangleStripListProvokingVertexSelect = 2;
2043 sf.TriangleFanProvokingVertexSelect = 2;
2044 sf.LineStripListProvokingVertexSelect = 1;
2045 }
2046
2047 #if GFX_VER == 6
2048 sf.AttributeSwizzleEnable = true;
2049 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2050 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2051 else
2052 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2053 #endif
2054
2055 #if GFX_VER <= 7
2056 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2057
2058 #if GFX_VER >= 6
2059 sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2060 sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2061 sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2062 sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2063 sf.GlobalDepthOffsetScale = state->offset_scale;
2064 sf.GlobalDepthOffsetClamp = state->offset_clamp;
2065
2066 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2067 sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2068 #endif
2069
2070 sf.CullMode = translate_cull_mode(state->cull_face);
2071 sf.ScissorRectangleEnable = true;
2072
2073 #if GFX_VERx10 == 75
2074 sf.LineStippleEnable = state->line_stipple_enable;
2075 #endif
2076 #endif
2077 }
2078 #endif
2079
2080 #if GFX_VER == 8
2081 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2082 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2083 rr.CullMode = translate_cull_mode(state->cull_face);
2084 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2085 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2086 rr.DXMultisampleRasterizationEnable = state->multisample;
2087 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2088 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2089 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2090 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2091 rr.GlobalDepthOffsetScale = state->offset_scale;
2092 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2093 rr.SmoothPointEnable = state->point_smooth;
2094 rr.AntialiasingEnable = state->line_smooth;
2095 rr.ScissorRectangleEnable = state->scissor;
2096 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2097 }
2098 #endif
2099
2100 #if GFX_VER >= 6
2101 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2102 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2103 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2104 */
2105 #if GFX_VER >= 7
2106 cl.EarlyCullEnable = true;
2107 #endif
2108
2109 #if GFX_VER == 7
2110 cl.FrontWinding = state->front_ccw ? 1 : 0;
2111 cl.CullMode = translate_cull_mode(state->cull_face);
2112 #endif
2113 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2114 #if GFX_VER < 8
2115 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2116 #endif
2117 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2118 cl.GuardbandClipTestEnable = true;
2119 cl.ClipEnable = true;
2120 cl.MinimumPointWidth = 0.125;
2121 cl.MaximumPointWidth = 255.875;
2122
2123 #if GFX_VER == 8
2124 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2125 #endif
2126
2127 if (state->flatshade_first) {
2128 cl.TriangleFanProvokingVertexSelect = 1;
2129 } else {
2130 cl.TriangleStripListProvokingVertexSelect = 2;
2131 cl.TriangleFanProvokingVertexSelect = 2;
2132 cl.LineStripListProvokingVertexSelect = 1;
2133 }
2134 }
2135 #endif
2136
2137 /* Remap from 0..255 back to 1..256 */
2138 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2139
2140 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2141 if (state->line_stipple_enable) {
2142 line.LineStipplePattern = state->line_stipple_pattern;
2143 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2144 line.LineStippleRepeatCount = line_stipple_factor;
2145 }
2146 }
2147
2148 return cso;
2149 }
2150
2151 /**
2152 * The pipe->bind_rasterizer_state() driver hook.
2153 *
2154 * Bind a rasterizer CSO and flag related dirty bits.
2155 */
2156 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2157 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2158 {
2159 struct crocus_context *ice = (struct crocus_context *) ctx;
2160 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2161 struct crocus_rasterizer_state *new_cso = state;
2162
2163 if (new_cso) {
2164 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2165 if (cso_changed_memcmp(line_stipple))
2166 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2167 #if GFX_VER >= 6
2168 if (cso_changed(cso.half_pixel_center))
2169 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2170 if (cso_changed(cso.scissor))
2171 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2172 if (cso_changed(cso.multisample))
2173 ice->state.dirty |= CROCUS_DIRTY_WM;
2174 #else
2175 if (cso_changed(cso.scissor))
2176 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2177 #endif
2178
2179 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2180 ice->state.dirty |= CROCUS_DIRTY_WM;
2181
2182 #if GFX_VER >= 6
2183 if (cso_changed(cso.rasterizer_discard))
2184 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2185
2186 if (cso_changed(cso.flatshade_first))
2187 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2188 #endif
2189
2190 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2191 cso_changed(cso.clip_halfz))
2192 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2193
2194 #if GFX_VER >= 7
2195 if (cso_changed(cso.sprite_coord_enable) ||
2196 cso_changed(cso.sprite_coord_mode) ||
2197 cso_changed(cso.light_twoside))
2198 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2199 #endif
2200 #if GFX_VER <= 5
2201 if (cso_changed(cso.clip_plane_enable))
2202 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2203 #endif
2204 }
2205
2206 ice->state.cso_rast = new_cso;
2207 ice->state.dirty |= CROCUS_DIRTY_RASTER;
2208 ice->state.dirty |= CROCUS_DIRTY_CLIP;
2209 #if GFX_VER <= 5
2210 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2211 ice->state.dirty |= CROCUS_DIRTY_WM;
2212 #endif
2213 #if GFX_VER <= 6
2214 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2215 #endif
2216 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2217 }
2218
2219 /**
2220 * Return true if the given wrap mode requires the border color to exist.
2221 *
2222 * (We can skip uploading it if the sampler isn't going to use it.)
2223 */
2224 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2225 wrap_mode_needs_border_color(unsigned wrap_mode)
2226 {
2227 #if GFX_VER == 8
2228 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2229 #else
2230 return wrap_mode == TCM_CLAMP_BORDER;
2231 #endif
2232 }
2233
2234 /**
2235 * Gallium CSO for sampler state.
2236 */
2237 struct crocus_sampler_state {
2238 struct pipe_sampler_state pstate;
2239 union pipe_color_union border_color;
2240 bool needs_border_color;
2241 unsigned wrap_s;
2242 unsigned wrap_t;
2243 unsigned wrap_r;
2244 unsigned mag_img_filter;
2245 float min_lod;
2246 };
2247
2248 /**
2249 * The pipe->create_sampler_state() driver hook.
2250 *
2251 * We fill out SAMPLER_STATE (except for the border color pointer), and
2252 * store that on the CPU. It doesn't make sense to upload it to a GPU
2253 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2254 * all bound sampler states to be in contiguous memor.
2255 */
2256 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2257 crocus_create_sampler_state(struct pipe_context *ctx,
2258 const struct pipe_sampler_state *state)
2259 {
2260 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2261
2262 if (!cso)
2263 return NULL;
2264
2265 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2266 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2267
2268 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2269 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2270 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2271 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2272 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2273
2274 cso->pstate = *state;
2275
2276 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2277
2278 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2279 wrap_mode_needs_border_color(cso->wrap_t) ||
2280 wrap_mode_needs_border_color(cso->wrap_r);
2281
2282 cso->min_lod = state->min_lod;
2283 cso->mag_img_filter = state->mag_img_filter;
2284
2285 // XXX: explain this code ported from ilo...I don't get it at all...
2286 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2287 state->min_lod > 0.0f) {
2288 cso->min_lod = 0.0f;
2289 cso->mag_img_filter = state->min_img_filter;
2290 }
2291
2292 return cso;
2293 }
2294
2295 /**
2296 * The pipe->bind_sampler_states() driver hook.
2297 */
2298 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2299 crocus_bind_sampler_states(struct pipe_context *ctx,
2300 enum pipe_shader_type p_stage,
2301 unsigned start, unsigned count,
2302 void **states)
2303 {
2304 struct crocus_context *ice = (struct crocus_context *) ctx;
2305 gl_shader_stage stage = stage_from_pipe(p_stage);
2306 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2307
2308 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2309
2310 bool dirty = false;
2311
2312 for (int i = 0; i < count; i++) {
2313 if (shs->samplers[start + i] != states[i]) {
2314 shs->samplers[start + i] = states[i];
2315 dirty = true;
2316 }
2317 }
2318
2319 if (dirty) {
2320 #if GFX_VER <= 5
2321 if (p_stage == PIPE_SHADER_FRAGMENT)
2322 ice->state.dirty |= CROCUS_DIRTY_WM;
2323 else if (p_stage == PIPE_SHADER_VERTEX)
2324 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2325 #endif
2326 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2327 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2328 }
2329 }
2330
2331 enum samp_workaround {
2332 SAMP_NORMAL,
2333 SAMP_CUBE_CLAMP,
2334 SAMP_CUBE_CUBE,
2335 SAMP_T_WRAP,
2336 };
2337
2338 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2339 crocus_upload_sampler_state(struct crocus_batch *batch,
2340 struct crocus_sampler_state *cso,
2341 uint32_t border_color_offset,
2342 enum samp_workaround samp_workaround,
2343 uint32_t first_level,
2344 void *map)
2345 {
2346 struct pipe_sampler_state *state = &cso->pstate;
2347 uint32_t wrap_s, wrap_t, wrap_r;
2348
2349 wrap_s = cso->wrap_s;
2350 wrap_t = cso->wrap_t;
2351 wrap_r = cso->wrap_r;
2352
2353 switch (samp_workaround) {
2354 case SAMP_CUBE_CLAMP:
2355 wrap_s = TCM_CLAMP;
2356 wrap_t = TCM_CLAMP;
2357 wrap_r = TCM_CLAMP;
2358 break;
2359 case SAMP_CUBE_CUBE:
2360 wrap_s = TCM_CUBE;
2361 wrap_t = TCM_CUBE;
2362 wrap_r = TCM_CUBE;
2363 break;
2364 case SAMP_T_WRAP:
2365 wrap_t = TCM_WRAP;
2366 break;
2367 default:
2368 break;
2369 }
2370
2371 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2372 samp.TCXAddressControlMode = wrap_s;
2373 samp.TCYAddressControlMode = wrap_t;
2374 samp.TCZAddressControlMode = wrap_r;
2375
2376 #if GFX_VER >= 6
2377 samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
2378 #endif
2379 samp.MinModeFilter = state->min_img_filter;
2380 samp.MagModeFilter = cso->mag_img_filter;
2381 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2382 samp.MaximumAnisotropy = RATIO21;
2383
2384 if (state->max_anisotropy >= 2) {
2385 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2386 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2387 #if GFX_VER >= 7
2388 samp.AnisotropicAlgorithm = EWAApproximation;
2389 #endif
2390 }
2391
2392 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2393 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2394
2395 samp.MaximumAnisotropy =
2396 MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2397 }
2398
2399 /* Set address rounding bits if not using nearest filtering. */
2400 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2401 samp.UAddressMinFilterRoundingEnable = true;
2402 samp.VAddressMinFilterRoundingEnable = true;
2403 samp.RAddressMinFilterRoundingEnable = true;
2404 }
2405
2406 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2407 samp.UAddressMagFilterRoundingEnable = true;
2408 samp.VAddressMagFilterRoundingEnable = true;
2409 samp.RAddressMagFilterRoundingEnable = true;
2410 }
2411
2412 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2413 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2414
2415 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2416
2417 #if GFX_VER == 8
2418 samp.LODPreClampMode = CLAMP_MODE_OGL;
2419 #else
2420 samp.LODPreClampEnable = true;
2421 #endif
2422 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2423 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2424 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2425
2426 #if GFX_VER == 6
2427 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2428 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2429 #endif
2430
2431 #if GFX_VER < 6
2432 samp.BorderColorPointer =
2433 ro_bo(batch->state.bo, border_color_offset);
2434 #else
2435 samp.BorderColorPointer = border_color_offset;
2436 #endif
2437 }
2438 }
2439
2440 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2441 crocus_upload_border_color(struct crocus_batch *batch,
2442 struct crocus_sampler_state *cso,
2443 struct crocus_sampler_view *tex,
2444 uint32_t *bc_offset)
2445 {
2446 /* We may need to swizzle the border color for format faking.
2447 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2448 * This means we need to move the border color's A channel into
2449 * the R or G channels so that those read swizzles will move it
2450 * back into A.
2451 */
2452 enum pipe_format internal_format = PIPE_FORMAT_NONE;
2453 union pipe_color_union *color = &cso->border_color;
2454 union pipe_color_union tmp;
2455 if (tex) {
2456 internal_format = tex->res->internal_format;
2457
2458 if (util_format_is_alpha(internal_format)) {
2459 unsigned char swz[4] = {
2460 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2461 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2462 };
2463 util_format_apply_color_swizzle(&tmp, color, swz, true);
2464 color = &tmp;
2465 } else if (util_format_is_luminance_alpha(internal_format) &&
2466 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2467 unsigned char swz[4] = {
2468 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2469 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2470 };
2471 util_format_apply_color_swizzle(&tmp, color, swz, true);
2472 color = &tmp;
2473 }
2474 }
2475 bool is_integer_format = util_format_is_pure_integer(internal_format);
2476 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2477 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2478 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2479
2480 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2481
2482 #define ASSIGN(dst, src) \
2483 do { \
2484 dst = src; \
2485 } while (0)
2486
2487 #define ASSIGNu16(dst, src) \
2488 do { \
2489 dst = (uint16_t)src; \
2490 } while (0)
2491
2492 #define ASSIGNu8(dst, src) \
2493 do { \
2494 dst = (uint8_t)src; \
2495 } while (0)
2496
2497 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
2498 macro(state.BorderColor ## _color_type ## Red, src[0]); \
2499 macro(state.BorderColor ## _color_type ## Green, src[1]); \
2500 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
2501 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2502
2503 #if GFX_VER >= 8
2504 /* On Broadwell, the border color is represented as four 32-bit floats,
2505 * integers, or unsigned values, interpreted according to the surface
2506 * format. This matches the sampler->BorderColor union exactly; just
2507 * memcpy the values.
2508 */
2509 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2510 #elif GFX_VERx10 == 75
2511 if (is_integer_format) {
2512 const struct util_format_description *format_desc =
2513 util_format_description(internal_format);
2514
2515 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2516 * "If any color channel is missing from the surface format,
2517 * corresponding border color should be programmed as zero and if
2518 * alpha channel is missing, corresponding Alpha border color should
2519 * be programmed as 1."
2520 */
2521 unsigned c[4] = { 0, 0, 0, 1 };
2522 for (int i = 0; i < 4; i++) {
2523 if (format_desc->channel[i].size)
2524 c[i] = color->ui[i];
2525 }
2526
2527 switch (format_desc->channel[0].size) {
2528 case 8:
2529 /* Copy RGBA in order. */
2530 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2531 break;
2532 case 10:
2533 /* R10G10B10A2_UINT is treated like a 16-bit format. */
2534 case 16:
2535 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2536 break;
2537 case 32:
2538 if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2539 /* Careful inspection of the tables reveals that for RG32 formats,
2540 * the green channel needs to go where blue normally belongs.
2541 */
2542 state.BorderColor32bitRed = c[0];
2543 state.BorderColor32bitBlue = c[1];
2544 state.BorderColor32bitAlpha = 1;
2545 } else {
2546 /* Copy RGBA in order. */
2547 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2548 }
2549 break;
2550 default:
2551 assert(!"Invalid number of bits per channel in integer format.");
2552 break;
2553 }
2554 } else {
2555 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2556 }
2557 #elif GFX_VER == 5 || GFX_VER == 6
2558 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2559 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2560 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2561
2562 #define MESA_FLOAT_TO_HALF(dst, src) \
2563 dst = _mesa_float_to_half(src);
2564
2565 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2566
2567 #undef MESA_FLOAT_TO_HALF
2568
2569 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
2570 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2571 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
2572 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2573
2574 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2575
2576 #elif GFX_VER == 4
2577 BORDER_COLOR_ATTR(ASSIGN, , color->f);
2578 #else
2579 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2580 #endif
2581
2582 #undef ASSIGN
2583 #undef BORDER_COLOR_ATTR
2584
2585 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2586 }
2587
2588 /**
2589 * Upload the sampler states into a contiguous area of GPU memory, for
2590 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2591 *
2592 * Also fill out the border color state pointers.
2593 */
2594 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2595 crocus_upload_sampler_states(struct crocus_context *ice,
2596 struct crocus_batch *batch, gl_shader_stage stage)
2597 {
2598 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2599 const struct shader_info *info = crocus_get_shader_info(ice, stage);
2600
2601 /* We assume the state tracker will call pipe->bind_sampler_states()
2602 * if the program's number of textures changes.
2603 */
2604 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2605
2606 if (!count)
2607 return;
2608
2609 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2610 * in the dynamic state memory zone, so we can point to it via the
2611 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2612 */
2613 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2614 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2615
2616 if (unlikely(!map))
2617 return;
2618
2619 for (int i = 0; i < count; i++) {
2620 struct crocus_sampler_state *state = shs->samplers[i];
2621 struct crocus_sampler_view *tex = shs->textures[i];
2622
2623 if (!state || !tex) {
2624 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2625 } else {
2626 unsigned border_color_offset = 0;
2627 if (state->needs_border_color) {
2628 crocus_upload_border_color(batch, state, tex, &border_color_offset);
2629 }
2630
2631 enum samp_workaround wa = SAMP_NORMAL;
2632 /* There's a bug in 1D texture sampling - it actually pays
2633 * attention to the wrap_t value, though it should not.
2634 * Override the wrap_t value here to GL_REPEAT to keep
2635 * any nonexistent border pixels from floating in.
2636 */
2637 if (tex->base.target == PIPE_TEXTURE_1D)
2638 wa = SAMP_T_WRAP;
2639 else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2640 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2641 /* Cube maps must use the same wrap mode for all three coordinate
2642 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
2643 *
2644 * Ivybridge and Baytrail seem to have problems with CUBE mode and
2645 * integer formats. Fall back to CLAMP for now.
2646 */
2647 if (state->pstate.seamless_cube_map &&
2648 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2649 wa = SAMP_CUBE_CUBE;
2650 else
2651 wa = SAMP_CUBE_CLAMP;
2652 }
2653
2654 uint32_t first_level = 0;
2655 if (tex->base.target != PIPE_BUFFER)
2656 first_level = tex->base.u.tex.first_level;
2657
2658 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2659 }
2660
2661 map += GENX(SAMPLER_STATE_length);
2662 }
2663 }
2664
2665 /**
2666 * The pipe->create_sampler_view() driver hook.
2667 */
2668 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2669 crocus_create_sampler_view(struct pipe_context *ctx,
2670 struct pipe_resource *tex,
2671 const struct pipe_sampler_view *tmpl)
2672 {
2673 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2674 const struct intel_device_info *devinfo = &screen->devinfo;
2675 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2676
2677 if (!isv)
2678 return NULL;
2679
2680 /* initialize base object */
2681 isv->base = *tmpl;
2682 isv->base.context = ctx;
2683 isv->base.texture = NULL;
2684 pipe_reference_init(&isv->base.reference, 1);
2685 pipe_resource_reference(&isv->base.texture, tex);
2686
2687 if (util_format_is_depth_or_stencil(tmpl->format)) {
2688 struct crocus_resource *zres, *sres;
2689 const struct util_format_description *desc =
2690 util_format_description(tmpl->format);
2691
2692 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2693
2694 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2695
2696 if (tex->format == PIPE_FORMAT_S8_UINT)
2697 if (devinfo->ver == 7 && sres->shadow)
2698 tex = &sres->shadow->base.b;
2699 }
2700
2701 isv->res = (struct crocus_resource *) tex;
2702
2703 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2704
2705 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2706 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2707 usage |= ISL_SURF_USAGE_CUBE_BIT;
2708
2709 const struct crocus_format_info fmt =
2710 crocus_format_for_usage(devinfo, tmpl->format, usage);
2711
2712 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2713 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2714
2715 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2716 if (devinfo->ver < 6 &&
2717 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2718 tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2719 isv->swizzle[0] = tmpl->swizzle_g;
2720 isv->swizzle[1] = tmpl->swizzle_g;
2721 isv->swizzle[2] = tmpl->swizzle_g;
2722 isv->swizzle[3] = tmpl->swizzle_g;
2723 }
2724
2725 isv->clear_color = isv->res->aux.clear_color;
2726
2727 isv->view = (struct isl_view) {
2728 .format = fmt.fmt,
2729 #if GFX_VERx10 >= 75
2730 .swizzle = (struct isl_swizzle) {
2731 .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2732 .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2733 .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2734 .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2735 },
2736 #else
2737 /* swizzling handled in shader code */
2738 .swizzle = ISL_SWIZZLE_IDENTITY,
2739 #endif
2740 .usage = usage,
2741 };
2742
2743 /* Fill out SURFACE_STATE for this view. */
2744 if (tmpl->target != PIPE_BUFFER) {
2745 isv->view.base_level = tmpl->u.tex.first_level;
2746 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2747 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2748 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2749 isv->view.array_len =
2750 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2751 }
2752 #if GFX_VER >= 6
2753 /* just create a second view struct for texture gather just in case */
2754 isv->gather_view = isv->view;
2755
2756 #if GFX_VER == 7
2757 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2758 fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2759 fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2760 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2761 #if GFX_VERx10 >= 75
2762 isv->gather_view.swizzle = (struct isl_swizzle) {
2763 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2764 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2765 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2766 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2767 };
2768 #endif
2769 }
2770 #endif
2771 #if GFX_VER == 6
2772 /* Sandybridge's gather4 message is broken for integer formats.
2773 * To work around this, we pretend the surface is UNORM for
2774 * 8 or 16-bit formats, and emit shader instructions to recover
2775 * the real INT/UINT value. For 32-bit formats, we pretend
2776 * the surface is FLOAT, and simply reinterpret the resulting
2777 * bits.
2778 */
2779 switch (fmt.fmt) {
2780 case ISL_FORMAT_R8_SINT:
2781 case ISL_FORMAT_R8_UINT:
2782 isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2783 break;
2784
2785 case ISL_FORMAT_R16_SINT:
2786 case ISL_FORMAT_R16_UINT:
2787 isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2788 break;
2789
2790 case ISL_FORMAT_R32_SINT:
2791 case ISL_FORMAT_R32_UINT:
2792 isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2793 break;
2794
2795 default:
2796 break;
2797 }
2798 #endif
2799 #endif
2800 /* Fill out SURFACE_STATE for this view. */
2801 if (tmpl->target != PIPE_BUFFER) {
2802 if (crocus_resource_unfinished_aux_import(isv->res))
2803 crocus_resource_finish_aux_import(&screen->base, isv->res);
2804
2805 }
2806
2807 return &isv->base;
2808 }
2809
2810 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2811 crocus_sampler_view_destroy(struct pipe_context *ctx,
2812 struct pipe_sampler_view *state)
2813 {
2814 struct crocus_sampler_view *isv = (void *) state;
2815 pipe_resource_reference(&state->texture, NULL);
2816 free(isv);
2817 }
2818
2819 /**
2820 * The pipe->create_surface() driver hook.
2821 *
2822 * In Gallium nomenclature, "surfaces" are a view of a resource that
2823 * can be bound as a render target or depth/stencil buffer.
2824 */
2825 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2826 crocus_create_surface(struct pipe_context *ctx,
2827 struct pipe_resource *tex,
2828 const struct pipe_surface *tmpl)
2829 {
2830 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2831 const struct intel_device_info *devinfo = &screen->devinfo;
2832
2833 isl_surf_usage_flags_t usage = 0;
2834 if (tmpl->writable)
2835 usage = ISL_SURF_USAGE_STORAGE_BIT;
2836 else if (util_format_is_depth_or_stencil(tmpl->format))
2837 usage = ISL_SURF_USAGE_DEPTH_BIT;
2838 else
2839 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2840
2841 const struct crocus_format_info fmt =
2842 crocus_format_for_usage(devinfo, tmpl->format, usage);
2843
2844 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2845 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2846 /* Framebuffer validation will reject this invalid case, but it
2847 * hasn't had the opportunity yet. In the meantime, we need to
2848 * avoid hitting ISL asserts about unsupported formats below.
2849 */
2850 return NULL;
2851 }
2852
2853 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2854 struct pipe_surface *psurf = &surf->base;
2855 struct crocus_resource *res = (struct crocus_resource *) tex;
2856
2857 if (!surf)
2858 return NULL;
2859
2860 pipe_reference_init(&psurf->reference, 1);
2861 pipe_resource_reference(&psurf->texture, tex);
2862 psurf->context = ctx;
2863 psurf->format = tmpl->format;
2864 psurf->width = tex->width0;
2865 psurf->height = tex->height0;
2866 psurf->texture = tex;
2867 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2868 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2869 psurf->u.tex.level = tmpl->u.tex.level;
2870
2871 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2872
2873 struct isl_view *view = &surf->view;
2874 *view = (struct isl_view) {
2875 .format = fmt.fmt,
2876 .base_level = tmpl->u.tex.level,
2877 .levels = 1,
2878 .base_array_layer = tmpl->u.tex.first_layer,
2879 .array_len = array_len,
2880 .swizzle = ISL_SWIZZLE_IDENTITY,
2881 .usage = usage,
2882 };
2883
2884 #if GFX_VER >= 6
2885 struct isl_view *read_view = &surf->read_view;
2886 *read_view = (struct isl_view) {
2887 .format = fmt.fmt,
2888 .base_level = tmpl->u.tex.level,
2889 .levels = 1,
2890 .base_array_layer = tmpl->u.tex.first_layer,
2891 .array_len = array_len,
2892 .swizzle = ISL_SWIZZLE_IDENTITY,
2893 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2894 };
2895 #endif
2896
2897 surf->clear_color = res->aux.clear_color;
2898
2899 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2900 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2901 ISL_SURF_USAGE_STENCIL_BIT))
2902 return psurf;
2903
2904 if (!isl_format_is_compressed(res->surf.format)) {
2905 if (crocus_resource_unfinished_aux_import(res))
2906 crocus_resource_finish_aux_import(&screen->base, res);
2907
2908 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2909 uint64_t temp_offset;
2910 uint32_t temp_x, temp_y;
2911
2912 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2913 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2914 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2915 &temp_offset, &temp_x, &temp_y);
2916 if (!devinfo->has_surface_tile_offset &&
2917 (temp_x || temp_y)) {
2918 /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2919 * destination.
2920 */
2921 /* move to temp */
2922 struct pipe_resource wa_templ = (struct pipe_resource) {
2923 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2924 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2925 .depth0 = 1,
2926 .array_size = 1,
2927 .format = res->base.b.format,
2928 .target = PIPE_TEXTURE_2D,
2929 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2930 };
2931 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2932 view->base_level = 0;
2933 view->base_array_layer = 0;
2934 view->array_len = 1;
2935 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2936 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2937 }
2938 return psurf;
2939 }
2940
2941 /* The resource has a compressed format, which is not renderable, but we
2942 * have a renderable view format. We must be attempting to upload blocks
2943 * of compressed data via an uncompressed view.
2944 *
2945 * In this case, we can assume there are no auxiliary buffers, a single
2946 * miplevel, and that the resource is single-sampled. Gallium may try
2947 * and create an uncompressed view with multiple layers, however.
2948 */
2949 assert(!isl_format_is_compressed(fmt.fmt));
2950 assert(res->surf.samples == 1);
2951 assert(view->levels == 1);
2952
2953 /* TODO: compressed pbo uploads aren't working here */
2954 return NULL;
2955
2956 uint64_t offset_B = 0;
2957 uint32_t tile_x_sa = 0, tile_y_sa = 0;
2958
2959 if (view->base_level > 0) {
2960 /* We can't rely on the hardware's miplevel selection with such
2961 * a substantial lie about the format, so we select a single image
2962 * using the Tile X/Y Offset fields. In this case, we can't handle
2963 * multiple array slices.
2964 *
2965 * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2966 * hard-coded to align to exactly the block size of the compressed
2967 * texture. This means that, when reinterpreted as a non-compressed
2968 * texture, the tile offsets may be anything and we can't rely on
2969 * X/Y Offset.
2970 *
2971 * Return NULL to force the state tracker to take fallback paths.
2972 */
2973 // TODO: check if the gen7 check is right, originally gen8
2974 if (view->array_len > 1 || GFX_VER == 7)
2975 return NULL;
2976
2977 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2978 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2979 view->base_level,
2980 is_3d ? 0 : view->base_array_layer,
2981 is_3d ? view->base_array_layer : 0,
2982 &surf->surf,
2983 &offset_B, &tile_x_sa, &tile_y_sa);
2984
2985 /* We use address and tile offsets to access a single level/layer
2986 * as a subimage, so reset level/layer so it doesn't offset again.
2987 */
2988 view->base_array_layer = 0;
2989 view->base_level = 0;
2990 } else {
2991 /* Level 0 doesn't require tile offsets, and the hardware can find
2992 * array slices using QPitch even with the format override, so we
2993 * can allow layers in this case. Copy the original ISL surface.
2994 */
2995 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2996 }
2997
2998 /* Scale down the image dimensions by the block size. */
2999 const struct isl_format_layout *fmtl =
3000 isl_format_get_layout(res->surf.format);
3001 surf->surf.format = fmt.fmt;
3002 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3003 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3004 tile_x_sa /= fmtl->bw;
3005 tile_y_sa /= fmtl->bh;
3006
3007 psurf->width = surf->surf.logical_level0_px.width;
3008 psurf->height = surf->surf.logical_level0_px.height;
3009
3010 return psurf;
3011 }
3012
3013 #if GFX_VER >= 7
3014 static void
fill_default_image_param(struct brw_image_param * param)3015 fill_default_image_param(struct brw_image_param *param)
3016 {
3017 memset(param, 0, sizeof(*param));
3018 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3019 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3020 * detailed explanation of these parameters.
3021 */
3022 param->swizzling[0] = 0xff;
3023 param->swizzling[1] = 0xff;
3024 }
3025
3026 static void
fill_buffer_image_param(struct brw_image_param * param,enum pipe_format pfmt,unsigned size)3027 fill_buffer_image_param(struct brw_image_param *param,
3028 enum pipe_format pfmt,
3029 unsigned size)
3030 {
3031 const unsigned cpp = util_format_get_blocksize(pfmt);
3032
3033 fill_default_image_param(param);
3034 param->size[0] = size / cpp;
3035 param->stride[0] = cpp;
3036 }
3037
3038 #endif
3039
3040 /**
3041 * The pipe->set_shader_images() driver hook.
3042 */
3043 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3044 crocus_set_shader_images(struct pipe_context *ctx,
3045 enum pipe_shader_type p_stage,
3046 unsigned start_slot, unsigned count,
3047 unsigned unbind_num_trailing_slots,
3048 const struct pipe_image_view *p_images)
3049 {
3050 #if GFX_VER >= 7
3051 struct crocus_context *ice = (struct crocus_context *) ctx;
3052 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3053 const struct intel_device_info *devinfo = &screen->devinfo;
3054 gl_shader_stage stage = stage_from_pipe(p_stage);
3055 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3056 struct crocus_genx_state *genx = ice->state.genx;
3057 struct brw_image_param *image_params = genx->shaders[stage].image_param;
3058
3059 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3060
3061 for (unsigned i = 0; i < count; i++) {
3062 struct crocus_image_view *iv = &shs->image[start_slot + i];
3063
3064 if (p_images && p_images[i].resource) {
3065 const struct pipe_image_view *img = &p_images[i];
3066 struct crocus_resource *res = (void *) img->resource;
3067
3068 util_copy_image_view(&iv->base, img);
3069
3070 shs->bound_image_views |= 1 << (start_slot + i);
3071
3072 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3073 res->bind_stages |= 1 << stage;
3074
3075 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3076 struct crocus_format_info fmt =
3077 crocus_format_for_usage(devinfo, img->format, usage);
3078
3079 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3080 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3081 /* On Gen8, try to use typed surfaces reads (which support a
3082 * limited number of formats), and if not possible, fall back
3083 * to untyped reads.
3084 */
3085 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3086 fmt.fmt = ISL_FORMAT_RAW;
3087 else
3088 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3089 }
3090
3091 if (res->base.b.target != PIPE_BUFFER) {
3092 struct isl_view view = {
3093 .format = fmt.fmt,
3094 .base_level = img->u.tex.level,
3095 .levels = 1,
3096 .base_array_layer = img->u.tex.first_layer,
3097 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3098 .swizzle = swiz,
3099 .usage = usage,
3100 };
3101
3102 iv->view = view;
3103
3104 isl_surf_fill_image_param(&screen->isl_dev,
3105 &image_params[start_slot + i],
3106 &res->surf, &view);
3107 } else {
3108 struct isl_view view = {
3109 .format = fmt.fmt,
3110 .swizzle = swiz,
3111 .usage = usage,
3112 };
3113 iv->view = view;
3114
3115 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3116 img->u.buf.offset + img->u.buf.size);
3117 fill_buffer_image_param(&image_params[start_slot + i],
3118 img->format, img->u.buf.size);
3119 }
3120 } else {
3121 pipe_resource_reference(&iv->base.resource, NULL);
3122 fill_default_image_param(&image_params[start_slot + i]);
3123 }
3124 }
3125
3126 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3127 ice->state.dirty |=
3128 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3129 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3130
3131 /* Broadwell also needs brw_image_params re-uploaded */
3132 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3133 shs->sysvals_need_upload = true;
3134 #endif
3135 }
3136
3137
3138 /**
3139 * The pipe->set_sampler_views() driver hook.
3140 */
3141 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3142 crocus_set_sampler_views(struct pipe_context *ctx,
3143 enum pipe_shader_type p_stage,
3144 unsigned start, unsigned count,
3145 unsigned unbind_num_trailing_slots,
3146 bool take_ownership,
3147 struct pipe_sampler_view **views)
3148 {
3149 struct crocus_context *ice = (struct crocus_context *) ctx;
3150 gl_shader_stage stage = stage_from_pipe(p_stage);
3151 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3152
3153 shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3154
3155 for (unsigned i = 0; i < count; i++) {
3156 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3157
3158 if (take_ownership) {
3159 pipe_sampler_view_reference((struct pipe_sampler_view **)
3160 &shs->textures[start + i], NULL);
3161 shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3162 } else {
3163 pipe_sampler_view_reference((struct pipe_sampler_view **)
3164 &shs->textures[start + i], pview);
3165 }
3166
3167 struct crocus_sampler_view *view = (void *) pview;
3168 if (view) {
3169 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3170 view->res->bind_stages |= 1 << stage;
3171
3172 shs->bound_sampler_views |= 1 << (start + i);
3173 }
3174 }
3175 #if GFX_VER == 6
3176 /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3177 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3178 #endif
3179 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3180 ice->state.dirty |=
3181 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3182 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3183 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3184 }
3185
3186 /**
3187 * The pipe->set_tess_state() driver hook.
3188 */
3189 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3190 crocus_set_tess_state(struct pipe_context *ctx,
3191 const float default_outer_level[4],
3192 const float default_inner_level[2])
3193 {
3194 struct crocus_context *ice = (struct crocus_context *) ctx;
3195 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3196
3197 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3198 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3199
3200 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3201 shs->sysvals_need_upload = true;
3202 }
3203
3204 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3205 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3206 {
3207 struct crocus_context *ice = (struct crocus_context *) ctx;
3208
3209 ice->state.patch_vertices = patch_vertices;
3210 }
3211
3212 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3213 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3214 {
3215 struct crocus_surface *surf = (void *) p_surf;
3216 pipe_resource_reference(&p_surf->texture, NULL);
3217
3218 pipe_resource_reference(&surf->align_res, NULL);
3219 free(surf);
3220 }
3221
3222 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3223 crocus_set_clip_state(struct pipe_context *ctx,
3224 const struct pipe_clip_state *state)
3225 {
3226 struct crocus_context *ice = (struct crocus_context *) ctx;
3227 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3228 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3229 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3230
3231 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3232
3233 #if GFX_VER <= 5
3234 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3235 #endif
3236 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3237 CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3238 shs->sysvals_need_upload = true;
3239 gshs->sysvals_need_upload = true;
3240 tshs->sysvals_need_upload = true;
3241 }
3242
3243 /**
3244 * The pipe->set_polygon_stipple() driver hook.
3245 */
3246 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3247 crocus_set_polygon_stipple(struct pipe_context *ctx,
3248 const struct pipe_poly_stipple *state)
3249 {
3250 struct crocus_context *ice = (struct crocus_context *) ctx;
3251 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3252 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3253 }
3254
3255 /**
3256 * The pipe->set_sample_mask() driver hook.
3257 */
3258 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3259 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3260 {
3261 struct crocus_context *ice = (struct crocus_context *) ctx;
3262
3263 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3264 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3265 */
3266 ice->state.sample_mask = sample_mask & 0xff;
3267 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3268 }
3269
3270 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3271 crocus_fill_scissor_rect(struct crocus_context *ice,
3272 int idx,
3273 struct pipe_scissor_state *ss)
3274 {
3275 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3276 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3277 const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3278 struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3279 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3280 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3281 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3282 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3283 };
3284 if (cso_state->scissor) {
3285 struct pipe_scissor_state *s = &ice->state.scissors[idx];
3286 scissor.minx = MAX2(scissor.minx, s->minx);
3287 scissor.miny = MAX2(scissor.miny, s->miny);
3288 scissor.maxx = MIN2(scissor.maxx, s->maxx);
3289 scissor.maxy = MIN2(scissor.maxy, s->maxy);
3290 }
3291 *ss = scissor;
3292 }
3293
3294 /**
3295 * The pipe->set_scissor_states() driver hook.
3296 *
3297 * This corresponds to our SCISSOR_RECT state structures. It's an
3298 * exact match, so we just store them, and memcpy them out later.
3299 */
3300 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3301 crocus_set_scissor_states(struct pipe_context *ctx,
3302 unsigned start_slot,
3303 unsigned num_scissors,
3304 const struct pipe_scissor_state *rects)
3305 {
3306 struct crocus_context *ice = (struct crocus_context *) ctx;
3307
3308 for (unsigned i = 0; i < num_scissors; i++) {
3309 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3310 /* If the scissor was out of bounds and got clamped to 0 width/height
3311 * at the bounds, the subtraction of 1 from maximums could produce a
3312 * negative number and thus not clip anything. Instead, just provide
3313 * a min > max scissor inside the bounds, which produces the expected
3314 * no rendering.
3315 */
3316 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3317 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3318 };
3319 } else {
3320 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3321 .minx = rects[i].minx, .miny = rects[i].miny,
3322 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3323 };
3324 }
3325 }
3326
3327 #if GFX_VER < 6
3328 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3329 #else
3330 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3331 #endif
3332 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3333
3334 }
3335
3336 /**
3337 * The pipe->set_stencil_ref() driver hook.
3338 *
3339 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3340 */
3341 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3342 crocus_set_stencil_ref(struct pipe_context *ctx,
3343 const struct pipe_stencil_ref ref)
3344 {
3345 struct crocus_context *ice = (struct crocus_context *) ctx;
3346 ice->state.stencil_ref = ref;
3347 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3348 }
3349
3350 #if GFX_VER == 8
3351 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3352 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3353 {
3354 return copysignf(state->scale[axis], sign) + state->translate[axis];
3355 }
3356 #endif
3357
3358 /**
3359 * The pipe->set_viewport_states() driver hook.
3360 *
3361 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3362 * the guardband yet, as we need the framebuffer dimensions, but we can
3363 * at least fill out the rest.
3364 */
3365 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3366 crocus_set_viewport_states(struct pipe_context *ctx,
3367 unsigned start_slot,
3368 unsigned count,
3369 const struct pipe_viewport_state *states)
3370 {
3371 struct crocus_context *ice = (struct crocus_context *) ctx;
3372
3373 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3374
3375 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3376 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3377 #if GFX_VER >= 6
3378 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3379 #endif
3380
3381 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3382 !ice->state.cso_rast->cso.depth_clip_far))
3383 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3384 }
3385
3386 /**
3387 * The pipe->set_framebuffer_state() driver hook.
3388 *
3389 * Sets the current draw FBO, including color render targets, depth,
3390 * and stencil buffers.
3391 */
3392 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3393 crocus_set_framebuffer_state(struct pipe_context *ctx,
3394 const struct pipe_framebuffer_state *state)
3395 {
3396 struct crocus_context *ice = (struct crocus_context *) ctx;
3397 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3398 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3399 const struct intel_device_info *devinfo = &screen->devinfo;
3400 #if 0
3401 struct isl_device *isl_dev = &screen->isl_dev;
3402 struct crocus_resource *zres;
3403 struct crocus_resource *stencil_res;
3404 #endif
3405
3406 unsigned samples = util_framebuffer_get_num_samples(state);
3407 unsigned layers = util_framebuffer_get_num_layers(state);
3408
3409 #if GFX_VER >= 6
3410 if (cso->samples != samples) {
3411 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3412 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3413 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3414 #if GFX_VERx10 == 75
3415 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3416 #endif
3417 }
3418 #endif
3419
3420 #if GFX_VER >= 6 && GFX_VER < 8
3421 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3422 #endif
3423
3424 if ((cso->layers == 0) != (layers == 0)) {
3425 ice->state.dirty |= CROCUS_DIRTY_CLIP;
3426 }
3427
3428 if (cso->width != state->width || cso->height != state->height) {
3429 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3430 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3431 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3432 #if GFX_VER >= 6
3433 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3434 #endif
3435 }
3436
3437 if (cso->zsbuf || state->zsbuf) {
3438 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3439
3440 /* update SF's depth buffer format */
3441 if (GFX_VER == 7 && cso->zsbuf)
3442 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3443 }
3444
3445 /* wm thread dispatch enable */
3446 ice->state.dirty |= CROCUS_DIRTY_WM;
3447 util_copy_framebuffer_state(cso, state);
3448 cso->samples = samples;
3449 cso->layers = layers;
3450
3451 if (cso->zsbuf) {
3452 struct crocus_resource *zres;
3453 struct crocus_resource *stencil_res;
3454 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3455 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3456 &stencil_res);
3457 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3458 aux_usage = zres->aux.usage;
3459 }
3460 ice->state.hiz_usage = aux_usage;
3461 }
3462
3463 /* Render target change */
3464 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3465
3466 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3467
3468 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3469 }
3470
3471 /**
3472 * The pipe->set_constant_buffer() driver hook.
3473 *
3474 * This uploads any constant data in user buffers, and references
3475 * any UBO resources containing constant data.
3476 */
3477 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3478 crocus_set_constant_buffer(struct pipe_context *ctx,
3479 enum pipe_shader_type p_stage, unsigned index,
3480 bool take_ownership,
3481 const struct pipe_constant_buffer *input)
3482 {
3483 struct crocus_context *ice = (struct crocus_context *) ctx;
3484 gl_shader_stage stage = stage_from_pipe(p_stage);
3485 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3486 struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3487
3488 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3489
3490 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3491 shs->bound_cbufs |= 1u << index;
3492
3493 if (input->user_buffer) {
3494 void *map = NULL;
3495 pipe_resource_reference(&cbuf->buffer, NULL);
3496 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3497 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3498
3499 if (!cbuf->buffer) {
3500 /* Allocation was unsuccessful - just unbind */
3501 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3502 return;
3503 }
3504
3505 assert(map);
3506 memcpy(map, input->user_buffer, input->buffer_size);
3507 }
3508 cbuf->buffer_size =
3509 MIN2(input->buffer_size,
3510 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3511
3512 struct crocus_resource *res = (void *) cbuf->buffer;
3513 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3514 res->bind_stages |= 1 << stage;
3515 } else {
3516 shs->bound_cbufs &= ~(1u << index);
3517 }
3518
3519 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3520 }
3521
3522 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3523 upload_sysvals(struct crocus_context *ice,
3524 gl_shader_stage stage)
3525 {
3526 UNUSED struct crocus_genx_state *genx = ice->state.genx;
3527 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3528
3529 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3530 if (!shader || shader->num_system_values == 0)
3531 return;
3532
3533 assert(shader->num_cbufs > 0);
3534
3535 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3536 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3537 unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3538 uint32_t *map = NULL;
3539
3540 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3541 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3542 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3543
3544 for (int i = 0; i < shader->num_system_values; i++) {
3545 uint32_t sysval = shader->system_values[i];
3546 uint32_t value = 0;
3547
3548 if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3549 #if GFX_VER >= 7
3550 unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3551 unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3552 struct brw_image_param *param =
3553 &genx->shaders[stage].image_param[img];
3554
3555 assert(offset < sizeof(struct brw_image_param));
3556 value = ((uint32_t *) param)[offset];
3557 #endif
3558 } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3559 value = 0;
3560 } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3561 int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3562 int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3563 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3564 } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3565 if (stage == MESA_SHADER_TESS_CTRL) {
3566 value = ice->state.vertices_per_patch;
3567 } else {
3568 assert(stage == MESA_SHADER_TESS_EVAL);
3569 const struct shader_info *tcs_info =
3570 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3571 if (tcs_info)
3572 value = tcs_info->tess.tcs_vertices_out;
3573 else
3574 value = ice->state.vertices_per_patch;
3575 }
3576 } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3577 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3578 unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3579 value = fui(ice->state.default_outer_level[i]);
3580 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3581 value = fui(ice->state.default_inner_level[0]);
3582 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3583 value = fui(ice->state.default_inner_level[1]);
3584 } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3585 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3586 unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3587 value = ice->state.last_block[i];
3588 } else {
3589 assert(!"unhandled system value");
3590 }
3591
3592 *map++ = value;
3593 }
3594
3595 cbuf->buffer_size = upload_size;
3596 shs->sysvals_need_upload = false;
3597 }
3598
3599 /**
3600 * The pipe->set_shader_buffers() driver hook.
3601 *
3602 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
3603 * SURFACE_STATE here, as the buffer offset may change each time.
3604 */
3605 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3606 crocus_set_shader_buffers(struct pipe_context *ctx,
3607 enum pipe_shader_type p_stage,
3608 unsigned start_slot, unsigned count,
3609 const struct pipe_shader_buffer *buffers,
3610 unsigned writable_bitmask)
3611 {
3612 struct crocus_context *ice = (struct crocus_context *) ctx;
3613 gl_shader_stage stage = stage_from_pipe(p_stage);
3614 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3615
3616 unsigned modified_bits = u_bit_consecutive(start_slot, count);
3617
3618 shs->bound_ssbos &= ~modified_bits;
3619 shs->writable_ssbos &= ~modified_bits;
3620 shs->writable_ssbos |= writable_bitmask << start_slot;
3621
3622 for (unsigned i = 0; i < count; i++) {
3623 if (buffers && buffers[i].buffer) {
3624 struct crocus_resource *res = (void *) buffers[i].buffer;
3625 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3626 pipe_resource_reference(&ssbo->buffer, &res->base.b);
3627 ssbo->buffer_offset = buffers[i].buffer_offset;
3628 ssbo->buffer_size =
3629 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3630
3631 shs->bound_ssbos |= 1 << (start_slot + i);
3632
3633 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3634 res->bind_stages |= 1 << stage;
3635
3636 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3637 ssbo->buffer_offset + ssbo->buffer_size);
3638 } else {
3639 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3640 }
3641 }
3642
3643 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3644 }
3645
3646 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3647 crocus_delete_state(struct pipe_context *ctx, void *state)
3648 {
3649 free(state);
3650 }
3651
3652 /**
3653 * The pipe->set_vertex_buffers() driver hook.
3654 *
3655 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3656 */
3657 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,const struct pipe_vertex_buffer * buffers)3658 crocus_set_vertex_buffers(struct pipe_context *ctx,
3659 unsigned start_slot, unsigned count,
3660 unsigned unbind_num_trailing_slots,
3661 bool take_ownership,
3662 const struct pipe_vertex_buffer *buffers)
3663 {
3664 struct crocus_context *ice = (struct crocus_context *) ctx;
3665 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3666 const unsigned padding =
3667 (GFX_VERx10 < 75 && !screen->devinfo.is_baytrail) * 2;
3668 ice->state.bound_vertex_buffers &=
3669 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3670
3671 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3672 buffers, start_slot, count, unbind_num_trailing_slots,
3673 take_ownership);
3674
3675 for (unsigned i = 0; i < count; i++) {
3676 struct pipe_vertex_buffer *state =
3677 &ice->state.vertex_buffers[start_slot + i];
3678
3679 if (!state->is_user_buffer && state->buffer.resource) {
3680 struct crocus_resource *res = (void *)state->buffer.resource;
3681 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3682 }
3683
3684 uint32_t end = 0;
3685 if (state->buffer.resource)
3686 end = state->buffer.resource->width0 + padding;
3687 ice->state.vb_end[start_slot + i] = end;
3688 }
3689 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3690 }
3691
3692 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3693 static uint8_t get_wa_flags(enum isl_format format)
3694 {
3695 uint8_t wa_flags = 0;
3696
3697 switch (format) {
3698 case ISL_FORMAT_R10G10B10A2_USCALED:
3699 wa_flags = BRW_ATTRIB_WA_SCALE;
3700 break;
3701 case ISL_FORMAT_R10G10B10A2_SSCALED:
3702 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
3703 break;
3704 case ISL_FORMAT_R10G10B10A2_UNORM:
3705 wa_flags = BRW_ATTRIB_WA_NORMALIZE;
3706 break;
3707 case ISL_FORMAT_R10G10B10A2_SNORM:
3708 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
3709 break;
3710 case ISL_FORMAT_R10G10B10A2_SINT:
3711 wa_flags = BRW_ATTRIB_WA_SIGN;
3712 break;
3713 case ISL_FORMAT_B10G10R10A2_USCALED:
3714 wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3715 break;
3716 case ISL_FORMAT_B10G10R10A2_SSCALED:
3717 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3718 break;
3719 case ISL_FORMAT_B10G10R10A2_UNORM:
3720 wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3721 break;
3722 case ISL_FORMAT_B10G10R10A2_SNORM:
3723 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3724 break;
3725 case ISL_FORMAT_B10G10R10A2_SINT:
3726 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
3727 break;
3728 case ISL_FORMAT_B10G10R10A2_UINT:
3729 wa_flags = BRW_ATTRIB_WA_BGRA;
3730 break;
3731 default:
3732 break;
3733 }
3734 return wa_flags;
3735 }
3736 #endif
3737
3738 /**
3739 * Gallium CSO for vertex elements.
3740 */
3741 struct crocus_vertex_element_state {
3742 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3743 #if GFX_VER == 8
3744 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3745 #endif
3746 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3747 #if GFX_VER == 8
3748 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3749 #endif
3750 uint32_t step_rate[16];
3751 uint8_t wa_flags[33];
3752 unsigned count;
3753 };
3754
3755 /**
3756 * The pipe->create_vertex_elements() driver hook.
3757 *
3758 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3759 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3760 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3761 * needed. In these cases we will need information available at draw time.
3762 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3763 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3764 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3765 */
3766 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3767 crocus_create_vertex_elements(struct pipe_context *ctx,
3768 unsigned count,
3769 const struct pipe_vertex_element *state)
3770 {
3771 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3772 const struct intel_device_info *devinfo = &screen->devinfo;
3773 struct crocus_vertex_element_state *cso =
3774 malloc(sizeof(struct crocus_vertex_element_state));
3775
3776 cso->count = count;
3777
3778 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3779 ve.DWordLength =
3780 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3781 }
3782
3783 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3784 #if GFX_VER == 8
3785 uint32_t *vfi_pack_dest = cso->vf_instancing;
3786 #endif
3787
3788 if (count == 0) {
3789 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3790 ve.Valid = true;
3791 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3792 ve.Component0Control = VFCOMP_STORE_0;
3793 ve.Component1Control = VFCOMP_STORE_0;
3794 ve.Component2Control = VFCOMP_STORE_0;
3795 ve.Component3Control = VFCOMP_STORE_1_FP;
3796 }
3797 #if GFX_VER == 8
3798 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3799 }
3800 #endif
3801 }
3802
3803 for (int i = 0; i < count; i++) {
3804 const struct crocus_format_info fmt =
3805 crocus_format_for_usage(devinfo, state[i].src_format, 0);
3806 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3807 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3808 enum isl_format actual_fmt = fmt.fmt;
3809
3810 #if GFX_VERx10 < 75
3811 cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3812
3813 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3814 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3815 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3816 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3817 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3818 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3819 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3820 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3821 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3822 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3823 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3824 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3825 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3826 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3827 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3828 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3829 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3830 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3831 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3832 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3833 #endif
3834
3835 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3836
3837 switch (isl_format_get_num_channels(fmt.fmt)) {
3838 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3839 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3840 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3841 case 3:
3842 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3843 : VFCOMP_STORE_1_FP;
3844 break;
3845 }
3846 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3847 #if GFX_VER >= 6
3848 ve.EdgeFlagEnable = false;
3849 #endif
3850 ve.VertexBufferIndex = state[i].vertex_buffer_index;
3851 ve.Valid = true;
3852 ve.SourceElementOffset = state[i].src_offset;
3853 ve.SourceElementFormat = actual_fmt;
3854 ve.Component0Control = comp[0];
3855 ve.Component1Control = comp[1];
3856 ve.Component2Control = comp[2];
3857 ve.Component3Control = comp[3];
3858 #if GFX_VER < 5
3859 ve.DestinationElementOffset = i * 4;
3860 #endif
3861 }
3862
3863 #if GFX_VER == 8
3864 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3865 vi.VertexElementIndex = i;
3866 vi.InstancingEnable = state[i].instance_divisor > 0;
3867 vi.InstanceDataStepRate = state[i].instance_divisor;
3868 }
3869 #endif
3870 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3871 #if GFX_VER == 8
3872 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3873 #endif
3874 }
3875
3876 /* An alternative version of the last VE and VFI is stored so it
3877 * can be used at draw time in case Vertex Shader uses EdgeFlag
3878 */
3879 if (count) {
3880 const unsigned edgeflag_index = count - 1;
3881 const struct crocus_format_info fmt =
3882 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3883 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3884 #if GFX_VER >= 6
3885 ve.EdgeFlagEnable = true;
3886 #endif
3887 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3888 ve.Valid = true;
3889 ve.SourceElementOffset = state[edgeflag_index].src_offset;
3890 ve.SourceElementFormat = fmt.fmt;
3891 ve.Component0Control = VFCOMP_STORE_SRC;
3892 ve.Component1Control = VFCOMP_STORE_0;
3893 ve.Component2Control = VFCOMP_STORE_0;
3894 ve.Component3Control = VFCOMP_STORE_0;
3895 }
3896 #if GFX_VER == 8
3897 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3898 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3899 * at draw time, as it should change if SGVs are emitted.
3900 */
3901 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3902 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3903 }
3904 #endif
3905 }
3906
3907 return cso;
3908 }
3909
3910 /**
3911 * The pipe->bind_vertex_elements_state() driver hook.
3912 */
3913 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3914 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3915 {
3916 struct crocus_context *ice = (struct crocus_context *) ctx;
3917 #if GFX_VER == 8
3918 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3919 struct crocus_vertex_element_state *new_cso = state;
3920
3921 if (new_cso && cso_changed(count))
3922 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3923 #endif
3924 ice->state.cso_vertex_elements = state;
3925 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3926 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3927 }
3928
3929 #if GFX_VER >= 6
3930 struct crocus_streamout_counter {
3931 uint32_t offset_start;
3932 uint32_t offset_end;
3933
3934 uint64_t accum;
3935 };
3936
3937 /**
3938 * Gallium CSO for stream output (transform feedback) targets.
3939 */
3940 struct crocus_stream_output_target {
3941 struct pipe_stream_output_target base;
3942
3943 /** Stride (bytes-per-vertex) during this transform feedback operation */
3944 uint16_t stride;
3945
3946 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3947 bool zeroed;
3948
3949 struct crocus_resource *offset_res;
3950 uint32_t offset_offset;
3951
3952 #if GFX_VER == 6
3953 void *prim_map;
3954 struct crocus_streamout_counter prev_count;
3955 struct crocus_streamout_counter count;
3956 #endif
3957 #if GFX_VER == 8
3958 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3959 bool zero_offset;
3960 #endif
3961 };
3962
3963 #if GFX_VER >= 7
3964 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3965 crocus_get_so_offset(struct pipe_stream_output_target *so)
3966 {
3967 struct crocus_stream_output_target *tgt = (void *)so;
3968 struct pipe_transfer *transfer;
3969 struct pipe_box box;
3970 uint32_t result;
3971 u_box_1d(tgt->offset_offset, 4, &box);
3972 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3973 0, PIPE_MAP_DIRECTLY,
3974 &box, &transfer);
3975 assert(val);
3976 result = *(uint32_t *)val;
3977 so->context->buffer_unmap(so->context, transfer);
3978
3979 return result / tgt->stride;
3980 }
3981 #endif
3982
3983 #if GFX_VER == 6
3984 static void
3985 compute_vertices_written_so_far(struct crocus_context *ice,
3986 struct crocus_stream_output_target *tgt,
3987 struct crocus_streamout_counter *count,
3988 uint64_t *svbi);
3989
3990 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3991 crocus_get_so_offset(struct pipe_stream_output_target *so)
3992 {
3993 struct crocus_stream_output_target *tgt = (void *)so;
3994 struct crocus_context *ice = (void *)so->context;
3995
3996 uint64_t vert_written;
3997 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3998 return vert_written;
3999 }
4000 #endif
4001
4002 /**
4003 * The pipe->create_stream_output_target() driver hook.
4004 *
4005 * "Target" here refers to a destination buffer. We translate this into
4006 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4007 * know which buffer this represents, or whether we ought to zero the
4008 * write-offsets, or append. Those are handled in the set() hook.
4009 */
4010 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4011 crocus_create_stream_output_target(struct pipe_context *ctx,
4012 struct pipe_resource *p_res,
4013 unsigned buffer_offset,
4014 unsigned buffer_size)
4015 {
4016 struct crocus_resource *res = (void *) p_res;
4017 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4018 if (!cso)
4019 return NULL;
4020
4021 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4022
4023 pipe_reference_init(&cso->base.reference, 1);
4024 pipe_resource_reference(&cso->base.buffer, p_res);
4025 cso->base.buffer_offset = buffer_offset;
4026 cso->base.buffer_size = buffer_size;
4027 cso->base.context = ctx;
4028
4029 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4030 buffer_offset + buffer_size);
4031 #if GFX_VER >= 7
4032 struct crocus_context *ice = (struct crocus_context *) ctx;
4033 void *temp;
4034 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4035 &cso->offset_offset,
4036 (struct pipe_resource **)&cso->offset_res,
4037 &temp);
4038 #endif
4039
4040 return &cso->base;
4041 }
4042
4043 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4044 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4045 struct pipe_stream_output_target *state)
4046 {
4047 struct crocus_stream_output_target *cso = (void *) state;
4048
4049 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4050 pipe_resource_reference(&cso->base.buffer, NULL);
4051
4052 free(cso);
4053 }
4054
4055 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
4056 #define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
4057
4058 #if GFX_VER == 6
4059 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4060 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4061 struct crocus_streamout_counter *counter)
4062 {
4063 uint64_t *prim_counts = tgt->prim_map;
4064
4065 if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4066 struct pipe_fence_handle *out_fence = NULL;
4067 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4068 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4069 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4070 }
4071
4072 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4073 counter->accum += prim_counts[i + 1] - prim_counts[i];
4074 }
4075 tgt->count.offset_start = tgt->count.offset_end = 0;
4076 }
4077
4078 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4079 crocus_stream_store_prims_written(struct crocus_batch *batch,
4080 struct crocus_stream_output_target *tgt)
4081 {
4082 if (!tgt->offset_res) {
4083 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4084 &tgt->offset_offset,
4085 (struct pipe_resource **)&tgt->offset_res,
4086 &tgt->prim_map);
4087 tgt->count.offset_start = tgt->count.offset_end = 0;
4088 }
4089
4090 if (tgt->count.offset_end + 16 >= 4096) {
4091 aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4092 aggregate_stream_counter(batch, tgt, &tgt->count);
4093 }
4094
4095 crocus_emit_mi_flush(batch);
4096 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4097 tgt->offset_res->bo,
4098 tgt->count.offset_end + tgt->offset_offset, false);
4099 tgt->count.offset_end += 8;
4100 }
4101
4102 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4103 compute_vertices_written_so_far(struct crocus_context *ice,
4104 struct crocus_stream_output_target *tgt,
4105 struct crocus_streamout_counter *counter,
4106 uint64_t *svbi)
4107 {
4108 //TODO vertices per prim
4109 aggregate_stream_counter(&ice->batches[0], tgt, counter);
4110
4111 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4112 }
4113 #endif
4114 /**
4115 * The pipe->set_stream_output_targets() driver hook.
4116 *
4117 * At this point, we know which targets are bound to a particular index,
4118 * and also whether we want to append or start over. We can finish the
4119 * 3DSTATE_SO_BUFFER packets we started earlier.
4120 */
4121 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4122 crocus_set_stream_output_targets(struct pipe_context *ctx,
4123 unsigned num_targets,
4124 struct pipe_stream_output_target **targets,
4125 const unsigned *offsets)
4126 {
4127 struct crocus_context *ice = (struct crocus_context *) ctx;
4128 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4129 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4130 const bool active = num_targets > 0;
4131 if (ice->state.streamout_active != active) {
4132 ice->state.streamout_active = active;
4133 #if GFX_VER >= 7
4134 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4135 #else
4136 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4137 #endif
4138
4139 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4140 * it's a non-pipelined command. If we're switching streamout on, we
4141 * may have missed emitting it earlier, so do so now. (We're already
4142 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4143 */
4144 if (active) {
4145 #if GFX_VER >= 7
4146 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4147 #endif
4148 } else {
4149 uint32_t flush = 0;
4150 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4151 struct crocus_stream_output_target *tgt =
4152 (void *) ice->state.so_target[i];
4153 if (tgt) {
4154 struct crocus_resource *res = (void *) tgt->base.buffer;
4155
4156 flush |= crocus_flush_bits_for_history(res);
4157 crocus_dirty_for_history(ice, res);
4158 }
4159 }
4160 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4161 "make streamout results visible", flush);
4162 }
4163 }
4164
4165 ice->state.so_targets = num_targets;
4166 for (int i = 0; i < 4; i++) {
4167 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4168 pipe_so_target_reference(&ice->state.so_target[i],
4169 i < num_targets ? targets[i] : NULL);
4170 }
4171
4172 #if GFX_VER == 6
4173 bool stored_num_prims = false;
4174 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4175 if (num_targets) {
4176 struct crocus_stream_output_target *tgt =
4177 (void *) ice->state.so_target[i];
4178
4179 if (!tgt)
4180 continue;
4181 if (offsets[i] == 0) {
4182 // This means that we're supposed to ignore anything written to
4183 // the buffer before. We can do this by just clearing out the
4184 // count of writes to the prim count buffer.
4185 tgt->count.offset_start = tgt->count.offset_end;
4186 tgt->count.accum = 0;
4187 ice->state.svbi = 0;
4188 } else {
4189 if (tgt->offset_res) {
4190 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4191 tgt->count.offset_start = tgt->count.offset_end;
4192 }
4193 }
4194
4195 if (!stored_num_prims) {
4196 crocus_stream_store_prims_written(batch, tgt);
4197 stored_num_prims = true;
4198 }
4199 } else {
4200 struct crocus_stream_output_target *tgt =
4201 (void *) old_tgt[i];
4202 if (tgt) {
4203 if (!stored_num_prims) {
4204 crocus_stream_store_prims_written(batch, tgt);
4205 stored_num_prims = true;
4206 }
4207
4208 if (tgt->offset_res) {
4209 tgt->prev_count = tgt->count;
4210 }
4211 }
4212 }
4213 pipe_so_target_reference(&old_tgt[i], NULL);
4214 }
4215 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4216 #else
4217 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4218 if (num_targets) {
4219 struct crocus_stream_output_target *tgt =
4220 (void *) ice->state.so_target[i];
4221
4222 if (offsets[i] == 0) {
4223 #if GFX_VER == 8
4224 if (tgt)
4225 tgt->zero_offset = true;
4226 #endif
4227 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4228 }
4229 else if (tgt)
4230 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4231 tgt->offset_res->bo,
4232 tgt->offset_offset);
4233 } else {
4234 struct crocus_stream_output_target *tgt =
4235 (void *) old_tgt[i];
4236 if (tgt)
4237 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4238 tgt->offset_res->bo,
4239 tgt->offset_offset, false);
4240 }
4241 pipe_so_target_reference(&old_tgt[i], NULL);
4242 }
4243 #endif
4244 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4245 if (!active)
4246 return;
4247 #if GFX_VER >= 7
4248 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4249 #elif GFX_VER == 6
4250 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4251 #endif
4252 }
4253
4254 #endif
4255
4256 #if GFX_VER >= 7
4257 /**
4258 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4259 * 3DSTATE_STREAMOUT packets.
4260 *
4261 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4262 * hardware to record. We can create it entirely based on the shader, with
4263 * no dynamic state dependencies.
4264 *
4265 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4266 * state-based settings. We capture the shader-related ones here, and merge
4267 * the rest in at draw time.
4268 */
4269 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct brw_vue_map * vue_map)4270 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4271 const struct brw_vue_map *vue_map)
4272 {
4273 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
4274 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4277 int max_decls = 0;
4278 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
4279
4280 memset(so_decl, 0, sizeof(so_decl));
4281
4282 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4283 * command feels strange -- each dword pair contains a SO_DECL per stream.
4284 */
4285 for (unsigned i = 0; i < info->num_outputs; i++) {
4286 const struct pipe_stream_output *output = &info->output[i];
4287 const int buffer = output->output_buffer;
4288 const int varying = output->register_index;
4289 const unsigned stream_id = output->stream;
4290 assert(stream_id < MAX_VERTEX_STREAMS);
4291
4292 buffer_mask[stream_id] |= 1 << buffer;
4293
4294 assert(vue_map->varying_to_slot[varying] >= 0);
4295
4296 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4297 * array. Instead, it simply increments DstOffset for the following
4298 * input by the number of components that should be skipped.
4299 *
4300 * Our hardware is unusual in that it requires us to program SO_DECLs
4301 * for fake "hole" components, rather than simply taking the offset
4302 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4303 * program as many size = 4 holes as we can, then a final hole to
4304 * accommodate the final 1, 2, or 3 remaining.
4305 */
4306 int skip_components = output->dst_offset - next_offset[buffer];
4307
4308 while (skip_components > 0) {
4309 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4310 .HoleFlag = 1,
4311 .OutputBufferSlot = output->output_buffer,
4312 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4313 };
4314 skip_components -= 4;
4315 }
4316
4317 next_offset[buffer] = output->dst_offset + output->num_components;
4318
4319 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4320 .OutputBufferSlot = output->output_buffer,
4321 .RegisterIndex = vue_map->varying_to_slot[varying],
4322 .ComponentMask =
4323 ((1 << output->num_components) - 1) << output->start_component,
4324 };
4325
4326 if (decls[stream_id] > max_decls)
4327 max_decls = decls[stream_id];
4328 }
4329
4330 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4331 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4332 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4333
4334 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4335 int urb_entry_read_offset = 0;
4336 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4337 urb_entry_read_offset;
4338
4339 /* We always read the whole vertex. This could be reduced at some
4340 * point by reading less and offsetting the register index in the
4341 * SO_DECLs.
4342 */
4343 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4344 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4345 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4346 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4347 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4348 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4349 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4350 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4351
4352 // TODO: Double-check that stride == 0 means no buffer. Probably this
4353 // needs to go elsewhere, where the buffer enable stuff is actually
4354 // known.
4355 #if GFX_VER < 8
4356 sol.SOBufferEnable0 = !!info->stride[0];
4357 sol.SOBufferEnable1 = !!info->stride[1];
4358 sol.SOBufferEnable2 = !!info->stride[2];
4359 sol.SOBufferEnable3 = !!info->stride[3];
4360 #else
4361 /* Set buffer pitches; 0 means unbound. */
4362 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4363 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4364 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4365 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4366 #endif
4367 }
4368
4369 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4370 list.DWordLength = 3 + 2 * max_decls - 2;
4371 list.StreamtoBufferSelects0 = buffer_mask[0];
4372 list.StreamtoBufferSelects1 = buffer_mask[1];
4373 list.StreamtoBufferSelects2 = buffer_mask[2];
4374 list.StreamtoBufferSelects3 = buffer_mask[3];
4375 list.NumEntries0 = decls[0];
4376 list.NumEntries1 = decls[1];
4377 list.NumEntries2 = decls[2];
4378 list.NumEntries3 = decls[3];
4379 }
4380
4381 for (int i = 0; i < max_decls; i++) {
4382 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4383 entry.Stream0Decl = so_decl[0][i];
4384 entry.Stream1Decl = so_decl[1][i];
4385 entry.Stream2Decl = so_decl[2][i];
4386 entry.Stream3Decl = so_decl[3][i];
4387 }
4388 }
4389
4390 return map;
4391 }
4392 #endif
4393
4394 #if GFX_VER == 6
4395 static void
crocus_emit_so_svbi(struct crocus_context * ice)4396 crocus_emit_so_svbi(struct crocus_context *ice)
4397 {
4398 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4399
4400 unsigned max_vertex = 0xffffffff;
4401 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4402 struct crocus_stream_output_target *tgt =
4403 (void *) ice->state.so_target[i];
4404 if (tgt)
4405 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4406 }
4407
4408 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4409 svbi.IndexNumber = 0;
4410 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4411 svbi.MaximumIndex = max_vertex;
4412 }
4413
4414 /* initialize the rest of the SVBI's to reasonable values so that we don't
4415 * run out of room writing the regular data.
4416 */
4417 for (int i = 1; i < 4; i++) {
4418 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4419 svbi.IndexNumber = i;
4420 svbi.StreamedVertexBufferIndex = 0;
4421 svbi.MaximumIndex = 0xffffffff;
4422 }
4423 }
4424 }
4425
4426 #endif
4427
4428
4429 #if GFX_VER >= 6
4430 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4431 crocus_is_drawing_points(const struct crocus_context *ice)
4432 {
4433 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4434
4435 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4436 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4437 return true;
4438
4439 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4440 const struct brw_gs_prog_data *gs_prog_data =
4441 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4442 return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4443 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4444 const struct brw_tes_prog_data *tes_data =
4445 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4446 return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4447 } else {
4448 return ice->state.prim_mode == PIPE_PRIM_POINTS;
4449 }
4450 }
4451 #endif
4452
4453 #if GFX_VER >= 6
4454 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct brw_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4455 get_attr_override(
4456 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4457 const struct brw_vue_map *vue_map,
4458 int urb_entry_read_offset, int fs_attr,
4459 bool two_side_color, uint32_t *max_source_attr)
4460 {
4461 /* Find the VUE slot for this attribute. */
4462 int slot = vue_map->varying_to_slot[fs_attr];
4463
4464 /* Viewport and Layer are stored in the VUE header. We need to override
4465 * them to zero if earlier stages didn't write them, as GL requires that
4466 * they read back as zero when not explicitly set.
4467 */
4468 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4469 attr->ComponentOverrideX = true;
4470 attr->ComponentOverrideW = true;
4471 attr->ConstantSource = CONST_0000;
4472
4473 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4474 attr->ComponentOverrideY = true;
4475 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4476 attr->ComponentOverrideZ = true;
4477
4478 return;
4479 }
4480
4481 /* If there was only a back color written but not front, use back
4482 * as the color instead of undefined
4483 */
4484 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4485 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4486 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4487 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4488
4489 if (slot == -1) {
4490 /* This attribute does not exist in the VUE--that means that the vertex
4491 * shader did not write to it. This means that either:
4492 *
4493 * (a) This attribute is a texture coordinate, and it is going to be
4494 * replaced with point coordinates (as a consequence of a call to
4495 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4496 * hardware will ignore whatever attribute override we supply.
4497 *
4498 * (b) This attribute is read by the fragment shader but not written by
4499 * the vertex shader, so its value is undefined. Therefore the
4500 * attribute override we supply doesn't matter.
4501 *
4502 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4503 * previous shader stage.
4504 *
4505 * Note that we don't have to worry about the cases where the attribute
4506 * is gl_PointCoord or is undergoing point sprite coordinate
4507 * replacement, because in those cases, this function isn't called.
4508 *
4509 * In case (c), we need to program the attribute overrides so that the
4510 * primitive ID will be stored in this slot. In every other case, the
4511 * attribute override we supply doesn't matter. So just go ahead and
4512 * program primitive ID in every case.
4513 */
4514 attr->ComponentOverrideW = true;
4515 attr->ComponentOverrideX = true;
4516 attr->ComponentOverrideY = true;
4517 attr->ComponentOverrideZ = true;
4518 attr->ConstantSource = PRIM_ID;
4519 return;
4520 }
4521
4522 /* Compute the location of the attribute relative to urb_entry_read_offset.
4523 * Each increment of urb_entry_read_offset represents a 256-bit value, so
4524 * it counts for two 128-bit VUE slots.
4525 */
4526 int source_attr = slot - 2 * urb_entry_read_offset;
4527 assert(source_attr >= 0 && source_attr < 32);
4528
4529 /* If we are doing two-sided color, and the VUE slot following this one
4530 * represents a back-facing color, then we need to instruct the SF unit to
4531 * do back-facing swizzling.
4532 */
4533 bool swizzling = two_side_color &&
4534 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4535 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4536 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4537 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4538
4539 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
4540 if (*max_source_attr < source_attr + swizzling)
4541 *max_source_attr = source_attr + swizzling;
4542
4543 attr->SourceAttribute = source_attr;
4544 if (swizzling)
4545 attr->SwizzleSelect = INPUTATTR_FACING;
4546 }
4547
4548 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4549 calculate_attr_overrides(
4550 const struct crocus_context *ice,
4551 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4552 uint32_t *point_sprite_enables,
4553 uint32_t *urb_entry_read_length,
4554 uint32_t *urb_entry_read_offset)
4555 {
4556 const struct brw_wm_prog_data *wm_prog_data = (void *)
4557 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4558 const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
4559 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4560 uint32_t max_source_attr = 0;
4561 const struct shader_info *fs_info =
4562 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4563
4564 int first_slot =
4565 brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4566
4567 /* Each URB offset packs two varying slots */
4568 assert(first_slot % 2 == 0);
4569 *urb_entry_read_offset = first_slot / 2;
4570 *point_sprite_enables = 0;
4571
4572 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4573 const int input_index = wm_prog_data->urb_setup[fs_attr];
4574
4575 if (input_index < 0)
4576 continue;
4577
4578 bool point_sprite = false;
4579 if (crocus_is_drawing_points(ice)) {
4580 if (fs_attr >= VARYING_SLOT_TEX0 &&
4581 fs_attr <= VARYING_SLOT_TEX7 &&
4582 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4583 point_sprite = true;
4584
4585 if (fs_attr == VARYING_SLOT_PNTC)
4586 point_sprite = true;
4587
4588 if (point_sprite)
4589 *point_sprite_enables |= 1U << input_index;
4590 }
4591
4592 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4593 if (!point_sprite) {
4594 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4595 cso_rast->cso.light_twoside, &max_source_attr);
4596 }
4597
4598 /* The hardware can only do the overrides on 16 overrides at a
4599 * time, and the other up to 16 have to be lined up so that the
4600 * input index = the output index. We'll need to do some
4601 * tweaking to make sure that's the case.
4602 */
4603 if (input_index < 16)
4604 attr_overrides[input_index] = attribute;
4605 else
4606 assert(attribute.SourceAttribute == input_index);
4607 }
4608
4609 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4610 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4611 *
4612 * "This field should be set to the minimum length required to read the
4613 * maximum source attribute. The maximum source attribute is indicated
4614 * by the maximum value of the enabled Attribute # Source Attribute if
4615 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4616 * enable is not set.
4617 * read_length = ceiling((max_source_attr + 1) / 2)
4618 *
4619 * [errata] Corruption/Hang possible if length programmed larger than
4620 * recommended"
4621 *
4622 * Similar text exists for Ivy Bridge.
4623 */
4624 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4625 }
4626 #endif
4627
4628 #if GFX_VER >= 7
4629 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4630 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4631 {
4632 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4633 const struct brw_wm_prog_data *wm_prog_data = (void *)
4634 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4635 #if GFX_VER >= 8
4636 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4637 #else
4638 #define attr_overrides sbe.Attribute
4639 #endif
4640
4641 uint32_t urb_entry_read_length;
4642 uint32_t urb_entry_read_offset;
4643 uint32_t point_sprite_enables;
4644
4645 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4646 sbe.AttributeSwizzleEnable = true;
4647 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4648 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4649
4650 calculate_attr_overrides(ice,
4651 attr_overrides,
4652 &point_sprite_enables,
4653 &urb_entry_read_length,
4654 &urb_entry_read_offset);
4655 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4656 sbe.VertexURBEntryReadLength = urb_entry_read_length;
4657 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4658 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4659 #if GFX_VER >= 8
4660 sbe.ForceVertexURBEntryReadLength = true;
4661 sbe.ForceVertexURBEntryReadOffset = true;
4662 #endif
4663 }
4664 #if GFX_VER >= 8
4665 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4666 for (int i = 0; i < 16; i++)
4667 sbes.Attribute[i] = attr_overrides[i];
4668 }
4669 #endif
4670 }
4671 #endif
4672
4673 /* ------------------------------------------------------------------- */
4674
4675 /**
4676 * Populate VS program key fields based on the current state.
4677 */
4678 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_vs_prog_key * key)4679 crocus_populate_vs_key(const struct crocus_context *ice,
4680 const struct shader_info *info,
4681 gl_shader_stage last_stage,
4682 struct brw_vs_prog_key *key)
4683 {
4684 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4685
4686 if (info->clip_distance_array_size == 0 &&
4687 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4688 last_stage == MESA_SHADER_VERTEX)
4689 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4690
4691 #if GFX_VER <= 5
4692 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4693 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4694 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4695 #endif
4696
4697 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4698
4699 #if GFX_VERx10 < 75
4700 uint64_t inputs_read = info->inputs_read;
4701 int ve_idx = 0;
4702 while (inputs_read) {
4703 int i = u_bit_scan64(&inputs_read);
4704 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4705 ve_idx++;
4706 }
4707 #endif
4708 }
4709
4710 /**
4711 * Populate TCS program key fields based on the current state.
4712 */
4713 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct brw_tcs_prog_key * key)4714 crocus_populate_tcs_key(const struct crocus_context *ice,
4715 struct brw_tcs_prog_key *key)
4716 {
4717 }
4718
4719 /**
4720 * Populate TES program key fields based on the current state.
4721 */
4722 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_tes_prog_key * key)4723 crocus_populate_tes_key(const struct crocus_context *ice,
4724 const struct shader_info *info,
4725 gl_shader_stage last_stage,
4726 struct brw_tes_prog_key *key)
4727 {
4728 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4729
4730 if (info->clip_distance_array_size == 0 &&
4731 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4732 last_stage == MESA_SHADER_TESS_EVAL)
4733 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4734 }
4735
4736 /**
4737 * Populate GS program key fields based on the current state.
4738 */
4739 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_gs_prog_key * key)4740 crocus_populate_gs_key(const struct crocus_context *ice,
4741 const struct shader_info *info,
4742 gl_shader_stage last_stage,
4743 struct brw_gs_prog_key *key)
4744 {
4745 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4746
4747 if (info->clip_distance_array_size == 0 &&
4748 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4749 last_stage == MESA_SHADER_GEOMETRY)
4750 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4751 }
4752
4753 static inline GLenum
compare_func_to_gl(enum pipe_compare_func pipe_func)4754 compare_func_to_gl(enum pipe_compare_func pipe_func)
4755 {
4756 static const unsigned map[] = {
4757 [PIPE_FUNC_NEVER] = GL_NEVER,
4758 [PIPE_FUNC_LESS] = GL_LESS,
4759 [PIPE_FUNC_EQUAL] = GL_EQUAL,
4760 [PIPE_FUNC_LEQUAL] = GL_LEQUAL,
4761 [PIPE_FUNC_GREATER] = GL_GREATER,
4762 [PIPE_FUNC_NOTEQUAL] = GL_NOTEQUAL,
4763 [PIPE_FUNC_GEQUAL] = GL_GEQUAL,
4764 [PIPE_FUNC_ALWAYS] = GL_ALWAYS,
4765 };
4766 return map[pipe_func];
4767 }
4768
4769 /**
4770 * Populate FS program key fields based on the current state.
4771 */
4772 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct brw_wm_prog_key * key)4773 crocus_populate_fs_key(const struct crocus_context *ice,
4774 const struct shader_info *info,
4775 struct brw_wm_prog_key *key)
4776 {
4777 struct crocus_screen *screen = (void *) ice->ctx.screen;
4778 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4779 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4780 const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4781 const struct crocus_blend_state *blend = ice->state.cso_blend;
4782
4783 #if GFX_VER < 6
4784 uint32_t lookup = 0;
4785
4786 if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4787 lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
4788
4789 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4790 lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4791
4792 if (fb->zsbuf && zsa->cso.depth_enabled) {
4793 lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4794
4795 if (zsa->cso.depth_writemask)
4796 lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4797
4798 }
4799 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4800 lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4801 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4802 lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4803 }
4804 key->iz_lookup = lookup;
4805 key->stats_wm = ice->state.stats_wm;
4806 #endif
4807
4808 uint32_t line_aa = BRW_WM_AA_NEVER;
4809 if (rast->cso.line_smooth) {
4810 int reduced_prim = ice->state.reduced_prim_mode;
4811 if (reduced_prim == PIPE_PRIM_LINES)
4812 line_aa = BRW_WM_AA_ALWAYS;
4813 else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
4814 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4815 line_aa = BRW_WM_AA_SOMETIMES;
4816
4817 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4818 rast->cso.cull_face == PIPE_FACE_BACK)
4819 line_aa = BRW_WM_AA_ALWAYS;
4820 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4821 line_aa = BRW_WM_AA_SOMETIMES;
4822
4823 if (rast->cso.cull_face == PIPE_FACE_FRONT)
4824 line_aa = BRW_WM_AA_ALWAYS;
4825 }
4826 }
4827 }
4828 key->line_aa = line_aa;
4829
4830 key->nr_color_regions = fb->nr_cbufs;
4831
4832 key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4833
4834 key->alpha_to_coverage = blend->cso.alpha_to_coverage;
4835
4836 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4837
4838 key->flat_shade = rast->cso.flatshade &&
4839 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4840
4841 key->persample_interp = rast->cso.force_persample_interp;
4842 key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
4843
4844 key->ignore_sample_mask_out = !key->multisample_fbo;
4845 key->coherent_fb_fetch = false; // TODO: needed?
4846
4847 key->force_dual_color_blend =
4848 screen->driconf.dual_color_blend_by_location &&
4849 (blend->blend_enables & 1) && blend->dual_color_blending;
4850
4851 /* TODO: Respect glHint for key->high_quality_derivatives */
4852
4853 #if GFX_VER <= 5
4854 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4855 key->alpha_test_func = compare_func_to_gl(zsa->cso.alpha_func);
4856 key->alpha_test_ref = zsa->cso.alpha_ref_value;
4857 }
4858 #endif
4859 }
4860
4861 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct brw_cs_prog_key * key)4862 crocus_populate_cs_key(const struct crocus_context *ice,
4863 struct brw_cs_prog_key *key)
4864 {
4865 }
4866
4867 #if GFX_VER == 4
4868 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4869 #elif GFX_VER >= 5
4870 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4871 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4872 {
4873 return shader->offset;
4874 }
4875 #endif
4876
4877 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4878 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
4879 * this WA on C0 stepping.
4880 *
4881 * TODO: Fill out SamplerCount for prefetching?
4882 */
4883
4884 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4885 pkt.KernelStartPointer = KSP(ice, shader); \
4886 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4887 pkt.FloatingPointMode = prog_data->use_alt_mode; \
4888 \
4889 pkt.DispatchGRFStartRegisterForURBData = \
4890 prog_data->dispatch_grf_start_reg; \
4891 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
4892 pkt.prefix##URBEntryReadOffset = 0; \
4893 \
4894 pkt.StatisticsEnable = true; \
4895 pkt.Enable = true; \
4896 \
4897 if (prog_data->total_scratch) { \
4898 struct crocus_bo *bo = \
4899 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4900 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
4901 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
4902 }
4903
4904 /* ------------------------------------------------------------------- */
4905 #if GFX_VER >= 6
4906 static const uint32_t push_constant_opcodes[] = {
4907 [MESA_SHADER_VERTEX] = 21,
4908 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4909 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4910 [MESA_SHADER_GEOMETRY] = 22,
4911 [MESA_SHADER_FRAGMENT] = 23,
4912 [MESA_SHADER_COMPUTE] = 0,
4913 };
4914 #endif
4915
4916 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4917 emit_sized_null_surface(struct crocus_batch *batch,
4918 unsigned width, unsigned height,
4919 unsigned layers, unsigned levels,
4920 unsigned minimum_array_element,
4921 uint32_t *out_offset)
4922 {
4923 struct isl_device *isl_dev = &batch->screen->isl_dev;
4924 uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4925 isl_dev->ss.align,
4926 out_offset);
4927 //TODO gen 6 multisample crash
4928 isl_null_fill_state(isl_dev, surf,
4929 .size = isl_extent3d(width, height, layers),
4930 .levels = levels,
4931 .minimum_array_element = minimum_array_element);
4932 }
4933 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4934 emit_null_surface(struct crocus_batch *batch,
4935 uint32_t *out_offset)
4936 {
4937 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4938 }
4939
4940 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4941 emit_null_fb_surface(struct crocus_batch *batch,
4942 struct crocus_context *ice,
4943 uint32_t *out_offset)
4944 {
4945 uint32_t width, height, layers, level, layer;
4946 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4947 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4948 emit_null_surface(batch, out_offset);
4949 return;
4950 }
4951
4952 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4953 width = MAX2(cso->width, 1);
4954 height = MAX2(cso->height, 1);
4955 layers = cso->layers ? cso->layers : 1;
4956 level = 0;
4957 layer = 0;
4958
4959 if (cso->nr_cbufs == 0 && cso->zsbuf) {
4960 width = cso->zsbuf->width;
4961 height = cso->zsbuf->height;
4962 level = cso->zsbuf->u.tex.level;
4963 layer = cso->zsbuf->u.tex.first_layer;
4964 }
4965 emit_sized_null_surface(batch, width, height,
4966 layers, level, layer,
4967 out_offset);
4968 }
4969
4970 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4971 emit_surface_state(struct crocus_batch *batch,
4972 struct crocus_resource *res,
4973 const struct isl_surf *in_surf,
4974 bool adjust_surf,
4975 struct isl_view *in_view,
4976 bool writeable,
4977 enum isl_aux_usage aux_usage,
4978 bool blend_enable,
4979 uint32_t write_disables,
4980 uint32_t *surf_state,
4981 uint32_t addr_offset)
4982 {
4983 const struct intel_device_info *devinfo = &batch->screen->devinfo;
4984 struct isl_device *isl_dev = &batch->screen->isl_dev;
4985 uint32_t reloc = RELOC_32BIT;
4986 uint64_t offset_B = res->offset;
4987 uint32_t tile_x_sa = 0, tile_y_sa = 0;
4988
4989 if (writeable)
4990 reloc |= RELOC_WRITE;
4991
4992 struct isl_surf surf = *in_surf;
4993 struct isl_view view = *in_view;
4994 if (adjust_surf) {
4995 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4996 isl_surf_get_image_surf(isl_dev, in_surf,
4997 view.base_level, 0,
4998 view.base_array_layer,
4999 &surf, &offset_B,
5000 &tile_x_sa, &tile_y_sa);
5001 view.base_array_layer = 0;
5002 view.base_level = 0;
5003 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
5004 isl_surf_get_image_surf(isl_dev, in_surf,
5005 view.base_level, view.base_array_layer,
5006 0,
5007 &surf, &offset_B,
5008 &tile_x_sa, &tile_y_sa);
5009 view.base_array_layer = 0;
5010 view.base_level = 0;
5011 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5012 surf.dim = ISL_SURF_DIM_2D;
5013 }
5014
5015 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5016 struct crocus_bo *aux_bo = NULL;
5017 uint32_t aux_offset = 0;
5018 struct isl_surf *aux_surf = NULL;
5019 if (aux_usage != ISL_AUX_USAGE_NONE) {
5020 aux_surf = &res->aux.surf;
5021 aux_offset = res->aux.offset;
5022 aux_bo = res->aux.bo;
5023
5024 clear_color = crocus_resource_get_clear_color(res);
5025 }
5026
5027 isl_surf_fill_state(isl_dev, surf_state,
5028 .surf = &surf,
5029 .view = &view,
5030 .address = crocus_state_reloc(batch,
5031 addr_offset + isl_dev->ss.addr_offset,
5032 res->bo, offset_B, reloc),
5033 .aux_surf = aux_surf,
5034 .aux_usage = aux_usage,
5035 .aux_address = aux_offset,
5036 .mocs = crocus_mocs(res->bo, isl_dev),
5037 .clear_color = clear_color,
5038 .use_clear_address = false,
5039 .clear_address = 0,
5040 .x_offset_sa = tile_x_sa,
5041 .y_offset_sa = tile_y_sa,
5042 #if GFX_VER <= 5
5043 .blend_enable = blend_enable,
5044 .write_disables = write_disables,
5045 #endif
5046 );
5047
5048 if (aux_surf) {
5049 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5050 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5051 * contain other control information. Since buffer addresses are always
5052 * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5053 * an ordinary reloc to do the necessary address translation.
5054 *
5055 * FIXME: move to the point of assignment.
5056 */
5057 if (devinfo->ver == 8) {
5058 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5059 *aux_addr = crocus_state_reloc(batch,
5060 addr_offset + isl_dev->ss.aux_addr_offset,
5061 aux_bo, *aux_addr,
5062 reloc);
5063 } else {
5064 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5065 *aux_addr = crocus_state_reloc(batch,
5066 addr_offset + isl_dev->ss.aux_addr_offset,
5067 aux_bo, *aux_addr,
5068 reloc);
5069 }
5070 }
5071
5072 }
5073
5074 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5075 emit_surface(struct crocus_batch *batch,
5076 struct crocus_surface *surf,
5077 enum isl_aux_usage aux_usage,
5078 bool blend_enable,
5079 uint32_t write_disables)
5080 {
5081 const struct intel_device_info *devinfo = &batch->screen->devinfo;
5082 struct isl_device *isl_dev = &batch->screen->isl_dev;
5083 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5084 struct isl_view *view = &surf->view;
5085 uint32_t offset = 0;
5086 enum pipe_texture_target target = res->base.b.target;
5087 bool adjust_surf = false;
5088
5089 if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
5090 adjust_surf = true;
5091
5092 if (surf->align_res)
5093 res = (struct crocus_resource *)surf->align_res;
5094
5095 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5096
5097 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5098 aux_usage, blend_enable,
5099 write_disables,
5100 surf_state, offset);
5101 return offset;
5102 }
5103
5104 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5105 emit_rt_surface(struct crocus_batch *batch,
5106 struct crocus_surface *surf,
5107 enum isl_aux_usage aux_usage)
5108 {
5109 struct isl_device *isl_dev = &batch->screen->isl_dev;
5110 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5111 struct isl_view *view = &surf->read_view;
5112 uint32_t offset = 0;
5113 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5114
5115 emit_surface_state(batch, res, &surf->surf, true, view, false,
5116 aux_usage, 0, false,
5117 surf_state, offset);
5118 return offset;
5119 }
5120
5121 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5122 emit_grid(struct crocus_context *ice,
5123 struct crocus_batch *batch)
5124 {
5125 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5126 uint32_t offset = 0;
5127 struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5128 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5129 isl_dev->ss.align, &offset);
5130 isl_buffer_fill_state(isl_dev, surf_state,
5131 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5132 crocus_resource_bo(grid_ref->res),
5133 grid_ref->offset,
5134 RELOC_32BIT),
5135 .size_B = 12,
5136 .format = ISL_FORMAT_RAW,
5137 .stride_B = 1,
5138 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5139 return offset;
5140 }
5141
5142 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5143 emit_ubo_buffer(struct crocus_context *ice,
5144 struct crocus_batch *batch,
5145 struct pipe_constant_buffer *buffer)
5146 {
5147 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5148 uint32_t offset = 0;
5149
5150 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5151 isl_dev->ss.align, &offset);
5152 isl_buffer_fill_state(isl_dev, surf_state,
5153 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5154 crocus_resource_bo(buffer->buffer),
5155 buffer->buffer_offset,
5156 RELOC_32BIT),
5157 .size_B = buffer->buffer_size,
5158 .format = 0,
5159 .swizzle = ISL_SWIZZLE_IDENTITY,
5160 .stride_B = 1,
5161 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5162
5163 return offset;
5164 }
5165
5166 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5167 emit_ssbo_buffer(struct crocus_context *ice,
5168 struct crocus_batch *batch,
5169 struct pipe_shader_buffer *buffer, bool writeable)
5170 {
5171 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5172 uint32_t offset = 0;
5173 uint32_t reloc = RELOC_32BIT;
5174
5175 if (writeable)
5176 reloc |= RELOC_WRITE;
5177 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5178 isl_dev->ss.align, &offset);
5179 isl_buffer_fill_state(isl_dev, surf_state,
5180 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5181 crocus_resource_bo(buffer->buffer),
5182 buffer->buffer_offset,
5183 reloc),
5184 .size_B = buffer->buffer_size,
5185 .format = ISL_FORMAT_RAW,
5186 .swizzle = ISL_SWIZZLE_IDENTITY,
5187 .stride_B = 1,
5188 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5189
5190 return offset;
5191 }
5192
5193 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5194 emit_sampler_view(struct crocus_context *ice,
5195 struct crocus_batch *batch,
5196 bool for_gather,
5197 struct crocus_sampler_view *isv)
5198 {
5199 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5200 uint32_t offset = 0;
5201
5202 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5203 isl_dev->ss.align, &offset);
5204
5205 if (isv->base.target == PIPE_BUFFER) {
5206 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5207 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5208 unsigned final_size =
5209 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5210 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5211 isl_buffer_fill_state(isl_dev, surf_state,
5212 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5213 isv->res->bo,
5214 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5215 .size_B = final_size,
5216 .format = isv->view.format,
5217 .swizzle = isv->view.swizzle,
5218 .stride_B = cpp,
5219 .mocs = crocus_mocs(isv->res->bo, isl_dev)
5220 );
5221 } else {
5222 enum isl_aux_usage aux_usage =
5223 crocus_resource_texture_aux_usage(isv->res);
5224
5225 emit_surface_state(batch, isv->res, &isv->res->surf, false,
5226 for_gather ? &isv->gather_view : &isv->view,
5227 false, aux_usage, false,
5228 0, surf_state, offset);
5229 }
5230 return offset;
5231 }
5232
5233 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5234 emit_image_view(struct crocus_context *ice,
5235 struct crocus_batch *batch,
5236 struct crocus_image_view *iv)
5237 {
5238 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5239 uint32_t offset = 0;
5240
5241 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5242 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5243 isl_dev->ss.align, &offset);
5244 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5245 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5246 if (res->base.b.target == PIPE_BUFFER) {
5247 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5248 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5249 unsigned final_size =
5250 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5251 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5252 isl_buffer_fill_state(isl_dev, surf_state,
5253 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5254 res->bo,
5255 res->offset + iv->base.u.buf.offset, reloc),
5256 .size_B = final_size,
5257 .format = iv->view.format,
5258 .swizzle = iv->view.swizzle,
5259 .stride_B = cpp,
5260 .mocs = crocus_mocs(res->bo, isl_dev)
5261 );
5262 } else {
5263 if (iv->view.format == ISL_FORMAT_RAW) {
5264 isl_buffer_fill_state(isl_dev, surf_state,
5265 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5266 res->bo,
5267 res->offset, reloc),
5268 .size_B = res->bo->size - res->offset,
5269 .format = iv->view.format,
5270 .swizzle = iv->view.swizzle,
5271 .stride_B = 1,
5272 .mocs = crocus_mocs(res->bo, isl_dev),
5273 );
5274
5275
5276 } else {
5277 emit_surface_state(batch, res,
5278 &res->surf, false, &iv->view,
5279 write, 0, false,
5280 0, surf_state, offset);
5281 }
5282 }
5283
5284 return offset;
5285 }
5286
5287 #if GFX_VER == 6
5288 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5289 emit_sol_surface(struct crocus_batch *batch,
5290 struct pipe_stream_output_info *so_info,
5291 uint32_t idx)
5292 {
5293 struct crocus_context *ice = batch->ice;
5294
5295 if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5296 return 0;
5297 const struct pipe_stream_output *output = &so_info->output[idx];
5298 const int buffer = output->output_buffer;
5299 assert(output->stream == 0);
5300
5301 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5302 unsigned stride_dwords = so_info->stride[buffer];
5303 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5304
5305 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5306 unsigned num_vector_components = output->num_components;
5307 unsigned num_elements;
5308 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5309 * too big to map using a single binding table entry?
5310 */
5311 // assert((size_dwords - offset_dwords) / stride_dwords
5312 // <= BRW_MAX_NUM_BUFFER_ENTRIES);
5313
5314 if (size_dwords > offset_dwords + num_vector_components) {
5315 /* There is room for at least 1 transform feedback output in the buffer.
5316 * Compute the number of additional transform feedback outputs the
5317 * buffer has room for.
5318 */
5319 num_elements =
5320 (size_dwords - offset_dwords - num_vector_components);
5321 } else {
5322 /* There isn't even room for a single transform feedback output in the
5323 * buffer. We can't configure the binding table entry to prevent output
5324 * entirely; we'll have to rely on the geometry shader to detect
5325 * overflow. But to minimize the damage in case of a bug, set up the
5326 * binding table entry to just allow a single output.
5327 */
5328 num_elements = 0;
5329 }
5330 num_elements += stride_dwords;
5331
5332 uint32_t surface_format;
5333 switch (num_vector_components) {
5334 case 1:
5335 surface_format = ISL_FORMAT_R32_FLOAT;
5336 break;
5337 case 2:
5338 surface_format = ISL_FORMAT_R32G32_FLOAT;
5339 break;
5340 case 3:
5341 surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5342 break;
5343 case 4:
5344 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5345 break;
5346 default:
5347 unreachable("Invalid vector size for transform feedback output");
5348 }
5349
5350 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5351 uint32_t offset = 0;
5352
5353 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5354 isl_dev->ss.align, &offset);
5355 isl_buffer_fill_state(isl_dev, surf_state,
5356 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5357 crocus_resource_bo(&buf->base.b),
5358 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5359 .size_B = num_elements * 4,
5360 .stride_B = stride_dwords * 4,
5361 .swizzle = ISL_SWIZZLE_IDENTITY,
5362 .format = surface_format);
5363 return offset;
5364 }
5365 #endif
5366
5367 #define foreach_surface_used(index, group) \
5368 for (int index = 0; index < bt->sizes[group]; index++) \
5369 if (crocus_group_index_to_bti(bt, group, index) != \
5370 CROCUS_SURFACE_NOT_USED)
5371
5372 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5373 crocus_populate_binding_table(struct crocus_context *ice,
5374 struct crocus_batch *batch,
5375 gl_shader_stage stage, bool ff_gs)
5376 {
5377 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5378 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5379 if (!shader)
5380 return;
5381
5382 struct crocus_binding_table *bt = &shader->bt;
5383 int s = 0;
5384 uint32_t *surf_offsets = shader->surf_offset;
5385
5386 #if GFX_VER < 8
5387 const struct shader_info *info = crocus_get_shader_info(ice, stage);
5388 #endif
5389
5390 if (stage == MESA_SHADER_FRAGMENT) {
5391 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5392 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5393 if (cso_fb->nr_cbufs) {
5394 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5395 uint32_t write_disables = 0;
5396 bool blend_enable = false;
5397 #if GFX_VER <= 5
5398 const struct pipe_rt_blend_state *rt =
5399 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5400 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5401 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5402 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5403 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5404 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5405 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5406 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5407 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5408 #endif
5409 if (cso_fb->cbufs[i]) {
5410 surf_offsets[s] = emit_surface(batch,
5411 (struct crocus_surface *)cso_fb->cbufs[i],
5412 ice->state.draw_aux_usage[i],
5413 blend_enable,
5414 write_disables);
5415 } else {
5416 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5417 }
5418 s++;
5419 }
5420 } else {
5421 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5422 s++;
5423 }
5424
5425 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5426 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5427 if (cso_fb->cbufs[i]) {
5428 surf_offsets[s++] = emit_rt_surface(batch,
5429 (struct crocus_surface *)cso_fb->cbufs[i],
5430 ice->state.draw_aux_usage[i]);
5431 }
5432 }
5433 }
5434
5435 if (stage == MESA_SHADER_COMPUTE) {
5436 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5437 surf_offsets[s] = emit_grid(ice, batch);
5438 s++;
5439 }
5440 }
5441
5442 #if GFX_VER == 6
5443 if (stage == MESA_SHADER_GEOMETRY) {
5444 struct pipe_stream_output_info *so_info;
5445 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5446 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5447 else
5448 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5449
5450 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5451 surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5452 s++;
5453 }
5454 }
5455 #endif
5456
5457 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5458 struct crocus_sampler_view *view = shs->textures[i];
5459 if (view)
5460 surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5461 else
5462 emit_null_surface(batch, &surf_offsets[s]);
5463 s++;
5464 }
5465
5466 #if GFX_VER < 8
5467 if (info && info->uses_texture_gather) {
5468 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5469 struct crocus_sampler_view *view = shs->textures[i];
5470 if (view)
5471 surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5472 else
5473 emit_null_surface(batch, &surf_offsets[s]);
5474 s++;
5475 }
5476 }
5477 #endif
5478
5479 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5480 struct crocus_image_view *view = &shs->image[i];
5481 if (view->base.resource)
5482 surf_offsets[s] = emit_image_view(ice, batch, view);
5483 else
5484 emit_null_surface(batch, &surf_offsets[s]);
5485 s++;
5486 }
5487 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5488 if (shs->constbufs[i].buffer)
5489 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5490 else
5491 emit_null_surface(batch, &surf_offsets[s]);
5492 s++;
5493 }
5494 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5495 if (shs->ssbo[i].buffer)
5496 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5497 !!(shs->writable_ssbos & (1 << i)));
5498 else
5499 emit_null_surface(batch, &surf_offsets[s]);
5500 s++;
5501 }
5502
5503 }
5504 /* ------------------------------------------------------------------- */
5505 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5506 crocus_upload_binding_table(struct crocus_context *ice,
5507 struct crocus_batch *batch,
5508 uint32_t *table,
5509 uint32_t size)
5510
5511 {
5512 if (size == 0)
5513 return 0;
5514 return emit_state(batch, table, size, 32);
5515 }
5516
5517 /**
5518 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5519 */
5520
5521 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5522 crocus_update_surface_base_address(struct crocus_batch *batch)
5523 {
5524 if (batch->state_base_address_emitted)
5525 return;
5526 #if GFX_VER >= 6
5527 uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5528 #endif
5529 flush_before_state_base_change(batch);
5530
5531 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5532
5533 sba.SurfaceStateBaseAddressModifyEnable = true;
5534 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5535
5536 #if GFX_VER >= 5
5537 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5538 #endif
5539
5540 sba.GeneralStateBaseAddressModifyEnable = true;
5541 sba.IndirectObjectBaseAddressModifyEnable = true;
5542 #if GFX_VER >= 5
5543 sba.InstructionBaseAddressModifyEnable = true;
5544 #endif
5545
5546 #if GFX_VER < 8
5547 sba.GeneralStateAccessUpperBoundModifyEnable = true;
5548 #endif
5549 #if GFX_VER >= 5 && GFX_VER < 8
5550 sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5551 sba.InstructionAccessUpperBoundModifyEnable = true;
5552 #endif
5553 #if GFX_VER <= 5
5554 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5555 #endif
5556 #if GFX_VER >= 6
5557 /* The hardware appears to pay attention to the MOCS fields even
5558 * if you don't set the "Address Modify Enable" bit for the base.
5559 */
5560 sba.GeneralStateMOCS = mocs;
5561 sba.StatelessDataPortAccessMOCS = mocs;
5562 #if GFX_VER == 8
5563 sba.DynamicStateMOCS = mocs;
5564 sba.IndirectObjectMOCS = mocs;
5565 sba.InstructionMOCS = mocs;
5566 sba.SurfaceStateMOCS = mocs;
5567 sba.GeneralStateBufferSize = 0xfffff;
5568 sba.IndirectObjectBufferSize = 0xfffff;
5569 sba.InstructionBufferSize = 0xfffff;
5570 sba.DynamicStateBufferSize = MAX_STATE_SIZE;
5571
5572 sba.GeneralStateBufferSizeModifyEnable = true;
5573 sba.DynamicStateBufferSizeModifyEnable = true;
5574 sba.IndirectObjectBufferSizeModifyEnable = true;
5575 sba.InstructionBuffersizeModifyEnable = true;
5576 #endif
5577
5578 sba.DynamicStateBaseAddressModifyEnable = true;
5579
5580 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5581
5582 /* Dynamic state upper bound. Although the documentation says that
5583 * programming it to zero will cause it to be ignored, that is a lie.
5584 * If this isn't programmed to a real bound, the sampler border color
5585 * pointer is rejected, causing border color to mysteriously fail.
5586 */
5587 #if GFX_VER < 8
5588 sba.DynamicStateAccessUpperBoundModifyEnable = true;
5589 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5590 #endif
5591
5592 #endif
5593 }
5594
5595 flush_after_state_base_change(batch);
5596
5597 /* According to section 3.6.1 of VOL1 of the 965 PRM,
5598 * STATE_BASE_ADDRESS updates require a reissue of:
5599 *
5600 * 3DSTATE_PIPELINE_POINTERS
5601 * 3DSTATE_BINDING_TABLE_POINTERS
5602 * MEDIA_STATE_POINTERS
5603 *
5604 * and this continues through Ironlake. The Sandy Bridge PRM, vol
5605 * 1 part 1 says that the folowing packets must be reissued:
5606 *
5607 * 3DSTATE_CC_POINTERS
5608 * 3DSTATE_BINDING_TABLE_POINTERS
5609 * 3DSTATE_SAMPLER_STATE_POINTERS
5610 * 3DSTATE_VIEWPORT_STATE_POINTERS
5611 * MEDIA_STATE_POINTERS
5612 *
5613 * Those are always reissued following SBA updates anyway (new
5614 * batch time), except in the case of the program cache BO
5615 * changing. Having a separate state flag makes the sequence more
5616 * obvious.
5617 */
5618 #if GFX_VER <= 5
5619 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5620 #elif GFX_VER == 6
5621 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5622 #endif
5623 batch->state_base_address_emitted = true;
5624 }
5625
5626 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5627 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5628 bool window_space_position, float *zmin, float *zmax)
5629 {
5630 if (window_space_position) {
5631 *zmin = 0.f;
5632 *zmax = 1.f;
5633 return;
5634 }
5635 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5636 }
5637
5638 struct push_bos {
5639 struct {
5640 struct crocus_address addr;
5641 uint32_t length;
5642 } buffers[4];
5643 int buffer_count;
5644 uint32_t max_length;
5645 };
5646
5647 #if GFX_VER >= 6
5648 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5649 setup_constant_buffers(struct crocus_context *ice,
5650 struct crocus_batch *batch,
5651 int stage,
5652 struct push_bos *push_bos)
5653 {
5654 struct crocus_shader_state *shs = &ice->state.shaders[stage];
5655 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5656 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5657
5658 uint32_t push_range_sum = 0;
5659
5660 int n = 0;
5661 for (int i = 0; i < 4; i++) {
5662 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5663
5664 if (range->length == 0)
5665 continue;
5666
5667 push_range_sum += range->length;
5668
5669 if (range->length > push_bos->max_length)
5670 push_bos->max_length = range->length;
5671
5672 /* Range block is a binding table index, map back to UBO index. */
5673 unsigned block_index = crocus_bti_to_group_index(
5674 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5675 assert(block_index != CROCUS_SURFACE_NOT_USED);
5676
5677 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5678 struct crocus_resource *res = (void *) cbuf->buffer;
5679
5680 assert(cbuf->buffer_offset % 32 == 0);
5681
5682 push_bos->buffers[n].length = range->length;
5683 push_bos->buffers[n].addr =
5684 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5685 : ro_bo(batch->ice->workaround_bo,
5686 batch->ice->workaround_offset);
5687 n++;
5688 }
5689
5690 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5691 *
5692 * "The sum of all four read length fields must be less than or
5693 * equal to the size of 64."
5694 */
5695 assert(push_range_sum <= 64);
5696
5697 push_bos->buffer_count = n;
5698 }
5699
5700 #if GFX_VER == 7
5701 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5702 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5703 {
5704 ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
5705
5706 assert(devinfo->ver == 7);
5707 crocus_emit_pipe_control_write(batch,
5708 "vs workaround",
5709 PIPE_CONTROL_WRITE_IMMEDIATE
5710 | PIPE_CONTROL_DEPTH_STALL,
5711 batch->ice->workaround_bo,
5712 batch->ice->workaround_offset, 0);
5713 }
5714 #endif
5715
5716 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5717 emit_push_constant_packets(struct crocus_context *ice,
5718 struct crocus_batch *batch,
5719 int stage,
5720 const struct push_bos *push_bos)
5721 {
5722 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5723 struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5724
5725 #if GFX_VER == 7
5726 if (stage == MESA_SHADER_VERTEX) {
5727 if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
5728 gen7_emit_vs_workaround_flush(batch);
5729 }
5730 #endif
5731 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5732 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5733 #if GFX_VER >= 7
5734 if (prog_data) {
5735 /* The Skylake PRM contains the following restriction:
5736 *
5737 * "The driver must ensure The following case does not occur
5738 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5739 * buffer 3 read length equal to zero committed followed by a
5740 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5741 * zero committed."
5742 *
5743 * To avoid this, we program the buffers in the highest slots.
5744 * This way, slot 0 is only used if slot 3 is also used.
5745 */
5746 int n = push_bos->buffer_count;
5747 assert(n <= 4);
5748 #if GFX_VERx10 >= 75
5749 const unsigned shift = 4 - n;
5750 #else
5751 const unsigned shift = 0;
5752 #endif
5753 for (int i = 0; i < n; i++) {
5754 pkt.ConstantBody.ReadLength[i + shift] =
5755 push_bos->buffers[i].length;
5756 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5757 }
5758 }
5759 #else
5760 if (prog_data) {
5761 int n = push_bos->buffer_count;
5762 assert (n <= 1);
5763 if (n == 1) {
5764 pkt.Buffer0Valid = true;
5765 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5766 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5767 }
5768 }
5769 #endif
5770 }
5771 }
5772
5773 #endif
5774
5775 #if GFX_VER == 8
5776 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5777 #elif GFX_VER >= 6
5778 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
5779 #else
5780 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
5781 #endif
5782
5783 static inline void
5784 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5785 {
5786 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5787 ds->DepthTestEnable = cso->cso.depth_enabled;
5788 ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5789 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5790
5791 ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5792 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5793 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5794 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5795
5796 ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5797 ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5798
5799 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5800 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5801 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5802 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5803
5804 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5805 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5806 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5807 ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5808 ds->StencilBufferWriteEnable =
5809 cso->cso.stencil[0].writemask != 0 ||
5810 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5811 }
5812
5813 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5814 emit_vertex_buffer_state(struct crocus_batch *batch,
5815 unsigned buffer_id,
5816 struct crocus_bo *bo,
5817 unsigned start_offset,
5818 unsigned end_offset,
5819 unsigned stride,
5820 unsigned step_rate,
5821 uint32_t **map)
5822 {
5823 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5824 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5825 vb.BufferStartingAddress = ro_bo(bo, start_offset);
5826 #if GFX_VER >= 8
5827 vb.BufferSize = end_offset - start_offset;
5828 #endif
5829 vb.VertexBufferIndex = buffer_id;
5830 vb.BufferPitch = stride;
5831 #if GFX_VER >= 7
5832 vb.AddressModifyEnable = true;
5833 #endif
5834 #if GFX_VER >= 6
5835 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5836 #endif
5837 #if GFX_VER < 8
5838 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5839 vb.InstanceDataStepRate = step_rate;
5840 #if GFX_VER >= 5
5841 vb.EndAddress = ro_bo(bo, end_offset - 1);
5842 #endif
5843 #endif
5844 }
5845 *map += vb_dwords;
5846 }
5847
5848 #if GFX_VER >= 6
5849 static uint32_t
determine_sample_mask(struct crocus_context * ice)5850 determine_sample_mask(struct crocus_context *ice)
5851 {
5852 uint32_t num_samples = ice->state.framebuffer.samples;
5853
5854 if (num_samples <= 1)
5855 return 1;
5856
5857 uint32_t fb_mask = (1 << num_samples) - 1;
5858 return ice->state.sample_mask & fb_mask;
5859 }
5860 #endif
5861
5862 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5863 crocus_upload_dirty_render_state(struct crocus_context *ice,
5864 struct crocus_batch *batch,
5865 const struct pipe_draw_info *draw)
5866 {
5867 uint64_t dirty = ice->state.dirty;
5868 uint64_t stage_dirty = ice->state.stage_dirty;
5869
5870 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5871 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5872 return;
5873
5874 if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5875 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5876 vf.StatisticsEnable = true;
5877 }
5878 }
5879
5880 #if GFX_VER <= 5
5881 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5882 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5883 bool ret = calculate_curbe_offsets(batch);
5884 if (ret) {
5885 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5886 stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5887 }
5888 }
5889
5890 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5891 stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5892 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5893 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5894 ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5895 if (ret) {
5896 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5897 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5898 }
5899 }
5900 #endif
5901 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5902 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5903 uint32_t cc_vp_address;
5904
5905 /* XXX: could avoid streaming for depth_clip [0,1] case. */
5906 uint32_t *cc_vp_map =
5907 stream_state(batch,
5908 4 * ice->state.num_viewports *
5909 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5910 for (int i = 0; i < ice->state.num_viewports; i++) {
5911 float zmin, zmax;
5912 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5913 ice->state.window_space_position,
5914 &zmin, &zmax);
5915 if (cso_rast->cso.depth_clip_near)
5916 zmin = 0.0;
5917 if (cso_rast->cso.depth_clip_far)
5918 zmax = 1.0;
5919
5920 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5921 ccv.MinimumDepth = zmin;
5922 ccv.MaximumDepth = zmax;
5923 }
5924
5925 cc_vp_map += GENX(CC_VIEWPORT_length);
5926 }
5927
5928 #if GFX_VER >= 7
5929 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5930 ptr.CCViewportPointer = cc_vp_address;
5931 }
5932 #elif GFX_VER == 6
5933 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5934 vp.CCViewportStateChange = 1;
5935 vp.PointertoCC_VIEWPORT = cc_vp_address;
5936 }
5937 #else
5938 ice->state.cc_vp_address = cc_vp_address;
5939 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5940 #endif
5941 }
5942
5943 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5944 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5945 #if GFX_VER >= 7
5946 uint32_t sf_cl_vp_address;
5947 uint32_t *vp_map =
5948 stream_state(batch,
5949 4 * ice->state.num_viewports *
5950 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5951 #else
5952 uint32_t *vp_map =
5953 stream_state(batch,
5954 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5955 32, &ice->state.sf_vp_address);
5956 uint32_t *clip_map =
5957 stream_state(batch,
5958 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5959 32, &ice->state.clip_vp_address);
5960 #endif
5961
5962 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5963 const struct pipe_viewport_state *state = &ice->state.viewports[i];
5964 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5965
5966 #if GFX_VER == 8
5967 float vp_xmin = viewport_extent(state, 0, -1.0f);
5968 float vp_xmax = viewport_extent(state, 0, 1.0f);
5969 float vp_ymin = viewport_extent(state, 1, -1.0f);
5970 float vp_ymax = viewport_extent(state, 1, 1.0f);
5971 #endif
5972 intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
5973 state->scale[0], state->scale[1],
5974 state->translate[0], state->translate[1],
5975 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5976 #if GFX_VER >= 7
5977 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5978 #else
5979 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5980 #endif
5981 {
5982 vp.ViewportMatrixElementm00 = state->scale[0];
5983 vp.ViewportMatrixElementm11 = state->scale[1];
5984 vp.ViewportMatrixElementm22 = state->scale[2];
5985 vp.ViewportMatrixElementm30 = state->translate[0];
5986 vp.ViewportMatrixElementm31 = state->translate[1];
5987 vp.ViewportMatrixElementm32 = state->translate[2];
5988 #if GFX_VER < 6
5989 struct pipe_scissor_state scissor;
5990 crocus_fill_scissor_rect(ice, 0, &scissor);
5991 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5992 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5993 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5994 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5995 #endif
5996
5997 #if GFX_VER >= 7
5998 vp.XMinClipGuardband = gb_xmin;
5999 vp.XMaxClipGuardband = gb_xmax;
6000 vp.YMinClipGuardband = gb_ymin;
6001 vp.YMaxClipGuardband = gb_ymax;
6002 #endif
6003 #if GFX_VER == 8
6004 vp.XMinViewPort = MAX2(vp_xmin, 0);
6005 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6006 vp.YMinViewPort = MAX2(vp_ymin, 0);
6007 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6008 #endif
6009 }
6010 #if GFX_VER < 7
6011 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6012 clip.XMinClipGuardband = gb_xmin;
6013 clip.XMaxClipGuardband = gb_xmax;
6014 clip.YMinClipGuardband = gb_ymin;
6015 clip.YMaxClipGuardband = gb_ymax;
6016 }
6017 #endif
6018 #if GFX_VER >= 7
6019 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6020 #else
6021 vp_map += GENX(SF_VIEWPORT_length);
6022 clip_map += GENX(CLIP_VIEWPORT_length);
6023 #endif
6024 }
6025 #if GFX_VER >= 7
6026 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6027 ptr.SFClipViewportPointer = sf_cl_vp_address;
6028 }
6029 #elif GFX_VER == 6
6030 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6031 vp.SFViewportStateChange = 1;
6032 vp.CLIPViewportStateChange = 1;
6033 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6034 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6035 }
6036 #endif
6037 }
6038
6039 #if GFX_VER >= 6
6040 if (dirty & CROCUS_DIRTY_GEN6_URB) {
6041 #if GFX_VER == 6
6042 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6043 || ice->shaders.ff_gs_prog;
6044
6045 struct brw_vue_prog_data *vue_prog_data =
6046 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6047 const unsigned vs_size = vue_prog_data->urb_entry_size;
6048 unsigned gs_size = vs_size;
6049 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6050 struct brw_vue_prog_data *gs_vue_prog_data =
6051 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6052 gs_size = gs_vue_prog_data->urb_entry_size;
6053 }
6054
6055 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6056 #endif
6057 #if GFX_VER >= 7
6058 const struct intel_device_info *devinfo = &batch->screen->devinfo;
6059 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6060 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6061 unsigned entry_size[4];
6062
6063 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6064 if (!ice->shaders.prog[i]) {
6065 entry_size[i] = 1;
6066 } else {
6067 struct brw_vue_prog_data *vue_prog_data =
6068 (void *) ice->shaders.prog[i]->prog_data;
6069 entry_size[i] = vue_prog_data->urb_entry_size;
6070 }
6071 assert(entry_size[i] != 0);
6072 }
6073
6074 /* If we're just switching between programs with the same URB requirements,
6075 * skip the rest of the logic.
6076 */
6077 bool no_change = false;
6078 if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
6079 ice->urb.gs_present == gs_present &&
6080 ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
6081 ice->urb.tess_present == tess_present &&
6082 ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
6083 ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
6084 no_change = true;
6085 }
6086
6087 if (!no_change) {
6088 ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
6089 ice->urb.gs_present = gs_present;
6090 ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
6091 ice->urb.tess_present = tess_present;
6092 ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
6093 ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
6094
6095 unsigned entries[4];
6096 unsigned start[4];
6097 bool constrained;
6098 intel_get_urb_config(devinfo,
6099 batch->screen->l3_config_3d,
6100 tess_present,
6101 gs_present,
6102 entry_size,
6103 entries, start, NULL, &constrained);
6104
6105 #if GFX_VER == 7
6106 if (GFX_VERx10 < 75 && !devinfo->is_baytrail)
6107 gen7_emit_vs_workaround_flush(batch);
6108 #endif
6109 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6110 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6111 urb._3DCommandSubOpcode += i;
6112 urb.VSURBStartingAddress = start[i];
6113 urb.VSURBEntryAllocationSize = entry_size[i] - 1;
6114 urb.VSNumberofURBEntries = entries[i];
6115 }
6116 }
6117 }
6118 #endif
6119 }
6120
6121 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6122 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6123 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6124 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6125
6126 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6127 int rt_dwords =
6128 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6129 #if GFX_VER >= 8
6130 rt_dwords += GENX(BLEND_STATE_length);
6131 #endif
6132 uint32_t blend_offset;
6133 uint32_t *blend_map =
6134 stream_state(batch,
6135 4 * rt_dwords, 64, &blend_offset);
6136
6137 #if GFX_VER >= 8
6138 struct GENX(BLEND_STATE) be = { 0 };
6139 {
6140 #else
6141 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6142 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6143 #define be entry
6144 #endif
6145
6146 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6147 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6148 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6149 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6150 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
6151 be.ColorDitherEnable = cso_blend->cso.dither;
6152
6153 #if GFX_VER >= 8
6154 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6155 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6156 #else
6157 {
6158 #endif
6159 const struct pipe_rt_blend_state *rt =
6160 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6161
6162 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6163 be.IndependentAlphaBlendEnable;
6164
6165 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6166 entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6167 entry.LogicOpFunction = cso_blend->cso.logicop_func;
6168 }
6169
6170 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6171 entry.PreBlendColorClampEnable = true;
6172 entry.PostBlendColorClampEnable = true;
6173
6174 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
6175 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6176 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
6177 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6178
6179 #if GFX_VER >= 8
6180 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6181 #else
6182 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6183 #endif
6184 }
6185 }
6186 #if GFX_VER >= 8
6187 GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6188 #endif
6189 #if GFX_VER < 7
6190 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6191 ptr.PointertoBLEND_STATE = blend_offset;
6192 ptr.BLEND_STATEChange = true;
6193 }
6194 #else
6195 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6196 ptr.BlendStatePointer = blend_offset;
6197 #if GFX_VER >= 8
6198 ptr.BlendStatePointerValid = true;
6199 #endif
6200 }
6201 #endif
6202 }
6203 #endif
6204
6205 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6206 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6207 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6208 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6209 uint32_t cc_offset;
6210 void *cc_map =
6211 stream_state(batch,
6212 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6213 64, &cc_offset);
6214 #if GFX_VER <= 5
6215 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6216 #endif
6217 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6218 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6219 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6220
6221 #if GFX_VER <= 5
6222
6223 set_depth_stencil_bits(ice, &cc);
6224
6225 if (cso_blend->cso.logicop_enable) {
6226 if (can_emit_logic_op(ice)) {
6227 cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6228 cc.LogicOpFunction = cso_blend->cso.logicop_func;
6229 }
6230 }
6231 cc.ColorDitherEnable = cso_blend->cso.dither;
6232
6233 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6234
6235 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6236 cc.AlphaTestEnable = cso->cso.alpha_enabled;
6237 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6238 }
6239 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6240 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6241 #else
6242 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6243 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6244
6245 cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6246 cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6247 cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6248 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6249 #endif
6250 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6251 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6252 }
6253 ice->shaders.cc_offset = cc_offset;
6254 #if GFX_VER >= 6
6255 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6256 ptr.ColorCalcStatePointer = cc_offset;
6257 #if GFX_VER != 7
6258 ptr.ColorCalcStatePointerValid = true;
6259 #endif
6260 }
6261 #endif
6262 }
6263 #if GFX_VER <= 5
6264 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6265 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6266 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6267 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6268 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6269 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6270 }
6271 }
6272 #endif
6273 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6274 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6275 continue;
6276
6277 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6278 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6279
6280 if (!shader)
6281 continue;
6282
6283 if (shs->sysvals_need_upload)
6284 upload_sysvals(ice, stage);
6285
6286 #if GFX_VER <= 5
6287 dirty |= CROCUS_DIRTY_GEN4_CURBE;
6288 #endif
6289 #if GFX_VER >= 7
6290 struct push_bos push_bos = {};
6291 setup_constant_buffers(ice, batch, stage, &push_bos);
6292
6293 emit_push_constant_packets(ice, batch, stage, &push_bos);
6294 #endif
6295 }
6296
6297 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6298 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6299 if (ice->shaders.prog[stage]) {
6300 #if GFX_VER <= 6
6301 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6302 #endif
6303 crocus_populate_binding_table(ice, batch, stage, false);
6304 ice->shaders.prog[stage]->bind_bo_offset =
6305 crocus_upload_binding_table(ice, batch,
6306 ice->shaders.prog[stage]->surf_offset,
6307 ice->shaders.prog[stage]->bt.size_bytes);
6308
6309 #if GFX_VER >= 7
6310 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6311 ptr._3DCommandSubOpcode = 38 + stage;
6312 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6313 }
6314 #endif
6315 #if GFX_VER == 6
6316 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6317 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6318 crocus_populate_binding_table(ice, batch, stage, true);
6319 ice->shaders.ff_gs_prog->bind_bo_offset =
6320 crocus_upload_binding_table(ice, batch,
6321 ice->shaders.ff_gs_prog->surf_offset,
6322 ice->shaders.ff_gs_prog->bt.size_bytes);
6323 #endif
6324 }
6325 }
6326 }
6327 #if GFX_VER <= 6
6328 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6329 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6330 if (gs == NULL)
6331 gs = ice->shaders.ff_gs_prog;
6332 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6333 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6334 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6335 #if GFX_VER == 6
6336 ptr.VSBindingTableChange = true;
6337 ptr.PSBindingTableChange = true;
6338 ptr.GSBindingTableChange = gs ? true : false;
6339 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6340 #endif
6341 }
6342 }
6343 #endif
6344
6345 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6346 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6347 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6348 !ice->shaders.prog[stage])
6349 continue;
6350
6351 crocus_upload_sampler_states(ice, batch, stage);
6352
6353 sampler_updates = true;
6354
6355 #if GFX_VER >= 7
6356 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6357
6358 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6359 ptr._3DCommandSubOpcode = 43 + stage;
6360 ptr.PointertoVSSamplerState = shs->sampler_offset;
6361 }
6362 #endif
6363 }
6364
6365 if (sampler_updates) {
6366 #if GFX_VER == 6
6367 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6368 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6369 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6370 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6371 if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6372 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6373 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6374 ptr.VSSamplerStateChange = true;
6375 ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6376 }
6377 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6378 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6379 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6380 ptr.GSSamplerStateChange = true;
6381 ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6382 }
6383 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6384 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6385 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6386 ptr.PSSamplerStateChange = true;
6387 ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6388 }
6389 }
6390 #endif
6391 }
6392
6393 #if GFX_VER >= 6
6394 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6395 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6396 ms.PixelLocation =
6397 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6398 if (ice->state.framebuffer.samples > 0)
6399 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6400 #if GFX_VER == 6
6401 INTEL_SAMPLE_POS_4X(ms.Sample);
6402 #elif GFX_VER == 7
6403 switch (ice->state.framebuffer.samples) {
6404 case 1:
6405 INTEL_SAMPLE_POS_1X(ms.Sample);
6406 break;
6407 case 2:
6408 INTEL_SAMPLE_POS_2X(ms.Sample);
6409 break;
6410 case 4:
6411 INTEL_SAMPLE_POS_4X(ms.Sample);
6412 break;
6413 case 8:
6414 INTEL_SAMPLE_POS_8X(ms.Sample);
6415 break;
6416 default:
6417 break;
6418 }
6419 #endif
6420 }
6421 }
6422
6423 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6424 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6425 ms.SampleMask = determine_sample_mask(ice);
6426 }
6427 }
6428 #endif
6429
6430 #if GFX_VER >= 7
6431 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6432 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6433 struct brw_stage_prog_data *prog_data = shader->prog_data;
6434 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6435
6436 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6437
6438 /* Initialize the execution mask with VMask. Otherwise, derivatives are
6439 * incorrect for subspans where some of the pixels are unlit. We believe
6440 * the bit just didn't take effect in previous generations.
6441 */
6442 ps.VectorMaskEnable = GFX_VER >= 8;
6443
6444 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
6445 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
6446 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
6447
6448 ps.DispatchGRFStartRegisterForConstantSetupData0 =
6449 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6450 ps.DispatchGRFStartRegisterForConstantSetupData1 =
6451 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6452 ps.DispatchGRFStartRegisterForConstantSetupData2 =
6453 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6454
6455 ps.KernelStartPointer0 = KSP(ice, shader) +
6456 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6457 ps.KernelStartPointer1 = KSP(ice, shader) +
6458 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6459 ps.KernelStartPointer2 = KSP(ice, shader) +
6460 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6461
6462 #if GFX_VERx10 == 75
6463 ps.SampleMask = determine_sample_mask(ice);
6464 #endif
6465 // XXX: WABTPPrefetchDisable, see above, drop at C0
6466 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6467 ps.FloatingPointMode = prog_data->use_alt_mode;
6468 #if GFX_VER >= 8
6469 ps.MaximumNumberofThreadsPerPSD = 64 - 2;
6470 #else
6471 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6472 #endif
6473
6474 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6475
6476 #if GFX_VER < 8
6477 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6478 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6479 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6480 #endif
6481 /* From the documentation for this packet:
6482 * "If the PS kernel does not need the Position XY Offsets to
6483 * compute a Position Value, then this field should be programmed
6484 * to POSOFFSET_NONE."
6485 *
6486 * "SW Recommendation: If the PS kernel needs the Position Offsets
6487 * to compute a Position XY value, this field should match Position
6488 * ZW Interpolation Mode to ensure a consistent position.xyzw
6489 * computation."
6490 *
6491 * We only require XY sample offsets. So, this recommendation doesn't
6492 * look useful at the moment. We might need this in future.
6493 */
6494 ps.PositionXYOffsetSelect =
6495 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6496
6497 if (wm_prog_data->base.total_scratch) {
6498 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6499 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6500 ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6501 }
6502 }
6503 #if GFX_VER == 8
6504 const struct shader_info *fs_info =
6505 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6506 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6507 psx.PixelShaderValid = true;
6508 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6509 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6510 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6511 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6512 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6513 psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
6514
6515 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
6516 if (wm_prog_data->uses_sample_mask)
6517 psx.PixelShaderUsesInputCoverageMask = true;
6518
6519 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6520
6521 /* The stricter cross-primitive coherency guarantees that the hardware
6522 * gives us with the "Accesses UAV" bit set for at least one shader stage
6523 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6524 * are redundant within the current image, atomic counter and SSBO GL
6525 * APIs, which all have very loose ordering and coherency requirements
6526 * and generally rely on the application to insert explicit barriers when
6527 * a shader invocation is expected to see the memory writes performed by
6528 * the invocations of some previous primitive. Regardless of the value
6529 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6530 * cause an in most cases useless DC flush when the lowermost stage with
6531 * the bit set finishes execution.
6532 *
6533 * It would be nice to disable it, but in some cases we can't because on
6534 * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6535 * signal (which could be set independently from the coherency mechanism
6536 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6537 * determine whether the hardware skips execution of the fragment shader
6538 * or not via the ThreadDispatchEnable signal. However if we know that
6539 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6540 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6541 * difference so we may just disable it here.
6542 *
6543 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6544 * take into account KillPixels when no depth or stencil writes are
6545 * enabled. In order for occlusion queries to work correctly with no
6546 * attachments, we need to force-enable here.
6547 *
6548 */
6549 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6550 !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6551 psx.PixelShaderHasUAV = true;
6552 }
6553 #endif
6554 }
6555 #endif
6556
6557 #if GFX_VER >= 7
6558 if (ice->state.streamout_active) {
6559 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6560 for (int i = 0; i < 4; i++) {
6561 struct crocus_stream_output_target *tgt =
6562 (void *) ice->state.so_target[i];
6563
6564 if (!tgt) {
6565 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6566 sob.SOBufferIndex = i;
6567 }
6568 continue;
6569 }
6570 struct crocus_resource *res = (void *) tgt->base.buffer;
6571 uint32_t start = tgt->base.buffer_offset;
6572 #if GFX_VER < 8
6573 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6574 #endif
6575 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6576 sob.SOBufferIndex = i;
6577
6578 sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6579 #if GFX_VER < 8
6580 sob.SurfacePitch = tgt->stride;
6581 sob.SurfaceEndAddress = rw_bo(res->bo, end);
6582 #else
6583 sob.SOBufferEnable = true;
6584 sob.StreamOffsetWriteEnable = true;
6585 sob.StreamOutputBufferOffsetAddressEnable = true;
6586 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6587
6588 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6589 sob.StreamOutputBufferOffsetAddress =
6590 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6591 if (tgt->zero_offset) {
6592 sob.StreamOffset = 0;
6593 tgt->zero_offset = false;
6594 } else
6595 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6596 #endif
6597 }
6598 }
6599 }
6600
6601 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6602 uint32_t *decl_list =
6603 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6604 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6605 }
6606
6607 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6608 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6609
6610 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6611 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6612 sol.SOFunctionEnable = true;
6613 sol.SOStatisticsEnable = true;
6614
6615 sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6616 !ice->state.prims_generated_query_active;
6617 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6618 }
6619
6620 assert(ice->state.streamout);
6621
6622 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6623 GENX(3DSTATE_STREAMOUT_length));
6624 }
6625 } else {
6626 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6627 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6628 }
6629 }
6630 #endif
6631 #if GFX_VER == 6
6632 if (ice->state.streamout_active) {
6633 if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6634 crocus_emit_so_svbi(ice);
6635 }
6636 }
6637 #endif
6638
6639 if (dirty & CROCUS_DIRTY_CLIP) {
6640 #if GFX_VER < 6
6641 const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6642 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6643
6644 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6645 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6646 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6647 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6648 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6649 clip.SingleProgramFlow = true;
6650 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6651
6652 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6653 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6654
6655 clip.DispatchGRFStartRegisterForURBData = 1;
6656 clip.VertexURBEntryReadOffset = 0;
6657 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6658
6659 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6660 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6661
6662 if (batch->ice->urb.nr_clip_entries >= 10) {
6663 /* Half of the URB entries go to each thread, and it has to be an
6664 * even number.
6665 */
6666 assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6667
6668 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6669 * only 2 threads can output VUEs at a time.
6670 */
6671 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6672 } else {
6673 assert(batch->ice->urb.nr_clip_entries >= 5);
6674 clip.MaximumNumberofThreads = 1 - 1;
6675 }
6676 clip.VertexPositionSpace = VPOS_NDCSPACE;
6677 clip.UserClipFlagsMustClipEnable = true;
6678 clip.GuardbandClipTestEnable = true;
6679
6680 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6681 clip.ScreenSpaceViewportXMin = -1.0;
6682 clip.ScreenSpaceViewportXMax = 1.0;
6683 clip.ScreenSpaceViewportYMin = -1.0;
6684 clip.ScreenSpaceViewportYMax = 1.0;
6685 clip.ViewportXYClipTestEnable = true;
6686 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6687
6688 #if GFX_VER == 5 || GFX_VERx10 == 45
6689 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6690 #else
6691 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6692 * workaround.
6693 */
6694 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6695 #endif
6696
6697 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6698 clip.GuardbandClipTestEnable = true;
6699
6700 clip.ClipMode = clip_prog_data->clip_mode;
6701 #if GFX_VERx10 == 45
6702 clip.NegativeWClipTestEnable = true;
6703 #endif
6704 }
6705
6706 #else //if GFX_VER >= 6
6707 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6708 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6709 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6710 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6711 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6712 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6713 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6714 : ice->state.prim_is_points_or_lines);
6715 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6716 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6717 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6718 if (cso_rast->cso.rasterizer_discard)
6719 cl.ClipMode = CLIPMODE_REJECT_ALL;
6720 else if (ice->state.window_space_position)
6721 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6722 else
6723 cl.ClipMode = CLIPMODE_NORMAL;
6724
6725 cl.PerspectiveDivideDisable = ice->state.window_space_position;
6726 cl.ViewportXYClipTestEnable = !points_or_lines;
6727
6728 cl.UserClipDistanceCullTestEnableBitmask =
6729 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6730
6731 if (wm_prog_data->barycentric_interp_modes &
6732 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
6733 cl.NonPerspectiveBarycentricEnable = true;
6734
6735 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6736 cl.MaximumVPIndex = ice->state.num_viewports - 1;
6737 }
6738 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6739 ARRAY_SIZE(cso_rast->clip));
6740 #endif
6741 }
6742
6743 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6744 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6745 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6746 const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
6747 #if GFX_VER == 7
6748 if (batch->screen->devinfo.is_ivybridge)
6749 gen7_emit_vs_workaround_flush(batch);
6750 #endif
6751
6752
6753 #if GFX_VER == 6
6754 struct push_bos push_bos = {};
6755 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6756
6757 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6758 #endif
6759 #if GFX_VER >= 6
6760 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6761 #else
6762 uint32_t *vs_ptr = stream_state(batch,
6763 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6764 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6765 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6766 #endif
6767 {
6768 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6769
6770 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6771
6772 #if GFX_VER < 6
6773 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6774 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6775 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6776
6777 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6778 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6779
6780 vs.MaximumNumberofThreads =
6781 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6782 vs.StatisticsEnable = false;
6783 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6784 #endif
6785 #if GFX_VER == 5
6786 /* Force single program flow on Ironlake. We cannot reliably get
6787 * all applications working without it. See:
6788 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6789 *
6790 * The most notable and reliably failing application is the Humus
6791 * demo "CelShading"
6792 */
6793 vs.SingleProgramFlow = true;
6794 vs.SamplerCount = 0; /* hardware requirement */
6795
6796 #endif
6797 #if GFX_VER >= 8
6798 vs.SIMD8DispatchEnable =
6799 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6800
6801 vs.UserClipDistanceCullTestEnableBitmask =
6802 vue_prog_data->cull_distance_mask;
6803 #endif
6804 }
6805
6806 #if GFX_VER == 6
6807 crocus_emit_pipe_control_flush(batch,
6808 "post VS const",
6809 PIPE_CONTROL_DEPTH_STALL |
6810 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6811 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6812 #endif
6813 }
6814
6815 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6816 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6817 bool active = GFX_VER >= 6 && shader;
6818 #if GFX_VER == 6
6819 struct push_bos push_bos = {};
6820 if (shader)
6821 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6822
6823 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6824 #endif
6825 #if GFX_VERx10 == 70
6826 /**
6827 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6828 * Geometry > Geometry Shader > State:
6829 *
6830 * "Note: Because of corruption in IVB:GT2, software needs to flush the
6831 * whole fixed function pipeline when the GS enable changes value in
6832 * the 3DSTATE_GS."
6833 *
6834 * The hardware architects have clarified that in this context "flush the
6835 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6836 * Stall" bit set.
6837 */
6838 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6839 gen7_emit_cs_stall_flush(batch);
6840 #endif
6841 #if GFX_VER >= 6
6842 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6843 #else
6844 uint32_t *gs_ptr = stream_state(batch,
6845 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6846 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6847 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6848 #endif
6849 {
6850 #if GFX_VER >= 6
6851 if (active) {
6852 const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
6853 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6854 const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
6855
6856 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6857 #if GFX_VER >= 7
6858 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6859 gs.OutputTopology = gs_prog_data->output_topology;
6860 gs.ControlDataHeaderSize =
6861 gs_prog_data->control_data_header_size_hwords;
6862
6863 gs.InstanceControl = gs_prog_data->invocations - 1;
6864 gs.DispatchMode = vue_prog_data->dispatch_mode;
6865
6866 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6867
6868 gs.ControlDataFormat = gs_prog_data->control_data_format;
6869 #endif
6870
6871 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6872 * Ivy Bridge and Haswell.
6873 *
6874 * On Ivy Bridge, setting this bit causes the vertices of a triangle
6875 * strip to be delivered to the geometry shader in an order that does
6876 * not strictly follow the OpenGL spec, but preserves triangle
6877 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
6878 * the geometry shader sees triangles:
6879 *
6880 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6881 *
6882 * (Clearing the bit is even worse, because it fails to preserve
6883 * orientation).
6884 *
6885 * Triangle strips with adjacency always ordered in a way that preserves
6886 * triangle orientation but does not strictly follow the OpenGL spec,
6887 * regardless of the setting of this bit.
6888 *
6889 * On Haswell, both triangle strips and triangle strips with adjacency
6890 * are always ordered in a way that preserves triangle orientation.
6891 * Setting this bit causes the ordering to strictly follow the OpenGL
6892 * spec.
6893 *
6894 * So in either case we want to set the bit. Unfortunately on Ivy
6895 * Bridge this will get the order close to correct but not perfect.
6896 */
6897 gs.ReorderMode = TRAILING;
6898 gs.MaximumNumberofThreads =
6899 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6900 (batch->screen->devinfo.max_gs_threads - 1);
6901 #if GFX_VER < 7
6902 gs.SOStatisticsEnable = true;
6903 if (gs_prog_data->num_transform_feedback_bindings)
6904 gs.SVBIPayloadEnable = ice->state.streamout_active;
6905
6906 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6907 * was previously done for gen6.
6908 *
6909 * TODO: test with both disabled to see if the HW is behaving
6910 * as expected, like in gen7.
6911 */
6912 gs.SingleProgramFlow = true;
6913 gs.VectorMaskEnable = true;
6914 #endif
6915 #if GFX_VER >= 8
6916 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6917
6918 if (gs_prog_data->static_vertex_count != -1) {
6919 gs.StaticOutput = true;
6920 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6921 }
6922 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6923
6924 gs.UserClipDistanceCullTestEnableBitmask =
6925 vue_prog_data->cull_distance_mask;
6926
6927 const int urb_entry_write_offset = 1;
6928 const uint32_t urb_entry_output_length =
6929 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6930 urb_entry_write_offset;
6931
6932 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6933 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6934 #endif
6935 }
6936 #endif
6937 #if GFX_VER <= 6
6938 if (!active && ice->shaders.ff_gs_prog) {
6939 const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6940 /* In gen6, transform feedback for the VS stage is done with an
6941 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6942 * for this.
6943 */
6944 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6945 gs.SingleProgramFlow = true;
6946 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6947 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6948
6949 #if GFX_VER <= 5
6950 gs.GRFRegisterCount =
6951 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6952 /* BRW_NEW_URB_FENCE */
6953 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6954 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6955 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6956 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6957 #else
6958 gs.Enable = true;
6959 gs.VectorMaskEnable = true;
6960 gs.SVBIPayloadEnable = true;
6961 gs.SVBIPostIncrementEnable = true;
6962 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6963 gs.SOStatisticsEnable = true;
6964 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6965 #endif
6966 }
6967 #endif
6968 if (!active && !ice->shaders.ff_gs_prog) {
6969 #if GFX_VER < 8
6970 gs.DispatchGRFStartRegisterForURBData = 1;
6971 #if GFX_VER >= 7
6972 gs.IncludeVertexHandles = true;
6973 #endif
6974 #endif
6975 }
6976 #if GFX_VER >= 6
6977 gs.StatisticsEnable = true;
6978 #endif
6979 #if GFX_VER == 5 || GFX_VER == 6
6980 gs.RenderingEnabled = true;
6981 #endif
6982 #if GFX_VER <= 5
6983 gs.MaximumVPIndex = ice->state.num_viewports - 1;
6984 #endif
6985 }
6986 ice->state.gs_enabled = active;
6987 }
6988
6989 #if GFX_VER >= 7
6990 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6991 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6992
6993 if (shader) {
6994 const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
6995 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6996 const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6997
6998 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6999 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
7000 hs.InstanceCount = tcs_prog_data->instances - 1;
7001 hs.IncludeVertexHandles = true;
7002 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7003 }
7004 } else {
7005 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7006 }
7007
7008 }
7009
7010 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7011 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7012 if (shader) {
7013 const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
7014 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7015 const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
7016
7017 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7018 te.Partitioning = tes_prog_data->partitioning;
7019 te.OutputTopology = tes_prog_data->output_topology;
7020 te.TEDomain = tes_prog_data->domain;
7021 te.TEEnable = true;
7022 te.MaximumTessellationFactorOdd = 63.0;
7023 te.MaximumTessellationFactorNotOdd = 64.0;
7024 };
7025 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7026 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7027
7028 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7029 ds.ComputeWCoordinateEnable =
7030 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
7031
7032 #if GFX_VER >= 8
7033 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7034 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7035 ds.UserClipDistanceCullTestEnableBitmask =
7036 vue_prog_data->cull_distance_mask;
7037 #endif
7038 };
7039 } else {
7040 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7041 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7042 }
7043 }
7044 #endif
7045 if (dirty & CROCUS_DIRTY_RASTER) {
7046
7047 #if GFX_VER < 6
7048 const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7049 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7050 uint32_t *sf_ptr = stream_state(batch,
7051 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7052 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7053 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7054 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7055 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7056 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7057 sf.DispatchGRFStartRegisterForURBData = 3;
7058 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
7059 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7060 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7061 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7062 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7063
7064 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7065
7066 sf.MaximumNumberofThreads =
7067 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7068
7069 sf.SpritePointEnable = cso_state->point_quad_rasterization;
7070 sf.DestinationOriginHorizontalBias = 0.5;
7071 sf.DestinationOriginVerticalBias = 0.5;
7072
7073 sf.LineEndCapAntialiasingRegionWidth =
7074 cso_state->line_smooth ? _10pixels : _05pixels;
7075 sf.LastPixelEnable = cso_state->line_last_pixel;
7076 sf.AntialiasingEnable = cso_state->line_smooth;
7077
7078 sf.LineWidth = get_line_width(cso_state);
7079 sf.PointWidth = cso_state->point_size;
7080 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7081 #if GFX_VERx10 >= 45
7082 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7083 #endif
7084 sf.ViewportTransformEnable = true;
7085 sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7086 sf.ScissorRectangleEnable = true;
7087 sf.CullMode = translate_cull_mode(cso_state->cull_face);
7088
7089 if (cso_state->flatshade_first) {
7090 sf.TriangleFanProvokingVertexSelect = 1;
7091 } else {
7092 sf.TriangleStripListProvokingVertexSelect = 2;
7093 sf.TriangleFanProvokingVertexSelect = 2;
7094 sf.LineStripListProvokingVertexSelect = 1;
7095 }
7096 }
7097 #else
7098 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7099 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7100 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7101 sf.ViewportTransformEnable = !ice->state.window_space_position;
7102
7103 #if GFX_VER == 6
7104 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7105 uint32_t urb_entry_read_length;
7106 uint32_t urb_entry_read_offset;
7107 uint32_t point_sprite_enables;
7108 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7109 &urb_entry_read_length,
7110 &urb_entry_read_offset);
7111 sf.VertexURBEntryReadLength = urb_entry_read_length;
7112 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7113 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7114 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7115 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7116 #endif
7117
7118 #if GFX_VER >= 6 && GFX_VER < 8
7119 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7120 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7121 #endif
7122 #if GFX_VER == 7
7123 if (ice->state.framebuffer.zsbuf) {
7124 struct crocus_resource *zres, *sres;
7125 crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7126 ice->state.framebuffer.zsbuf->texture,
7127 &zres, &sres);
7128 /* ANV thinks that the stencil-ness doesn't matter, this is just
7129 * about handling polygon offset scaling.
7130 */
7131 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7132 }
7133 #endif
7134 }
7135 crocus_emit_merge(batch, cso->sf, dynamic_sf,
7136 ARRAY_SIZE(dynamic_sf));
7137 #if GFX_VER == 8
7138 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7139 #endif
7140 #endif
7141 }
7142
7143 if (dirty & CROCUS_DIRTY_WM) {
7144 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7145 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7146 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
7147 UNUSED const struct shader_info *fs_info =
7148 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7149
7150 #if GFX_VER == 6
7151 struct push_bos push_bos = {};
7152 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7153
7154 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7155 #endif
7156 #if GFX_VER >= 6
7157 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7158 #else
7159 uint32_t *wm_ptr = stream_state(batch,
7160 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7161
7162 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7163
7164 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7165 #endif
7166 {
7167 #if GFX_VER <= 6
7168 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7169 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7170 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7171 #endif
7172 #if GFX_VER == 4
7173 /* On gen4, we only have one shader kernel */
7174 if (brw_wm_state_has_ksp(wm, 0)) {
7175 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7176 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7177 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7178 wm_prog_data->base.dispatch_grf_start_reg;
7179 }
7180 #elif GFX_VER == 5
7181 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7182 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7183 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7184 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7185 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7186 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7187
7188 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7189 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7190 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7191
7192 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7193 wm_prog_data->base.dispatch_grf_start_reg;
7194 #elif GFX_VER == 6
7195 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7196 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7197 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7198 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7199 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7200 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7201
7202 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7203 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7204 wm.DispatchGRFStartRegisterForConstantSetupData1 =
7205 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7206 wm.DispatchGRFStartRegisterForConstantSetupData2 =
7207 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7208 #endif
7209 #if GFX_VER <= 5
7210 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7211 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7212 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7213 wm.SetupURBEntryReadOffset = 0;
7214 wm.EarlyDepthTestEnable = true;
7215 wm.LineAntialiasingRegionWidth = _05pixels;
7216 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7217 wm.DepthCoefficientURBReadOffset = 1;
7218
7219 if (cso->cso.offset_tri) {
7220 wm.GlobalDepthOffsetEnable = true;
7221
7222 /* Something weird going on with legacy_global_depth_bias,
7223 * offset_constant, scaling and MRD. This value passes glean
7224 * but gives some odd results elsewere (eg. the
7225 * quad-offset-units test).
7226 */
7227 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7228 wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7229 }
7230 wm.SamplerStatePointer = ro_bo(batch->state.bo,
7231 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7232 #endif
7233
7234 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7235 ice->state.statistics_counters_enabled : 0;
7236
7237 #if GFX_VER >= 6
7238 wm.LineAntialiasingRegionWidth = _10pixels;
7239 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7240
7241 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7242 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7243 #endif
7244 #if GFX_VER == 6
7245 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7246 ice->state.cso_blend->dual_color_blending;
7247 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7248 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7249
7250 /* From the SNB PRM, volume 2 part 1, page 281:
7251 * "If the PS kernel does not need the Position XY Offsets
7252 * to compute a Position XY value, then this field should be
7253 * programmed to POSOFFSET_NONE."
7254 *
7255 * "SW Recommendation: If the PS kernel needs the Position Offsets
7256 * to compute a Position XY value, this field should match Position
7257 * ZW Interpolation Mode to ensure a consistent position.xyzw
7258 * computation."
7259 * We only require XY sample offsets. So, this recommendation doesn't
7260 * look useful at the moment. We might need this in future.
7261 */
7262 if (wm_prog_data->uses_pos_offset)
7263 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7264 else
7265 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7266 #endif
7267 wm.LineStippleEnable = cso->cso.line_stipple_enable;
7268 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7269
7270 #if GFX_VER < 7
7271 if (wm_prog_data->base.use_alt_mode)
7272 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7273 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7274 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7275 #endif
7276
7277 #if GFX_VER < 8
7278 #if GFX_VER >= 6
7279 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7280
7281 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7282 if (fb->samples > 1) {
7283 if (cso->cso.multisample)
7284 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7285 else
7286 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7287
7288 if (wm_prog_data->persample_dispatch)
7289 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7290 else
7291 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7292 } else {
7293 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7294 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7295 }
7296 #endif
7297
7298 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7299
7300 if (wm_prog_data->uses_kill ||
7301 ice->state.cso_zsa->cso.alpha_enabled ||
7302 ice->state.cso_blend->cso.alpha_to_coverage ||
7303 (GFX_VER >= 6 && wm_prog_data->uses_omask))
7304 wm.PixelShaderKillsPixel = true;
7305
7306 if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7307 writes_depth || wm.PixelShaderKillsPixel ||
7308 (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7309 wm.ThreadDispatchEnable = true;
7310
7311 #if GFX_VER >= 7
7312 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7313 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7314 #else
7315 if (wm_prog_data->base.total_scratch) {
7316 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7317 MESA_SHADER_FRAGMENT);
7318 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7319 wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7320 }
7321
7322 wm.PixelShaderComputedDepth = writes_depth;
7323
7324 #endif
7325 /* The "UAV access enable" bits are unnecessary on HSW because they only
7326 * seem to have an effect on the HW-assisted coherency mechanism which we
7327 * don't need, and the rasterization-related UAV_ONLY flag and the
7328 * DISPATCH_ENABLE bit can be set independently from it.
7329 * C.f. gen8_upload_ps_extra().
7330 *
7331 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7332 * _NEW_COLOR
7333 */
7334 #if GFX_VERx10 == 75
7335 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7336 wm_prog_data->has_side_effects)
7337 wm.PSUAVonly = ON;
7338 #endif
7339 #endif
7340 #if GFX_VER >= 7
7341 /* BRW_NEW_FS_PROG_DATA */
7342 if (wm_prog_data->early_fragment_tests)
7343 wm.EarlyDepthStencilControl = EDSC_PREPS;
7344 else if (wm_prog_data->has_side_effects)
7345 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7346 #endif
7347 #if GFX_VER == 8
7348 /* We could skip this bit if color writes are enabled. */
7349 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7350 wm.ForceThreadDispatchEnable = ForceON;
7351 #endif
7352 };
7353
7354 #if GFX_VER <= 5
7355 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7356 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7357 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7358 }
7359 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7360 }
7361 #endif
7362 }
7363
7364 #if GFX_VER >= 7
7365 if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7366 crocus_emit_sbe(batch, ice);
7367 }
7368 #endif
7369
7370 #if GFX_VER >= 8
7371 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7372 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7373 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7374 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7375 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7376 const struct shader_info *fs_info =
7377 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7378 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7379 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7380 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7381 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7382 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7383 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7384 }
7385 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7386 ARRAY_SIZE(cso_blend->ps_blend));
7387 }
7388 #endif
7389
7390 #if GFX_VER >= 6
7391 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7392
7393 #if GFX_VER >= 8
7394 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7395 set_depth_stencil_bits(ice, &wmds);
7396 }
7397 #else
7398 uint32_t ds_offset;
7399 void *ds_map = stream_state(batch,
7400 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7401 64, &ds_offset);
7402 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7403 set_depth_stencil_bits(ice, &ds);
7404 }
7405
7406 #if GFX_VER == 6
7407 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7408 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7409 ptr.DEPTH_STENCIL_STATEChange = true;
7410 }
7411 #else
7412 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7413 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7414 }
7415 #endif
7416 #endif
7417 }
7418
7419 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7420 /* Align to 64-byte boundary as per anv. */
7421 uint32_t scissor_offset;
7422 struct pipe_scissor_state *scissor_map = (void *)
7423 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7424 64, &scissor_offset);
7425 for (int i = 0; i < ice->state.num_viewports; i++) {
7426 struct pipe_scissor_state scissor;
7427 crocus_fill_scissor_rect(ice, i, &scissor);
7428 scissor_map[i] = scissor;
7429 }
7430
7431 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7432 ptr.ScissorRectPointer = scissor_offset;
7433 }
7434 }
7435 #endif
7436
7437 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7438 struct isl_device *isl_dev = &batch->screen->isl_dev;
7439 #if GFX_VER >= 6
7440 crocus_emit_depth_stall_flushes(batch);
7441 #endif
7442 void *batch_ptr;
7443 struct crocus_resource *zres, *sres;
7444 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7445 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7446
7447 struct isl_view view = {
7448 .base_level = 0,
7449 .levels = 1,
7450 .base_array_layer = 0,
7451 .array_len = 1,
7452 .swizzle = ISL_SWIZZLE_IDENTITY,
7453 };
7454 struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
7455
7456 if (cso->zsbuf) {
7457 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7458 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7459 if (zsbuf->align_res) {
7460 zres = (struct crocus_resource *)zsbuf->align_res;
7461 }
7462 view.base_level = cso->zsbuf->u.tex.level;
7463 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7464 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7465
7466 if (zres) {
7467 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7468
7469 info.depth_surf = &zres->surf;
7470 info.depth_address = crocus_command_reloc(batch,
7471 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7472 zres->bo, 0, RELOC_32BIT);
7473
7474 info.mocs = crocus_mocs(zres->bo, isl_dev);
7475 view.format = zres->surf.format;
7476
7477 if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7478 info.hiz_usage = zres->aux.usage;
7479 info.hiz_surf = &zres->aux.surf;
7480 uint64_t hiz_offset = 0;
7481
7482 #if GFX_VER == 6
7483 /* HiZ surfaces on Sandy Bridge technically don't support
7484 * mip-mapping. However, we can fake it by offsetting to the
7485 * first slice of LOD0 in the HiZ surface.
7486 */
7487 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7488 view.base_level, 0, 0,
7489 &hiz_offset, NULL, NULL);
7490 #endif
7491 info.hiz_address = crocus_command_reloc(batch,
7492 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7493 zres->aux.bo, zres->aux.offset + hiz_offset,
7494 RELOC_32BIT);
7495 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7496 }
7497 }
7498
7499 #if GFX_VER >= 6
7500 if (sres) {
7501 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7502 info.stencil_aux_usage = sres->aux.usage;
7503 info.stencil_surf = &sres->surf;
7504
7505 uint64_t stencil_offset = 0;
7506 #if GFX_VER == 6
7507 /* Stencil surfaces on Sandy Bridge technically don't support
7508 * mip-mapping. However, we can fake it by offsetting to the
7509 * first slice of LOD0 in the stencil surface.
7510 */
7511 isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7512 view.base_level, 0, 0,
7513 &stencil_offset, NULL, NULL);
7514 #endif
7515
7516 info.stencil_address = crocus_command_reloc(batch,
7517 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7518 sres->bo, stencil_offset, RELOC_32BIT);
7519 if (!zres) {
7520 view.format = sres->surf.format;
7521 info.mocs = crocus_mocs(sres->bo, isl_dev);
7522 }
7523 }
7524 #endif
7525 }
7526 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7527 }
7528
7529 /* TODO: Disable emitting this until something uses a stipple. */
7530 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7531 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7532 for (int i = 0; i < 32; i++) {
7533 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7534 }
7535 }
7536 }
7537
7538 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7539 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7540 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7541 }
7542
7543 #if GFX_VER >= 8
7544 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7545 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7546 topo.PrimitiveTopologyType =
7547 translate_prim_type(draw->mode, ice->state.patch_vertices);
7548 }
7549 }
7550 #endif
7551
7552 #if GFX_VER <= 5
7553 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7554 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7555 ice->shaders.vs_offset, ice->shaders.sf_offset,
7556 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7557 crocus_upload_urb_fence(batch);
7558
7559 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7560 cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7561 cs.URBEntryAllocationSize = ice->urb.csize - 1;
7562 }
7563 dirty |= CROCUS_DIRTY_GEN4_CURBE;
7564 }
7565 #endif
7566 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7567 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7568 if (fb->width && fb->height) {
7569 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7570 rect.ClippedDrawingRectangleXMax = fb->width - 1;
7571 rect.ClippedDrawingRectangleYMax = fb->height - 1;
7572 }
7573 }
7574 }
7575
7576 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7577 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7578 const uint32_t count = user_count +
7579 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7580 uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7581
7582 if (count) {
7583 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7584
7585 uint32_t *map =
7586 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7587 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7588 vb.DWordLength = (vb_dwords * count + 1) - 2;
7589 }
7590 map += 1;
7591
7592 uint32_t bound = dynamic_bound;
7593 int i;
7594 while (bound) {
7595 i = u_bit_scan(&bound);
7596 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7597 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7598 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7599
7600 emit_vertex_buffer_state(batch, i, bo,
7601 buf->buffer_offset,
7602 ice->state.vb_end[i],
7603 buf->stride,
7604 step_rate,
7605 &map);
7606 }
7607 i = user_count;
7608 if (ice->state.vs_uses_draw_params) {
7609 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7610 emit_vertex_buffer_state(batch, i++,
7611 res->bo,
7612 ice->draw.draw_params.offset,
7613 ice->draw.draw_params.res->width0,
7614 0, 0, &map);
7615 }
7616 if (ice->state.vs_uses_derived_draw_params) {
7617 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7618 emit_vertex_buffer_state(batch, i++,
7619 res->bo,
7620 ice->draw.derived_draw_params.offset,
7621 ice->draw.derived_draw_params.res->width0,
7622 0, 0, &map);
7623 }
7624 }
7625 }
7626
7627 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7628 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7629 const unsigned entries = MAX2(cso->count, 1);
7630 if (!(ice->state.vs_needs_sgvs_element ||
7631 ice->state.vs_uses_derived_draw_params ||
7632 ice->state.vs_needs_edge_flag)) {
7633 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7634 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7635 } else {
7636 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7637 const unsigned dyn_count = cso->count +
7638 ice->state.vs_needs_sgvs_element +
7639 ice->state.vs_uses_derived_draw_params;
7640
7641 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7642 &dynamic_ves, ve) {
7643 ve.DWordLength =
7644 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7645 }
7646 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7647 (cso->count - ice->state.vs_needs_edge_flag) *
7648 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7649 uint32_t *ve_pack_dest =
7650 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7651 GENX(VERTEX_ELEMENT_STATE_length)];
7652
7653 if (ice->state.vs_needs_sgvs_element) {
7654 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7655 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7656 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7657 ve.Valid = true;
7658 ve.VertexBufferIndex =
7659 util_bitcount64(ice->state.bound_vertex_buffers);
7660 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7661 ve.Component0Control = base_ctrl;
7662 ve.Component1Control = base_ctrl;
7663 #if GFX_VER < 8
7664 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7665 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7666 #else
7667 ve.Component2Control = VFCOMP_STORE_0;
7668 ve.Component3Control = VFCOMP_STORE_0;
7669 #endif
7670 #if GFX_VER < 5
7671 ve.DestinationElementOffset = cso->count * 4;
7672 #endif
7673 }
7674 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7675 }
7676 if (ice->state.vs_uses_derived_draw_params) {
7677 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7678 ve.Valid = true;
7679 ve.VertexBufferIndex =
7680 util_bitcount64(ice->state.bound_vertex_buffers) +
7681 ice->state.vs_uses_draw_params;
7682 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7683 ve.Component0Control = VFCOMP_STORE_SRC;
7684 ve.Component1Control = VFCOMP_STORE_SRC;
7685 ve.Component2Control = VFCOMP_STORE_0;
7686 ve.Component3Control = VFCOMP_STORE_0;
7687 #if GFX_VER < 5
7688 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7689 #endif
7690 }
7691 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7692 }
7693 if (ice->state.vs_needs_edge_flag) {
7694 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
7695 ve_pack_dest[i] = cso->edgeflag_ve[i];
7696 }
7697
7698 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7699 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7700 }
7701
7702 #if GFX_VER == 8
7703 if (!ice->state.vs_needs_edge_flag) {
7704 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7705 entries * GENX(3DSTATE_VF_INSTANCING_length));
7706 } else {
7707 assert(cso->count > 0);
7708 const unsigned edgeflag_index = cso->count - 1;
7709 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7710 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7711 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7712
7713 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7714 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7715 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7716 vi.VertexElementIndex = edgeflag_index +
7717 ice->state.vs_needs_sgvs_element +
7718 ice->state.vs_uses_derived_draw_params;
7719 }
7720 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
7721 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7722
7723 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7724 entries * GENX(3DSTATE_VF_INSTANCING_length));
7725 }
7726 #endif
7727 }
7728
7729 #if GFX_VER == 8
7730 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7731 const struct brw_vs_prog_data *vs_prog_data = (void *)
7732 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7733 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7734
7735 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7736 if (vs_prog_data->uses_vertexid) {
7737 sgv.VertexIDEnable = true;
7738 sgv.VertexIDComponentNumber = 2;
7739 sgv.VertexIDElementOffset =
7740 cso->count - ice->state.vs_needs_edge_flag;
7741 }
7742
7743 if (vs_prog_data->uses_instanceid) {
7744 sgv.InstanceIDEnable = true;
7745 sgv.InstanceIDComponentNumber = 3;
7746 sgv.InstanceIDElementOffset =
7747 cso->count - ice->state.vs_needs_edge_flag;
7748 }
7749 }
7750 }
7751 #endif
7752 #if GFX_VERx10 >= 75
7753 if (dirty & CROCUS_DIRTY_GEN75_VF) {
7754 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7755 if (draw->primitive_restart) {
7756 vf.IndexedDrawCutIndexEnable = true;
7757 vf.CutIndex = draw->restart_index;
7758 }
7759 }
7760 }
7761 #endif
7762
7763 #if GFX_VER == 8
7764 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7765 bool enable = want_pma_fix(ice);
7766 genX(crocus_update_pma_fix)(ice, batch, enable);
7767 }
7768 #endif
7769
7770 #if GFX_VER <= 5
7771 if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7772 gen4_upload_curbe(batch);
7773 }
7774 #endif
7775 }
7776
7777 static void
7778 crocus_upload_render_state(struct crocus_context *ice,
7779 struct crocus_batch *batch,
7780 const struct pipe_draw_info *draw,
7781 unsigned drawid_offset,
7782 const struct pipe_draw_indirect_info *indirect,
7783 const struct pipe_draw_start_count_bias *sc)
7784 {
7785 #if GFX_VER >= 7
7786 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7787 #endif
7788
7789 batch->no_wrap = true;
7790 batch->contains_draw = true;
7791
7792 crocus_update_surface_base_address(batch);
7793
7794 crocus_upload_dirty_render_state(ice, batch, draw);
7795
7796 batch->no_wrap = false;
7797 if (draw->index_size > 0) {
7798 unsigned offset;
7799 unsigned size;
7800 bool emit_index = false;
7801
7802 if (draw->has_user_indices) {
7803 unsigned start_offset = draw->index_size * sc->start;
7804 u_upload_data(ice->ctx.stream_uploader, 0,
7805 sc->count * draw->index_size, 4,
7806 (char *)draw->index.user + start_offset,
7807 &offset, &ice->state.index_buffer.res);
7808 offset -= start_offset;
7809 size = start_offset + sc->count * draw->index_size;
7810 emit_index = true;
7811 } else {
7812 struct crocus_resource *res = (void *) draw->index.resource;
7813
7814 if (ice->state.index_buffer.res != draw->index.resource) {
7815 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7816 pipe_resource_reference(&ice->state.index_buffer.res,
7817 draw->index.resource);
7818 emit_index = true;
7819 }
7820 offset = 0;
7821 size = draw->index.resource->width0;
7822 }
7823
7824 if (!emit_index &&
7825 (ice->state.index_buffer.size != size ||
7826 ice->state.index_buffer.index_size != draw->index_size
7827 #if GFX_VERx10 < 75
7828 || ice->state.index_buffer.prim_restart != draw->primitive_restart
7829 #endif
7830 )
7831 )
7832 emit_index = true;
7833
7834 if (emit_index) {
7835 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7836
7837 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7838 #if GFX_VERx10 < 75
7839 ib.CutIndexEnable = draw->primitive_restart;
7840 #endif
7841 ib.IndexFormat = draw->index_size >> 1;
7842 ib.BufferStartingAddress = ro_bo(bo, offset);
7843 #if GFX_VER >= 8
7844 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7845 ib.BufferSize = bo->size - offset;
7846 #else
7847 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7848 #endif
7849 }
7850 ice->state.index_buffer.size = size;
7851 ice->state.index_buffer.offset = offset;
7852 ice->state.index_buffer.index_size = draw->index_size;
7853 #if GFX_VERx10 < 75
7854 ice->state.index_buffer.prim_restart = draw->primitive_restart;
7855 #endif
7856 }
7857 }
7858
7859 #define _3DPRIM_END_OFFSET 0x2420
7860 #define _3DPRIM_START_VERTEX 0x2430
7861 #define _3DPRIM_VERTEX_COUNT 0x2434
7862 #define _3DPRIM_INSTANCE_COUNT 0x2438
7863 #define _3DPRIM_START_INSTANCE 0x243C
7864 #define _3DPRIM_BASE_VERTEX 0x2440
7865
7866 #if GFX_VER >= 7
7867 if (indirect && !indirect->count_from_stream_output) {
7868 if (indirect->indirect_draw_count) {
7869 use_predicate = true;
7870
7871 struct crocus_bo *draw_count_bo =
7872 crocus_resource_bo(indirect->indirect_draw_count);
7873 unsigned draw_count_offset =
7874 indirect->indirect_draw_count_offset;
7875
7876 crocus_emit_pipe_control_flush(batch,
7877 "ensure indirect draw buffer is flushed",
7878 PIPE_CONTROL_FLUSH_ENABLE);
7879 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7880 #if GFX_VERx10 >= 75
7881 struct mi_builder b;
7882 mi_builder_init(&b, &batch->screen->devinfo, batch);
7883
7884 /* comparison = draw id < draw count */
7885 struct mi_value comparison =
7886 mi_ult(&b, mi_imm(drawid_offset),
7887 mi_mem32(ro_bo(draw_count_bo,
7888 draw_count_offset)));
7889 #if GFX_VER == 8
7890 /* predicate = comparison & conditional rendering predicate */
7891 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7892 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7893 #else
7894 /* predicate = comparison & conditional rendering predicate */
7895 struct mi_value pred = mi_iand(&b, comparison,
7896 mi_reg32(CS_GPR(15)));
7897
7898 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7899 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7900
7901 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7902 MI_PREDICATE_COMBINEOP_SET |
7903 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7904
7905 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7906 #endif
7907 #endif
7908 } else {
7909 uint32_t mi_predicate;
7910
7911 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7912 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7913 /* Upload the current draw count from the draw parameters buffer
7914 * to MI_PREDICATE_SRC0.
7915 */
7916 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7917 draw_count_bo, draw_count_offset);
7918 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7919 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7920
7921 if (drawid_offset == 0) {
7922 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7923 MI_PREDICATE_COMBINEOP_SET |
7924 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7925 } else {
7926 /* While draw_index < draw_count the predicate's result will be
7927 * (draw_index == draw_count) ^ TRUE = TRUE
7928 * When draw_index == draw_count the result is
7929 * (TRUE) ^ TRUE = FALSE
7930 * After this all results will be:
7931 * (FALSE) ^ FALSE = FALSE
7932 */
7933 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7934 MI_PREDICATE_COMBINEOP_XOR |
7935 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7936 }
7937 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7938 }
7939 }
7940
7941 #if GFX_VER >= 7
7942 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7943 assert(bo);
7944
7945 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7946 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7947 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7948 }
7949 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7950 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7951 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7952 }
7953 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7954 lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7955 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7956 }
7957 if (draw->index_size) {
7958 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7959 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7960 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7961 }
7962 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7963 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7964 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7965 }
7966 } else {
7967 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7968 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7969 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7970 }
7971 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7972 lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7973 lri.DataDWord = 0;
7974 }
7975 }
7976 #endif
7977 } else if (indirect && indirect->count_from_stream_output) {
7978 #if GFX_VERx10 >= 75
7979 struct crocus_stream_output_target *so =
7980 (void *) indirect->count_from_stream_output;
7981
7982 /* XXX: Replace with actual cache tracking */
7983 crocus_emit_pipe_control_flush(batch,
7984 "draw count from stream output stall",
7985 PIPE_CONTROL_CS_STALL);
7986
7987 struct mi_builder b;
7988 mi_builder_init(&b, &batch->screen->devinfo, batch);
7989
7990 struct crocus_address addr =
7991 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7992 struct mi_value offset =
7993 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7994
7995 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
7996 mi_udiv32_imm(&b, offset, so->stride));
7997
7998 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
7999 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8000 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8001 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8002 #endif
8003 }
8004 #else
8005 assert(!indirect);
8006 #endif
8007
8008 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8009 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8010 #if GFX_VER >= 7
8011 prim.PredicateEnable = use_predicate;
8012 #endif
8013
8014 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8015 if (indirect) {
8016 // XXX Probably have to do something for gen6 here?
8017 #if GFX_VER >= 7
8018 prim.IndirectParameterEnable = true;
8019 #endif
8020 } else {
8021 #if GFX_VER >= 5
8022 prim.StartInstanceLocation = draw->start_instance;
8023 #endif
8024 prim.InstanceCount = draw->instance_count;
8025 prim.VertexCountPerInstance = sc->count;
8026
8027 prim.StartVertexLocation = sc->start;
8028
8029 if (draw->index_size) {
8030 prim.BaseVertexLocation += sc->index_bias;
8031 }
8032 }
8033 }
8034 }
8035
8036 #if GFX_VER >= 7
8037
8038 static void
8039 crocus_upload_compute_state(struct crocus_context *ice,
8040 struct crocus_batch *batch,
8041 const struct pipe_grid_info *grid)
8042 {
8043 const uint64_t stage_dirty = ice->state.stage_dirty;
8044 struct crocus_screen *screen = batch->screen;
8045 const struct intel_device_info *devinfo = &screen->devinfo;
8046 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8047 struct crocus_compiled_shader *shader =
8048 ice->shaders.prog[MESA_SHADER_COMPUTE];
8049 struct brw_stage_prog_data *prog_data = shader->prog_data;
8050 struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
8051 const struct brw_cs_dispatch_info dispatch =
8052 brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8053
8054 crocus_update_surface_base_address(batch);
8055 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8056 upload_sysvals(ice, MESA_SHADER_COMPUTE);
8057
8058 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8059 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8060 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8061 crocus_upload_binding_table(ice, batch,
8062 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8063 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8064 }
8065
8066 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8067 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8068
8069 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8070 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8071 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8072 *
8073 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8074 * the only bits that are changed are scoreboard related: Scoreboard
8075 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
8076 * these scoreboard related states, a MEDIA_STATE_FLUSH is
8077 * sufficient."
8078 */
8079 crocus_emit_pipe_control_flush(batch,
8080 "workaround: stall before MEDIA_VFE_STATE",
8081 PIPE_CONTROL_CS_STALL);
8082
8083 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8084 if (prog_data->total_scratch) {
8085 struct crocus_bo *bo =
8086 crocus_get_scratch_space(ice, prog_data->total_scratch,
8087 MESA_SHADER_COMPUTE);
8088 #if GFX_VER == 8
8089 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8090 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8091 */
8092 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8093 #elif GFX_VERx10 == 75
8094 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8095 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8096 */
8097 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8098 #else
8099 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8100 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8101 */
8102 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8103 #endif
8104 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8105 }
8106
8107 vfe.MaximumNumberofThreads =
8108 devinfo->max_cs_threads * devinfo->subslice_total - 1;
8109 vfe.ResetGatewayTimer =
8110 Resettingrelativetimerandlatchingtheglobaltimestamp;
8111 vfe.BypassGatewayControl = true;
8112 #if GFX_VER == 7
8113 vfe.GPGPUMode = true;
8114 #endif
8115 #if GFX_VER == 8
8116 vfe.BypassGatewayControl = true;
8117 #endif
8118 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8119 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8120
8121 vfe.CURBEAllocationSize =
8122 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8123 cs_prog_data->push.cross_thread.regs, 2);
8124 }
8125 }
8126
8127 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8128 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8129 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8130 uint32_t curbe_data_offset = 0;
8131 assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8132 cs_prog_data->push.per_thread.dwords == 1 &&
8133 cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
8134 const unsigned push_const_size =
8135 brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8136 uint32_t *curbe_data_map =
8137 stream_state(batch,
8138 ALIGN(push_const_size, 64), 64,
8139 &curbe_data_offset);
8140 assert(curbe_data_map);
8141 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8142 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8143 curbe_data_map);
8144
8145 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8146 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8147 curbe.CURBEDataStartAddress = curbe_data_offset;
8148 }
8149 }
8150
8151 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8152 CROCUS_STAGE_DIRTY_BINDINGS_CS |
8153 CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8154 CROCUS_STAGE_DIRTY_CS)) {
8155 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8156 const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8157 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8158 idd.KernelStartPointer = ksp;
8159 idd.SamplerStatePointer = shs->sampler_offset;
8160 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8161 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8162 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8163 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8164 idd.BarrierEnable = cs_prog_data->uses_barrier;
8165 idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
8166 prog_data->total_shared);
8167 #if GFX_VERx10 >= 75
8168 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8169 #endif
8170 }
8171
8172 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8173 load.InterfaceDescriptorTotalLength =
8174 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8175 load.InterfaceDescriptorDataStartAddress =
8176 emit_state(batch, desc, sizeof(desc), 64);
8177 }
8178 }
8179
8180 #define GPGPU_DISPATCHDIMX 0x2500
8181 #define GPGPU_DISPATCHDIMY 0x2504
8182 #define GPGPU_DISPATCHDIMZ 0x2508
8183
8184 if (grid->indirect) {
8185 struct crocus_state_ref *grid_size = &ice->state.grid_size;
8186 struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8187 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8188 lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8189 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8190 }
8191 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8192 lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8193 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8194 }
8195 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8196 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8197 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8198 }
8199
8200 #if GFX_VER == 7
8201 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8202 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8203 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8204
8205 /* Load compute_dispatch_indirect_x_size into SRC0 */
8206 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8207
8208 /* predicate = (compute_dispatch_indirect_x_size == 0); */
8209 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8210 mip.LoadOperation = LOAD_LOAD;
8211 mip.CombineOperation = COMBINE_SET;
8212 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8213 };
8214
8215 /* Load compute_dispatch_indirect_y_size into SRC0 */
8216 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8217
8218 /* predicate = (compute_dispatch_indirect_y_size == 0); */
8219 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8220 mip.LoadOperation = LOAD_LOAD;
8221 mip.CombineOperation = COMBINE_OR;
8222 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8223 };
8224
8225 /* Load compute_dispatch_indirect_z_size into SRC0 */
8226 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8227
8228 /* predicate = (compute_dispatch_indirect_z_size == 0); */
8229 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8230 mip.LoadOperation = LOAD_LOAD;
8231 mip.CombineOperation = COMBINE_OR;
8232 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8233 };
8234
8235 /* predicate = !predicate; */
8236 #define COMPARE_FALSE 1
8237 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8238 mip.LoadOperation = LOAD_LOADINV;
8239 mip.CombineOperation = COMBINE_OR;
8240 mip.CompareOperation = COMPARE_FALSE;
8241 }
8242 #endif
8243 }
8244
8245 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8246 ggw.IndirectParameterEnable = grid->indirect != NULL;
8247 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
8248 ggw.SIMDSize = dispatch.simd_size / 16;
8249 ggw.ThreadDepthCounterMaximum = 0;
8250 ggw.ThreadHeightCounterMaximum = 0;
8251 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
8252 ggw.ThreadGroupIDXDimension = grid->grid[0];
8253 ggw.ThreadGroupIDYDimension = grid->grid[1];
8254 ggw.ThreadGroupIDZDimension = grid->grid[2];
8255 ggw.RightExecutionMask = dispatch.right_mask;
8256 ggw.BottomExecutionMask = 0xffffffff;
8257 }
8258
8259 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8260
8261 batch->contains_draw = true;
8262 }
8263
8264 #endif /* GFX_VER >= 7 */
8265
8266 /**
8267 * State module teardown.
8268 */
8269 static void
8270 crocus_destroy_state(struct crocus_context *ice)
8271 {
8272 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8273 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8274
8275 free(ice->state.genx);
8276
8277 for (int i = 0; i < 4; i++) {
8278 pipe_so_target_reference(&ice->state.so_target[i], NULL);
8279 }
8280
8281 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
8282 pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
8283 }
8284 pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
8285
8286 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8287 struct crocus_shader_state *shs = &ice->state.shaders[stage];
8288 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8289 pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8290 }
8291 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8292 pipe_resource_reference(&shs->image[i].base.resource, NULL);
8293 }
8294 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8295 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8296 }
8297 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8298 pipe_sampler_view_reference((struct pipe_sampler_view **)
8299 &shs->textures[i], NULL);
8300 }
8301 }
8302
8303 for (int i = 0; i < 16; i++)
8304 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8305 pipe_resource_reference(&ice->state.grid_size.res, NULL);
8306
8307 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8308 }
8309
8310 /* ------------------------------------------------------------------- */
8311
8312 static void
8313 crocus_rebind_buffer(struct crocus_context *ice,
8314 struct crocus_resource *res)
8315 {
8316 struct pipe_context *ctx = &ice->ctx;
8317
8318 assert(res->base.b.target == PIPE_BUFFER);
8319
8320 /* Buffers can't be framebuffer attachments, nor display related,
8321 * and we don't have upstream Clover support.
8322 */
8323 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8324 PIPE_BIND_RENDER_TARGET |
8325 PIPE_BIND_BLENDABLE |
8326 PIPE_BIND_DISPLAY_TARGET |
8327 PIPE_BIND_CURSOR |
8328 PIPE_BIND_COMPUTE_RESOURCE |
8329 PIPE_BIND_GLOBAL)));
8330
8331 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8332 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8333 while (bound_vbs) {
8334 const int i = u_bit_scan64(&bound_vbs);
8335 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8336
8337 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8338 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8339 }
8340 }
8341
8342 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8343 ice->state.index_buffer.res) {
8344 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8345 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8346 }
8347 /* There is no need to handle these:
8348 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8349 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8350 */
8351
8352 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8353 /* XXX: be careful about resetting vs appending... */
8354 for (int i = 0; i < 4; i++) {
8355 if (ice->state.so_target[i] &&
8356 (ice->state.so_target[i]->buffer == &res->base.b)) {
8357 #if GFX_VER == 6
8358 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8359 #else
8360 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8361 #endif
8362 }
8363 }
8364 }
8365
8366 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8367 struct crocus_shader_state *shs = &ice->state.shaders[s];
8368 enum pipe_shader_type p_stage = stage_to_pipe(s);
8369
8370 if (!(res->bind_stages & (1 << s)))
8371 continue;
8372
8373 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8374 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8375 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8376 while (bound_cbufs) {
8377 const int i = u_bit_scan(&bound_cbufs);
8378 struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8379
8380 if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8381 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8382 }
8383 }
8384 }
8385
8386 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8387 uint32_t bound_ssbos = shs->bound_ssbos;
8388 while (bound_ssbos) {
8389 const int i = u_bit_scan(&bound_ssbos);
8390 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8391
8392 if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8393 struct pipe_shader_buffer buf = {
8394 .buffer = &res->base.b,
8395 .buffer_offset = ssbo->buffer_offset,
8396 .buffer_size = ssbo->buffer_size,
8397 };
8398 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8399 (shs->writable_ssbos >> i) & 1);
8400 }
8401 }
8402 }
8403
8404 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8405 uint32_t bound_sampler_views = shs->bound_sampler_views;
8406 while (bound_sampler_views) {
8407 const int i = u_bit_scan(&bound_sampler_views);
8408 struct crocus_sampler_view *isv = shs->textures[i];
8409 struct crocus_bo *bo = isv->res->bo;
8410
8411 if (res->bo == bo) {
8412 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8413 }
8414 }
8415 }
8416
8417 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8418 uint32_t bound_image_views = shs->bound_image_views;
8419 while (bound_image_views) {
8420 const int i = u_bit_scan(&bound_image_views);
8421 struct crocus_image_view *iv = &shs->image[i];
8422 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8423
8424 if (res->bo == bo)
8425 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8426 }
8427 }
8428 }
8429 }
8430
8431 /* ------------------------------------------------------------------- */
8432
8433 static unsigned
8434 flags_to_post_sync_op(uint32_t flags)
8435 {
8436 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8437 return WriteImmediateData;
8438
8439 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8440 return WritePSDepthCount;
8441
8442 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8443 return WriteTimestamp;
8444
8445 return 0;
8446 }
8447
8448 /*
8449 * Do the given flags have a Post Sync or LRI Post Sync operation?
8450 */
8451 static enum pipe_control_flags
8452 get_post_sync_flags(enum pipe_control_flags flags)
8453 {
8454 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8455 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8456 PIPE_CONTROL_WRITE_TIMESTAMP |
8457 PIPE_CONTROL_LRI_POST_SYNC_OP;
8458
8459 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8460 * "LRI Post Sync Operation". So more than one bit set would be illegal.
8461 */
8462 assert(util_bitcount(flags) <= 1);
8463
8464 return flags;
8465 }
8466
8467 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8468
8469 /**
8470 * Emit a series of PIPE_CONTROL commands, taking into account any
8471 * workarounds necessary to actually accomplish the caller's request.
8472 *
8473 * Unless otherwise noted, spec quotations in this function come from:
8474 *
8475 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8476 * Restrictions for PIPE_CONTROL.
8477 *
8478 * You should not use this function directly. Use the helpers in
8479 * crocus_pipe_control.c instead, which may split the pipe control further.
8480 */
8481 static void
8482 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8483 const char *reason,
8484 uint32_t flags,
8485 struct crocus_bo *bo,
8486 uint32_t offset,
8487 uint64_t imm)
8488 {
8489 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8490 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8491 UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8492 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8493
8494 /* Recursive PIPE_CONTROL workarounds --------------------------------
8495 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8496 *
8497 * We do these first because we want to look at the original operation,
8498 * rather than any workarounds we set.
8499 */
8500
8501 /* "Flush Types" workarounds ---------------------------------------------
8502 * We do these now because they may add post-sync operations or CS stalls.
8503 */
8504
8505 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8506 /* Hardware workaround: SNB B-Spec says:
8507 *
8508 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8509 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8510 * required."
8511 */
8512 crocus_emit_post_sync_nonzero_flush(batch);
8513 }
8514
8515 #if GFX_VER == 8
8516 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8517 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8518 *
8519 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8520 * 'Write PS Depth Count' or 'Write Timestamp'."
8521 */
8522 if (!bo) {
8523 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8524 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8525 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8526 bo = batch->ice->workaround_bo;
8527 offset = batch->ice->workaround_offset;
8528 }
8529 }
8530 #endif
8531
8532 #if GFX_VERx10 < 75
8533 if (flags & PIPE_CONTROL_DEPTH_STALL) {
8534 /* Project: PRE-HSW / Argument: Depth Stall
8535 *
8536 * "The following bits must be clear:
8537 * - Render Target Cache Flush Enable ([12] of DW1)
8538 * - Depth Cache Flush Enable ([0] of DW1)"
8539 */
8540 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8541 PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8542 }
8543 #endif
8544 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8545 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8546 *
8547 * "This bit must be DISABLED for operations other than writing
8548 * PS_DEPTH_COUNT."
8549 *
8550 * This seems like nonsense. An Ivybridge workaround requires us to
8551 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8552 * operation. Gen8+ requires us to emit depth stalls and depth cache
8553 * flushes together. So, it's hard to imagine this means anything other
8554 * than "we originally intended this to be used for PS_DEPTH_COUNT".
8555 *
8556 * We ignore the supposed restriction and do nothing.
8557 */
8558 }
8559
8560 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8561 /* Project: PRE-HSW / Argument: Depth Cache Flush
8562 *
8563 * "Depth Stall must be clear ([13] of DW1)."
8564 */
8565 assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8566 }
8567
8568 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8569 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8570 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8571 *
8572 * "This bit must be DISABLED for End-of-pipe (Read) fences,
8573 * PS_DEPTH_COUNT or TIMESTAMP queries."
8574 *
8575 * TODO: Implement end-of-pipe checking.
8576 */
8577 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8578 PIPE_CONTROL_WRITE_TIMESTAMP)));
8579 }
8580
8581 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8582 /* From the PIPE_CONTROL instruction table, bit 1:
8583 *
8584 * "This bit is ignored if Depth Stall Enable is set.
8585 * Further, the render cache is not flushed even if Write Cache
8586 * Flush Enable bit is set."
8587 *
8588 * We assert that the caller doesn't do this combination, to try and
8589 * prevent mistakes. It shouldn't hurt the GPU, though.
8590 *
8591 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8592 * and "Render Target Flush" combo is explicitly required for BTI
8593 * update workarounds.
8594 */
8595 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8596 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8597 }
8598
8599 /* PIPE_CONTROL page workarounds ------------------------------------- */
8600
8601 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8602 /* From the PIPE_CONTROL page itself:
8603 *
8604 * "IVB, HSW, BDW
8605 * Restriction: Pipe_control with CS-stall bit set must be issued
8606 * before a pipe-control command that has the State Cache
8607 * Invalidate bit set."
8608 */
8609 flags |= PIPE_CONTROL_CS_STALL;
8610 }
8611
8612 if ((GFX_VERx10 == 75)) {
8613 /* From the PIPE_CONTROL page itself:
8614 *
8615 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8616 * Prior to programming a PIPECONTROL command with any of the RO
8617 * cache invalidation bit set, program a PIPECONTROL flush command
8618 * with “CS stall” bit and “HDC Flush” bit set."
8619 *
8620 * TODO: Actually implement this. What's an HDC Flush?
8621 */
8622 }
8623
8624 if (flags & PIPE_CONTROL_FLUSH_LLC) {
8625 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8626 *
8627 * "Project: ALL
8628 * SW must always program Post-Sync Operation to "Write Immediate
8629 * Data" when Flush LLC is set."
8630 *
8631 * For now, we just require the caller to do it.
8632 */
8633 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8634 }
8635
8636 /* "Post-Sync Operation" workarounds -------------------------------- */
8637
8638 /* Project: All / Argument: Global Snapshot Count Reset [19]
8639 *
8640 * "This bit must not be exercised on any product.
8641 * Requires stall bit ([20] of DW1) set."
8642 *
8643 * We don't use this, so we just assert that it isn't used. The
8644 * PIPE_CONTROL instruction page indicates that they intended this
8645 * as a debug feature and don't think it is useful in production,
8646 * but it may actually be usable, should we ever want to.
8647 */
8648 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8649
8650 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8651 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8652 /* Project: All / Arguments:
8653 *
8654 * - Generic Media State Clear [16]
8655 * - Indirect State Pointers Disable [16]
8656 *
8657 * "Requires stall bit ([20] of DW1) set."
8658 *
8659 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8660 * State Clear) says:
8661 *
8662 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
8663 * programmed prior to programming a PIPECONTROL command with "Media
8664 * State Clear" set in GPGPU mode of operation"
8665 *
8666 * This is a subset of the earlier rule, so there's nothing to do.
8667 */
8668 flags |= PIPE_CONTROL_CS_STALL;
8669 }
8670
8671 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8672 /* Project: All / Argument: Store Data Index
8673 *
8674 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8675 * than '0'."
8676 *
8677 * For now, we just assert that the caller does this. We might want to
8678 * automatically add a write to the workaround BO...
8679 */
8680 assert(non_lri_post_sync_flags != 0);
8681 }
8682
8683 if (flags & PIPE_CONTROL_SYNC_GFDT) {
8684 /* Project: All / Argument: Sync GFDT
8685 *
8686 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8687 * than '0' or 0x2520[13] must be set."
8688 *
8689 * For now, we just assert that the caller does this.
8690 */
8691 assert(non_lri_post_sync_flags != 0);
8692 }
8693
8694 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8695 /* Project: SNB, IVB, HSW / Argument: TLB inv
8696 *
8697 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8698 * must be set to something other than '0'."
8699 *
8700 * For now, we just assert that the caller does this.
8701 */
8702 assert(non_lri_post_sync_flags != 0);
8703 }
8704
8705 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8706 /* Project: IVB+ / Argument: TLB inv
8707 *
8708 * "Requires stall bit ([20] of DW1) set."
8709 *
8710 * Also, from the PIPE_CONTROL instruction table:
8711 *
8712 * "Project: SKL+
8713 * Post Sync Operation or CS stall must be set to ensure a TLB
8714 * invalidation occurs. Otherwise no cycle will occur to the TLB
8715 * cache to invalidate."
8716 *
8717 * This is not a subset of the earlier rule, so there's nothing to do.
8718 */
8719 flags |= PIPE_CONTROL_CS_STALL;
8720 }
8721 #if GFX_VER == 8
8722 if (IS_COMPUTE_PIPELINE(batch)) {
8723 if (post_sync_flags ||
8724 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8725 PIPE_CONTROL_DEPTH_STALL |
8726 PIPE_CONTROL_RENDER_TARGET_FLUSH |
8727 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8728 PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8729 /* Project: BDW / Arguments:
8730 *
8731 * - LRI Post Sync Operation [23]
8732 * - Post Sync Op [15:14]
8733 * - Notify En [8]
8734 * - Depth Stall [13]
8735 * - Render Target Cache Flush [12]
8736 * - Depth Cache Flush [0]
8737 * - DC Flush Enable [5]
8738 *
8739 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
8740 * Workloads."
8741 *
8742 * (The docs have separate table rows for each bit, with essentially
8743 * the same workaround text. We've combined them here.)
8744 */
8745 flags |= PIPE_CONTROL_CS_STALL;
8746
8747 /* Also, from the PIPE_CONTROL instruction table, bit 20:
8748 *
8749 * "Project: BDW
8750 * This bit must be always set when PIPE_CONTROL command is
8751 * programmed by GPGPU and MEDIA workloads, except for the cases
8752 * when only Read Only Cache Invalidation bits are set (State
8753 * Cache Invalidation Enable, Instruction cache Invalidation
8754 * Enable, Texture Cache Invalidation Enable, Constant Cache
8755 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
8756 * need not implemented when FF_DOP_CG is disable via "Fixed
8757 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8758 *
8759 * It sounds like we could avoid CS stalls in some cases, but we
8760 * don't currently bother. This list isn't exactly the list above,
8761 * either...
8762 */
8763 }
8764 }
8765 #endif
8766 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8767 *
8768 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8769 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8770 *
8771 * Note that the kernel does CS stalls between batches, so we only need
8772 * to count them within a batch. We currently naively count every 4, and
8773 * don't skip the ones with only read-cache-invalidate bits set. This
8774 * may or may not be a problem...
8775 */
8776 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8777 if (flags & PIPE_CONTROL_CS_STALL) {
8778 /* If we're doing a CS stall, reset the counter and carry on. */
8779 batch->pipe_controls_since_last_cs_stall = 0;
8780 }
8781
8782 /* If this is the fourth pipe control without a CS stall, do one now. */
8783 if (++batch->pipe_controls_since_last_cs_stall == 4) {
8784 batch->pipe_controls_since_last_cs_stall = 0;
8785 flags |= PIPE_CONTROL_CS_STALL;
8786 }
8787 }
8788
8789 /* "Stall" workarounds ----------------------------------------------
8790 * These have to come after the earlier ones because we may have added
8791 * some additional CS stalls above.
8792 */
8793
8794 if (flags & PIPE_CONTROL_CS_STALL) {
8795 /* Project: PRE-SKL, VLV, CHV
8796 *
8797 * "[All Stepping][All SKUs]:
8798 *
8799 * One of the following must also be set:
8800 *
8801 * - Render Target Cache Flush Enable ([12] of DW1)
8802 * - Depth Cache Flush Enable ([0] of DW1)
8803 * - Stall at Pixel Scoreboard ([1] of DW1)
8804 * - Depth Stall ([13] of DW1)
8805 * - Post-Sync Operation ([13] of DW1)
8806 * - DC Flush Enable ([5] of DW1)"
8807 *
8808 * If we don't already have one of those bits set, we choose to add
8809 * "Stall at Pixel Scoreboard". Some of the other bits require a
8810 * CS stall as a workaround (see above), which would send us into
8811 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
8812 * appears to be safe, so we choose that.
8813 */
8814 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8815 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8816 PIPE_CONTROL_WRITE_IMMEDIATE |
8817 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8818 PIPE_CONTROL_WRITE_TIMESTAMP |
8819 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8820 PIPE_CONTROL_DEPTH_STALL |
8821 PIPE_CONTROL_DATA_CACHE_FLUSH;
8822 if (!(flags & wa_bits))
8823 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8824 }
8825
8826 /* Emit --------------------------------------------------------------- */
8827
8828 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8829 fprintf(stderr,
8830 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8831 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8832 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8833 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8834 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8835 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8836 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8837 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8838 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8839 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8840 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8841 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8842 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8843 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8844 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8845 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8846 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8847 "SnapRes" : "",
8848 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8849 "ISPDis" : "",
8850 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8851 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8852 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8853 imm, reason);
8854 }
8855
8856 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8857 #if GFX_VER >= 7
8858 pc.LRIPostSyncOperation = NoLRIOperation;
8859 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8860 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8861 #endif
8862 #if GFX_VER >= 6
8863 pc.StoreDataIndex = 0;
8864 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8865 pc.GlobalSnapshotCountReset =
8866 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8867 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8868 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8869 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8870 pc.RenderTargetCacheFlushEnable =
8871 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8872 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8873 pc.StateCacheInvalidationEnable =
8874 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8875 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8876 pc.ConstantCacheInvalidationEnable =
8877 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8878 #else
8879 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8880 #endif
8881 pc.PostSyncOperation = flags_to_post_sync_op(flags);
8882 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8883 pc.InstructionCacheInvalidateEnable =
8884 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8885 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8886 #if GFX_VER >= 5 || GFX_VERx10 == 45
8887 pc.IndirectStatePointersDisable =
8888 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8889 #endif
8890 #if GFX_VER >= 6
8891 pc.TextureCacheInvalidationEnable =
8892 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8893 #elif GFX_VER == 5 || GFX_VERx10 == 45
8894 pc.TextureCacheFlushEnable =
8895 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8896 #endif
8897 pc.Address = ggtt_bo(bo, offset);
8898 if (GFX_VER < 7 && bo)
8899 pc.DestinationAddressType = DAT_GGTT;
8900 pc.ImmediateData = imm;
8901 }
8902 }
8903
8904 #if GFX_VER == 6
8905 void
8906 genX(crocus_upload_urb)(struct crocus_batch *batch,
8907 unsigned vs_size,
8908 bool gs_present,
8909 unsigned gs_size)
8910 {
8911 struct crocus_context *ice = batch->ice;
8912 int nr_vs_entries, nr_gs_entries;
8913 int total_urb_size = ice->urb.size * 1024; /* in bytes */
8914 const struct intel_device_info *devinfo = &batch->screen->devinfo;
8915
8916 /* Calculate how many entries fit in each stage's section of the URB */
8917 if (gs_present) {
8918 nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8919 nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8920 } else {
8921 nr_vs_entries = total_urb_size / (vs_size * 128);
8922 nr_gs_entries = 0;
8923 }
8924
8925 /* Then clamp to the maximum allowed by the hardware */
8926 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8927 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8928
8929 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8930 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8931
8932 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8933 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8934 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8935
8936 assert(ice->urb.nr_vs_entries >=
8937 devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8938 assert(ice->urb.nr_vs_entries % 4 == 0);
8939 assert(ice->urb.nr_gs_entries % 4 == 0);
8940 assert(vs_size <= 5);
8941 assert(gs_size <= 5);
8942
8943 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8944 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8945 urb.VSURBEntryAllocationSize = vs_size - 1;
8946
8947 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8948 urb.GSURBEntryAllocationSize = gs_size - 1;
8949 };
8950 /* From the PRM Volume 2 part 1, section 1.4.7:
8951 *
8952 * Because of a urb corruption caused by allocating a previous gsunit’s
8953 * urb entry to vsunit software is required to send a "GS NULL
8954 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8955 * a dummy DRAW call before any case where VS will be taking over GS URB
8956 * space.
8957 *
8958 * It is not clear exactly what this means ("URB fence" is a command that
8959 * doesn't exist on Gen6). So for now we just do a full pipeline flush as
8960 * a workaround.
8961 */
8962 if (ice->urb.gs_present && !gs_present)
8963 crocus_emit_mi_flush(batch);
8964 ice->urb.gs_present = gs_present;
8965 }
8966 #endif
8967
8968 static void
8969 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8970 {
8971 }
8972
8973 static void
8974 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8975 struct crocus_bo *bo,
8976 uint32_t offset_in_bytes,
8977 uint32_t report_id)
8978 {
8979 #if GFX_VER >= 7
8980 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8981 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8982 mi_rpc.ReportID = report_id;
8983 }
8984 #endif
8985 }
8986
8987 /**
8988 * From the PRM, Volume 2a:
8989 *
8990 * "Indirect State Pointers Disable
8991 *
8992 * At the completion of the post-sync operation associated with this pipe
8993 * control packet, the indirect state pointers in the hardware are
8994 * considered invalid; the indirect pointers are not saved in the context.
8995 * If any new indirect state commands are executed in the command stream
8996 * while the pipe control is pending, the new indirect state commands are
8997 * preserved.
8998 *
8999 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9000 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9001 * commands are only considered as Indirect State Pointers. Once ISP is
9002 * issued in a context, SW must initialize by programming push constant
9003 * commands for all the shaders (at least to zero length) before attempting
9004 * any rendering operation for the same context."
9005 *
9006 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9007 * even though they point to a BO that has been already unreferenced at
9008 * the end of the previous batch buffer. This has been fine so far since
9009 * we are protected by these scratch page (every address not covered by
9010 * a BO should be pointing to the scratch page). But on CNL, it is
9011 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9012 * instruction.
9013 *
9014 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9015 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9016 * context restore, so the mentioned hang doesn't happen. However,
9017 * software must program push constant commands for all stages prior to
9018 * rendering anything, so we flag them as dirty.
9019 *
9020 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9021 * constants have been loaded into the EUs prior to disable the push constants
9022 * so that it doesn't hang a previous 3DPRIMITIVE.
9023 */
9024 #if GFX_VER >= 7
9025 static void
9026 gen7_emit_isp_disable(struct crocus_batch *batch)
9027 {
9028 crocus_emit_raw_pipe_control(batch, "isp disable",
9029 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9030 PIPE_CONTROL_CS_STALL,
9031 NULL, 0, 0);
9032 crocus_emit_raw_pipe_control(batch, "isp disable",
9033 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9034 PIPE_CONTROL_CS_STALL,
9035 NULL, 0, 0);
9036
9037 struct crocus_context *ice = batch->ice;
9038 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9039 CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9040 CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9041 CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9042 CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9043 }
9044 #endif
9045
9046 #if GFX_VER >= 7
9047 static void
9048 crocus_state_finish_batch(struct crocus_batch *batch)
9049 {
9050 #if GFX_VERx10 == 75
9051 if (batch->name == CROCUS_BATCH_RENDER) {
9052 crocus_emit_mi_flush(batch);
9053 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9054 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9055 }
9056
9057 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9058 PIPE_CONTROL_CS_STALL);
9059 }
9060 #endif
9061 gen7_emit_isp_disable(batch);
9062 }
9063 #endif
9064
9065 static void
9066 crocus_batch_reset_dirty(struct crocus_batch *batch)
9067 {
9068 /* unreference any index buffer so it get reemitted. */
9069 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9070
9071 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9072 * as the old state batch won't still be available.
9073 */
9074 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9075 CROCUS_DIRTY_COLOR_CALC_STATE;
9076
9077 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9078
9079 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9080 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9081 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9082 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9083 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9084 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9085 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9086
9087 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9088 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9089 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9090 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9091 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9093
9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9095 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9096 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9097 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9098
9099 #if GFX_VER >= 6
9100 /* SCISSOR_STATE */
9101 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9102 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9103 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9104
9105 #endif
9106 #if GFX_VER <= 5
9107 /* dirty the SF state on gen4/5 */
9108 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9109 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9110 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9111 batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9112 #endif
9113 #if GFX_VER >= 7
9114 /* Streamout dirty */
9115 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9116 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9117 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9118 #endif
9119 }
9120
9121 #if GFX_VERx10 == 75
9122 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9123 {
9124 return &ice->state.cso_rast->cso;
9125 }
9126 #endif
9127
9128 #if GFX_VER >= 6
9129 static void update_so_strides(struct crocus_context *ice,
9130 uint16_t *strides)
9131 {
9132 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9133 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9134 if (so)
9135 so->stride = strides[i] * sizeof(uint32_t);
9136 }
9137 }
9138 #endif
9139
9140 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9141 int s,
9142 uint32_t *clamp_mask)
9143 {
9144 #if GFX_VER < 8
9145 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9146 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9147 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9148 clamp_mask[0] |= (1 << s);
9149 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9150 clamp_mask[1] |= (1 << s);
9151 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9152 clamp_mask[2] |= (1 << s);
9153 }
9154 #endif
9155 }
9156
9157 static void
9158 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9159 {
9160 struct crocus_context *ice = (struct crocus_context *) ctx;
9161
9162 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9163 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9164 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9165 }
9166
9167 if (ice->batch_count == 1)
9168 return;
9169
9170 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9171 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9172 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9173 }
9174 }
9175
9176 void
9177 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9178 {
9179 assert(screen->devinfo.verx10 == GFX_VERx10);
9180 screen->vtbl.destroy_state = crocus_destroy_state;
9181 screen->vtbl.init_render_context = crocus_init_render_context;
9182 screen->vtbl.upload_render_state = crocus_upload_render_state;
9183 #if GFX_VER >= 7
9184 screen->vtbl.init_compute_context = crocus_init_compute_context;
9185 screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9186 #endif
9187 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9188 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9189 screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9190 #if GFX_VERx10 >= 75
9191 screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9192 screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9193 screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9194 screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9195 screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9196 screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9197 #endif
9198 #if GFX_VER >= 7
9199 screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9200 screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9201 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9202 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9203 #endif
9204 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9205 #if GFX_VER >= 6
9206 screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9207 screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9208 #endif
9209 screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9210 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9211 screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9212 screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9213 screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9214 screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9215 screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9216 #if GFX_VER >= 7
9217 screen->vtbl.finish_batch = crocus_state_finish_batch;
9218 #endif
9219 #if GFX_VER <= 5
9220 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9221 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9222 #endif
9223 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9224 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9225 screen->vtbl.translate_prim_type = translate_prim_type;
9226 #if GFX_VER >= 6
9227 screen->vtbl.update_so_strides = update_so_strides;
9228 screen->vtbl.get_so_offset = crocus_get_so_offset;
9229 #endif
9230
9231 genX(crocus_init_blt)(screen);
9232 }
9233
9234 void
9235 genX(crocus_init_state)(struct crocus_context *ice)
9236 {
9237 struct pipe_context *ctx = &ice->ctx;
9238
9239 ctx->create_blend_state = crocus_create_blend_state;
9240 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9241 ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9242 ctx->create_sampler_state = crocus_create_sampler_state;
9243 ctx->create_sampler_view = crocus_create_sampler_view;
9244 ctx->create_surface = crocus_create_surface;
9245 ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9246 ctx->bind_blend_state = crocus_bind_blend_state;
9247 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9248 ctx->bind_sampler_states = crocus_bind_sampler_states;
9249 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9250 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9251 ctx->delete_blend_state = crocus_delete_state;
9252 ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9253 ctx->delete_rasterizer_state = crocus_delete_state;
9254 ctx->delete_sampler_state = crocus_delete_state;
9255 ctx->delete_vertex_elements_state = crocus_delete_state;
9256 ctx->set_blend_color = crocus_set_blend_color;
9257 ctx->set_clip_state = crocus_set_clip_state;
9258 ctx->set_constant_buffer = crocus_set_constant_buffer;
9259 ctx->set_shader_buffers = crocus_set_shader_buffers;
9260 ctx->set_shader_images = crocus_set_shader_images;
9261 ctx->set_sampler_views = crocus_set_sampler_views;
9262 ctx->set_tess_state = crocus_set_tess_state;
9263 ctx->set_patch_vertices = crocus_set_patch_vertices;
9264 ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9265 ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9266 ctx->set_sample_mask = crocus_set_sample_mask;
9267 ctx->set_scissor_states = crocus_set_scissor_states;
9268 ctx->set_stencil_ref = crocus_set_stencil_ref;
9269 ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9270 ctx->set_viewport_states = crocus_set_viewport_states;
9271 ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9272 ctx->surface_destroy = crocus_surface_destroy;
9273 ctx->draw_vbo = crocus_draw_vbo;
9274 ctx->launch_grid = crocus_launch_grid;
9275
9276 ctx->set_frontend_noop = crocus_set_frontend_noop;
9277
9278 #if GFX_VER >= 6
9279 ctx->create_stream_output_target = crocus_create_stream_output_target;
9280 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9281 ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9282 #endif
9283
9284 ice->state.dirty = ~0ull;
9285 ice->state.stage_dirty = ~0ull;
9286
9287 ice->state.statistics_counters_enabled = true;
9288
9289 ice->state.sample_mask = 0xff;
9290 ice->state.num_viewports = 1;
9291 ice->state.prim_mode = PIPE_PRIM_MAX;
9292 ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
9293 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9294 ice->draw.derived_params.drawid = -1;
9295
9296 /* Default all scissor rectangles to be empty regions. */
9297 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9298 ice->state.scissors[i] = (struct pipe_scissor_state) {
9299 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9300 };
9301 }
9302 }
9303