1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file crocus_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <errno.h>
74 #include <stdio.h>
75 
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #ifdef DEBUG
81 #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
82 #endif
83 #else
84 #define VG(x)
85 #endif
86 
87 #include "drm-uapi/i915_drm.h"
88 #include "intel/common/intel_l3_config.h"
89 #include "intel/common/intel_sample_positions.h"
90 #include "intel/compiler/brw_compiler.h"
91 #include "pipe/p_context.h"
92 #include "pipe/p_defines.h"
93 #include "pipe/p_screen.h"
94 #include "pipe/p_state.h"
95 #include "util/format/u_format.h"
96 #include "util/half_float.h"
97 #include "util/u_dual_blend.h"
98 #include "util/u_framebuffer.h"
99 #include "util/u_helpers.h"
100 #include "util/u_inlines.h"
101 #include "util/u_memory.h"
102 #include "util/u_prim.h"
103 #include "util/u_transfer.h"
104 #include "util/u_upload_mgr.h"
105 #include "util/u_viewport.h"
106 #include "crocus_batch.h"
107 #include "crocus_context.h"
108 #include "crocus_defines.h"
109 #include "crocus_pipe.h"
110 #include "crocus_resource.h"
111 
112 #include "crocus_genx_macros.h"
113 #include "intel/common/intel_guardband.h"
114 
115 /**
116  * Statically assert that PIPE_* enums match the hardware packets.
117  * (As long as they match, we don't need to translate them.)
118  */
pipe_asserts()119 UNUSED static void pipe_asserts()
120 {
121 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
122 
123    /* pipe_logicop happens to match the hardware. */
124    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
125    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
126    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
127    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
128    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
129    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
130    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
131    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
132    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
133    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
134    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
135    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
136    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
137    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
138    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
139    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
140 
141    /* pipe_blend_func happens to match the hardware. */
142    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
143    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
144    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
145    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
146    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
147    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
148    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
149    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
150    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
151    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
161 
162    /* pipe_blend_func happens to match the hardware. */
163    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
164    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
165    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
166    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
167    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
168 
169    /* pipe_stencil_op happens to match the hardware. */
170    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
171    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
172    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
173    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
174    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
175    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
176    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
177    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
178 
179 #if GFX_VER >= 6
180    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
181    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
182    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
183 #endif
184 #undef PIPE_ASSERT
185 }
186 
187 static unsigned
translate_prim_type(enum pipe_prim_type prim,uint8_t verts_per_patch)188 translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
189 {
190    static const unsigned map[] = {
191       [PIPE_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
192       [PIPE_PRIM_LINES]                    = _3DPRIM_LINELIST,
193       [PIPE_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
194       [PIPE_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
195       [PIPE_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
196       [PIPE_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
197       [PIPE_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
198       [PIPE_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
199       [PIPE_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
200       [PIPE_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
201 #if GFX_VER >= 6
202       [PIPE_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
203       [PIPE_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
204       [PIPE_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
205       [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
206 #endif
207 #if GFX_VER >= 7
208       [PIPE_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
209 #endif
210    };
211 
212    return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
213 }
214 
215 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)216 translate_compare_func(enum pipe_compare_func pipe_func)
217 {
218    static const unsigned map[] = {
219       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
220       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
221       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
222       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
223       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
224       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
225       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
226       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
227    };
228    return map[pipe_func];
229 }
230 
231 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)232 translate_shadow_func(enum pipe_compare_func pipe_func)
233 {
234    /* Gallium specifies the result of shadow comparisons as:
235     *
236     *    1 if ref <op> texel,
237     *    0 otherwise.
238     *
239     * The hardware does:
240     *
241     *    0 if texel <op> ref,
242     *    1 otherwise.
243     *
244     * So we need to flip the operator and also negate.
245     */
246    static const unsigned map[] = {
247       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
248       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
249       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
250       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
251       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
252       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
253       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
254       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
255    };
256    return map[pipe_func];
257 }
258 
259 static unsigned
translate_cull_mode(unsigned pipe_face)260 translate_cull_mode(unsigned pipe_face)
261 {
262    static const unsigned map[4] = {
263       [PIPE_FACE_NONE]           = CULLMODE_NONE,
264       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
265       [PIPE_FACE_BACK]           = CULLMODE_BACK,
266       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
267    };
268    return map[pipe_face];
269 }
270 
271 #if GFX_VER >= 6
272 static unsigned
translate_fill_mode(unsigned pipe_polymode)273 translate_fill_mode(unsigned pipe_polymode)
274 {
275    static const unsigned map[4] = {
276       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
277       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
278       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
279       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
280    };
281    return map[pipe_polymode];
282 }
283 #endif
284 
285 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)286 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
287 {
288    static const unsigned map[] = {
289       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
290       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
291       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
292    };
293    return map[pipe_mip];
294 }
295 
296 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)297 translate_wrap(unsigned pipe_wrap, bool either_nearest)
298 {
299    static const unsigned map[] = {
300       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
301 #if GFX_VER == 8
302       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
303 #else
304       [PIPE_TEX_WRAP_CLAMP]                  = TCM_CLAMP_BORDER,
305 #endif
306       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
307       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
308       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
309       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
310 
311       /* These are unsupported. */
312       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
313       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
314    };
315 #if GFX_VER < 8
316    if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
317       return TCM_CLAMP;
318 #endif
319    return map[pipe_wrap];
320 }
321 
322 /**
323  * Equiv if brw_state_batch
324  */
325 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)326 stream_state(struct crocus_batch *batch,
327              unsigned size,
328              unsigned alignment,
329              uint32_t *out_offset)
330 {
331    uint32_t offset = ALIGN(batch->state.used, alignment);
332 
333    if (offset + size >= STATE_SZ && !batch->no_wrap) {
334       crocus_batch_flush(batch);
335       offset = ALIGN(batch->state.used, alignment);
336    } else if (offset + size >= batch->state.bo->size) {
337       const unsigned new_size =
338          MIN2(batch->state.bo->size + batch->state.bo->size / 2,
339               MAX_STATE_SIZE);
340       crocus_grow_buffer(batch, true, batch->state.used, new_size);
341       assert(offset + size < batch->state.bo->size);
342    }
343 
344    crocus_record_state_size(batch->state_sizes, offset, size);
345 
346    batch->state.used = offset + size;
347    *out_offset = offset;
348 
349    return (uint32_t *)batch->state.map + (offset >> 2);
350 }
351 
352 /**
353  * stream_state() + memcpy.
354  */
355 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)356 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
357            unsigned alignment)
358 {
359    unsigned offset = 0;
360    uint32_t *map = stream_state(batch, size, alignment, &offset);
361 
362    if (map)
363       memcpy(map, data, size);
364 
365    return offset;
366 }
367 
368 #if GFX_VER <= 5
369 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)370 upload_pipelined_state_pointers(struct crocus_batch *batch,
371                                 bool gs_active, uint32_t gs_offset,
372                                 uint32_t vs_offset, uint32_t sf_offset,
373                                 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
374 {
375 #if GFX_VER == 5
376    /* Need to flush before changing clip max threads for errata. */
377    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
378 #endif
379 
380    crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
381       pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
382       pp.GSEnable = gs_active;
383       if (gs_active)
384          pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
385       pp.ClipEnable = true;
386       pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
387       pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
388       pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
389       pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
390    }
391 }
392 
393 #endif
394 /**
395  * Did field 'x' change between 'old_cso' and 'new_cso'?
396  *
397  * (If so, we may want to set some dirty flags.)
398  */
399 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
400 #define cso_changed_memcmp(x) \
401    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
402 
403 static void
flush_before_state_base_change(struct crocus_batch * batch)404 flush_before_state_base_change(struct crocus_batch *batch)
405 {
406 #if GFX_VER >= 6
407    /* Flush before emitting STATE_BASE_ADDRESS.
408     *
409     * This isn't documented anywhere in the PRM.  However, it seems to be
410     * necessary prior to changing the surface state base adress.  We've
411     * seen issues in Vulkan where we get GPU hangs when using multi-level
412     * command buffers which clear depth, reset state base address, and then
413     * go render stuff.
414     *
415     * Normally, in GL, we would trust the kernel to do sufficient stalls
416     * and flushes prior to executing our batch.  However, it doesn't seem
417     * as if the kernel's flushing is always sufficient and we don't want to
418     * rely on it.
419     *
420     * We make this an end-of-pipe sync instead of a normal flush because we
421     * do not know the current status of the GPU.  On Haswell at least,
422     * having a fast-clear operation in flight at the same time as a normal
423     * rendering operation can cause hangs.  Since the kernel's flushing is
424     * insufficient, we need to ensure that any rendering operations from
425     * other processes are definitely complete before we try to do our own
426     * rendering.  It's a bit of a big hammer but it appears to work.
427     */
428    const unsigned dc_flush =
429       batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
430    crocus_emit_end_of_pipe_sync(batch,
431                                 "change STATE_BASE_ADDRESS (flushes)",
432                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
433                                 dc_flush |
434                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
435 #endif
436 }
437 
438 static void
flush_after_state_base_change(struct crocus_batch * batch)439 flush_after_state_base_change(struct crocus_batch *batch)
440 {
441    /* After re-setting the surface state base address, we have to do some
442     * cache flusing so that the sampler engine will pick up the new
443     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
444     * Shared Function > 3D Sampler > State > State Caching (page 96):
445     *
446     *    Coherency with system memory in the state cache, like the texture
447     *    cache is handled partially by software. It is expected that the
448     *    command stream or shader will issue Cache Flush operation or
449     *    Cache_Flush sampler message to ensure that the L1 cache remains
450     *    coherent with system memory.
451     *
452     *    [...]
453     *
454     *    Whenever the value of the Dynamic_State_Base_Addr,
455     *    Surface_State_Base_Addr are altered, the L1 state cache must be
456     *    invalidated to ensure the new surface or sampler state is fetched
457     *    from system memory.
458     *
459     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
460     * which, according the PIPE_CONTROL instruction documentation in the
461     * Broadwell PRM:
462     *
463     *    Setting this bit is independent of any other bit in this packet.
464     *    This bit controls the invalidation of the L1 and L2 state caches
465     *    at the top of the pipe i.e. at the parsing time.
466     *
467     * Unfortunately, experimentation seems to indicate that state cache
468     * invalidation through a PIPE_CONTROL does nothing whatsoever in
469     * regards to surface state and binding tables.  In stead, it seems that
470     * invalidating the texture cache is what is actually needed.
471     *
472     * XXX:  As far as we have been able to determine through
473     * experimentation, shows that flush the texture cache appears to be
474     * sufficient.  The theory here is that all of the sampling/rendering
475     * units cache the binding table in the texture cache.  However, we have
476     * yet to be able to actually confirm this.
477     */
478 #if GFX_VER >= 6
479    crocus_emit_end_of_pipe_sync(batch,
480                                 "change STATE_BASE_ADDRESS (invalidates)",
481                                 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
482                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
483                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
484                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
485 #endif
486 }
487 
488 #if GFX_VER >= 6
489 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)490 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
491                             struct crocus_bo *bo, uint32_t offset,
492                             bool predicated)
493 {
494    crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
495       srm.RegisterAddress = reg;
496       srm.MemoryAddress = ggtt_bo(bo, offset);
497 #if GFX_VERx10 >= 75
498       srm.PredicateEnable = predicated;
499 #else
500       if (predicated)
501          unreachable("unsupported predication");
502 #endif
503    }
504 }
505 
506 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)507 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
508                             struct crocus_bo *bo, uint32_t offset,
509                             bool predicated)
510 {
511    crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
512    crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
513 }
514 #endif
515 
516 #if GFX_VER >= 7
517 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)518 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
519 {
520    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
521       lri.RegisterOffset = reg;
522       lri.DataDWord      = val;
523    }
524 }
525 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
526 
527 #if GFX_VERx10 >= 75
528 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)529 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
530 {
531    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
532       lrr.SourceRegisterAddress = src;
533       lrr.DestinationRegisterAddress = dst;
534    }
535 }
536 
537 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)538 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
539                            uint32_t src)
540 {
541    _crocus_emit_lrr(batch, dst, src);
542 }
543 
544 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)545 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
546                            uint32_t src)
547 {
548    _crocus_emit_lrr(batch, dst, src);
549    _crocus_emit_lrr(batch, dst + 4, src + 4);
550 }
551 #endif
552 
553 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)554 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
555                            uint32_t val)
556 {
557    _crocus_emit_lri(batch, reg, val);
558 }
559 
560 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)561 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
562                            uint64_t val)
563 {
564    _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
565    _crocus_emit_lri(batch, reg + 4, val >> 32);
566 }
567 
568 /**
569  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
570  */
571 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)572 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
573                            struct crocus_bo *bo, uint32_t offset)
574 {
575    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
576       lrm.RegisterAddress = reg;
577       lrm.MemoryAddress = ro_bo(bo, offset);
578    }
579 }
580 
581 /**
582  * Load a 64-bit value from a buffer into a MMIO register via
583  * two MI_LOAD_REGISTER_MEM commands.
584  */
585 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)586 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
587                            struct crocus_bo *bo, uint32_t offset)
588 {
589    crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
590    crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
591 }
592 
593 #if GFX_VERx10 >= 75
594 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)595 crocus_store_data_imm32(struct crocus_batch *batch,
596                         struct crocus_bo *bo, uint32_t offset,
597                         uint32_t imm)
598 {
599    crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
600       sdi.Address = rw_bo(bo, offset);
601 #if GFX_VER >= 6
602       sdi.ImmediateData = imm;
603 #endif
604    }
605 }
606 
607 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)608 crocus_store_data_imm64(struct crocus_batch *batch,
609                         struct crocus_bo *bo, uint32_t offset,
610                         uint64_t imm)
611 {
612    /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
613     * 2 in genxml but it's actually variable length and we need 5 DWords.
614     */
615    void *map = crocus_get_command_space(batch, 4 * 5);
616    _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
617       sdi.DWordLength = 5 - 2;
618       sdi.Address = rw_bo(bo, offset);
619 #if GFX_VER >= 6
620       sdi.ImmediateData = imm;
621 #endif
622    }
623 }
624 #endif
625 
626 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)627 crocus_copy_mem_mem(struct crocus_batch *batch,
628                     struct crocus_bo *dst_bo, uint32_t dst_offset,
629                     struct crocus_bo *src_bo, uint32_t src_offset,
630                     unsigned bytes)
631 {
632    assert(bytes % 4 == 0);
633    assert(dst_offset % 4 == 0);
634    assert(src_offset % 4 == 0);
635 
636 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
637    for (unsigned i = 0; i < bytes; i += 4) {
638       crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
639                                  src_bo, src_offset + i);
640       crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
641                                   dst_bo, dst_offset + i, false);
642    }
643 }
644 #endif
645 
646 /**
647  * Gallium CSO for rasterizer state.
648  */
649 struct crocus_rasterizer_state {
650    struct pipe_rasterizer_state cso;
651 #if GFX_VER >= 6
652    uint32_t sf[GENX(3DSTATE_SF_length)];
653    uint32_t clip[GENX(3DSTATE_CLIP_length)];
654 #endif
655 #if GFX_VER >= 8
656    uint32_t raster[GENX(3DSTATE_RASTER_length)];
657 #endif
658    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
659 
660    uint8_t num_clip_plane_consts;
661    bool fill_mode_point_or_line;
662 };
663 
664 #if GFX_VER <= 5
665 #define URB_VS 0
666 #define URB_GS 1
667 #define URB_CLP 2
668 #define URB_SF 3
669 #define URB_CS 4
670 
671 static const struct {
672    uint32_t min_nr_entries;
673    uint32_t preferred_nr_entries;
674    uint32_t min_entry_size;
675    uint32_t  max_entry_size;
676 } limits[URB_CS+1] = {
677    { 16, 32, 1, 5 },                        /* vs */
678    { 4, 8,  1, 5 },                        /* gs */
679    { 5, 10,  1, 5 },                        /* clp */
680    { 1, 8,  1, 12 },                        /* sf */
681    { 1, 4,  1, 32 }                        /* cs */
682 };
683 
check_urb_layout(struct crocus_context * ice)684 static bool check_urb_layout(struct crocus_context *ice)
685 {
686    ice->urb.vs_start = 0;
687    ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
688    ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
689    ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
690    ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
691 
692    return ice->urb.cs_start + ice->urb.nr_cs_entries *
693       ice->urb.csize <= ice->urb.size;
694 }
695 
696 
697 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)698 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
699                            unsigned vsize, unsigned sfsize)
700 {
701    const struct intel_device_info *devinfo = &batch->screen->devinfo;
702    struct crocus_context *ice = batch->ice;
703    if (csize < limits[URB_CS].min_entry_size)
704       csize = limits[URB_CS].min_entry_size;
705 
706    if (vsize < limits[URB_VS].min_entry_size)
707       vsize = limits[URB_VS].min_entry_size;
708 
709    if (sfsize < limits[URB_SF].min_entry_size)
710       sfsize = limits[URB_SF].min_entry_size;
711 
712    if (ice->urb.vsize < vsize ||
713        ice->urb.sfsize < sfsize ||
714        ice->urb.csize < csize ||
715        (ice->urb.constrained && (ice->urb.vsize > vsize ||
716                                  ice->urb.sfsize > sfsize ||
717                                  ice->urb.csize > csize))) {
718 
719 
720       ice->urb.csize = csize;
721       ice->urb.sfsize = sfsize;
722       ice->urb.vsize = vsize;
723 
724       ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
725       ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
726       ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
727       ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
728       ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
729 
730       ice->urb.constrained = 0;
731 
732       if (devinfo->ver == 5) {
733          ice->urb.nr_vs_entries = 128;
734          ice->urb.nr_sf_entries = 48;
735          if (check_urb_layout(ice)) {
736             goto done;
737          } else {
738             ice->urb.constrained = 1;
739             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
740             ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
741          }
742       } else if (devinfo->is_g4x) {
743          ice->urb.nr_vs_entries = 64;
744          if (check_urb_layout(ice)) {
745             goto done;
746          } else {
747             ice->urb.constrained = 1;
748             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
749          }
750       }
751 
752       if (!check_urb_layout(ice)) {
753          ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
754          ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
755          ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
756          ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
757          ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
758 
759          /* Mark us as operating with constrained nr_entries, so that next
760           * time we recalculate we'll resize the fences in the hope of
761           * escaping constrained mode and getting back to normal performance.
762           */
763          ice->urb.constrained = 1;
764 
765          if (!check_urb_layout(ice)) {
766             /* This is impossible, given the maximal sizes of urb
767              * entries and the values for minimum nr of entries
768              * provided above.
769              */
770             fprintf(stderr, "couldn't calculate URB layout!\n");
771             exit(1);
772          }
773 
774          if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
775             fprintf(stderr, "URB CONSTRAINED\n");
776       }
777 
778 done:
779       if (INTEL_DEBUG(DEBUG_URB))
780          fprintf(stderr,
781                  "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
782                  ice->urb.vs_start,
783                  ice->urb.gs_start,
784                  ice->urb.clip_start,
785                  ice->urb.sf_start,
786                  ice->urb.cs_start,
787                  ice->urb.size);
788       return true;
789    }
790    return false;
791 }
792 
793 static void
crocus_upload_urb_fence(struct crocus_batch * batch)794 crocus_upload_urb_fence(struct crocus_batch *batch)
795 {
796    uint32_t urb_fence[3];
797    _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
798       urb.VSUnitURBReallocationRequest = 1;
799       urb.GSUnitURBReallocationRequest = 1;
800       urb.CLIPUnitURBReallocationRequest = 1;
801       urb.SFUnitURBReallocationRequest = 1;
802       urb.VFEUnitURBReallocationRequest = 1;
803       urb.CSUnitURBReallocationRequest = 1;
804 
805       urb.VSFence = batch->ice->urb.gs_start;
806       urb.GSFence = batch->ice->urb.clip_start;
807       urb.CLIPFence = batch->ice->urb.sf_start;
808       urb.SFFence = batch->ice->urb.cs_start;
809       urb.CSFence = batch->ice->urb.size;
810    }
811 
812    /* erratum: URB_FENCE must not cross a 64byte cacheline */
813    if ((crocus_batch_bytes_used(batch) & 15) > 12) {
814       int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
815       do {
816          *(uint32_t *)batch->command.map_next = 0;
817          batch->command.map_next += sizeof(uint32_t);
818       } while (--pad);
819    }
820 
821    crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
822 }
823 
824 static bool
calculate_curbe_offsets(struct crocus_batch * batch)825 calculate_curbe_offsets(struct crocus_batch *batch)
826 {
827    struct crocus_context *ice = batch->ice;
828 
829    unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
830    unsigned total_regs;
831 
832    nr_fp_regs = 0;
833    for (int i = 0; i < 4; i++) {
834       const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
835       if (range->length == 0)
836          continue;
837 
838       /* ubo range tracks at 256-bit, we need 512-bit */
839       nr_fp_regs += (range->length + 1) / 2;
840    }
841 
842    if (ice->state.cso_rast->cso.clip_plane_enable) {
843       unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
844       nr_clip_regs = (nr_planes * 4 + 15) / 16;
845    }
846 
847    nr_vp_regs = 0;
848    for (int i = 0; i < 4; i++) {
849       const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
850       if (range->length == 0)
851          continue;
852 
853       /* ubo range tracks at 256-bit, we need 512-bit */
854       nr_vp_regs += (range->length + 1) / 2;
855    }
856    if (nr_vp_regs == 0) {
857       /* The pre-gen6 VS requires that some push constants get loaded no
858        * matter what, or the GPU would hang.
859        */
860       nr_vp_regs = 1;
861    }
862    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
863 
864    /* The CURBE allocation size is limited to 32 512-bit units (128 EU
865     * registers, or 1024 floats).  See CS_URB_STATE in the gen4 or gen5
866     * (volume 1, part 1) PRMs.
867     *
868     * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
869     * values as push constants before spilling to pull constants, and in
870     * brw_vec4.cpp we're loading up to 32 registers of push constants.  An EU
871     * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
872     * regs for clip.
873     */
874    assert(total_regs <= 32);
875 
876    /* Lazy resize:
877     */
878    if (nr_fp_regs > ice->curbe.wm_size ||
879        nr_vp_regs > ice->curbe.vs_size ||
880        nr_clip_regs != ice->curbe.clip_size ||
881        (total_regs < ice->curbe.total_size / 4 &&
882         ice->curbe.total_size > 16)) {
883 
884       GLuint reg = 0;
885 
886       /* Calculate a new layout:
887        */
888       reg = 0;
889       ice->curbe.wm_start = reg;
890       ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
891       ice->curbe.clip_start = reg;
892       ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
893       ice->curbe.vs_start = reg;
894       ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
895       ice->curbe.total_size = reg;
896 
897       if (0)
898          fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
899                  ice->curbe.wm_start,
900                  ice->curbe.wm_size,
901                  ice->curbe.clip_start,
902                  ice->curbe.clip_size,
903                  ice->curbe.vs_start,
904                  ice->curbe.vs_size );
905       return true;
906    }
907    return false;
908 }
909 
910 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)911 upload_shader_consts(struct crocus_context *ice,
912                      gl_shader_stage stage,
913                      uint32_t *map,
914                      unsigned start)
915 {
916    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
917    struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
918    uint32_t *cmap;
919    bool found = false;
920    unsigned offset = start * 16;
921    int total = 0;
922    for (int i = 0; i < 4; i++) {
923       const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
924 
925       if (range->length == 0)
926          continue;
927 
928       unsigned block_index = crocus_bti_to_group_index(
929          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
930       unsigned len = range->length * 8 * sizeof(float);
931       unsigned start = range->start * 8 * sizeof(float);
932       struct pipe_transfer *transfer;
933 
934       cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
935                                    ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
936                                    PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
937       if (cmap)
938          memcpy(&map[offset + (total * 8)], cmap, len);
939       pipe_buffer_unmap(&ice->ctx, transfer);
940       total += range->length;
941       found = true;
942    }
943 
944    if (stage == MESA_SHADER_VERTEX && !found) {
945       /* The pre-gen6 VS requires that some push constants get loaded no
946        * matter what, or the GPU would hang.
947        */
948       unsigned len = 16;
949       memset(&map[offset], 0, len);
950    }
951 }
952 
953 static const float fixed_plane[6][4] = {
954    { 0,    0,   -1, 1 },
955    { 0,    0,    1, 1 },
956    { 0,   -1,    0, 1 },
957    { 0,    1,    0, 1 },
958    {-1,    0,    0, 1 },
959    { 1,    0,    0, 1 }
960 };
961 
962 static void
gen4_upload_curbe(struct crocus_batch * batch)963 gen4_upload_curbe(struct crocus_batch *batch)
964 {
965    struct crocus_context *ice = batch->ice;
966    const unsigned sz = ice->curbe.total_size;
967    const unsigned buf_sz = sz * 16 * sizeof(float);
968 
969    if (sz == 0)
970       goto emit;
971 
972    uint32_t *map;
973    u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
974                   &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
975 
976    /* fragment shader constants */
977    if (ice->curbe.wm_size) {
978       upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
979    }
980 
981    /* clipper constants */
982    if (ice->curbe.clip_size) {
983       unsigned offset = ice->curbe.clip_start * 16;
984       float *fmap = (float *)map;
985       unsigned i;
986       /* If any planes are going this way, send them all this way:
987        */
988       for (i = 0; i < 6; i++) {
989          fmap[offset + i * 4 + 0] = fixed_plane[i][0];
990          fmap[offset + i * 4 + 1] = fixed_plane[i][1];
991          fmap[offset + i * 4 + 2] = fixed_plane[i][2];
992          fmap[offset + i * 4 + 3] = fixed_plane[i][3];
993       }
994 
995       unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
996       struct pipe_clip_state *cp = &ice->state.clip_planes;
997       while (mask) {
998          const int j = u_bit_scan(&mask);
999          fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1000          fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1001          fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1002          fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1003          i++;
1004       }
1005    }
1006 
1007    /* vertex shader constants */
1008    if (ice->curbe.vs_size) {
1009       upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1010    }
1011    if (0) {
1012       for (int i = 0; i < sz*16; i+=4) {
1013          float *f = (float *)map;
1014          fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1015                  f[i+0], f[i+1], f[i+2], f[i+3]);
1016       }
1017    }
1018 
1019 emit:
1020    crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1021       if (ice->curbe.curbe_res) {
1022          cb.BufferLength = ice->curbe.total_size - 1;
1023          cb.Valid = 1;
1024          cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1025       }
1026    }
1027 
1028 #if GFX_VER == 4 && GFX_VERx10 != 45
1029    /* Work around a Broadwater/Crestline depth interpolator bug.  The
1030     * following sequence will cause GPU hangs:
1031     *
1032     * 1. Change state so that all depth related fields in CC_STATE are
1033     *    disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1034     * 2. Emit a CONSTANT_BUFFER packet.
1035     * 3. Draw via 3DPRIMITIVE.
1036     *
1037     * The recommended workaround is to emit a non-pipelined state change after
1038     * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1039     *
1040     * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1041     * and always emit it when "PS Use Source Depth" is set.  We could be more
1042     * precise, but the additional complexity is probably not worth it.
1043     *
1044     */
1045    const struct shader_info *fs_info =
1046       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1047 
1048    if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1049       ice->state.global_depth_offset_clamp = 0;
1050       crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1051    }
1052 #endif
1053 }
1054 #endif
1055 
1056 #if GFX_VER >= 7
1057 
1058 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
1059 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
1060 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
1061 
1062 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1063 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1064 {
1065 #if GFX_VER == 7
1066    const struct intel_device_info *devinfo = &batch->screen->devinfo;
1067    const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1068    const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1069                        cfg->n[INTEL_L3P_ALL];
1070    const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1071                       cfg->n[INTEL_L3P_ALL];
1072    const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1073                       cfg->n[INTEL_L3P_ALL];
1074    const bool has_slm = cfg->n[INTEL_L3P_SLM];
1075 #endif
1076 
1077    /* According to the hardware docs, the L3 partitioning can only be changed
1078     * while the pipeline is completely drained and the caches are flushed,
1079     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1080     */
1081    crocus_emit_pipe_control_flush(batch, "l3_config",
1082                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1083                                   PIPE_CONTROL_CS_STALL);
1084 
1085    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1086     * invalidation of the relevant caches.  Note that because RO invalidation
1087     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1088     * command is processed by the CS) we cannot combine it with the previous
1089     * stalling flush as the hardware documentation suggests, because that
1090     * would cause the CS to stall on previous rendering *after* RO
1091     * invalidation and wouldn't prevent the RO caches from being polluted by
1092     * concurrent rendering before the stall completes.  This intentionally
1093     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1094     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1095     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1096     * already guarantee that there is no concurrent GPGPU kernel execution
1097     * (see SKL HSD 2132585).
1098     */
1099    crocus_emit_pipe_control_flush(batch, "l3 config",
1100                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1101                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1102                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1103                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1104 
1105    /* Now send a third stalling flush to make sure that invalidation is
1106     * complete when the L3 configuration registers are modified.
1107     */
1108    crocus_emit_pipe_control_flush(batch, "l3 config",
1109                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1110                                   PIPE_CONTROL_CS_STALL);
1111 
1112 #if GFX_VER == 8
1113    assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1114    crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1115       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1116       reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1117       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1118       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1119       reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1120    }
1121 #else
1122    assert(!cfg->n[INTEL_L3P_ALL]);
1123 
1124    /* When enabled SLM only uses a portion of the L3 on half of the banks,
1125     * the matching space on the remaining banks has to be allocated to a
1126     * client (URB for all validated configurations) set to the
1127     * lower-bandwidth 2-bank address hashing mode.
1128     */
1129    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
1130    assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1131 
1132    /* Minimum number of ways that can be allocated to the URB. */
1133    const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
1134    assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1135 
1136    uint32_t l3sqcr1, l3cr2, l3cr3;
1137 
1138    crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1139       reg.ConvertDC_UC = !has_dc;
1140       reg.ConvertIS_UC = !has_is;
1141       reg.ConvertC_UC = !has_c;
1142       reg.ConvertT_UC = !has_t;
1143 #if GFX_VERx10 == 75
1144       reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1145 #else
1146       reg.L3SQGeneralPriorityCreditInitialization =
1147          devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1148 #endif
1149       reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1150    };
1151 
1152    crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1153       reg.SLMEnable = has_slm;
1154       reg.URBLowBandwidth = urb_low_bw;
1155       reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1156 #if !(GFX_VERx10 == 75)
1157       reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1158 #endif
1159       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1160       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1161    };
1162 
1163    crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1164       reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1165       reg.ISLowBandwidth = 0;
1166       reg.CAllocation = cfg->n[INTEL_L3P_C];
1167       reg.CLowBandwidth = 0;
1168       reg.TAllocation = cfg->n[INTEL_L3P_T];
1169       reg.TLowBandwidth = 0;
1170    };
1171 
1172    /* Set up the L3 partitioning. */
1173    crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1174    crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1175    crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1176 
1177 #if GFX_VERSIONx10 == 75
1178    /* TODO: Fail screen creation if command parser version < 4 */
1179    uint32_t scratch1, chicken3;
1180    crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1181       reg.L3AtomicDisable = !has_dc;
1182    }
1183    crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1184       reg.L3AtomicDisableMask = true;
1185       reg.L3AtomicDisable = !has_dc;
1186    }
1187    crocus_emit_lri(batch, SCRATCH1, scratch1);
1188    crocus_emit_lri(batch, CHICKEN3, chicken3);
1189 #endif
1190 #endif
1191 }
1192 
1193 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1194 emit_l3_state(struct crocus_batch *batch, bool compute)
1195 {
1196    const struct intel_l3_config *const cfg =
1197       compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1198 
1199    setup_l3_config(batch, cfg);
1200    if (INTEL_DEBUG(DEBUG_L3)) {
1201       intel_dump_l3_config(cfg, stderr);
1202    }
1203 }
1204 
1205 /**
1206  * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1207  */
1208 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1209 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1210 {
1211    crocus_emit_pipe_control_write(batch,
1212                                   "workaround",
1213                                   PIPE_CONTROL_CS_STALL
1214                                   | PIPE_CONTROL_WRITE_IMMEDIATE,
1215                                   batch->ice->workaround_bo,
1216                                   batch->ice->workaround_offset, 0);
1217 }
1218 #endif
1219 
1220 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1221 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1222 {
1223 #if GFX_VER == 8
1224    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1225     *
1226     *   Software must clear the COLOR_CALC_STATE Valid field in
1227     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1228     *   with Pipeline Select set to GPGPU.
1229     *
1230     * The internal hardware docs recommend the same workaround for Gfx9
1231     * hardware too.
1232     */
1233    if (pipeline == GPGPU)
1234       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1235 #endif
1236 
1237 #if GFX_VER >= 6
1238    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1239     * PIPELINE_SELECT [DevBWR+]":
1240     *
1241     *    "Project: DEVSNB+
1242     *
1243     *     Software must ensure all the write caches are flushed through a
1244     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1245     *     command to invalidate read only caches prior to programming
1246     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1247     */
1248    const unsigned dc_flush =
1249       batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1250    crocus_emit_pipe_control_flush(batch,
1251                                   "workaround: PIPELINE_SELECT flushes (1/2)",
1252                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
1253                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1254                                   dc_flush |
1255                                   PIPE_CONTROL_CS_STALL);
1256 
1257    crocus_emit_pipe_control_flush(batch,
1258                                   "workaround: PIPELINE_SELECT flushes (2/2)",
1259                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1260                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1261                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1262                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1263 #else
1264    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1265     * PIPELINE_SELECT [DevBWR+]":
1266     *
1267     *   Project: PRE-DEVSNB
1268     *
1269     *   Software must ensure the current pipeline is flushed via an
1270     *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1271     */
1272    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1273 #endif
1274 
1275    crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1276       sel.PipelineSelection = pipeline;
1277    }
1278 
1279 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1280    if (pipeline == _3D) {
1281       gen7_emit_cs_stall_flush(batch);
1282 
1283       crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1284          prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1285       };
1286    }
1287 #endif
1288 }
1289 
1290 /**
1291  * The following diagram shows how we partition the URB:
1292  *
1293  *        16kB or 32kB               Rest of the URB space
1294  *   __________-__________   _________________-_________________
1295  *  /                     \ /                                   \
1296  * +-------------------------------------------------------------+
1297  * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
1298  * |       Constants       |               Entries               |
1299  * +-------------------------------------------------------------+
1300  *
1301  * Notably, push constants must be stored at the beginning of the URB
1302  * space, while entries can be stored anywhere.  Ivybridge and Haswell
1303  * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1304  * doubles this (32kB).
1305  *
1306  * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1307  * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
1308  * sized in increments of 2kB.
1309  *
1310  * Currently we split the constant buffer space evenly among whatever stages
1311  * are active.  This is probably not ideal, but simple.
1312  *
1313  * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1314  * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1315  * Haswell GT3 has 512kB of URB space.
1316  *
1317  * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1318  * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1319  */
1320 #if GFX_VER >= 7
1321 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1322 crocus_alloc_push_constants(struct crocus_batch *batch)
1323 {
1324    const unsigned push_constant_kb =
1325       batch->screen->devinfo.max_constant_urb_size_kb;
1326    unsigned size_per_stage = push_constant_kb / 5;
1327 
1328    /* For now, we set a static partitioning of the push constant area,
1329     * assuming that all stages could be in use.
1330     *
1331     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1332     *       see if that improves performance by offering more space to
1333     *       the VS/FS when those aren't in use.  Also, try dynamically
1334     *       enabling/disabling it like i965 does.  This would be more
1335     *       stalls and may not actually help; we don't know yet.
1336     */
1337    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1338       crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1339          alloc._3DCommandSubOpcode = 18 + i;
1340          alloc.ConstantBufferOffset = size_per_stage * i;
1341          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1342       }
1343    }
1344 
1345    /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1346     *
1347     *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
1348     *     in the ring after this instruction.
1349     *
1350     * No such restriction exists for Haswell or Baytrail.
1351     */
1352    if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
1353       gen7_emit_cs_stall_flush(batch);
1354 }
1355 #endif
1356 
1357 /**
1358  * Upload the initial GPU state for a render context.
1359  *
1360  * This sets some invariant state that needs to be programmed a particular
1361  * way, but we never actually change.
1362  */
1363 static void
crocus_init_render_context(struct crocus_batch * batch)1364 crocus_init_render_context(struct crocus_batch *batch)
1365 {
1366    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1367 
1368    emit_pipeline_select(batch, _3D);
1369 
1370    crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1371 
1372 #if GFX_VER >= 7
1373    emit_l3_state(batch, false);
1374 #endif
1375 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1376    crocus_emit_reg(batch, GENX(INSTPM), reg) {
1377       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1378       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1379    }
1380 #endif
1381 #if GFX_VER >= 5 || GFX_VERx10 == 45
1382    /* Use the legacy AA line coverage computation. */
1383    crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1384 #endif
1385 
1386    /* No polygon stippling offsets are necessary. */
1387    /* TODO: may need to set an offset for origin-UL framebuffers */
1388    crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1389 
1390 #if GFX_VER >= 7
1391    crocus_alloc_push_constants(batch);
1392 #endif
1393 
1394 #if GFX_VER == 8
1395    /* Set the initial MSAA sample positions. */
1396    crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1397       INTEL_SAMPLE_POS_1X(pat._1xSample);
1398       INTEL_SAMPLE_POS_2X(pat._2xSample);
1399       INTEL_SAMPLE_POS_4X(pat._4xSample);
1400       INTEL_SAMPLE_POS_8X(pat._8xSample);
1401    }
1402 
1403    /* Disable chromakeying (it's for media) */
1404    crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1405 
1406    /* We want regular rendering, not special HiZ operations. */
1407    crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1408 #endif
1409 }
1410 
1411 #if GFX_VER >= 7
1412 static void
crocus_init_compute_context(struct crocus_batch * batch)1413 crocus_init_compute_context(struct crocus_batch *batch)
1414 {
1415    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1416 
1417    emit_pipeline_select(batch, GPGPU);
1418 
1419 #if GFX_VER >= 7
1420    emit_l3_state(batch, true);
1421 #endif
1422 }
1423 #endif
1424 
1425 /**
1426  * Generation-specific context state (ice->state.genx->...).
1427  *
1428  * Most state can go in crocus_context directly, but these encode hardware
1429  * packets which vary by generation.
1430  */
1431 struct crocus_genx_state {
1432    struct {
1433 #if GFX_VER >= 7
1434       struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1435 #endif
1436    } shaders[MESA_SHADER_STAGES];
1437 
1438 #if GFX_VER == 8
1439    bool pma_fix_enabled;
1440 #endif
1441 };
1442 
1443 /**
1444  * The pipe->set_blend_color() driver hook.
1445  *
1446  * This corresponds to our COLOR_CALC_STATE.
1447  */
1448 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1449 crocus_set_blend_color(struct pipe_context *ctx,
1450                        const struct pipe_blend_color *state)
1451 {
1452    struct crocus_context *ice = (struct crocus_context *) ctx;
1453 
1454    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1455    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1456 #if GFX_VER <= 5
1457    ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1458 #else
1459    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1460 #endif
1461 }
1462 
1463 /**
1464  * Gallium CSO for blend state (see pipe_blend_state).
1465  */
1466 struct crocus_blend_state {
1467 #if GFX_VER == 8
1468    /** Partial 3DSTATE_PS_BLEND */
1469    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1470 #endif
1471 
1472    /** copy of BLEND_STATE */
1473    struct pipe_blend_state cso;
1474 
1475    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1476    uint8_t blend_enables;
1477 
1478    /** Bitfield of whether color writes are enabled for RT[i] */
1479    uint8_t color_write_enables;
1480 
1481    /** Does RT[0] use dual color blending? */
1482    bool dual_color_blending;
1483 };
1484 
1485 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1486 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1487 {
1488    if (alpha_to_one) {
1489       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1490          return PIPE_BLENDFACTOR_ONE;
1491 
1492       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1493          return PIPE_BLENDFACTOR_ZERO;
1494    }
1495 
1496    return f;
1497 }
1498 
1499 #if GFX_VER >= 6
1500 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1501 #else
1502 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1503 #endif
1504 
1505 static bool
1506 can_emit_logic_op(struct crocus_context *ice)
1507 {
1508    /* all pre gen8 have logicop restricted to unorm */
1509    enum pipe_format pformat = PIPE_FORMAT_NONE;
1510    for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1511       if (ice->state.framebuffer.cbufs[i]) {
1512          pformat = ice->state.framebuffer.cbufs[i]->format;
1513          break;
1514       }
1515    }
1516    return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1517 }
1518 
1519 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1520 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1521                      struct crocus_blend_state *cso_blend,
1522                      int idx)
1523 {
1524    struct crocus_context *ice = batch->ice;
1525    bool independent_alpha_blend = false;
1526    const struct pipe_rt_blend_state *rt =
1527       &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1528    const unsigned blend_enabled = rt->blend_enable;
1529 
1530    enum pipe_blendfactor src_rgb =
1531       fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1532    enum pipe_blendfactor src_alpha =
1533       fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1534    enum pipe_blendfactor dst_rgb =
1535       fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1536    enum pipe_blendfactor dst_alpha =
1537       fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1538 
1539    if (rt->rgb_func != rt->alpha_func ||
1540        src_rgb != src_alpha || dst_rgb != dst_alpha)
1541       independent_alpha_blend = true;
1542    if (cso_blend->cso.logicop_enable) {
1543       if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1544          entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1545          entry->LogicOpFunction = cso_blend->cso.logicop_func;
1546       }
1547    } else if (blend_enabled) {
1548       if (idx == 0) {
1549          struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1550          struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1551          entry->ColorBufferBlendEnable =
1552             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1553       } else
1554          entry->ColorBufferBlendEnable = 1;
1555 
1556       entry->ColorBlendFunction          = rt->rgb_func;
1557       entry->AlphaBlendFunction          = rt->alpha_func;
1558       entry->SourceBlendFactor           = (int) src_rgb;
1559       entry->SourceAlphaBlendFactor      = (int) src_alpha;
1560       entry->DestinationBlendFactor      = (int) dst_rgb;
1561       entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1562    }
1563 #if GFX_VER <= 5
1564    /*
1565     * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1566     * when a dual src blend shader is in use. Setup dummy blending.
1567     */
1568    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1569    struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1570    if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1571       entry->ColorBufferBlendEnable = 1;
1572       entry->ColorBlendFunction = PIPE_BLEND_ADD;
1573       entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1574       entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1575       entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1576       entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577       entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578    }
1579 #endif
1580    return independent_alpha_blend;
1581 }
1582 
1583 /**
1584  * The pipe->create_blend_state() driver hook.
1585  *
1586  * Translates a pipe_blend_state into crocus_blend_state.
1587  */
1588 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1589 crocus_create_blend_state(struct pipe_context *ctx,
1590                           const struct pipe_blend_state *state)
1591 {
1592    struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1593 
1594    cso->blend_enables = 0;
1595    cso->color_write_enables = 0;
1596    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1597 
1598    cso->cso = *state;
1599    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1600 
1601 #if GFX_VER == 8
1602    bool indep_alpha_blend = false;
1603 #endif
1604    for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1605       const struct pipe_rt_blend_state *rt =
1606          &state->rt[state->independent_blend_enable ? i : 0];
1607       if (rt->blend_enable)
1608          cso->blend_enables |= 1u << i;
1609       if (rt->colormask)
1610          cso->color_write_enables |= 1u << i;
1611 #if GFX_VER == 8
1612       enum pipe_blendfactor src_rgb =
1613          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1614       enum pipe_blendfactor src_alpha =
1615          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1616       enum pipe_blendfactor dst_rgb =
1617          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1618       enum pipe_blendfactor dst_alpha =
1619          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1620 
1621       if (rt->rgb_func != rt->alpha_func ||
1622           src_rgb != src_alpha || dst_rgb != dst_alpha)
1623          indep_alpha_blend = true;
1624 #endif
1625    }
1626 
1627 #if GFX_VER == 8
1628    crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1629       /* pb.HasWriteableRT is filled in at draw time.
1630        * pb.AlphaTestEnable is filled in at draw time.
1631        *
1632        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1633        * setting it when dual color blending without an appropriate shader.
1634        */
1635 
1636       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1637       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1638 
1639       /* The casts prevent warnings about implicit enum type conversions. */
1640       pb.SourceBlendFactor =
1641          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1642       pb.SourceAlphaBlendFactor =
1643          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1644       pb.DestinationBlendFactor =
1645          (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1646       pb.DestinationAlphaBlendFactor =
1647          (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1648    }
1649 #endif
1650    return cso;
1651 }
1652 
1653 /**
1654  * The pipe->bind_blend_state() driver hook.
1655  *
1656  * Bind a blending CSO and flag related dirty bits.
1657  */
1658 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1659 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1660 {
1661    struct crocus_context *ice = (struct crocus_context *) ctx;
1662    struct crocus_blend_state *cso = state;
1663 
1664    ice->state.cso_blend = cso;
1665    ice->state.blend_enables = cso ? cso->blend_enables : 0;
1666 
1667    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1668    ice->state.dirty |= CROCUS_DIRTY_WM;
1669 #if GFX_VER >= 6
1670    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1671 #endif
1672 #if GFX_VER >= 7
1673    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1674 #endif
1675 #if GFX_VER == 8
1676    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1677    ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1678 #endif
1679    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1680    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1681    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1682 }
1683 
1684 /**
1685  * Return true if the FS writes to any color outputs which are not disabled
1686  * via color masking.
1687  */
1688 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1689 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1690                  const struct shader_info *fs_info)
1691 {
1692    if (!fs_info)
1693       return false;
1694 
1695    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1696 
1697    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1698       rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1699 
1700    return cso_blend->color_write_enables & rt_outputs;
1701 }
1702 
1703 /**
1704  * Gallium CSO for depth, stencil, and alpha testing state.
1705  */
1706 struct crocus_depth_stencil_alpha_state {
1707    struct pipe_depth_stencil_alpha_state cso;
1708 
1709    bool depth_writes_enabled;
1710    bool stencil_writes_enabled;
1711 };
1712 
1713 /**
1714  * The pipe->create_depth_stencil_alpha_state() driver hook.
1715  *
1716  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1717  * testing state since we need pieces of it in a variety of places.
1718  */
1719 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1720 crocus_create_zsa_state(struct pipe_context *ctx,
1721                         const struct pipe_depth_stencil_alpha_state *state)
1722 {
1723    struct crocus_depth_stencil_alpha_state *cso =
1724       malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1725 
1726    bool two_sided_stencil = state->stencil[1].enabled;
1727    cso->cso = *state;
1728 
1729    cso->depth_writes_enabled = state->depth_writemask;
1730    cso->stencil_writes_enabled =
1731       state->stencil[0].writemask != 0 ||
1732       (two_sided_stencil && state->stencil[1].writemask != 0);
1733 
1734    /* The state tracker needs to optimize away EQUAL writes for us. */
1735    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1736 
1737    return cso;
1738 }
1739 
1740 /**
1741  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1742  *
1743  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1744  */
1745 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1746 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1747 {
1748    struct crocus_context *ice = (struct crocus_context *) ctx;
1749    struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1750    struct crocus_depth_stencil_alpha_state *new_cso = state;
1751 
1752    if (new_cso) {
1753       if (cso_changed(cso.alpha_ref_value))
1754          ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1755 
1756       if (cso_changed(cso.alpha_enabled))
1757          ice->state.dirty |= CROCUS_DIRTY_WM;
1758 #if GFX_VER >= 6
1759       if (cso_changed(cso.alpha_enabled))
1760          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1761 
1762       if (cso_changed(cso.alpha_func))
1763          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1764 #endif
1765 #if GFX_VER == 8
1766       if (cso_changed(cso.alpha_enabled))
1767          ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1768 #endif
1769 
1770       if (cso_changed(depth_writes_enabled))
1771          ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1772 
1773       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1774       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1775 
1776 #if GFX_VER <= 5
1777       ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1778 #endif
1779    }
1780 
1781    ice->state.cso_zsa = new_cso;
1782    ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1783 #if GFX_VER >= 6
1784    ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1785 #endif
1786 #if GFX_VER == 8
1787    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1788 #endif
1789    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1790 }
1791 
1792 #if GFX_VER == 8
1793 static bool
want_pma_fix(struct crocus_context * ice)1794 want_pma_fix(struct crocus_context *ice)
1795 {
1796    UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1797    UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1798    const struct brw_wm_prog_data *wm_prog_data = (void *)
1799       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1800    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1801    const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1802    const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1803 
1804    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1805     * to avoid stalling at the pixel mask array.  The state equations are
1806     * documented in these places:
1807     *
1808     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1809     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1810     *
1811     * Both equations share some common elements:
1812     *
1813     *    no_hiz_op =
1814     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1815     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1816     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1817     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1818     *
1819     *    killpixels =
1820     *       3DSTATE_WM::ForceKillPix != ForceOff &&
1821     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1822     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1823     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1824     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1825     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1826     *
1827     *    (Technically the stencil PMA treats ForceKillPix differently,
1828     *     but I think this is a documentation oversight, and we don't
1829     *     ever use it in this way, so it doesn't matter).
1830     *
1831     *    common_pma_fix =
1832     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1833     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1834     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1835     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1836     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1837     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1838     *       no_hiz_op
1839     *
1840     * These are always true:
1841     *
1842     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1843     *    3DSTATE_PS_EXTRA::PixelShaderValid
1844     *
1845     * Also, we never use the normal drawing path for HiZ ops; these are true:
1846     *
1847     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1848     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1849     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1850     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
1851     *
1852     * This happens sometimes:
1853     *
1854     *    3DSTATE_WM::ForceThreadDispatch != 1
1855     *
1856     * However, we choose to ignore it as it either agrees with the signal
1857     * (dispatch was already enabled, so nothing out of the ordinary), or
1858     * there are no framebuffer attachments (so no depth or HiZ anyway,
1859     * meaning the PMA signal will already be disabled).
1860     */
1861 
1862    if (!cso_fb->zsbuf)
1863       return false;
1864 
1865    struct crocus_resource *zres, *sres;
1866    crocus_get_depth_stencil_resources(devinfo,
1867                                       cso_fb->zsbuf->texture, &zres, &sres);
1868 
1869    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1870     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1871     */
1872    if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1873       return false;
1874 
1875    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1876    if (wm_prog_data->early_fragment_tests)
1877       return false;
1878 
1879    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1880     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1881     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1882     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1883     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
1884     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1885     */
1886    bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1887                      cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1888 
1889    /* The Gfx8 depth PMA equation becomes:
1890     *
1891     *    depth_writes =
1892     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1893     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1894     *
1895     *    stencil_writes =
1896     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1897     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1898     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1899     *
1900     *    Z_PMA_OPT =
1901     *       common_pma_fix &&
1902     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1903     *       ((killpixels && (depth_writes || stencil_writes)) ||
1904     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1905     *
1906     */
1907    if (!cso_zsa->cso.depth_enabled)
1908       return false;
1909 
1910    return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1911           (killpixels && (cso_zsa->depth_writes_enabled ||
1912                           (sres && cso_zsa->stencil_writes_enabled)));
1913 }
1914 #endif
1915 void
genX(crocus_update_pma_fix)1916 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1917                             struct crocus_batch *batch,
1918                             bool enable)
1919 {
1920 #if GFX_VER == 8
1921    struct crocus_genx_state *genx = ice->state.genx;
1922 
1923    if (genx->pma_fix_enabled == enable)
1924       return;
1925 
1926    genx->pma_fix_enabled = enable;
1927 
1928    /* According to the Broadwell PIPE_CONTROL documentation, software should
1929     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1930     * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
1931     *
1932     * The Gfx9 docs say to use a depth stall rather than a command streamer
1933     * stall.  However, the hardware seems to violently disagree.  A full
1934     * command streamer stall seems to be needed in both cases.
1935     */
1936    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1937                                   PIPE_CONTROL_CS_STALL |
1938                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1939                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1940 
1941    crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1942       reg.NPPMAFixEnable = enable;
1943       reg.NPEarlyZFailsDisable = enable;
1944       reg.NPPMAFixEnableMask = true;
1945       reg.NPEarlyZFailsDisableMask = true;
1946    }
1947 
1948    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1949     * Flush bits is often necessary.  We do it regardless because it's easier.
1950     * The render cache flush is also necessary if stencil writes are enabled.
1951     *
1952     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1953     * flushes seem to work just as well.
1954     */
1955    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1956                                   PIPE_CONTROL_DEPTH_STALL |
1957                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1958                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1959 #endif
1960 }
1961 
1962 static float
get_line_width(const struct pipe_rasterizer_state * state)1963 get_line_width(const struct pipe_rasterizer_state *state)
1964 {
1965    float line_width = state->line_width;
1966 
1967    /* From the OpenGL 4.4 spec:
1968     *
1969     * "The actual width of non-antialiased lines is determined by rounding
1970     *  the supplied width to the nearest integer, then clamping it to the
1971     *  implementation-dependent maximum non-antialiased line width."
1972     */
1973    if (!state->multisample && !state->line_smooth)
1974       line_width = roundf(state->line_width);
1975 
1976    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1977       /* For 1 pixel line thickness or less, the general anti-aliasing
1978        * algorithm gives up, and a garbage line is generated.  Setting a
1979        * Line Width of 0.0 specifies the rasterization of the "thinnest"
1980        * (one-pixel-wide), non-antialiased lines.
1981        *
1982        * Lines rendered with zero Line Width are rasterized using the
1983        * "Grid Intersection Quantization" rules as specified by the
1984        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1985        */
1986       line_width = 0.0f;
1987    }
1988 
1989    return line_width;
1990 }
1991 
1992 /**
1993  * The pipe->create_rasterizer_state() driver hook.
1994  */
1995 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1996 crocus_create_rasterizer_state(struct pipe_context *ctx,
1997                                const struct pipe_rasterizer_state *state)
1998 {
1999    struct crocus_rasterizer_state *cso =
2000       malloc(sizeof(struct crocus_rasterizer_state));
2001 
2002    cso->fill_mode_point_or_line =
2003       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2004       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2005       state->fill_back == PIPE_POLYGON_MODE_LINE ||
2006       state->fill_back == PIPE_POLYGON_MODE_POINT;
2007 
2008    if (state->clip_plane_enable != 0)
2009       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2010    else
2011       cso->num_clip_plane_consts = 0;
2012 
2013    cso->cso = *state;
2014 
2015 #if GFX_VER >= 6
2016    float line_width = get_line_width(state);
2017 
2018    crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2019       sf.StatisticsEnable = true;
2020       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2021       sf.LineEndCapAntialiasingRegionWidth =
2022          state->line_smooth ? _10pixels : _05pixels;
2023       sf.LastPixelEnable = state->line_last_pixel;
2024 #if GFX_VER <= 7
2025       sf.AntialiasingEnable = state->line_smooth;
2026 #endif
2027 #if GFX_VER == 8
2028       struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2029       if (screen->devinfo.is_cherryview)
2030          sf.CHVLineWidth = line_width;
2031       else
2032          sf.LineWidth = line_width;
2033 #else
2034       sf.LineWidth = line_width;
2035 #endif
2036       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2037       sf.PointWidth = state->point_size;
2038 
2039       if (state->flatshade_first) {
2040          sf.TriangleFanProvokingVertexSelect = 1;
2041       } else {
2042          sf.TriangleStripListProvokingVertexSelect = 2;
2043          sf.TriangleFanProvokingVertexSelect = 2;
2044          sf.LineStripListProvokingVertexSelect = 1;
2045       }
2046 
2047 #if GFX_VER == 6
2048       sf.AttributeSwizzleEnable = true;
2049       if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2050          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2051       else
2052          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2053 #endif
2054 
2055 #if GFX_VER <= 7
2056       sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2057 
2058 #if GFX_VER >= 6
2059       sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2060       sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2061       sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2062       sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2063       sf.GlobalDepthOffsetScale = state->offset_scale;
2064       sf.GlobalDepthOffsetClamp = state->offset_clamp;
2065 
2066       sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2067       sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2068 #endif
2069 
2070       sf.CullMode = translate_cull_mode(state->cull_face);
2071       sf.ScissorRectangleEnable = true;
2072 
2073 #if GFX_VERx10 == 75
2074       sf.LineStippleEnable = state->line_stipple_enable;
2075 #endif
2076 #endif
2077    }
2078 #endif
2079 
2080 #if GFX_VER == 8
2081    crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2082       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2083       rr.CullMode = translate_cull_mode(state->cull_face);
2084       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2085       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2086       rr.DXMultisampleRasterizationEnable = state->multisample;
2087       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2088       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2089       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2090       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2091       rr.GlobalDepthOffsetScale = state->offset_scale;
2092       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2093       rr.SmoothPointEnable = state->point_smooth;
2094       rr.AntialiasingEnable = state->line_smooth;
2095       rr.ScissorRectangleEnable = state->scissor;
2096       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2097    }
2098 #endif
2099 
2100 #if GFX_VER >= 6
2101    crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2102       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2103        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2104        */
2105 #if GFX_VER >= 7
2106       cl.EarlyCullEnable = true;
2107 #endif
2108 
2109 #if GFX_VER == 7
2110       cl.FrontWinding = state->front_ccw ? 1 : 0;
2111       cl.CullMode = translate_cull_mode(state->cull_face);
2112 #endif
2113       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2114 #if GFX_VER < 8
2115       cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2116 #endif
2117       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2118       cl.GuardbandClipTestEnable = true;
2119       cl.ClipEnable = true;
2120       cl.MinimumPointWidth = 0.125;
2121       cl.MaximumPointWidth = 255.875;
2122 
2123 #if GFX_VER == 8
2124       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2125 #endif
2126 
2127       if (state->flatshade_first) {
2128          cl.TriangleFanProvokingVertexSelect = 1;
2129       } else {
2130          cl.TriangleStripListProvokingVertexSelect = 2;
2131          cl.TriangleFanProvokingVertexSelect = 2;
2132          cl.LineStripListProvokingVertexSelect = 1;
2133       }
2134    }
2135 #endif
2136 
2137    /* Remap from 0..255 back to 1..256 */
2138    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2139 
2140    crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2141       if (state->line_stipple_enable) {
2142          line.LineStipplePattern = state->line_stipple_pattern;
2143          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2144          line.LineStippleRepeatCount = line_stipple_factor;
2145       }
2146    }
2147 
2148    return cso;
2149 }
2150 
2151 /**
2152  * The pipe->bind_rasterizer_state() driver hook.
2153  *
2154  * Bind a rasterizer CSO and flag related dirty bits.
2155  */
2156 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2157 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2158 {
2159    struct crocus_context *ice = (struct crocus_context *) ctx;
2160    struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2161    struct crocus_rasterizer_state *new_cso = state;
2162 
2163    if (new_cso) {
2164       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2165       if (cso_changed_memcmp(line_stipple))
2166          ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2167 #if GFX_VER >= 6
2168       if (cso_changed(cso.half_pixel_center))
2169          ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2170       if (cso_changed(cso.scissor))
2171          ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2172       if (cso_changed(cso.multisample))
2173 	 ice->state.dirty |= CROCUS_DIRTY_WM;
2174 #else
2175       if (cso_changed(cso.scissor))
2176          ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2177 #endif
2178 
2179       if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2180          ice->state.dirty |= CROCUS_DIRTY_WM;
2181 
2182 #if GFX_VER >= 6
2183       if (cso_changed(cso.rasterizer_discard))
2184          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2185 
2186       if (cso_changed(cso.flatshade_first))
2187          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2188 #endif
2189 
2190       if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2191           cso_changed(cso.clip_halfz))
2192          ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2193 
2194 #if GFX_VER >= 7
2195       if (cso_changed(cso.sprite_coord_enable) ||
2196           cso_changed(cso.sprite_coord_mode) ||
2197           cso_changed(cso.light_twoside))
2198          ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2199 #endif
2200 #if GFX_VER <= 5
2201       if (cso_changed(cso.clip_plane_enable))
2202          ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2203 #endif
2204    }
2205 
2206    ice->state.cso_rast = new_cso;
2207    ice->state.dirty |= CROCUS_DIRTY_RASTER;
2208    ice->state.dirty |= CROCUS_DIRTY_CLIP;
2209 #if GFX_VER <= 5
2210    ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2211    ice->state.dirty |= CROCUS_DIRTY_WM;
2212 #endif
2213 #if GFX_VER <= 6
2214    ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2215 #endif
2216    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2217 }
2218 
2219 /**
2220  * Return true if the given wrap mode requires the border color to exist.
2221  *
2222  * (We can skip uploading it if the sampler isn't going to use it.)
2223  */
2224 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2225 wrap_mode_needs_border_color(unsigned wrap_mode)
2226 {
2227 #if GFX_VER == 8
2228    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2229 #else
2230    return wrap_mode == TCM_CLAMP_BORDER;
2231 #endif
2232 }
2233 
2234 /**
2235  * Gallium CSO for sampler state.
2236  */
2237 struct crocus_sampler_state {
2238    struct pipe_sampler_state pstate;
2239    union pipe_color_union border_color;
2240    bool needs_border_color;
2241    unsigned wrap_s;
2242    unsigned wrap_t;
2243    unsigned wrap_r;
2244    unsigned mag_img_filter;
2245    float min_lod;
2246 };
2247 
2248 /**
2249  * The pipe->create_sampler_state() driver hook.
2250  *
2251  * We fill out SAMPLER_STATE (except for the border color pointer), and
2252  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2253  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2254  * all bound sampler states to be in contiguous memor.
2255  */
2256 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2257 crocus_create_sampler_state(struct pipe_context *ctx,
2258                             const struct pipe_sampler_state *state)
2259 {
2260    struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2261 
2262    if (!cso)
2263       return NULL;
2264 
2265    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2266    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2267 
2268    bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2269       state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2270    cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2271    cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2272    cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2273 
2274    cso->pstate = *state;
2275 
2276    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2277 
2278    cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2279                              wrap_mode_needs_border_color(cso->wrap_t) ||
2280                              wrap_mode_needs_border_color(cso->wrap_r);
2281 
2282    cso->min_lod = state->min_lod;
2283    cso->mag_img_filter = state->mag_img_filter;
2284 
2285    // XXX: explain this code ported from ilo...I don't get it at all...
2286    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2287        state->min_lod > 0.0f) {
2288       cso->min_lod = 0.0f;
2289       cso->mag_img_filter = state->min_img_filter;
2290    }
2291 
2292    return cso;
2293 }
2294 
2295 /**
2296  * The pipe->bind_sampler_states() driver hook.
2297  */
2298 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2299 crocus_bind_sampler_states(struct pipe_context *ctx,
2300                            enum pipe_shader_type p_stage,
2301                            unsigned start, unsigned count,
2302                            void **states)
2303 {
2304    struct crocus_context *ice = (struct crocus_context *) ctx;
2305    gl_shader_stage stage = stage_from_pipe(p_stage);
2306    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2307 
2308    assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2309 
2310    bool dirty = false;
2311 
2312    for (int i = 0; i < count; i++) {
2313       if (shs->samplers[start + i] != states[i]) {
2314          shs->samplers[start + i] = states[i];
2315          dirty = true;
2316       }
2317    }
2318 
2319    if (dirty) {
2320 #if GFX_VER <= 5
2321       if (p_stage == PIPE_SHADER_FRAGMENT)
2322          ice->state.dirty |= CROCUS_DIRTY_WM;
2323       else if (p_stage == PIPE_SHADER_VERTEX)
2324          ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2325 #endif
2326       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2327       ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2328    }
2329 }
2330 
2331 enum samp_workaround {
2332    SAMP_NORMAL,
2333    SAMP_CUBE_CLAMP,
2334    SAMP_CUBE_CUBE,
2335    SAMP_T_WRAP,
2336 };
2337 
2338 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2339 crocus_upload_sampler_state(struct crocus_batch *batch,
2340                             struct crocus_sampler_state *cso,
2341                             uint32_t border_color_offset,
2342                             enum samp_workaround samp_workaround,
2343                             uint32_t first_level,
2344                             void *map)
2345 {
2346    struct pipe_sampler_state *state = &cso->pstate;
2347    uint32_t wrap_s, wrap_t, wrap_r;
2348 
2349    wrap_s = cso->wrap_s;
2350    wrap_t = cso->wrap_t;
2351    wrap_r = cso->wrap_r;
2352 
2353    switch (samp_workaround) {
2354    case SAMP_CUBE_CLAMP:
2355       wrap_s = TCM_CLAMP;
2356       wrap_t = TCM_CLAMP;
2357       wrap_r = TCM_CLAMP;
2358       break;
2359    case SAMP_CUBE_CUBE:
2360       wrap_s = TCM_CUBE;
2361       wrap_t = TCM_CUBE;
2362       wrap_r = TCM_CUBE;
2363       break;
2364    case SAMP_T_WRAP:
2365       wrap_t = TCM_WRAP;
2366       break;
2367    default:
2368       break;
2369    }
2370 
2371    _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2372       samp.TCXAddressControlMode = wrap_s;
2373       samp.TCYAddressControlMode = wrap_t;
2374       samp.TCZAddressControlMode = wrap_r;
2375 
2376 #if GFX_VER >= 6
2377       samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
2378 #endif
2379       samp.MinModeFilter = state->min_img_filter;
2380       samp.MagModeFilter = cso->mag_img_filter;
2381       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2382       samp.MaximumAnisotropy = RATIO21;
2383 
2384       if (state->max_anisotropy >= 2) {
2385          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2386             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2387 #if GFX_VER >= 7
2388             samp.AnisotropicAlgorithm = EWAApproximation;
2389 #endif
2390          }
2391 
2392          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2393             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2394 
2395          samp.MaximumAnisotropy =
2396             MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2397       }
2398 
2399       /* Set address rounding bits if not using nearest filtering. */
2400       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2401          samp.UAddressMinFilterRoundingEnable = true;
2402          samp.VAddressMinFilterRoundingEnable = true;
2403          samp.RAddressMinFilterRoundingEnable = true;
2404       }
2405 
2406       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2407          samp.UAddressMagFilterRoundingEnable = true;
2408          samp.VAddressMagFilterRoundingEnable = true;
2409          samp.RAddressMagFilterRoundingEnable = true;
2410       }
2411 
2412       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2413          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2414 
2415       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2416 
2417 #if GFX_VER == 8
2418       samp.LODPreClampMode = CLAMP_MODE_OGL;
2419 #else
2420       samp.LODPreClampEnable = true;
2421 #endif
2422       samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2423       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2424       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2425 
2426 #if GFX_VER == 6
2427       samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2428       samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2429 #endif
2430 
2431 #if GFX_VER < 6
2432       samp.BorderColorPointer =
2433          ro_bo(batch->state.bo, border_color_offset);
2434 #else
2435       samp.BorderColorPointer = border_color_offset;
2436 #endif
2437    }
2438 }
2439 
2440 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2441 crocus_upload_border_color(struct crocus_batch *batch,
2442                            struct crocus_sampler_state *cso,
2443                            struct crocus_sampler_view *tex,
2444                            uint32_t *bc_offset)
2445 {
2446    /* We may need to swizzle the border color for format faking.
2447     * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2448     * This means we need to move the border color's A channel into
2449     * the R or G channels so that those read swizzles will move it
2450     * back into A.
2451     */
2452    enum pipe_format internal_format = PIPE_FORMAT_NONE;
2453    union pipe_color_union *color = &cso->border_color;
2454    union pipe_color_union tmp;
2455    if (tex) {
2456       internal_format = tex->res->internal_format;
2457 
2458       if (util_format_is_alpha(internal_format)) {
2459          unsigned char swz[4] = {
2460             PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2461             PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2462          };
2463          util_format_apply_color_swizzle(&tmp, color, swz, true);
2464          color = &tmp;
2465       } else if (util_format_is_luminance_alpha(internal_format) &&
2466                  internal_format != PIPE_FORMAT_L8A8_SRGB) {
2467          unsigned char swz[4] = {
2468             PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2469             PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2470          };
2471          util_format_apply_color_swizzle(&tmp, color, swz, true);
2472          color = &tmp;
2473       }
2474    }
2475    bool is_integer_format = util_format_is_pure_integer(internal_format);
2476    unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2477    const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2478    uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2479 
2480    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2481 
2482 #define ASSIGN(dst, src)                        \
2483    do {                                         \
2484       dst = src;                                \
2485    } while (0)
2486 
2487 #define ASSIGNu16(dst, src)                     \
2488    do {                                         \
2489       dst = (uint16_t)src;                      \
2490    } while (0)
2491 
2492 #define ASSIGNu8(dst, src)                      \
2493    do {                                         \
2494       dst = (uint8_t)src;                       \
2495    } while (0)
2496 
2497 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
2498    macro(state.BorderColor ## _color_type ## Red, src[0]);      \
2499    macro(state.BorderColor ## _color_type ## Green, src[1]);    \
2500    macro(state.BorderColor ## _color_type ## Blue, src[2]);     \
2501    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2502 
2503 #if GFX_VER >= 8
2504    /* On Broadwell, the border color is represented as four 32-bit floats,
2505     * integers, or unsigned values, interpreted according to the surface
2506     * format.  This matches the sampler->BorderColor union exactly; just
2507     * memcpy the values.
2508     */
2509    BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2510 #elif GFX_VERx10 == 75
2511    if (is_integer_format) {
2512       const struct util_format_description *format_desc =
2513          util_format_description(internal_format);
2514 
2515       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2516        * "If any color channel is missing from the surface format,
2517        *  corresponding border color should be programmed as zero and if
2518        *  alpha channel is missing, corresponding Alpha border color should
2519        *  be programmed as 1."
2520        */
2521       unsigned c[4] = { 0, 0, 0, 1 };
2522       for (int i = 0; i < 4; i++) {
2523          if (format_desc->channel[i].size)
2524             c[i] = color->ui[i];
2525       }
2526 
2527       switch (format_desc->channel[0].size) {
2528       case 8:
2529          /* Copy RGBA in order. */
2530          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2531          break;
2532       case 10:
2533          /* R10G10B10A2_UINT is treated like a 16-bit format. */
2534       case 16:
2535          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2536          break;
2537       case 32:
2538          if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2539             /* Careful inspection of the tables reveals that for RG32 formats,
2540              * the green channel needs to go where blue normally belongs.
2541              */
2542             state.BorderColor32bitRed = c[0];
2543             state.BorderColor32bitBlue = c[1];
2544             state.BorderColor32bitAlpha = 1;
2545          } else {
2546             /* Copy RGBA in order. */
2547             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2548          }
2549          break;
2550       default:
2551          assert(!"Invalid number of bits per channel in integer format.");
2552          break;
2553       }
2554    } else {
2555       BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2556    }
2557 #elif GFX_VER == 5 || GFX_VER == 6
2558    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2559    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2560    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2561 
2562 #define MESA_FLOAT_TO_HALF(dst, src)            \
2563    dst = _mesa_float_to_half(src);
2564 
2565    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2566 
2567 #undef MESA_FLOAT_TO_HALF
2568 
2569    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
2570    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2571    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
2572    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2573 
2574    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2575 
2576 #elif GFX_VER == 4
2577    BORDER_COLOR_ATTR(ASSIGN, , color->f);
2578 #else
2579    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2580 #endif
2581 
2582 #undef ASSIGN
2583 #undef BORDER_COLOR_ATTR
2584 
2585    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2586 }
2587 
2588 /**
2589  * Upload the sampler states into a contiguous area of GPU memory, for
2590  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2591  *
2592  * Also fill out the border color state pointers.
2593  */
2594 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2595 crocus_upload_sampler_states(struct crocus_context *ice,
2596                              struct crocus_batch *batch, gl_shader_stage stage)
2597 {
2598    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2599    const struct shader_info *info = crocus_get_shader_info(ice, stage);
2600 
2601    /* We assume the state tracker will call pipe->bind_sampler_states()
2602     * if the program's number of textures changes.
2603     */
2604    unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2605 
2606    if (!count)
2607       return;
2608 
2609    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2610     * in the dynamic state memory zone, so we can point to it via the
2611     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2612     */
2613    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2614    uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2615 
2616    if (unlikely(!map))
2617       return;
2618 
2619    for (int i = 0; i < count; i++) {
2620       struct crocus_sampler_state *state = shs->samplers[i];
2621       struct crocus_sampler_view *tex = shs->textures[i];
2622 
2623       if (!state || !tex) {
2624          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2625       } else {
2626          unsigned border_color_offset = 0;
2627          if (state->needs_border_color) {
2628             crocus_upload_border_color(batch, state, tex, &border_color_offset);
2629          }
2630 
2631          enum samp_workaround wa = SAMP_NORMAL;
2632          /* There's a bug in 1D texture sampling - it actually pays
2633           * attention to the wrap_t value, though it should not.
2634           * Override the wrap_t value here to GL_REPEAT to keep
2635           * any nonexistent border pixels from floating in.
2636           */
2637          if (tex->base.target == PIPE_TEXTURE_1D)
2638             wa = SAMP_T_WRAP;
2639          else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2640                   tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2641             /* Cube maps must use the same wrap mode for all three coordinate
2642              * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
2643              *
2644              * Ivybridge and Baytrail seem to have problems with CUBE mode and
2645              * integer formats.  Fall back to CLAMP for now.
2646              */
2647             if (state->pstate.seamless_cube_map &&
2648                 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2649                wa = SAMP_CUBE_CUBE;
2650             else
2651                wa = SAMP_CUBE_CLAMP;
2652          }
2653 
2654          uint32_t first_level = 0;
2655          if (tex->base.target != PIPE_BUFFER)
2656             first_level = tex->base.u.tex.first_level;
2657 
2658          crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2659       }
2660 
2661       map += GENX(SAMPLER_STATE_length);
2662    }
2663 }
2664 
2665 /**
2666  * The pipe->create_sampler_view() driver hook.
2667  */
2668 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2669 crocus_create_sampler_view(struct pipe_context *ctx,
2670                            struct pipe_resource *tex,
2671                            const struct pipe_sampler_view *tmpl)
2672 {
2673    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2674    const struct intel_device_info *devinfo = &screen->devinfo;
2675    struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2676 
2677    if (!isv)
2678       return NULL;
2679 
2680    /* initialize base object */
2681    isv->base = *tmpl;
2682    isv->base.context = ctx;
2683    isv->base.texture = NULL;
2684    pipe_reference_init(&isv->base.reference, 1);
2685    pipe_resource_reference(&isv->base.texture, tex);
2686 
2687    if (util_format_is_depth_or_stencil(tmpl->format)) {
2688       struct crocus_resource *zres, *sres;
2689       const struct util_format_description *desc =
2690          util_format_description(tmpl->format);
2691 
2692       crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2693 
2694       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2695 
2696       if (tex->format == PIPE_FORMAT_S8_UINT)
2697          if (devinfo->ver == 7 && sres->shadow)
2698             tex = &sres->shadow->base.b;
2699    }
2700 
2701    isv->res = (struct crocus_resource *) tex;
2702 
2703    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2704 
2705    if (isv->base.target == PIPE_TEXTURE_CUBE ||
2706        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2707       usage |= ISL_SURF_USAGE_CUBE_BIT;
2708 
2709    const struct crocus_format_info fmt =
2710       crocus_format_for_usage(devinfo, tmpl->format, usage);
2711 
2712    enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2713    crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2714 
2715    /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2716    if (devinfo->ver < 6 &&
2717        (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2718         tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2719       isv->swizzle[0] = tmpl->swizzle_g;
2720       isv->swizzle[1] = tmpl->swizzle_g;
2721       isv->swizzle[2] = tmpl->swizzle_g;
2722       isv->swizzle[3] = tmpl->swizzle_g;
2723    }
2724 
2725    isv->clear_color = isv->res->aux.clear_color;
2726 
2727    isv->view = (struct isl_view) {
2728       .format = fmt.fmt,
2729 #if GFX_VERx10 >= 75
2730       .swizzle = (struct isl_swizzle) {
2731          .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2732          .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2733          .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2734          .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2735       },
2736 #else
2737       /* swizzling handled in shader code */
2738       .swizzle = ISL_SWIZZLE_IDENTITY,
2739 #endif
2740       .usage = usage,
2741    };
2742 
2743    /* Fill out SURFACE_STATE for this view. */
2744    if (tmpl->target != PIPE_BUFFER) {
2745       isv->view.base_level = tmpl->u.tex.first_level;
2746       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2747       // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2748       isv->view.base_array_layer = tmpl->u.tex.first_layer;
2749       isv->view.array_len =
2750          tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2751    }
2752 #if GFX_VER >= 6
2753    /* just create a second view struct for texture gather just in case */
2754    isv->gather_view = isv->view;
2755 
2756 #if GFX_VER == 7
2757    if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2758        fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2759        fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2760       isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2761 #if GFX_VERx10 >= 75
2762       isv->gather_view.swizzle = (struct isl_swizzle) {
2763          .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2764          .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2765          .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2766          .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2767       };
2768 #endif
2769    }
2770 #endif
2771 #if GFX_VER == 6
2772    /* Sandybridge's gather4 message is broken for integer formats.
2773     * To work around this, we pretend the surface is UNORM for
2774     * 8 or 16-bit formats, and emit shader instructions to recover
2775     * the real INT/UINT value.  For 32-bit formats, we pretend
2776     * the surface is FLOAT, and simply reinterpret the resulting
2777     * bits.
2778     */
2779    switch (fmt.fmt) {
2780    case ISL_FORMAT_R8_SINT:
2781    case ISL_FORMAT_R8_UINT:
2782       isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2783       break;
2784 
2785    case ISL_FORMAT_R16_SINT:
2786    case ISL_FORMAT_R16_UINT:
2787       isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2788       break;
2789 
2790    case ISL_FORMAT_R32_SINT:
2791    case ISL_FORMAT_R32_UINT:
2792       isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2793       break;
2794 
2795    default:
2796       break;
2797    }
2798 #endif
2799 #endif
2800    /* Fill out SURFACE_STATE for this view. */
2801    if (tmpl->target != PIPE_BUFFER) {
2802       if (crocus_resource_unfinished_aux_import(isv->res))
2803          crocus_resource_finish_aux_import(&screen->base, isv->res);
2804 
2805    }
2806 
2807    return &isv->base;
2808 }
2809 
2810 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2811 crocus_sampler_view_destroy(struct pipe_context *ctx,
2812                             struct pipe_sampler_view *state)
2813 {
2814    struct crocus_sampler_view *isv = (void *) state;
2815    pipe_resource_reference(&state->texture, NULL);
2816    free(isv);
2817 }
2818 
2819 /**
2820  * The pipe->create_surface() driver hook.
2821  *
2822  * In Gallium nomenclature, "surfaces" are a view of a resource that
2823  * can be bound as a render target or depth/stencil buffer.
2824  */
2825 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2826 crocus_create_surface(struct pipe_context *ctx,
2827                       struct pipe_resource *tex,
2828                       const struct pipe_surface *tmpl)
2829 {
2830    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2831    const struct intel_device_info *devinfo = &screen->devinfo;
2832 
2833    isl_surf_usage_flags_t usage = 0;
2834    if (tmpl->writable)
2835       usage = ISL_SURF_USAGE_STORAGE_BIT;
2836    else if (util_format_is_depth_or_stencil(tmpl->format))
2837       usage = ISL_SURF_USAGE_DEPTH_BIT;
2838    else
2839       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2840 
2841    const struct crocus_format_info fmt =
2842       crocus_format_for_usage(devinfo, tmpl->format, usage);
2843 
2844    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2845        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2846       /* Framebuffer validation will reject this invalid case, but it
2847        * hasn't had the opportunity yet.  In the meantime, we need to
2848        * avoid hitting ISL asserts about unsupported formats below.
2849        */
2850       return NULL;
2851    }
2852 
2853    struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2854    struct pipe_surface *psurf = &surf->base;
2855    struct crocus_resource *res = (struct crocus_resource *) tex;
2856 
2857    if (!surf)
2858       return NULL;
2859 
2860    pipe_reference_init(&psurf->reference, 1);
2861    pipe_resource_reference(&psurf->texture, tex);
2862    psurf->context = ctx;
2863    psurf->format = tmpl->format;
2864    psurf->width = tex->width0;
2865    psurf->height = tex->height0;
2866    psurf->texture = tex;
2867    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2868    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2869    psurf->u.tex.level = tmpl->u.tex.level;
2870 
2871    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2872 
2873    struct isl_view *view = &surf->view;
2874    *view = (struct isl_view) {
2875       .format = fmt.fmt,
2876       .base_level = tmpl->u.tex.level,
2877       .levels = 1,
2878       .base_array_layer = tmpl->u.tex.first_layer,
2879       .array_len = array_len,
2880       .swizzle = ISL_SWIZZLE_IDENTITY,
2881       .usage = usage,
2882    };
2883 
2884 #if GFX_VER >= 6
2885    struct isl_view *read_view = &surf->read_view;
2886    *read_view = (struct isl_view) {
2887       .format = fmt.fmt,
2888       .base_level = tmpl->u.tex.level,
2889       .levels = 1,
2890       .base_array_layer = tmpl->u.tex.first_layer,
2891       .array_len = array_len,
2892       .swizzle = ISL_SWIZZLE_IDENTITY,
2893       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2894    };
2895 #endif
2896 
2897    surf->clear_color = res->aux.clear_color;
2898 
2899    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2900    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2901                           ISL_SURF_USAGE_STENCIL_BIT))
2902       return psurf;
2903 
2904    if (!isl_format_is_compressed(res->surf.format)) {
2905       if (crocus_resource_unfinished_aux_import(res))
2906          crocus_resource_finish_aux_import(&screen->base, res);
2907 
2908       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2909       uint64_t temp_offset;
2910       uint32_t temp_x, temp_y;
2911 
2912       isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2913                                           res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2914                                           res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2915                                           &temp_offset, &temp_x, &temp_y);
2916       if (!devinfo->has_surface_tile_offset &&
2917           (temp_x || temp_y)) {
2918          /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2919           * destination.
2920           */
2921          /* move to temp */
2922          struct pipe_resource wa_templ = (struct pipe_resource) {
2923             .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2924             .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2925             .depth0 = 1,
2926             .array_size = 1,
2927             .format = res->base.b.format,
2928             .target = PIPE_TEXTURE_2D,
2929             .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2930          };
2931          surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2932          view->base_level = 0;
2933          view->base_array_layer = 0;
2934          view->array_len = 1;
2935          struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2936          memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2937       }
2938       return psurf;
2939    }
2940 
2941    /* The resource has a compressed format, which is not renderable, but we
2942     * have a renderable view format.  We must be attempting to upload blocks
2943     * of compressed data via an uncompressed view.
2944     *
2945     * In this case, we can assume there are no auxiliary buffers, a single
2946     * miplevel, and that the resource is single-sampled.  Gallium may try
2947     * and create an uncompressed view with multiple layers, however.
2948     */
2949    assert(!isl_format_is_compressed(fmt.fmt));
2950    assert(res->surf.samples == 1);
2951    assert(view->levels == 1);
2952 
2953    /* TODO: compressed pbo uploads aren't working here */
2954    return NULL;
2955 
2956    uint64_t offset_B = 0;
2957    uint32_t tile_x_sa = 0, tile_y_sa = 0;
2958 
2959    if (view->base_level > 0) {
2960       /* We can't rely on the hardware's miplevel selection with such
2961        * a substantial lie about the format, so we select a single image
2962        * using the Tile X/Y Offset fields.  In this case, we can't handle
2963        * multiple array slices.
2964        *
2965        * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2966        * hard-coded to align to exactly the block size of the compressed
2967        * texture.  This means that, when reinterpreted as a non-compressed
2968        * texture, the tile offsets may be anything and we can't rely on
2969        * X/Y Offset.
2970        *
2971        * Return NULL to force the state tracker to take fallback paths.
2972        */
2973       // TODO: check if the gen7 check is right, originally gen8
2974       if (view->array_len > 1 || GFX_VER == 7)
2975          return NULL;
2976 
2977       const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2978       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2979                               view->base_level,
2980                               is_3d ? 0 : view->base_array_layer,
2981                               is_3d ? view->base_array_layer : 0,
2982                               &surf->surf,
2983                               &offset_B, &tile_x_sa, &tile_y_sa);
2984 
2985       /* We use address and tile offsets to access a single level/layer
2986        * as a subimage, so reset level/layer so it doesn't offset again.
2987        */
2988       view->base_array_layer = 0;
2989       view->base_level = 0;
2990    } else {
2991       /* Level 0 doesn't require tile offsets, and the hardware can find
2992        * array slices using QPitch even with the format override, so we
2993        * can allow layers in this case.  Copy the original ISL surface.
2994        */
2995       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2996    }
2997 
2998    /* Scale down the image dimensions by the block size. */
2999    const struct isl_format_layout *fmtl =
3000       isl_format_get_layout(res->surf.format);
3001    surf->surf.format = fmt.fmt;
3002    surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3003    surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3004    tile_x_sa /= fmtl->bw;
3005    tile_y_sa /= fmtl->bh;
3006 
3007    psurf->width = surf->surf.logical_level0_px.width;
3008    psurf->height = surf->surf.logical_level0_px.height;
3009 
3010    return psurf;
3011 }
3012 
3013 #if GFX_VER >= 7
3014 static void
fill_default_image_param(struct brw_image_param * param)3015 fill_default_image_param(struct brw_image_param *param)
3016 {
3017    memset(param, 0, sizeof(*param));
3018    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3019     * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3020     * detailed explanation of these parameters.
3021     */
3022    param->swizzling[0] = 0xff;
3023    param->swizzling[1] = 0xff;
3024 }
3025 
3026 static void
fill_buffer_image_param(struct brw_image_param * param,enum pipe_format pfmt,unsigned size)3027 fill_buffer_image_param(struct brw_image_param *param,
3028                         enum pipe_format pfmt,
3029                         unsigned size)
3030 {
3031    const unsigned cpp = util_format_get_blocksize(pfmt);
3032 
3033    fill_default_image_param(param);
3034    param->size[0] = size / cpp;
3035    param->stride[0] = cpp;
3036 }
3037 
3038 #endif
3039 
3040 /**
3041  * The pipe->set_shader_images() driver hook.
3042  */
3043 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3044 crocus_set_shader_images(struct pipe_context *ctx,
3045                          enum pipe_shader_type p_stage,
3046                          unsigned start_slot, unsigned count,
3047                          unsigned unbind_num_trailing_slots,
3048                          const struct pipe_image_view *p_images)
3049 {
3050 #if GFX_VER >= 7
3051    struct crocus_context *ice = (struct crocus_context *) ctx;
3052    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3053    const struct intel_device_info *devinfo = &screen->devinfo;
3054    gl_shader_stage stage = stage_from_pipe(p_stage);
3055    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3056    struct crocus_genx_state *genx = ice->state.genx;
3057    struct brw_image_param *image_params = genx->shaders[stage].image_param;
3058 
3059    shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3060 
3061    for (unsigned i = 0; i < count; i++) {
3062       struct crocus_image_view *iv = &shs->image[start_slot + i];
3063 
3064       if (p_images && p_images[i].resource) {
3065          const struct pipe_image_view *img = &p_images[i];
3066          struct crocus_resource *res = (void *) img->resource;
3067 
3068          util_copy_image_view(&iv->base, img);
3069 
3070          shs->bound_image_views |= 1 << (start_slot + i);
3071 
3072          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3073          res->bind_stages |= 1 << stage;
3074 
3075          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3076          struct crocus_format_info fmt =
3077             crocus_format_for_usage(devinfo, img->format, usage);
3078 
3079          struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3080          if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3081             /* On Gen8, try to use typed surfaces reads (which support a
3082              * limited number of formats), and if not possible, fall back
3083              * to untyped reads.
3084              */
3085             if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3086                fmt.fmt = ISL_FORMAT_RAW;
3087             else
3088                fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3089          }
3090 
3091          if (res->base.b.target != PIPE_BUFFER) {
3092             struct isl_view view = {
3093                .format = fmt.fmt,
3094                .base_level = img->u.tex.level,
3095                .levels = 1,
3096                .base_array_layer = img->u.tex.first_layer,
3097                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3098                .swizzle = swiz,
3099                .usage = usage,
3100             };
3101 
3102             iv->view = view;
3103 
3104             isl_surf_fill_image_param(&screen->isl_dev,
3105                                       &image_params[start_slot + i],
3106                                       &res->surf, &view);
3107          } else {
3108             struct isl_view view = {
3109                .format = fmt.fmt,
3110                .swizzle = swiz,
3111                .usage = usage,
3112             };
3113             iv->view = view;
3114 
3115             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3116                            img->u.buf.offset + img->u.buf.size);
3117             fill_buffer_image_param(&image_params[start_slot + i],
3118                                     img->format, img->u.buf.size);
3119          }
3120       } else {
3121          pipe_resource_reference(&iv->base.resource, NULL);
3122          fill_default_image_param(&image_params[start_slot + i]);
3123       }
3124    }
3125 
3126    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3127    ice->state.dirty |=
3128       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3129                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3130 
3131    /* Broadwell also needs brw_image_params re-uploaded */
3132    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3133    shs->sysvals_need_upload = true;
3134 #endif
3135 }
3136 
3137 
3138 /**
3139  * The pipe->set_sampler_views() driver hook.
3140  */
3141 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3142 crocus_set_sampler_views(struct pipe_context *ctx,
3143                          enum pipe_shader_type p_stage,
3144                          unsigned start, unsigned count,
3145                          unsigned unbind_num_trailing_slots,
3146                          bool take_ownership,
3147                          struct pipe_sampler_view **views)
3148 {
3149    struct crocus_context *ice = (struct crocus_context *) ctx;
3150    gl_shader_stage stage = stage_from_pipe(p_stage);
3151    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3152 
3153    shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3154 
3155    for (unsigned i = 0; i < count; i++) {
3156       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3157 
3158       if (take_ownership) {
3159          pipe_sampler_view_reference((struct pipe_sampler_view **)
3160                                      &shs->textures[start + i], NULL);
3161          shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3162       } else {
3163          pipe_sampler_view_reference((struct pipe_sampler_view **)
3164                                      &shs->textures[start + i], pview);
3165       }
3166 
3167       struct crocus_sampler_view *view = (void *) pview;
3168       if (view) {
3169          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3170          view->res->bind_stages |= 1 << stage;
3171 
3172          shs->bound_sampler_views |= 1 << (start + i);
3173       }
3174    }
3175 #if GFX_VER == 6
3176    /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3177    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3178 #endif
3179    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3180    ice->state.dirty |=
3181       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3182                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3183    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3184 }
3185 
3186 /**
3187  * The pipe->set_tess_state() driver hook.
3188  */
3189 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3190 crocus_set_tess_state(struct pipe_context *ctx,
3191                       const float default_outer_level[4],
3192                       const float default_inner_level[2])
3193 {
3194    struct crocus_context *ice = (struct crocus_context *) ctx;
3195    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3196 
3197    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3198    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3199 
3200    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3201    shs->sysvals_need_upload = true;
3202 }
3203 
3204 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3205 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3206 {
3207    struct crocus_context *ice = (struct crocus_context *) ctx;
3208 
3209    ice->state.patch_vertices = patch_vertices;
3210 }
3211 
3212 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3213 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3214 {
3215    struct crocus_surface *surf = (void *) p_surf;
3216    pipe_resource_reference(&p_surf->texture, NULL);
3217 
3218    pipe_resource_reference(&surf->align_res, NULL);
3219    free(surf);
3220 }
3221 
3222 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3223 crocus_set_clip_state(struct pipe_context *ctx,
3224                       const struct pipe_clip_state *state)
3225 {
3226    struct crocus_context *ice = (struct crocus_context *) ctx;
3227    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3228    struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3229    struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3230 
3231    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3232 
3233 #if GFX_VER <= 5
3234    ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3235 #endif
3236    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3237                              CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3238    shs->sysvals_need_upload = true;
3239    gshs->sysvals_need_upload = true;
3240    tshs->sysvals_need_upload = true;
3241 }
3242 
3243 /**
3244  * The pipe->set_polygon_stipple() driver hook.
3245  */
3246 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3247 crocus_set_polygon_stipple(struct pipe_context *ctx,
3248                            const struct pipe_poly_stipple *state)
3249 {
3250    struct crocus_context *ice = (struct crocus_context *) ctx;
3251    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3252    ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3253 }
3254 
3255 /**
3256  * The pipe->set_sample_mask() driver hook.
3257  */
3258 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3259 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3260 {
3261    struct crocus_context *ice = (struct crocus_context *) ctx;
3262 
3263    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3264     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3265     */
3266    ice->state.sample_mask = sample_mask & 0xff;
3267    ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3268 }
3269 
3270 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3271 crocus_fill_scissor_rect(struct crocus_context *ice,
3272                          int idx,
3273                          struct pipe_scissor_state *ss)
3274 {
3275    struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3276    struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3277    const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3278    struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3279       .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3280       .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3281       .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3282       .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3283    };
3284    if (cso_state->scissor) {
3285       struct pipe_scissor_state *s = &ice->state.scissors[idx];
3286       scissor.minx = MAX2(scissor.minx, s->minx);
3287       scissor.miny = MAX2(scissor.miny, s->miny);
3288       scissor.maxx = MIN2(scissor.maxx, s->maxx);
3289       scissor.maxy = MIN2(scissor.maxy, s->maxy);
3290    }
3291    *ss = scissor;
3292 }
3293 
3294 /**
3295  * The pipe->set_scissor_states() driver hook.
3296  *
3297  * This corresponds to our SCISSOR_RECT state structures.  It's an
3298  * exact match, so we just store them, and memcpy them out later.
3299  */
3300 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3301 crocus_set_scissor_states(struct pipe_context *ctx,
3302                           unsigned start_slot,
3303                           unsigned num_scissors,
3304                           const struct pipe_scissor_state *rects)
3305 {
3306    struct crocus_context *ice = (struct crocus_context *) ctx;
3307 
3308    for (unsigned i = 0; i < num_scissors; i++) {
3309       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3310          /* If the scissor was out of bounds and got clamped to 0 width/height
3311           * at the bounds, the subtraction of 1 from maximums could produce a
3312           * negative number and thus not clip anything.  Instead, just provide
3313           * a min > max scissor inside the bounds, which produces the expected
3314           * no rendering.
3315           */
3316          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3317             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3318          };
3319       } else {
3320          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3321             .minx = rects[i].minx,     .miny = rects[i].miny,
3322             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3323          };
3324       }
3325    }
3326 
3327 #if GFX_VER < 6
3328    ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3329 #else
3330    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3331 #endif
3332    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3333 
3334 }
3335 
3336 /**
3337  * The pipe->set_stencil_ref() driver hook.
3338  *
3339  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3340  */
3341 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3342 crocus_set_stencil_ref(struct pipe_context *ctx,
3343                        const struct pipe_stencil_ref ref)
3344 {
3345    struct crocus_context *ice = (struct crocus_context *) ctx;
3346    ice->state.stencil_ref = ref;
3347    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3348 }
3349 
3350 #if GFX_VER == 8
3351 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3352 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3353 {
3354    return copysignf(state->scale[axis], sign) + state->translate[axis];
3355 }
3356 #endif
3357 
3358 /**
3359  * The pipe->set_viewport_states() driver hook.
3360  *
3361  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3362  * the guardband yet, as we need the framebuffer dimensions, but we can
3363  * at least fill out the rest.
3364  */
3365 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3366 crocus_set_viewport_states(struct pipe_context *ctx,
3367                            unsigned start_slot,
3368                            unsigned count,
3369                            const struct pipe_viewport_state *states)
3370 {
3371    struct crocus_context *ice = (struct crocus_context *) ctx;
3372 
3373    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3374 
3375    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3376    ice->state.dirty |= CROCUS_DIRTY_RASTER;
3377 #if GFX_VER >= 6
3378    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3379 #endif
3380 
3381    if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3382                                !ice->state.cso_rast->cso.depth_clip_far))
3383       ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3384 }
3385 
3386 /**
3387  * The pipe->set_framebuffer_state() driver hook.
3388  *
3389  * Sets the current draw FBO, including color render targets, depth,
3390  * and stencil buffers.
3391  */
3392 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3393 crocus_set_framebuffer_state(struct pipe_context *ctx,
3394                              const struct pipe_framebuffer_state *state)
3395 {
3396    struct crocus_context *ice = (struct crocus_context *) ctx;
3397    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3398    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3399    const struct intel_device_info *devinfo = &screen->devinfo;
3400 #if 0
3401    struct isl_device *isl_dev = &screen->isl_dev;
3402    struct crocus_resource *zres;
3403    struct crocus_resource *stencil_res;
3404 #endif
3405 
3406    unsigned samples = util_framebuffer_get_num_samples(state);
3407    unsigned layers = util_framebuffer_get_num_layers(state);
3408 
3409 #if GFX_VER >= 6
3410    if (cso->samples != samples) {
3411       ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3412       ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3413       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3414 #if GFX_VERx10 == 75
3415       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3416 #endif
3417    }
3418 #endif
3419 
3420 #if GFX_VER >= 6 && GFX_VER < 8
3421    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3422 #endif
3423 
3424    if ((cso->layers == 0) != (layers == 0)) {
3425       ice->state.dirty |= CROCUS_DIRTY_CLIP;
3426    }
3427 
3428    if (cso->width != state->width || cso->height != state->height) {
3429       ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3430       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3431       ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3432 #if GFX_VER >= 6
3433       ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3434 #endif
3435    }
3436 
3437    if (cso->zsbuf || state->zsbuf) {
3438       ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3439 
3440       /* update SF's depth buffer format */
3441       if (GFX_VER == 7 && cso->zsbuf)
3442          ice->state.dirty |= CROCUS_DIRTY_RASTER;
3443    }
3444 
3445    /* wm thread dispatch enable */
3446    ice->state.dirty |= CROCUS_DIRTY_WM;
3447    util_copy_framebuffer_state(cso, state);
3448    cso->samples = samples;
3449    cso->layers = layers;
3450 
3451    if (cso->zsbuf) {
3452       struct crocus_resource *zres;
3453       struct crocus_resource *stencil_res;
3454       enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3455       crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3456                                          &stencil_res);
3457       if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3458          aux_usage = zres->aux.usage;
3459       }
3460       ice->state.hiz_usage = aux_usage;
3461    }
3462 
3463    /* Render target change */
3464    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3465 
3466    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3467 
3468    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3469 }
3470 
3471 /**
3472  * The pipe->set_constant_buffer() driver hook.
3473  *
3474  * This uploads any constant data in user buffers, and references
3475  * any UBO resources containing constant data.
3476  */
3477 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3478 crocus_set_constant_buffer(struct pipe_context *ctx,
3479                            enum pipe_shader_type p_stage, unsigned index,
3480                            bool take_ownership,
3481                            const struct pipe_constant_buffer *input)
3482 {
3483    struct crocus_context *ice = (struct crocus_context *) ctx;
3484    gl_shader_stage stage = stage_from_pipe(p_stage);
3485    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3486    struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3487 
3488    util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3489 
3490    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3491       shs->bound_cbufs |= 1u << index;
3492 
3493       if (input->user_buffer) {
3494          void *map = NULL;
3495          pipe_resource_reference(&cbuf->buffer, NULL);
3496          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3497                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3498 
3499          if (!cbuf->buffer) {
3500             /* Allocation was unsuccessful - just unbind */
3501             crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3502             return;
3503          }
3504 
3505          assert(map);
3506          memcpy(map, input->user_buffer, input->buffer_size);
3507       }
3508       cbuf->buffer_size =
3509          MIN2(input->buffer_size,
3510               crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3511 
3512       struct crocus_resource *res = (void *) cbuf->buffer;
3513       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3514       res->bind_stages |= 1 << stage;
3515    } else {
3516       shs->bound_cbufs &= ~(1u << index);
3517    }
3518 
3519    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3520 }
3521 
3522 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3523 upload_sysvals(struct crocus_context *ice,
3524                gl_shader_stage stage)
3525 {
3526    UNUSED struct crocus_genx_state *genx = ice->state.genx;
3527    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3528 
3529    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3530    if (!shader || shader->num_system_values == 0)
3531       return;
3532 
3533    assert(shader->num_cbufs > 0);
3534 
3535    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3536    struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3537    unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3538    uint32_t *map = NULL;
3539 
3540    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3541    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3542                   &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3543 
3544    for (int i = 0; i < shader->num_system_values; i++) {
3545       uint32_t sysval = shader->system_values[i];
3546       uint32_t value = 0;
3547 
3548       if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3549 #if GFX_VER >= 7
3550          unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3551          unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3552          struct brw_image_param *param =
3553             &genx->shaders[stage].image_param[img];
3554 
3555          assert(offset < sizeof(struct brw_image_param));
3556          value = ((uint32_t *) param)[offset];
3557 #endif
3558       } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3559          value = 0;
3560       } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3561          int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3562          int comp  = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3563          value = fui(ice->state.clip_planes.ucp[plane][comp]);
3564       } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3565          if (stage == MESA_SHADER_TESS_CTRL) {
3566             value = ice->state.vertices_per_patch;
3567          } else {
3568             assert(stage == MESA_SHADER_TESS_EVAL);
3569             const struct shader_info *tcs_info =
3570                crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3571             if (tcs_info)
3572                value = tcs_info->tess.tcs_vertices_out;
3573             else
3574                value = ice->state.vertices_per_patch;
3575          }
3576       } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3577                  sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3578          unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3579          value = fui(ice->state.default_outer_level[i]);
3580       } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3581          value = fui(ice->state.default_inner_level[0]);
3582       } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3583          value = fui(ice->state.default_inner_level[1]);
3584       } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3585                  sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3586          unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3587          value = ice->state.last_block[i];
3588       } else {
3589          assert(!"unhandled system value");
3590       }
3591 
3592       *map++ = value;
3593    }
3594 
3595    cbuf->buffer_size = upload_size;
3596    shs->sysvals_need_upload = false;
3597 }
3598 
3599 /**
3600  * The pipe->set_shader_buffers() driver hook.
3601  *
3602  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
3603  * SURFACE_STATE here, as the buffer offset may change each time.
3604  */
3605 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3606 crocus_set_shader_buffers(struct pipe_context *ctx,
3607                           enum pipe_shader_type p_stage,
3608                           unsigned start_slot, unsigned count,
3609                           const struct pipe_shader_buffer *buffers,
3610                           unsigned writable_bitmask)
3611 {
3612    struct crocus_context *ice = (struct crocus_context *) ctx;
3613    gl_shader_stage stage = stage_from_pipe(p_stage);
3614    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3615 
3616    unsigned modified_bits = u_bit_consecutive(start_slot, count);
3617 
3618    shs->bound_ssbos &= ~modified_bits;
3619    shs->writable_ssbos &= ~modified_bits;
3620    shs->writable_ssbos |= writable_bitmask << start_slot;
3621 
3622    for (unsigned i = 0; i < count; i++) {
3623       if (buffers && buffers[i].buffer) {
3624          struct crocus_resource *res = (void *) buffers[i].buffer;
3625          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3626          pipe_resource_reference(&ssbo->buffer, &res->base.b);
3627          ssbo->buffer_offset = buffers[i].buffer_offset;
3628          ssbo->buffer_size =
3629             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3630 
3631          shs->bound_ssbos |= 1 << (start_slot + i);
3632 
3633          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3634          res->bind_stages |= 1 << stage;
3635 
3636          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3637                         ssbo->buffer_offset + ssbo->buffer_size);
3638       } else {
3639          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3640       }
3641    }
3642 
3643    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3644 }
3645 
3646 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3647 crocus_delete_state(struct pipe_context *ctx, void *state)
3648 {
3649    free(state);
3650 }
3651 
3652 /**
3653  * The pipe->set_vertex_buffers() driver hook.
3654  *
3655  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3656  */
3657 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,const struct pipe_vertex_buffer * buffers)3658 crocus_set_vertex_buffers(struct pipe_context *ctx,
3659                           unsigned start_slot, unsigned count,
3660                           unsigned unbind_num_trailing_slots,
3661                           bool take_ownership,
3662                           const struct pipe_vertex_buffer *buffers)
3663 {
3664    struct crocus_context *ice = (struct crocus_context *) ctx;
3665    struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3666    const unsigned padding =
3667       (GFX_VERx10 < 75 && !screen->devinfo.is_baytrail) * 2;
3668    ice->state.bound_vertex_buffers &=
3669       ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3670 
3671    util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3672                                 buffers, start_slot, count, unbind_num_trailing_slots,
3673                                 take_ownership);
3674 
3675    for (unsigned i = 0; i < count; i++) {
3676       struct pipe_vertex_buffer *state =
3677          &ice->state.vertex_buffers[start_slot + i];
3678 
3679       if (!state->is_user_buffer && state->buffer.resource) {
3680          struct crocus_resource *res = (void *)state->buffer.resource;
3681          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3682       }
3683 
3684       uint32_t end = 0;
3685       if (state->buffer.resource)
3686          end = state->buffer.resource->width0 + padding;
3687       ice->state.vb_end[start_slot + i] = end;
3688    }
3689    ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3690 }
3691 
3692 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3693 static uint8_t get_wa_flags(enum isl_format format)
3694 {
3695    uint8_t wa_flags = 0;
3696 
3697    switch (format) {
3698    case ISL_FORMAT_R10G10B10A2_USCALED:
3699       wa_flags = BRW_ATTRIB_WA_SCALE;
3700       break;
3701    case ISL_FORMAT_R10G10B10A2_SSCALED:
3702       wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
3703       break;
3704    case ISL_FORMAT_R10G10B10A2_UNORM:
3705       wa_flags = BRW_ATTRIB_WA_NORMALIZE;
3706       break;
3707    case ISL_FORMAT_R10G10B10A2_SNORM:
3708       wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
3709       break;
3710    case ISL_FORMAT_R10G10B10A2_SINT:
3711       wa_flags = BRW_ATTRIB_WA_SIGN;
3712       break;
3713    case ISL_FORMAT_B10G10R10A2_USCALED:
3714       wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3715       break;
3716    case ISL_FORMAT_B10G10R10A2_SSCALED:
3717       wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3718       break;
3719    case ISL_FORMAT_B10G10R10A2_UNORM:
3720       wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3721       break;
3722    case ISL_FORMAT_B10G10R10A2_SNORM:
3723       wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3724       break;
3725    case ISL_FORMAT_B10G10R10A2_SINT:
3726       wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
3727       break;
3728    case ISL_FORMAT_B10G10R10A2_UINT:
3729       wa_flags = BRW_ATTRIB_WA_BGRA;
3730       break;
3731    default:
3732       break;
3733    }
3734    return wa_flags;
3735 }
3736 #endif
3737 
3738 /**
3739  * Gallium CSO for vertex elements.
3740  */
3741 struct crocus_vertex_element_state {
3742    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3743 #if GFX_VER == 8
3744    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3745 #endif
3746    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3747 #if GFX_VER == 8
3748    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3749 #endif
3750    uint32_t step_rate[16];
3751    uint8_t wa_flags[33];
3752    unsigned count;
3753 };
3754 
3755 /**
3756  * The pipe->create_vertex_elements() driver hook.
3757  *
3758  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3759  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3760  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3761  * needed. In these cases we will need information available at draw time.
3762  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3763  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3764  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3765  */
3766 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3767 crocus_create_vertex_elements(struct pipe_context *ctx,
3768                               unsigned count,
3769                               const struct pipe_vertex_element *state)
3770 {
3771    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3772    const struct intel_device_info *devinfo = &screen->devinfo;
3773    struct crocus_vertex_element_state *cso =
3774       malloc(sizeof(struct crocus_vertex_element_state));
3775 
3776    cso->count = count;
3777 
3778    crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3779       ve.DWordLength =
3780          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3781    }
3782 
3783    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3784 #if GFX_VER == 8
3785    uint32_t *vfi_pack_dest = cso->vf_instancing;
3786 #endif
3787 
3788    if (count == 0) {
3789       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3790          ve.Valid = true;
3791          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3792          ve.Component0Control = VFCOMP_STORE_0;
3793          ve.Component1Control = VFCOMP_STORE_0;
3794          ve.Component2Control = VFCOMP_STORE_0;
3795          ve.Component3Control = VFCOMP_STORE_1_FP;
3796       }
3797 #if GFX_VER == 8
3798       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3799       }
3800 #endif
3801    }
3802 
3803    for (int i = 0; i < count; i++) {
3804       const struct crocus_format_info fmt =
3805          crocus_format_for_usage(devinfo, state[i].src_format, 0);
3806       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3807                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3808       enum isl_format actual_fmt = fmt.fmt;
3809 
3810 #if GFX_VERx10 < 75
3811       cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3812 
3813       if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3814           fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3815           fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3816           fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3817           fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3818           fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3819           fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3820           fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3821           fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3822           fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3823           fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3824          actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3825       if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3826          actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3827       if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3828          actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3829       if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3830          actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3831       if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3832          actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3833 #endif
3834 
3835       cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3836 
3837       switch (isl_format_get_num_channels(fmt.fmt)) {
3838       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3839       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3840       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3841       case 3:
3842          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3843             : VFCOMP_STORE_1_FP;
3844          break;
3845       }
3846       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3847 #if GFX_VER >= 6
3848          ve.EdgeFlagEnable = false;
3849 #endif
3850          ve.VertexBufferIndex = state[i].vertex_buffer_index;
3851          ve.Valid = true;
3852          ve.SourceElementOffset = state[i].src_offset;
3853          ve.SourceElementFormat = actual_fmt;
3854          ve.Component0Control = comp[0];
3855          ve.Component1Control = comp[1];
3856          ve.Component2Control = comp[2];
3857          ve.Component3Control = comp[3];
3858 #if GFX_VER < 5
3859          ve.DestinationElementOffset = i * 4;
3860 #endif
3861       }
3862 
3863 #if GFX_VER == 8
3864       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3865          vi.VertexElementIndex = i;
3866          vi.InstancingEnable = state[i].instance_divisor > 0;
3867          vi.InstanceDataStepRate = state[i].instance_divisor;
3868       }
3869 #endif
3870       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3871 #if GFX_VER == 8
3872       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3873 #endif
3874    }
3875 
3876    /* An alternative version of the last VE and VFI is stored so it
3877     * can be used at draw time in case Vertex Shader uses EdgeFlag
3878     */
3879    if (count) {
3880       const unsigned edgeflag_index = count - 1;
3881       const struct crocus_format_info fmt =
3882          crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3883       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3884 #if GFX_VER >= 6
3885          ve.EdgeFlagEnable = true;
3886 #endif
3887          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3888          ve.Valid = true;
3889          ve.SourceElementOffset = state[edgeflag_index].src_offset;
3890          ve.SourceElementFormat = fmt.fmt;
3891          ve.Component0Control = VFCOMP_STORE_SRC;
3892          ve.Component1Control = VFCOMP_STORE_0;
3893          ve.Component2Control = VFCOMP_STORE_0;
3894          ve.Component3Control = VFCOMP_STORE_0;
3895       }
3896 #if GFX_VER == 8
3897       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3898          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3899           * at draw time, as it should change if SGVs are emitted.
3900           */
3901          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3902          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3903       }
3904 #endif
3905    }
3906 
3907    return cso;
3908 }
3909 
3910 /**
3911  * The pipe->bind_vertex_elements_state() driver hook.
3912  */
3913 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3914 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3915 {
3916    struct crocus_context *ice = (struct crocus_context *) ctx;
3917 #if GFX_VER == 8
3918    struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3919    struct crocus_vertex_element_state *new_cso = state;
3920 
3921    if (new_cso && cso_changed(count))
3922       ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3923 #endif
3924    ice->state.cso_vertex_elements = state;
3925    ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3926    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3927 }
3928 
3929 #if GFX_VER >= 6
3930 struct crocus_streamout_counter {
3931    uint32_t offset_start;
3932    uint32_t offset_end;
3933 
3934    uint64_t accum;
3935 };
3936 
3937 /**
3938  * Gallium CSO for stream output (transform feedback) targets.
3939  */
3940 struct crocus_stream_output_target {
3941    struct pipe_stream_output_target base;
3942 
3943    /** Stride (bytes-per-vertex) during this transform feedback operation */
3944    uint16_t stride;
3945 
3946    /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3947    bool zeroed;
3948 
3949    struct crocus_resource *offset_res;
3950    uint32_t offset_offset;
3951 
3952 #if GFX_VER == 6
3953    void *prim_map;
3954    struct crocus_streamout_counter prev_count;
3955    struct crocus_streamout_counter count;
3956 #endif
3957 #if GFX_VER == 8
3958    /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3959    bool zero_offset;
3960 #endif
3961 };
3962 
3963 #if GFX_VER >= 7
3964 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3965 crocus_get_so_offset(struct pipe_stream_output_target *so)
3966 {
3967    struct crocus_stream_output_target *tgt = (void *)so;
3968    struct pipe_transfer *transfer;
3969    struct pipe_box box;
3970    uint32_t result;
3971    u_box_1d(tgt->offset_offset, 4, &box);
3972    void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3973                                        0, PIPE_MAP_DIRECTLY,
3974                                        &box, &transfer);
3975    assert(val);
3976    result = *(uint32_t *)val;
3977    so->context->buffer_unmap(so->context, transfer);
3978 
3979    return result / tgt->stride;
3980 }
3981 #endif
3982 
3983 #if GFX_VER == 6
3984 static void
3985 compute_vertices_written_so_far(struct crocus_context *ice,
3986                                 struct crocus_stream_output_target *tgt,
3987                                 struct crocus_streamout_counter *count,
3988                                 uint64_t *svbi);
3989 
3990 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3991 crocus_get_so_offset(struct pipe_stream_output_target *so)
3992 {
3993    struct crocus_stream_output_target *tgt = (void *)so;
3994    struct crocus_context *ice = (void *)so->context;
3995 
3996    uint64_t vert_written;
3997    compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3998    return vert_written;
3999 }
4000 #endif
4001 
4002 /**
4003  * The pipe->create_stream_output_target() driver hook.
4004  *
4005  * "Target" here refers to a destination buffer.  We translate this into
4006  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4007  * know which buffer this represents, or whether we ought to zero the
4008  * write-offsets, or append.  Those are handled in the set() hook.
4009  */
4010 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4011 crocus_create_stream_output_target(struct pipe_context *ctx,
4012                                    struct pipe_resource *p_res,
4013                                    unsigned buffer_offset,
4014                                    unsigned buffer_size)
4015 {
4016    struct crocus_resource *res = (void *) p_res;
4017    struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4018    if (!cso)
4019       return NULL;
4020 
4021    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4022 
4023    pipe_reference_init(&cso->base.reference, 1);
4024    pipe_resource_reference(&cso->base.buffer, p_res);
4025    cso->base.buffer_offset = buffer_offset;
4026    cso->base.buffer_size = buffer_size;
4027    cso->base.context = ctx;
4028 
4029    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4030                   buffer_offset + buffer_size);
4031 #if GFX_VER >= 7
4032    struct crocus_context *ice = (struct crocus_context *) ctx;
4033    void *temp;
4034    u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4035                   &cso->offset_offset,
4036                   (struct pipe_resource **)&cso->offset_res,
4037                   &temp);
4038 #endif
4039 
4040    return &cso->base;
4041 }
4042 
4043 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4044 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4045                                     struct pipe_stream_output_target *state)
4046 {
4047    struct crocus_stream_output_target *cso = (void *) state;
4048 
4049    pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4050    pipe_resource_reference(&cso->base.buffer, NULL);
4051 
4052    free(cso);
4053 }
4054 
4055 #define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
4056 #define GEN7_SO_WRITE_OFFSET(n)         (0x5280 + (n) * 4)
4057 
4058 #if GFX_VER == 6
4059 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4060 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4061                          struct crocus_streamout_counter *counter)
4062 {
4063    uint64_t *prim_counts = tgt->prim_map;
4064 
4065    if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4066       struct pipe_fence_handle *out_fence = NULL;
4067       batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4068       batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4069       batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4070    }
4071 
4072    for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4073       counter->accum += prim_counts[i + 1] - prim_counts[i];
4074    }
4075    tgt->count.offset_start = tgt->count.offset_end = 0;
4076 }
4077 
4078 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4079 crocus_stream_store_prims_written(struct crocus_batch *batch,
4080                                   struct crocus_stream_output_target *tgt)
4081 {
4082    if (!tgt->offset_res) {
4083       u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4084                      &tgt->offset_offset,
4085                      (struct pipe_resource **)&tgt->offset_res,
4086                      &tgt->prim_map);
4087       tgt->count.offset_start = tgt->count.offset_end = 0;
4088    }
4089 
4090    if (tgt->count.offset_end + 16 >= 4096) {
4091       aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4092       aggregate_stream_counter(batch, tgt, &tgt->count);
4093    }
4094 
4095    crocus_emit_mi_flush(batch);
4096    crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4097                                tgt->offset_res->bo,
4098                                tgt->count.offset_end + tgt->offset_offset, false);
4099    tgt->count.offset_end += 8;
4100 }
4101 
4102 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4103 compute_vertices_written_so_far(struct crocus_context *ice,
4104                                 struct crocus_stream_output_target *tgt,
4105                                 struct crocus_streamout_counter *counter,
4106                                 uint64_t *svbi)
4107 {
4108    //TODO vertices per prim
4109    aggregate_stream_counter(&ice->batches[0], tgt, counter);
4110 
4111    *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4112 }
4113 #endif
4114 /**
4115  * The pipe->set_stream_output_targets() driver hook.
4116  *
4117  * At this point, we know which targets are bound to a particular index,
4118  * and also whether we want to append or start over.  We can finish the
4119  * 3DSTATE_SO_BUFFER packets we started earlier.
4120  */
4121 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4122 crocus_set_stream_output_targets(struct pipe_context *ctx,
4123                                  unsigned num_targets,
4124                                  struct pipe_stream_output_target **targets,
4125                                  const unsigned *offsets)
4126 {
4127    struct crocus_context *ice = (struct crocus_context *) ctx;
4128    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4129    struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4130    const bool active = num_targets > 0;
4131    if (ice->state.streamout_active != active) {
4132       ice->state.streamout_active = active;
4133 #if GFX_VER >= 7
4134       ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4135 #else
4136       ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4137 #endif
4138 
4139       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4140        * it's a non-pipelined command.  If we're switching streamout on, we
4141        * may have missed emitting it earlier, so do so now.  (We're already
4142        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4143        */
4144       if (active) {
4145 #if GFX_VER >= 7
4146          ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4147 #endif
4148       } else {
4149          uint32_t flush = 0;
4150          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4151             struct crocus_stream_output_target *tgt =
4152                (void *) ice->state.so_target[i];
4153             if (tgt) {
4154                struct crocus_resource *res = (void *) tgt->base.buffer;
4155 
4156                flush |= crocus_flush_bits_for_history(res);
4157                crocus_dirty_for_history(ice, res);
4158             }
4159          }
4160          crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4161                                         "make streamout results visible", flush);
4162       }
4163    }
4164 
4165    ice->state.so_targets = num_targets;
4166    for (int i = 0; i < 4; i++) {
4167       pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4168       pipe_so_target_reference(&ice->state.so_target[i],
4169                                i < num_targets ? targets[i] : NULL);
4170    }
4171 
4172 #if GFX_VER == 6
4173    bool stored_num_prims = false;
4174    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4175       if (num_targets) {
4176          struct crocus_stream_output_target *tgt =
4177             (void *) ice->state.so_target[i];
4178 
4179          if (!tgt)
4180             continue;
4181          if (offsets[i] == 0) {
4182             // This means that we're supposed to ignore anything written to
4183             // the buffer before. We can do this by just clearing out the
4184             // count of writes to the prim count buffer.
4185             tgt->count.offset_start = tgt->count.offset_end;
4186             tgt->count.accum = 0;
4187             ice->state.svbi = 0;
4188          } else {
4189             if (tgt->offset_res) {
4190                compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4191                tgt->count.offset_start = tgt->count.offset_end;
4192             }
4193          }
4194 
4195          if (!stored_num_prims) {
4196             crocus_stream_store_prims_written(batch, tgt);
4197             stored_num_prims = true;
4198          }
4199       } else {
4200          struct crocus_stream_output_target *tgt =
4201             (void *) old_tgt[i];
4202          if (tgt) {
4203             if (!stored_num_prims) {
4204                crocus_stream_store_prims_written(batch, tgt);
4205                stored_num_prims = true;
4206             }
4207 
4208             if (tgt->offset_res) {
4209                tgt->prev_count = tgt->count;
4210             }
4211          }
4212       }
4213       pipe_so_target_reference(&old_tgt[i], NULL);
4214    }
4215    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4216 #else
4217    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4218       if (num_targets) {
4219          struct crocus_stream_output_target *tgt =
4220             (void *) ice->state.so_target[i];
4221 
4222          if (offsets[i] == 0) {
4223 #if GFX_VER == 8
4224             if (tgt)
4225                tgt->zero_offset = true;
4226 #endif
4227             crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4228          }
4229          else if (tgt)
4230             crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4231                                        tgt->offset_res->bo,
4232                                        tgt->offset_offset);
4233       } else {
4234          struct crocus_stream_output_target *tgt =
4235             (void *) old_tgt[i];
4236          if (tgt)
4237             crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4238                                         tgt->offset_res->bo,
4239                                         tgt->offset_offset, false);
4240       }
4241       pipe_so_target_reference(&old_tgt[i], NULL);
4242    }
4243 #endif
4244    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4245    if (!active)
4246       return;
4247 #if GFX_VER >= 7
4248    ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4249 #elif GFX_VER == 6
4250    ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4251 #endif
4252 }
4253 
4254 #endif
4255 
4256 #if GFX_VER >= 7
4257 /**
4258  * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4259  * 3DSTATE_STREAMOUT packets.
4260  *
4261  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4262  * hardware to record.  We can create it entirely based on the shader, with
4263  * no dynamic state dependencies.
4264  *
4265  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4266  * state-based settings.  We capture the shader-related ones here, and merge
4267  * the rest in at draw time.
4268  */
4269 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct brw_vue_map * vue_map)4270 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4271                            const struct brw_vue_map *vue_map)
4272 {
4273    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
4274    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4277    int max_decls = 0;
4278    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
4279 
4280    memset(so_decl, 0, sizeof(so_decl));
4281 
4282    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4283     * command feels strange -- each dword pair contains a SO_DECL per stream.
4284     */
4285    for (unsigned i = 0; i < info->num_outputs; i++) {
4286       const struct pipe_stream_output *output = &info->output[i];
4287       const int buffer = output->output_buffer;
4288       const int varying = output->register_index;
4289       const unsigned stream_id = output->stream;
4290       assert(stream_id < MAX_VERTEX_STREAMS);
4291 
4292       buffer_mask[stream_id] |= 1 << buffer;
4293 
4294       assert(vue_map->varying_to_slot[varying] >= 0);
4295 
4296       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4297        * array.  Instead, it simply increments DstOffset for the following
4298        * input by the number of components that should be skipped.
4299        *
4300        * Our hardware is unusual in that it requires us to program SO_DECLs
4301        * for fake "hole" components, rather than simply taking the offset
4302        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4303        * program as many size = 4 holes as we can, then a final hole to
4304        * accommodate the final 1, 2, or 3 remaining.
4305        */
4306       int skip_components = output->dst_offset - next_offset[buffer];
4307 
4308       while (skip_components > 0) {
4309          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4310             .HoleFlag = 1,
4311             .OutputBufferSlot = output->output_buffer,
4312             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4313          };
4314          skip_components -= 4;
4315       }
4316 
4317       next_offset[buffer] = output->dst_offset + output->num_components;
4318 
4319       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4320          .OutputBufferSlot = output->output_buffer,
4321          .RegisterIndex = vue_map->varying_to_slot[varying],
4322          .ComponentMask =
4323             ((1 << output->num_components) - 1) << output->start_component,
4324       };
4325 
4326       if (decls[stream_id] > max_decls)
4327          max_decls = decls[stream_id];
4328    }
4329 
4330    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4331    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4332    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4333 
4334    crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4335       int urb_entry_read_offset = 0;
4336       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4337          urb_entry_read_offset;
4338 
4339       /* We always read the whole vertex.  This could be reduced at some
4340        * point by reading less and offsetting the register index in the
4341        * SO_DECLs.
4342        */
4343       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4344       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4345       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4346       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4347       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4348       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4349       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4350       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4351 
4352       // TODO: Double-check that stride == 0 means no buffer. Probably this
4353       // needs to go elsewhere, where the buffer enable stuff is actually
4354       // known.
4355 #if GFX_VER < 8
4356       sol.SOBufferEnable0 = !!info->stride[0];
4357       sol.SOBufferEnable1 = !!info->stride[1];
4358       sol.SOBufferEnable2 = !!info->stride[2];
4359       sol.SOBufferEnable3 = !!info->stride[3];
4360 #else
4361       /* Set buffer pitches; 0 means unbound. */
4362       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4363       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4364       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4365       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4366 #endif
4367    }
4368 
4369    crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4370       list.DWordLength = 3 + 2 * max_decls - 2;
4371       list.StreamtoBufferSelects0 = buffer_mask[0];
4372       list.StreamtoBufferSelects1 = buffer_mask[1];
4373       list.StreamtoBufferSelects2 = buffer_mask[2];
4374       list.StreamtoBufferSelects3 = buffer_mask[3];
4375       list.NumEntries0 = decls[0];
4376       list.NumEntries1 = decls[1];
4377       list.NumEntries2 = decls[2];
4378       list.NumEntries3 = decls[3];
4379    }
4380 
4381    for (int i = 0; i < max_decls; i++) {
4382       crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4383          entry.Stream0Decl = so_decl[0][i];
4384          entry.Stream1Decl = so_decl[1][i];
4385          entry.Stream2Decl = so_decl[2][i];
4386          entry.Stream3Decl = so_decl[3][i];
4387       }
4388    }
4389 
4390    return map;
4391 }
4392 #endif
4393 
4394 #if GFX_VER == 6
4395 static void
crocus_emit_so_svbi(struct crocus_context * ice)4396 crocus_emit_so_svbi(struct crocus_context *ice)
4397 {
4398    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4399 
4400    unsigned max_vertex = 0xffffffff;
4401    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4402       struct crocus_stream_output_target *tgt =
4403          (void *) ice->state.so_target[i];
4404       if (tgt)
4405          max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4406    }
4407 
4408    crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4409       svbi.IndexNumber = 0;
4410       svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4411       svbi.MaximumIndex = max_vertex;
4412    }
4413 
4414    /* initialize the rest of the SVBI's to reasonable values so that we don't
4415     * run out of room writing the regular data.
4416     */
4417    for (int i = 1; i < 4; i++) {
4418       crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4419          svbi.IndexNumber = i;
4420          svbi.StreamedVertexBufferIndex = 0;
4421          svbi.MaximumIndex = 0xffffffff;
4422       }
4423    }
4424 }
4425 
4426 #endif
4427 
4428 
4429 #if GFX_VER >= 6
4430 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4431 crocus_is_drawing_points(const struct crocus_context *ice)
4432 {
4433    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4434 
4435    if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4436        cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4437       return true;
4438 
4439    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4440       const struct brw_gs_prog_data *gs_prog_data =
4441          (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4442       return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4443    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4444       const struct brw_tes_prog_data *tes_data =
4445          (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4446       return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4447    } else {
4448       return ice->state.prim_mode == PIPE_PRIM_POINTS;
4449    }
4450 }
4451 #endif
4452 
4453 #if GFX_VER >= 6
4454 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct brw_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4455 get_attr_override(
4456    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4457    const struct brw_vue_map *vue_map,
4458    int urb_entry_read_offset, int fs_attr,
4459    bool two_side_color, uint32_t *max_source_attr)
4460 {
4461    /* Find the VUE slot for this attribute. */
4462    int slot = vue_map->varying_to_slot[fs_attr];
4463 
4464    /* Viewport and Layer are stored in the VUE header.  We need to override
4465     * them to zero if earlier stages didn't write them, as GL requires that
4466     * they read back as zero when not explicitly set.
4467     */
4468    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4469       attr->ComponentOverrideX = true;
4470       attr->ComponentOverrideW = true;
4471       attr->ConstantSource = CONST_0000;
4472 
4473       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4474          attr->ComponentOverrideY = true;
4475       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4476          attr->ComponentOverrideZ = true;
4477 
4478       return;
4479    }
4480 
4481    /* If there was only a back color written but not front, use back
4482     * as the color instead of undefined
4483     */
4484    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4485       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4486    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4487       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4488 
4489    if (slot == -1) {
4490       /* This attribute does not exist in the VUE--that means that the vertex
4491        * shader did not write to it.  This means that either:
4492        *
4493        * (a) This attribute is a texture coordinate, and it is going to be
4494        * replaced with point coordinates (as a consequence of a call to
4495        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4496        * hardware will ignore whatever attribute override we supply.
4497        *
4498        * (b) This attribute is read by the fragment shader but not written by
4499        * the vertex shader, so its value is undefined.  Therefore the
4500        * attribute override we supply doesn't matter.
4501        *
4502        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4503        * previous shader stage.
4504        *
4505        * Note that we don't have to worry about the cases where the attribute
4506        * is gl_PointCoord or is undergoing point sprite coordinate
4507        * replacement, because in those cases, this function isn't called.
4508        *
4509        * In case (c), we need to program the attribute overrides so that the
4510        * primitive ID will be stored in this slot.  In every other case, the
4511        * attribute override we supply doesn't matter.  So just go ahead and
4512        * program primitive ID in every case.
4513        */
4514       attr->ComponentOverrideW = true;
4515       attr->ComponentOverrideX = true;
4516       attr->ComponentOverrideY = true;
4517       attr->ComponentOverrideZ = true;
4518       attr->ConstantSource = PRIM_ID;
4519       return;
4520    }
4521 
4522    /* Compute the location of the attribute relative to urb_entry_read_offset.
4523     * Each increment of urb_entry_read_offset represents a 256-bit value, so
4524     * it counts for two 128-bit VUE slots.
4525     */
4526    int source_attr = slot - 2 * urb_entry_read_offset;
4527    assert(source_attr >= 0 && source_attr < 32);
4528 
4529    /* If we are doing two-sided color, and the VUE slot following this one
4530     * represents a back-facing color, then we need to instruct the SF unit to
4531     * do back-facing swizzling.
4532     */
4533    bool swizzling = two_side_color &&
4534       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4535         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4536        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4537         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4538 
4539    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
4540    if (*max_source_attr < source_attr + swizzling)
4541       *max_source_attr = source_attr + swizzling;
4542 
4543    attr->SourceAttribute = source_attr;
4544    if (swizzling)
4545       attr->SwizzleSelect = INPUTATTR_FACING;
4546 }
4547 
4548 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4549 calculate_attr_overrides(
4550    const struct crocus_context *ice,
4551    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4552    uint32_t *point_sprite_enables,
4553    uint32_t *urb_entry_read_length,
4554    uint32_t *urb_entry_read_offset)
4555 {
4556    const struct brw_wm_prog_data *wm_prog_data = (void *)
4557       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4558    const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
4559    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4560    uint32_t max_source_attr = 0;
4561    const struct shader_info *fs_info =
4562       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4563 
4564    int first_slot =
4565       brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4566 
4567    /* Each URB offset packs two varying slots */
4568    assert(first_slot % 2 == 0);
4569    *urb_entry_read_offset = first_slot / 2;
4570    *point_sprite_enables = 0;
4571 
4572    for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4573       const int input_index = wm_prog_data->urb_setup[fs_attr];
4574 
4575       if (input_index < 0)
4576          continue;
4577 
4578       bool point_sprite = false;
4579       if (crocus_is_drawing_points(ice)) {
4580          if (fs_attr >= VARYING_SLOT_TEX0 &&
4581              fs_attr <= VARYING_SLOT_TEX7 &&
4582              cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4583             point_sprite = true;
4584 
4585          if (fs_attr == VARYING_SLOT_PNTC)
4586             point_sprite = true;
4587 
4588          if (point_sprite)
4589             *point_sprite_enables |= 1U << input_index;
4590       }
4591 
4592       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4593       if (!point_sprite) {
4594          get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4595                            cso_rast->cso.light_twoside, &max_source_attr);
4596       }
4597 
4598       /* The hardware can only do the overrides on 16 overrides at a
4599        * time, and the other up to 16 have to be lined up so that the
4600        * input index = the output index.  We'll need to do some
4601        * tweaking to make sure that's the case.
4602        */
4603       if (input_index < 16)
4604          attr_overrides[input_index] = attribute;
4605       else
4606          assert(attribute.SourceAttribute == input_index);
4607    }
4608 
4609    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4610     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4611     *
4612     * "This field should be set to the minimum length required to read the
4613     *  maximum source attribute.  The maximum source attribute is indicated
4614     *  by the maximum value of the enabled Attribute # Source Attribute if
4615     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4616     *  enable is not set.
4617     *  read_length = ceiling((max_source_attr + 1) / 2)
4618     *
4619     *  [errata] Corruption/Hang possible if length programmed larger than
4620     *  recommended"
4621     *
4622     * Similar text exists for Ivy Bridge.
4623     */
4624    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4625 }
4626 #endif
4627 
4628 #if GFX_VER >= 7
4629 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4630 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4631 {
4632    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4633    const struct brw_wm_prog_data *wm_prog_data = (void *)
4634       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4635 #if GFX_VER >= 8
4636    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4637 #else
4638 #define attr_overrides sbe.Attribute
4639 #endif
4640 
4641    uint32_t urb_entry_read_length;
4642    uint32_t urb_entry_read_offset;
4643    uint32_t point_sprite_enables;
4644 
4645    crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4646       sbe.AttributeSwizzleEnable = true;
4647       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4648       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4649 
4650       calculate_attr_overrides(ice,
4651                                attr_overrides,
4652                                &point_sprite_enables,
4653                                &urb_entry_read_length,
4654                                &urb_entry_read_offset);
4655       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4656       sbe.VertexURBEntryReadLength = urb_entry_read_length;
4657       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4658       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4659 #if GFX_VER >= 8
4660       sbe.ForceVertexURBEntryReadLength = true;
4661       sbe.ForceVertexURBEntryReadOffset = true;
4662 #endif
4663    }
4664 #if GFX_VER >= 8
4665    crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4666       for (int i = 0; i < 16; i++)
4667          sbes.Attribute[i] = attr_overrides[i];
4668    }
4669 #endif
4670 }
4671 #endif
4672 
4673 /* ------------------------------------------------------------------- */
4674 
4675 /**
4676  * Populate VS program key fields based on the current state.
4677  */
4678 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_vs_prog_key * key)4679 crocus_populate_vs_key(const struct crocus_context *ice,
4680                        const struct shader_info *info,
4681                        gl_shader_stage last_stage,
4682                        struct brw_vs_prog_key *key)
4683 {
4684    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4685 
4686    if (info->clip_distance_array_size == 0 &&
4687        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4688        last_stage == MESA_SHADER_VERTEX)
4689       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4690 
4691 #if GFX_VER <= 5
4692    key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4693                          cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4694    key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4695 #endif
4696 
4697    key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4698 
4699 #if GFX_VERx10 < 75
4700    uint64_t inputs_read = info->inputs_read;
4701    int ve_idx = 0;
4702    while (inputs_read) {
4703       int i = u_bit_scan64(&inputs_read);
4704       key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4705       ve_idx++;
4706    }
4707 #endif
4708 }
4709 
4710 /**
4711  * Populate TCS program key fields based on the current state.
4712  */
4713 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct brw_tcs_prog_key * key)4714 crocus_populate_tcs_key(const struct crocus_context *ice,
4715                         struct brw_tcs_prog_key *key)
4716 {
4717 }
4718 
4719 /**
4720  * Populate TES program key fields based on the current state.
4721  */
4722 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_tes_prog_key * key)4723 crocus_populate_tes_key(const struct crocus_context *ice,
4724                         const struct shader_info *info,
4725                         gl_shader_stage last_stage,
4726                         struct brw_tes_prog_key *key)
4727 {
4728    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4729 
4730    if (info->clip_distance_array_size == 0 &&
4731        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4732        last_stage == MESA_SHADER_TESS_EVAL)
4733       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4734 }
4735 
4736 /**
4737  * Populate GS program key fields based on the current state.
4738  */
4739 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct brw_gs_prog_key * key)4740 crocus_populate_gs_key(const struct crocus_context *ice,
4741                        const struct shader_info *info,
4742                        gl_shader_stage last_stage,
4743                        struct brw_gs_prog_key *key)
4744 {
4745    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4746 
4747    if (info->clip_distance_array_size == 0 &&
4748        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4749        last_stage == MESA_SHADER_GEOMETRY)
4750       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4751 }
4752 
4753 static inline GLenum
compare_func_to_gl(enum pipe_compare_func pipe_func)4754 compare_func_to_gl(enum pipe_compare_func pipe_func)
4755 {
4756    static const unsigned map[] = {
4757       [PIPE_FUNC_NEVER]    = GL_NEVER,
4758       [PIPE_FUNC_LESS]     = GL_LESS,
4759       [PIPE_FUNC_EQUAL]    = GL_EQUAL,
4760       [PIPE_FUNC_LEQUAL]   = GL_LEQUAL,
4761       [PIPE_FUNC_GREATER]  = GL_GREATER,
4762       [PIPE_FUNC_NOTEQUAL] = GL_NOTEQUAL,
4763       [PIPE_FUNC_GEQUAL]   = GL_GEQUAL,
4764       [PIPE_FUNC_ALWAYS]   = GL_ALWAYS,
4765    };
4766    return map[pipe_func];
4767 }
4768 
4769 /**
4770  * Populate FS program key fields based on the current state.
4771  */
4772 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct brw_wm_prog_key * key)4773 crocus_populate_fs_key(const struct crocus_context *ice,
4774                        const struct shader_info *info,
4775                        struct brw_wm_prog_key *key)
4776 {
4777    struct crocus_screen *screen = (void *) ice->ctx.screen;
4778    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4779    const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4780    const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4781    const struct crocus_blend_state *blend = ice->state.cso_blend;
4782 
4783 #if GFX_VER < 6
4784    uint32_t lookup = 0;
4785 
4786    if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4787       lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
4788 
4789    if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4790       lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4791 
4792    if (fb->zsbuf && zsa->cso.depth_enabled) {
4793       lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4794 
4795       if (zsa->cso.depth_writemask)
4796          lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4797 
4798    }
4799    if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4800       lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4801       if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4802          lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4803    }
4804    key->iz_lookup = lookup;
4805    key->stats_wm = ice->state.stats_wm;
4806 #endif
4807 
4808    uint32_t line_aa = BRW_WM_AA_NEVER;
4809    if (rast->cso.line_smooth) {
4810       int reduced_prim = ice->state.reduced_prim_mode;
4811       if (reduced_prim == PIPE_PRIM_LINES)
4812          line_aa = BRW_WM_AA_ALWAYS;
4813       else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
4814          if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4815             line_aa = BRW_WM_AA_SOMETIMES;
4816 
4817             if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4818                 rast->cso.cull_face == PIPE_FACE_BACK)
4819                line_aa = BRW_WM_AA_ALWAYS;
4820          } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4821             line_aa = BRW_WM_AA_SOMETIMES;
4822 
4823             if (rast->cso.cull_face == PIPE_FACE_FRONT)
4824                line_aa = BRW_WM_AA_ALWAYS;
4825          }
4826       }
4827    }
4828    key->line_aa = line_aa;
4829 
4830    key->nr_color_regions = fb->nr_cbufs;
4831 
4832    key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4833 
4834    key->alpha_to_coverage = blend->cso.alpha_to_coverage;
4835 
4836    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4837 
4838    key->flat_shade = rast->cso.flatshade &&
4839       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4840 
4841    key->persample_interp = rast->cso.force_persample_interp;
4842    key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
4843 
4844    key->ignore_sample_mask_out = !key->multisample_fbo;
4845    key->coherent_fb_fetch = false; // TODO: needed?
4846 
4847    key->force_dual_color_blend =
4848       screen->driconf.dual_color_blend_by_location &&
4849       (blend->blend_enables & 1) && blend->dual_color_blending;
4850 
4851    /* TODO: Respect glHint for key->high_quality_derivatives */
4852 
4853 #if GFX_VER <= 5
4854    if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4855       key->alpha_test_func = compare_func_to_gl(zsa->cso.alpha_func);
4856       key->alpha_test_ref = zsa->cso.alpha_ref_value;
4857    }
4858 #endif
4859 }
4860 
4861 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct brw_cs_prog_key * key)4862 crocus_populate_cs_key(const struct crocus_context *ice,
4863                        struct brw_cs_prog_key *key)
4864 {
4865 }
4866 
4867 #if GFX_VER == 4
4868 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4869 #elif GFX_VER >= 5
4870 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4871 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4872 {
4873    return shader->offset;
4874 }
4875 #endif
4876 
4877 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4878  * prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
4879  * this WA on C0 stepping.
4880  *
4881  * TODO: Fill out SamplerCount for prefetching?
4882  */
4883 
4884 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                 \
4885    pkt.KernelStartPointer = KSP(ice, shader);                           \
4886    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;              \
4887    pkt.FloatingPointMode = prog_data->use_alt_mode;                     \
4888                                                                         \
4889    pkt.DispatchGRFStartRegisterForURBData =                             \
4890       prog_data->dispatch_grf_start_reg;                                \
4891    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;     \
4892    pkt.prefix##URBEntryReadOffset = 0;                                  \
4893                                                                         \
4894    pkt.StatisticsEnable = true;                                         \
4895    pkt.Enable           = true;                                         \
4896                                                                         \
4897    if (prog_data->total_scratch) {                                      \
4898       struct crocus_bo *bo =                                            \
4899          crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4900       pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;   \
4901       pkt.ScratchSpaceBasePointer = rw_bo(bo, 0);                       \
4902    }
4903 
4904 /* ------------------------------------------------------------------- */
4905 #if GFX_VER >= 6
4906 static const uint32_t push_constant_opcodes[] = {
4907    [MESA_SHADER_VERTEX]    = 21,
4908    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4909    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4910    [MESA_SHADER_GEOMETRY]  = 22,
4911    [MESA_SHADER_FRAGMENT]  = 23,
4912    [MESA_SHADER_COMPUTE]   = 0,
4913 };
4914 #endif
4915 
4916 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4917 emit_sized_null_surface(struct crocus_batch *batch,
4918                         unsigned width, unsigned height,
4919                         unsigned layers, unsigned levels,
4920                         unsigned minimum_array_element,
4921                         uint32_t *out_offset)
4922 {
4923    struct isl_device *isl_dev = &batch->screen->isl_dev;
4924    uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4925                                  isl_dev->ss.align,
4926                                  out_offset);
4927    //TODO gen 6 multisample crash
4928    isl_null_fill_state(isl_dev, surf,
4929                        .size = isl_extent3d(width, height, layers),
4930                        .levels = levels,
4931                        .minimum_array_element = minimum_array_element);
4932 }
4933 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4934 emit_null_surface(struct crocus_batch *batch,
4935                   uint32_t *out_offset)
4936 {
4937    emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4938 }
4939 
4940 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4941 emit_null_fb_surface(struct crocus_batch *batch,
4942                      struct crocus_context *ice,
4943                      uint32_t *out_offset)
4944 {
4945    uint32_t width, height, layers, level, layer;
4946    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4947    if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4948       emit_null_surface(batch, out_offset);
4949       return;
4950    }
4951 
4952    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4953    width = MAX2(cso->width, 1);
4954    height = MAX2(cso->height, 1);
4955    layers = cso->layers ? cso->layers : 1;
4956    level = 0;
4957    layer = 0;
4958 
4959    if (cso->nr_cbufs == 0 && cso->zsbuf) {
4960       width = cso->zsbuf->width;
4961       height = cso->zsbuf->height;
4962       level = cso->zsbuf->u.tex.level;
4963       layer = cso->zsbuf->u.tex.first_layer;
4964    }
4965    emit_sized_null_surface(batch, width, height,
4966                            layers, level, layer,
4967                            out_offset);
4968 }
4969 
4970 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4971 emit_surface_state(struct crocus_batch *batch,
4972                    struct crocus_resource *res,
4973                    const struct isl_surf *in_surf,
4974                    bool adjust_surf,
4975                    struct isl_view *in_view,
4976                    bool writeable,
4977                    enum isl_aux_usage aux_usage,
4978                    bool blend_enable,
4979                    uint32_t write_disables,
4980                    uint32_t *surf_state,
4981                    uint32_t addr_offset)
4982 {
4983    const struct intel_device_info *devinfo = &batch->screen->devinfo;
4984    struct isl_device *isl_dev = &batch->screen->isl_dev;
4985    uint32_t reloc = RELOC_32BIT;
4986    uint64_t offset_B = res->offset;
4987    uint32_t tile_x_sa = 0, tile_y_sa = 0;
4988 
4989    if (writeable)
4990       reloc |= RELOC_WRITE;
4991 
4992    struct isl_surf surf = *in_surf;
4993    struct isl_view view = *in_view;
4994    if (adjust_surf) {
4995       if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4996          isl_surf_get_image_surf(isl_dev, in_surf,
4997                                  view.base_level, 0,
4998                                  view.base_array_layer,
4999                                  &surf, &offset_B,
5000                                  &tile_x_sa, &tile_y_sa);
5001          view.base_array_layer = 0;
5002          view.base_level = 0;
5003       } else if (res->base.b.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
5004          isl_surf_get_image_surf(isl_dev, in_surf,
5005                                  view.base_level, view.base_array_layer,
5006                                  0,
5007                                  &surf, &offset_B,
5008                                  &tile_x_sa, &tile_y_sa);
5009          view.base_array_layer = 0;
5010          view.base_level = 0;
5011       } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5012          surf.dim = ISL_SURF_DIM_2D;
5013    }
5014 
5015    union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5016    struct crocus_bo *aux_bo = NULL;
5017    uint32_t aux_offset = 0;
5018    struct isl_surf *aux_surf = NULL;
5019    if (aux_usage != ISL_AUX_USAGE_NONE) {
5020       aux_surf = &res->aux.surf;
5021       aux_offset = res->aux.offset;
5022       aux_bo = res->aux.bo;
5023 
5024       clear_color = crocus_resource_get_clear_color(res);
5025    }
5026 
5027    isl_surf_fill_state(isl_dev, surf_state,
5028                        .surf = &surf,
5029                        .view = &view,
5030                        .address = crocus_state_reloc(batch,
5031                                                      addr_offset + isl_dev->ss.addr_offset,
5032                                                      res->bo, offset_B, reloc),
5033                        .aux_surf = aux_surf,
5034                        .aux_usage = aux_usage,
5035                        .aux_address = aux_offset,
5036                        .mocs = crocus_mocs(res->bo, isl_dev),
5037                        .clear_color = clear_color,
5038                        .use_clear_address = false,
5039                        .clear_address = 0,
5040                        .x_offset_sa = tile_x_sa,
5041                        .y_offset_sa = tile_y_sa,
5042 #if GFX_VER <= 5
5043                        .blend_enable = blend_enable,
5044                        .write_disables = write_disables,
5045 #endif
5046       );
5047 
5048    if (aux_surf) {
5049       /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5050        * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5051        * contain other control information.  Since buffer addresses are always
5052        * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5053        * an ordinary reloc to do the necessary address translation.
5054        *
5055        * FIXME: move to the point of assignment.
5056        */
5057       if (devinfo->ver == 8) {
5058          uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5059          *aux_addr = crocus_state_reloc(batch,
5060                                         addr_offset + isl_dev->ss.aux_addr_offset,
5061                                         aux_bo, *aux_addr,
5062                                         reloc);
5063       } else {
5064          uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5065          *aux_addr = crocus_state_reloc(batch,
5066                                         addr_offset + isl_dev->ss.aux_addr_offset,
5067                                         aux_bo, *aux_addr,
5068                                         reloc);
5069       }
5070    }
5071 
5072 }
5073 
5074 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5075 emit_surface(struct crocus_batch *batch,
5076              struct crocus_surface *surf,
5077              enum isl_aux_usage aux_usage,
5078              bool blend_enable,
5079              uint32_t write_disables)
5080 {
5081    const struct intel_device_info *devinfo = &batch->screen->devinfo;
5082    struct isl_device *isl_dev = &batch->screen->isl_dev;
5083    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5084    struct isl_view *view = &surf->view;
5085    uint32_t offset = 0;
5086    enum pipe_texture_target target = res->base.b.target;
5087    bool adjust_surf = false;
5088 
5089    if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
5090       adjust_surf = true;
5091 
5092    if (surf->align_res)
5093       res = (struct crocus_resource *)surf->align_res;
5094 
5095    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5096 
5097    emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5098                       aux_usage, blend_enable,
5099                       write_disables,
5100                       surf_state, offset);
5101    return offset;
5102 }
5103 
5104 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5105 emit_rt_surface(struct crocus_batch *batch,
5106                 struct crocus_surface *surf,
5107                 enum isl_aux_usage aux_usage)
5108 {
5109    struct isl_device *isl_dev = &batch->screen->isl_dev;
5110    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5111    struct isl_view *view = &surf->read_view;
5112    uint32_t offset = 0;
5113    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5114 
5115    emit_surface_state(batch, res, &surf->surf, true, view, false,
5116                       aux_usage, 0, false,
5117                       surf_state, offset);
5118    return offset;
5119 }
5120 
5121 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5122 emit_grid(struct crocus_context *ice,
5123           struct crocus_batch *batch)
5124 {
5125    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5126    uint32_t offset = 0;
5127    struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5128    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5129                                        isl_dev->ss.align, &offset);
5130    isl_buffer_fill_state(isl_dev, surf_state,
5131                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5132                                                        crocus_resource_bo(grid_ref->res),
5133                                                        grid_ref->offset,
5134                                                        RELOC_32BIT),
5135                          .size_B = 12,
5136                          .format = ISL_FORMAT_RAW,
5137                          .stride_B = 1,
5138                          .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5139    return offset;
5140 }
5141 
5142 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5143 emit_ubo_buffer(struct crocus_context *ice,
5144                 struct crocus_batch *batch,
5145                 struct pipe_constant_buffer *buffer)
5146 {
5147    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5148    uint32_t offset = 0;
5149 
5150    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5151                                        isl_dev->ss.align, &offset);
5152    isl_buffer_fill_state(isl_dev, surf_state,
5153                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5154                                                        crocus_resource_bo(buffer->buffer),
5155                                                        buffer->buffer_offset,
5156                                                        RELOC_32BIT),
5157                          .size_B = buffer->buffer_size,
5158                          .format = 0,
5159                          .swizzle = ISL_SWIZZLE_IDENTITY,
5160                          .stride_B = 1,
5161                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5162 
5163    return offset;
5164 }
5165 
5166 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5167 emit_ssbo_buffer(struct crocus_context *ice,
5168                  struct crocus_batch *batch,
5169                  struct pipe_shader_buffer *buffer, bool writeable)
5170 {
5171    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5172    uint32_t offset = 0;
5173    uint32_t reloc = RELOC_32BIT;
5174 
5175    if (writeable)
5176       reloc |= RELOC_WRITE;
5177    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5178                                        isl_dev->ss.align, &offset);
5179    isl_buffer_fill_state(isl_dev, surf_state,
5180                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5181                                                        crocus_resource_bo(buffer->buffer),
5182                                                        buffer->buffer_offset,
5183                                                        reloc),
5184                          .size_B = buffer->buffer_size,
5185                          .format = ISL_FORMAT_RAW,
5186                          .swizzle = ISL_SWIZZLE_IDENTITY,
5187                          .stride_B = 1,
5188                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5189 
5190    return offset;
5191 }
5192 
5193 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5194 emit_sampler_view(struct crocus_context *ice,
5195                   struct crocus_batch *batch,
5196                   bool for_gather,
5197                   struct crocus_sampler_view *isv)
5198 {
5199    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5200    uint32_t offset = 0;
5201 
5202    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5203                                        isl_dev->ss.align, &offset);
5204 
5205    if (isv->base.target == PIPE_BUFFER) {
5206       const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5207       const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5208       unsigned final_size =
5209          MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5210               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5211       isl_buffer_fill_state(isl_dev, surf_state,
5212                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5213                                                           isv->res->bo,
5214                                                           isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5215                             .size_B = final_size,
5216                             .format = isv->view.format,
5217                             .swizzle = isv->view.swizzle,
5218                             .stride_B = cpp,
5219                             .mocs = crocus_mocs(isv->res->bo, isl_dev)
5220          );
5221    } else {
5222       enum isl_aux_usage aux_usage =
5223          crocus_resource_texture_aux_usage(isv->res);
5224 
5225       emit_surface_state(batch, isv->res, &isv->res->surf, false,
5226                          for_gather ? &isv->gather_view : &isv->view,
5227                          false, aux_usage, false,
5228                          0, surf_state, offset);
5229    }
5230    return offset;
5231 }
5232 
5233 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5234 emit_image_view(struct crocus_context *ice,
5235                 struct crocus_batch *batch,
5236                 struct crocus_image_view *iv)
5237 {
5238    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5239    uint32_t offset = 0;
5240 
5241    struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5242    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5243                                        isl_dev->ss.align, &offset);
5244    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5245    uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5246    if (res->base.b.target == PIPE_BUFFER) {
5247       const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5248       const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5249       unsigned final_size =
5250          MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5251               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5252       isl_buffer_fill_state(isl_dev, surf_state,
5253                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5254                                                           res->bo,
5255                                                           res->offset + iv->base.u.buf.offset, reloc),
5256                             .size_B = final_size,
5257                             .format = iv->view.format,
5258                             .swizzle = iv->view.swizzle,
5259                             .stride_B = cpp,
5260                             .mocs = crocus_mocs(res->bo, isl_dev)
5261          );
5262    } else {
5263       if (iv->view.format == ISL_FORMAT_RAW) {
5264          isl_buffer_fill_state(isl_dev, surf_state,
5265                                .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5266                                                              res->bo,
5267                                                              res->offset, reloc),
5268                                .size_B = res->bo->size - res->offset,
5269                                .format = iv->view.format,
5270                                .swizzle = iv->view.swizzle,
5271                                .stride_B = 1,
5272                                .mocs = crocus_mocs(res->bo, isl_dev),
5273             );
5274 
5275 
5276       } else {
5277          emit_surface_state(batch, res,
5278                             &res->surf, false, &iv->view,
5279                             write, 0, false,
5280                             0, surf_state, offset);
5281       }
5282    }
5283 
5284    return offset;
5285 }
5286 
5287 #if GFX_VER == 6
5288 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5289 emit_sol_surface(struct crocus_batch *batch,
5290                  struct pipe_stream_output_info *so_info,
5291                  uint32_t idx)
5292 {
5293    struct crocus_context *ice = batch->ice;
5294 
5295    if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5296       return 0;
5297    const struct pipe_stream_output *output = &so_info->output[idx];
5298    const int buffer = output->output_buffer;
5299    assert(output->stream == 0);
5300 
5301    struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5302    unsigned stride_dwords = so_info->stride[buffer];
5303    unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5304 
5305    size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5306    unsigned num_vector_components = output->num_components;
5307    unsigned num_elements;
5308    /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5309     * too big to map using a single binding table entry?
5310     */
5311    //   assert((size_dwords - offset_dwords) / stride_dwords
5312    //          <= BRW_MAX_NUM_BUFFER_ENTRIES);
5313 
5314    if (size_dwords > offset_dwords + num_vector_components) {
5315       /* There is room for at least 1 transform feedback output in the buffer.
5316        * Compute the number of additional transform feedback outputs the
5317        * buffer has room for.
5318        */
5319       num_elements =
5320          (size_dwords - offset_dwords - num_vector_components);
5321    } else {
5322       /* There isn't even room for a single transform feedback output in the
5323        * buffer.  We can't configure the binding table entry to prevent output
5324        * entirely; we'll have to rely on the geometry shader to detect
5325        * overflow.  But to minimize the damage in case of a bug, set up the
5326        * binding table entry to just allow a single output.
5327        */
5328       num_elements = 0;
5329    }
5330    num_elements += stride_dwords;
5331 
5332    uint32_t surface_format;
5333    switch (num_vector_components) {
5334    case 1:
5335       surface_format = ISL_FORMAT_R32_FLOAT;
5336       break;
5337    case 2:
5338       surface_format = ISL_FORMAT_R32G32_FLOAT;
5339       break;
5340    case 3:
5341       surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5342       break;
5343    case 4:
5344       surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5345       break;
5346    default:
5347       unreachable("Invalid vector size for transform feedback output");
5348    }
5349 
5350    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5351    uint32_t offset = 0;
5352 
5353    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5354                                        isl_dev->ss.align, &offset);
5355    isl_buffer_fill_state(isl_dev, surf_state,
5356                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5357                                                        crocus_resource_bo(&buf->base.b),
5358                                                        offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5359                          .size_B = num_elements * 4,
5360                          .stride_B = stride_dwords * 4,
5361                          .swizzle = ISL_SWIZZLE_IDENTITY,
5362                          .format = surface_format);
5363    return offset;
5364 }
5365 #endif
5366 
5367 #define foreach_surface_used(index, group)                      \
5368    for (int index = 0; index < bt->sizes[group]; index++)       \
5369       if (crocus_group_index_to_bti(bt, group, index) !=        \
5370           CROCUS_SURFACE_NOT_USED)
5371 
5372 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5373 crocus_populate_binding_table(struct crocus_context *ice,
5374                               struct crocus_batch *batch,
5375                               gl_shader_stage stage, bool ff_gs)
5376 {
5377    struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5378    struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5379    if (!shader)
5380       return;
5381 
5382    struct crocus_binding_table *bt = &shader->bt;
5383    int s = 0;
5384    uint32_t *surf_offsets = shader->surf_offset;
5385 
5386 #if GFX_VER < 8
5387    const struct shader_info *info = crocus_get_shader_info(ice, stage);
5388 #endif
5389 
5390    if (stage == MESA_SHADER_FRAGMENT) {
5391       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5392       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5393       if (cso_fb->nr_cbufs) {
5394          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5395             uint32_t write_disables = 0;
5396             bool blend_enable = false;
5397 #if GFX_VER <= 5
5398             const struct pipe_rt_blend_state *rt =
5399                &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5400             struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5401             struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5402             write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5403             write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5404             write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5405             write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5406             /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5407             blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5408 #endif
5409             if (cso_fb->cbufs[i]) {
5410                surf_offsets[s] = emit_surface(batch,
5411                                               (struct crocus_surface *)cso_fb->cbufs[i],
5412                                               ice->state.draw_aux_usage[i],
5413                                               blend_enable,
5414                                               write_disables);
5415             } else {
5416                emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5417             }
5418             s++;
5419          }
5420       } else {
5421          emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5422          s++;
5423       }
5424 
5425       foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5426          struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5427          if (cso_fb->cbufs[i]) {
5428             surf_offsets[s++] = emit_rt_surface(batch,
5429                                                 (struct crocus_surface *)cso_fb->cbufs[i],
5430                                                 ice->state.draw_aux_usage[i]);
5431          }
5432       }
5433    }
5434 
5435    if (stage == MESA_SHADER_COMPUTE) {
5436       foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5437          surf_offsets[s] = emit_grid(ice, batch);
5438          s++;
5439       }
5440    }
5441 
5442 #if GFX_VER == 6
5443    if (stage == MESA_SHADER_GEOMETRY) {
5444       struct pipe_stream_output_info *so_info;
5445       if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5446          so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5447       else
5448          so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5449 
5450       foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5451          surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5452          s++;
5453       }
5454    }
5455 #endif
5456 
5457    foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5458       struct crocus_sampler_view *view = shs->textures[i];
5459       if (view)
5460          surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5461       else
5462          emit_null_surface(batch, &surf_offsets[s]);
5463       s++;
5464    }
5465 
5466 #if GFX_VER < 8
5467    if (info && info->uses_texture_gather) {
5468       foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5469          struct crocus_sampler_view *view = shs->textures[i];
5470          if (view)
5471             surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5472          else
5473             emit_null_surface(batch, &surf_offsets[s]);
5474          s++;
5475       }
5476    }
5477 #endif
5478 
5479    foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5480       struct crocus_image_view *view = &shs->image[i];
5481       if (view->base.resource)
5482          surf_offsets[s] = emit_image_view(ice, batch, view);
5483       else
5484          emit_null_surface(batch, &surf_offsets[s]);
5485       s++;
5486    }
5487    foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5488       if (shs->constbufs[i].buffer)
5489          surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5490       else
5491          emit_null_surface(batch, &surf_offsets[s]);
5492       s++;
5493    }
5494    foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5495       if (shs->ssbo[i].buffer)
5496          surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5497                                             !!(shs->writable_ssbos & (1 << i)));
5498       else
5499          emit_null_surface(batch, &surf_offsets[s]);
5500       s++;
5501    }
5502 
5503 }
5504 /* ------------------------------------------------------------------- */
5505 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5506 crocus_upload_binding_table(struct crocus_context *ice,
5507                             struct crocus_batch *batch,
5508                             uint32_t *table,
5509                             uint32_t size)
5510 
5511 {
5512    if (size == 0)
5513       return 0;
5514    return emit_state(batch, table, size, 32);
5515 }
5516 
5517 /**
5518  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5519  */
5520 
5521 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5522 crocus_update_surface_base_address(struct crocus_batch *batch)
5523 {
5524    if (batch->state_base_address_emitted)
5525       return;
5526 #if GFX_VER >= 6
5527    uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5528 #endif
5529    flush_before_state_base_change(batch);
5530 
5531    crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5532 
5533       sba.SurfaceStateBaseAddressModifyEnable = true;
5534       sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5535 
5536 #if GFX_VER >= 5
5537       sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5538 #endif
5539 
5540       sba.GeneralStateBaseAddressModifyEnable   = true;
5541       sba.IndirectObjectBaseAddressModifyEnable = true;
5542 #if GFX_VER >= 5
5543       sba.InstructionBaseAddressModifyEnable    = true;
5544 #endif
5545 
5546 #if GFX_VER < 8
5547       sba.GeneralStateAccessUpperBoundModifyEnable = true;
5548 #endif
5549 #if GFX_VER >= 5 && GFX_VER < 8
5550       sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5551       sba.InstructionAccessUpperBoundModifyEnable = true;
5552 #endif
5553 #if GFX_VER <= 5
5554       sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5555 #endif
5556 #if GFX_VER >= 6
5557       /* The hardware appears to pay attention to the MOCS fields even
5558        * if you don't set the "Address Modify Enable" bit for the base.
5559        */
5560       sba.GeneralStateMOCS            = mocs;
5561       sba.StatelessDataPortAccessMOCS = mocs;
5562 #if GFX_VER == 8
5563       sba.DynamicStateMOCS            = mocs;
5564       sba.IndirectObjectMOCS          = mocs;
5565       sba.InstructionMOCS             = mocs;
5566       sba.SurfaceStateMOCS            = mocs;
5567       sba.GeneralStateBufferSize   = 0xfffff;
5568       sba.IndirectObjectBufferSize = 0xfffff;
5569       sba.InstructionBufferSize    = 0xfffff;
5570       sba.DynamicStateBufferSize   = MAX_STATE_SIZE;
5571 
5572       sba.GeneralStateBufferSizeModifyEnable    = true;
5573       sba.DynamicStateBufferSizeModifyEnable    = true;
5574       sba.IndirectObjectBufferSizeModifyEnable  = true;
5575       sba.InstructionBuffersizeModifyEnable     = true;
5576 #endif
5577 
5578       sba.DynamicStateBaseAddressModifyEnable   = true;
5579 
5580       sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5581 
5582       /* Dynamic state upper bound.  Although the documentation says that
5583        * programming it to zero will cause it to be ignored, that is a lie.
5584        * If this isn't programmed to a real bound, the sampler border color
5585        * pointer is rejected, causing border color to mysteriously fail.
5586        */
5587 #if GFX_VER < 8
5588       sba.DynamicStateAccessUpperBoundModifyEnable = true;
5589       sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5590 #endif
5591 
5592 #endif
5593    }
5594 
5595    flush_after_state_base_change(batch);
5596 
5597    /* According to section 3.6.1 of VOL1 of the 965 PRM,
5598     * STATE_BASE_ADDRESS updates require a reissue of:
5599     *
5600     * 3DSTATE_PIPELINE_POINTERS
5601     * 3DSTATE_BINDING_TABLE_POINTERS
5602     * MEDIA_STATE_POINTERS
5603     *
5604     * and this continues through Ironlake.  The Sandy Bridge PRM, vol
5605     * 1 part 1 says that the folowing packets must be reissued:
5606     *
5607     * 3DSTATE_CC_POINTERS
5608     * 3DSTATE_BINDING_TABLE_POINTERS
5609     * 3DSTATE_SAMPLER_STATE_POINTERS
5610     * 3DSTATE_VIEWPORT_STATE_POINTERS
5611     * MEDIA_STATE_POINTERS
5612     *
5613     * Those are always reissued following SBA updates anyway (new
5614     * batch time), except in the case of the program cache BO
5615     * changing.  Having a separate state flag makes the sequence more
5616     * obvious.
5617     */
5618 #if GFX_VER <= 5
5619    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5620 #elif GFX_VER == 6
5621    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5622 #endif
5623    batch->state_base_address_emitted = true;
5624 }
5625 
5626 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5627 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5628                           bool window_space_position, float *zmin, float *zmax)
5629 {
5630    if (window_space_position) {
5631       *zmin = 0.f;
5632       *zmax = 1.f;
5633       return;
5634    }
5635    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5636 }
5637 
5638 struct push_bos {
5639    struct {
5640       struct crocus_address addr;
5641       uint32_t length;
5642    } buffers[4];
5643    int buffer_count;
5644    uint32_t max_length;
5645 };
5646 
5647 #if GFX_VER >= 6
5648 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5649 setup_constant_buffers(struct crocus_context *ice,
5650                        struct crocus_batch *batch,
5651                        int stage,
5652                        struct push_bos *push_bos)
5653 {
5654    struct crocus_shader_state *shs = &ice->state.shaders[stage];
5655    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5656    struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5657 
5658    uint32_t push_range_sum = 0;
5659 
5660    int n = 0;
5661    for (int i = 0; i < 4; i++) {
5662       const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5663 
5664       if (range->length == 0)
5665          continue;
5666 
5667       push_range_sum += range->length;
5668 
5669       if (range->length > push_bos->max_length)
5670          push_bos->max_length = range->length;
5671 
5672       /* Range block is a binding table index, map back to UBO index. */
5673       unsigned block_index = crocus_bti_to_group_index(
5674          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5675       assert(block_index != CROCUS_SURFACE_NOT_USED);
5676 
5677       struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5678       struct crocus_resource *res = (void *) cbuf->buffer;
5679 
5680       assert(cbuf->buffer_offset % 32 == 0);
5681 
5682       push_bos->buffers[n].length = range->length;
5683       push_bos->buffers[n].addr =
5684          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5685          : ro_bo(batch->ice->workaround_bo,
5686                  batch->ice->workaround_offset);
5687       n++;
5688    }
5689 
5690    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5691     *
5692     *    "The sum of all four read length fields must be less than or
5693     *    equal to the size of 64."
5694     */
5695    assert(push_range_sum <= 64);
5696 
5697    push_bos->buffer_count = n;
5698 }
5699 
5700 #if GFX_VER == 7
5701 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5702 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5703 {
5704    ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
5705 
5706    assert(devinfo->ver == 7);
5707    crocus_emit_pipe_control_write(batch,
5708                                   "vs workaround",
5709                                   PIPE_CONTROL_WRITE_IMMEDIATE
5710                                   | PIPE_CONTROL_DEPTH_STALL,
5711                                   batch->ice->workaround_bo,
5712                                   batch->ice->workaround_offset, 0);
5713 }
5714 #endif
5715 
5716 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5717 emit_push_constant_packets(struct crocus_context *ice,
5718                            struct crocus_batch *batch,
5719                            int stage,
5720                            const struct push_bos *push_bos)
5721 {
5722    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5723    struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5724 
5725 #if GFX_VER == 7
5726    if (stage == MESA_SHADER_VERTEX) {
5727       if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
5728          gen7_emit_vs_workaround_flush(batch);
5729    }
5730 #endif
5731    crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5732       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5733 #if GFX_VER >= 7
5734       if (prog_data) {
5735          /* The Skylake PRM contains the following restriction:
5736           *
5737           *    "The driver must ensure The following case does not occur
5738           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5739           *     buffer 3 read length equal to zero committed followed by a
5740           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5741           *     zero committed."
5742           *
5743           * To avoid this, we program the buffers in the highest slots.
5744           * This way, slot 0 is only used if slot 3 is also used.
5745           */
5746          int n = push_bos->buffer_count;
5747          assert(n <= 4);
5748 #if GFX_VERx10 >= 75
5749          const unsigned shift = 4 - n;
5750 #else
5751          const unsigned shift = 0;
5752 #endif
5753          for (int i = 0; i < n; i++) {
5754             pkt.ConstantBody.ReadLength[i + shift] =
5755                push_bos->buffers[i].length;
5756             pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5757          }
5758       }
5759 #else
5760       if (prog_data) {
5761          int n = push_bos->buffer_count;
5762          assert (n <= 1);
5763          if (n == 1) {
5764             pkt.Buffer0Valid = true;
5765             pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5766             pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5767          }
5768       }
5769 #endif
5770    }
5771 }
5772 
5773 #endif
5774 
5775 #if GFX_VER == 8
5776 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5777 #elif GFX_VER >= 6
5778 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
5779 #else
5780 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
5781 #endif
5782 
5783 static inline void
5784 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5785 {
5786    struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5787    ds->DepthTestEnable = cso->cso.depth_enabled;
5788    ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5789    ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5790 
5791    ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5792    ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5793    ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5794    ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5795 
5796    ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5797    ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5798 
5799    ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5800    ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5801    ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5802    ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5803 
5804    ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5805    ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5806    ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5807    ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5808    ds->StencilBufferWriteEnable =
5809       cso->cso.stencil[0].writemask != 0 ||
5810       (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5811 }
5812 
5813 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5814 emit_vertex_buffer_state(struct crocus_batch *batch,
5815                          unsigned buffer_id,
5816                          struct crocus_bo *bo,
5817                          unsigned start_offset,
5818                          unsigned end_offset,
5819                          unsigned stride,
5820                          unsigned step_rate,
5821                          uint32_t **map)
5822 {
5823    const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5824    _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5825       vb.BufferStartingAddress = ro_bo(bo, start_offset);
5826 #if GFX_VER >= 8
5827       vb.BufferSize = end_offset - start_offset;
5828 #endif
5829       vb.VertexBufferIndex = buffer_id;
5830       vb.BufferPitch = stride;
5831 #if GFX_VER >= 7
5832       vb.AddressModifyEnable = true;
5833 #endif
5834 #if GFX_VER >= 6
5835       vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5836 #endif
5837 #if GFX_VER < 8
5838       vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5839       vb.InstanceDataStepRate = step_rate;
5840 #if GFX_VER >= 5
5841       vb.EndAddress = ro_bo(bo, end_offset - 1);
5842 #endif
5843 #endif
5844    }
5845    *map += vb_dwords;
5846 }
5847 
5848 #if GFX_VER >= 6
5849 static uint32_t
determine_sample_mask(struct crocus_context * ice)5850 determine_sample_mask(struct crocus_context *ice)
5851 {
5852    uint32_t num_samples = ice->state.framebuffer.samples;
5853 
5854    if (num_samples <= 1)
5855       return 1;
5856 
5857    uint32_t fb_mask = (1 << num_samples) - 1;
5858    return ice->state.sample_mask & fb_mask;
5859 }
5860 #endif
5861 
5862 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5863 crocus_upload_dirty_render_state(struct crocus_context *ice,
5864                                struct crocus_batch *batch,
5865                                const struct pipe_draw_info *draw)
5866 {
5867    uint64_t dirty = ice->state.dirty;
5868    uint64_t stage_dirty = ice->state.stage_dirty;
5869 
5870    if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5871        !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5872       return;
5873 
5874    if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5875       crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5876          vf.StatisticsEnable = true;
5877       }
5878    }
5879 
5880 #if GFX_VER <= 5
5881    if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5882                       CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5883       bool ret = calculate_curbe_offsets(batch);
5884       if (ret) {
5885          dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5886          stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5887       }
5888    }
5889 
5890    if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5891        stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5892      bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5893                                            brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5894                                            ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5895      if (ret) {
5896 	dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5897 	stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5898      }
5899    }
5900 #endif
5901    if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5902       const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5903       uint32_t cc_vp_address;
5904 
5905       /* XXX: could avoid streaming for depth_clip [0,1] case. */
5906       uint32_t *cc_vp_map =
5907          stream_state(batch,
5908                       4 * ice->state.num_viewports *
5909                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5910       for (int i = 0; i < ice->state.num_viewports; i++) {
5911          float zmin, zmax;
5912          crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5913                                  ice->state.window_space_position,
5914                                  &zmin, &zmax);
5915          if (cso_rast->cso.depth_clip_near)
5916             zmin = 0.0;
5917          if (cso_rast->cso.depth_clip_far)
5918             zmax = 1.0;
5919 
5920          crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5921             ccv.MinimumDepth = zmin;
5922             ccv.MaximumDepth = zmax;
5923          }
5924 
5925          cc_vp_map += GENX(CC_VIEWPORT_length);
5926       }
5927 
5928 #if GFX_VER >= 7
5929       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5930          ptr.CCViewportPointer = cc_vp_address;
5931       }
5932 #elif GFX_VER == 6
5933       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5934          vp.CCViewportStateChange = 1;
5935          vp.PointertoCC_VIEWPORT = cc_vp_address;
5936       }
5937 #else
5938       ice->state.cc_vp_address = cc_vp_address;
5939       dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5940 #endif
5941    }
5942 
5943    if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5944       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5945 #if GFX_VER >= 7
5946       uint32_t sf_cl_vp_address;
5947       uint32_t *vp_map =
5948          stream_state(batch,
5949                       4 * ice->state.num_viewports *
5950                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5951 #else
5952       uint32_t *vp_map =
5953          stream_state(batch,
5954                       4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5955                       32, &ice->state.sf_vp_address);
5956       uint32_t *clip_map =
5957          stream_state(batch,
5958                       4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5959                       32, &ice->state.clip_vp_address);
5960 #endif
5961 
5962       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5963          const struct pipe_viewport_state *state = &ice->state.viewports[i];
5964          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5965 
5966 #if GFX_VER == 8
5967          float vp_xmin = viewport_extent(state, 0, -1.0f);
5968          float vp_xmax = viewport_extent(state, 0,  1.0f);
5969          float vp_ymin = viewport_extent(state, 1, -1.0f);
5970          float vp_ymax = viewport_extent(state, 1,  1.0f);
5971 #endif
5972          intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
5973                                         state->scale[0], state->scale[1],
5974                                         state->translate[0], state->translate[1],
5975                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5976 #if GFX_VER >= 7
5977          crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5978 #else
5979          crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5980 #endif
5981          {
5982             vp.ViewportMatrixElementm00 = state->scale[0];
5983             vp.ViewportMatrixElementm11 = state->scale[1];
5984             vp.ViewportMatrixElementm22 = state->scale[2];
5985             vp.ViewportMatrixElementm30 = state->translate[0];
5986             vp.ViewportMatrixElementm31 = state->translate[1];
5987             vp.ViewportMatrixElementm32 = state->translate[2];
5988 #if GFX_VER < 6
5989             struct pipe_scissor_state scissor;
5990             crocus_fill_scissor_rect(ice, 0, &scissor);
5991             vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5992             vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5993             vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5994             vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5995 #endif
5996 
5997 #if GFX_VER >= 7
5998             vp.XMinClipGuardband = gb_xmin;
5999             vp.XMaxClipGuardband = gb_xmax;
6000             vp.YMinClipGuardband = gb_ymin;
6001             vp.YMaxClipGuardband = gb_ymax;
6002 #endif
6003 #if GFX_VER == 8
6004             vp.XMinViewPort = MAX2(vp_xmin, 0);
6005             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6006             vp.YMinViewPort = MAX2(vp_ymin, 0);
6007             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6008 #endif
6009          }
6010 #if GFX_VER < 7
6011          crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6012             clip.XMinClipGuardband = gb_xmin;
6013             clip.XMaxClipGuardband = gb_xmax;
6014             clip.YMinClipGuardband = gb_ymin;
6015             clip.YMaxClipGuardband = gb_ymax;
6016          }
6017 #endif
6018 #if GFX_VER >= 7
6019          vp_map += GENX(SF_CLIP_VIEWPORT_length);
6020 #else
6021          vp_map += GENX(SF_VIEWPORT_length);
6022          clip_map += GENX(CLIP_VIEWPORT_length);
6023 #endif
6024       }
6025 #if GFX_VER >= 7
6026       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6027          ptr.SFClipViewportPointer = sf_cl_vp_address;
6028       }
6029 #elif GFX_VER == 6
6030       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6031          vp.SFViewportStateChange = 1;
6032          vp.CLIPViewportStateChange = 1;
6033          vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6034          vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6035       }
6036 #endif
6037    }
6038 
6039 #if GFX_VER >= 6
6040    if (dirty & CROCUS_DIRTY_GEN6_URB) {
6041 #if GFX_VER == 6
6042       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6043          || ice->shaders.ff_gs_prog;
6044 
6045       struct brw_vue_prog_data *vue_prog_data =
6046          (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6047       const unsigned vs_size = vue_prog_data->urb_entry_size;
6048       unsigned gs_size = vs_size;
6049       if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6050          struct brw_vue_prog_data *gs_vue_prog_data =
6051             (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6052          gs_size = gs_vue_prog_data->urb_entry_size;
6053       }
6054 
6055       genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6056 #endif
6057 #if GFX_VER >= 7
6058       const struct intel_device_info *devinfo = &batch->screen->devinfo;
6059       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6060       bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6061       unsigned entry_size[4];
6062 
6063       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6064          if (!ice->shaders.prog[i]) {
6065             entry_size[i] = 1;
6066          } else {
6067             struct brw_vue_prog_data *vue_prog_data =
6068                (void *) ice->shaders.prog[i]->prog_data;
6069             entry_size[i] = vue_prog_data->urb_entry_size;
6070          }
6071          assert(entry_size[i] != 0);
6072       }
6073 
6074       /* If we're just switching between programs with the same URB requirements,
6075        * skip the rest of the logic.
6076        */
6077       bool no_change = false;
6078       if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
6079           ice->urb.gs_present == gs_present &&
6080           ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
6081           ice->urb.tess_present == tess_present &&
6082           ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
6083           ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
6084          no_change = true;
6085       }
6086 
6087       if (!no_change) {
6088          ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
6089          ice->urb.gs_present = gs_present;
6090          ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
6091          ice->urb.tess_present = tess_present;
6092          ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
6093          ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
6094 
6095          unsigned entries[4];
6096          unsigned start[4];
6097          bool constrained;
6098          intel_get_urb_config(devinfo,
6099                               batch->screen->l3_config_3d,
6100                               tess_present,
6101                               gs_present,
6102                               entry_size,
6103                               entries, start, NULL, &constrained);
6104 
6105 #if GFX_VER == 7
6106          if (GFX_VERx10 < 75 && !devinfo->is_baytrail)
6107             gen7_emit_vs_workaround_flush(batch);
6108 #endif
6109          for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6110             crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6111                urb._3DCommandSubOpcode += i;
6112                urb.VSURBStartingAddress     = start[i];
6113                urb.VSURBEntryAllocationSize = entry_size[i] - 1;
6114                urb.VSNumberofURBEntries     = entries[i];
6115             }
6116          }
6117       }
6118 #endif
6119    }
6120 
6121    if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6122       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6123       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6124       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6125 
6126       STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6127       int rt_dwords =
6128          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6129 #if GFX_VER >= 8
6130       rt_dwords += GENX(BLEND_STATE_length);
6131 #endif
6132       uint32_t blend_offset;
6133       uint32_t *blend_map =
6134          stream_state(batch,
6135                       4 * rt_dwords, 64, &blend_offset);
6136 
6137 #if GFX_VER >= 8
6138    struct GENX(BLEND_STATE) be = { 0 };
6139    {
6140 #else
6141    for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6142       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6143 #define be entry
6144 #endif
6145 
6146       be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6147       be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6148       be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6149       be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6150       be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
6151       be.ColorDitherEnable = cso_blend->cso.dither;
6152 
6153 #if GFX_VER >= 8
6154       for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6155          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6156 #else
6157       {
6158 #endif
6159          const struct pipe_rt_blend_state *rt =
6160             &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6161 
6162          be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6163             be.IndependentAlphaBlendEnable;
6164 
6165          if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6166             entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6167             entry.LogicOpFunction = cso_blend->cso.logicop_func;
6168          }
6169 
6170          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6171          entry.PreBlendColorClampEnable = true;
6172          entry.PostBlendColorClampEnable = true;
6173 
6174          entry.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
6175          entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6176          entry.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
6177          entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6178 
6179 #if GFX_VER >= 8
6180          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6181 #else
6182          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6183 #endif
6184       }
6185    }
6186 #if GFX_VER >= 8
6187    GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6188 #endif
6189 #if GFX_VER < 7
6190       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6191          ptr.PointertoBLEND_STATE = blend_offset;
6192          ptr.BLEND_STATEChange = true;
6193       }
6194 #else
6195       crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6196          ptr.BlendStatePointer = blend_offset;
6197 #if GFX_VER >= 8
6198          ptr.BlendStatePointerValid = true;
6199 #endif
6200       }
6201 #endif
6202    }
6203 #endif
6204 
6205    if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6206       struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6207       UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6208       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6209       uint32_t cc_offset;
6210       void *cc_map =
6211          stream_state(batch,
6212                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6213                       64, &cc_offset);
6214 #if GFX_VER <= 5
6215       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6216 #endif
6217       _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6218          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6219          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6220 
6221 #if GFX_VER <= 5
6222 
6223          set_depth_stencil_bits(ice, &cc);
6224 
6225          if (cso_blend->cso.logicop_enable) {
6226             if (can_emit_logic_op(ice)) {
6227                cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6228                cc.LogicOpFunction = cso_blend->cso.logicop_func;
6229             }
6230          }
6231          cc.ColorDitherEnable = cso_blend->cso.dither;
6232 
6233          cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6234 
6235          if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6236             cc.AlphaTestEnable = cso->cso.alpha_enabled;
6237             cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6238          }
6239          cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6240          cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6241 #else
6242          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6243          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6244 
6245          cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
6246          cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6247          cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
6248          cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6249 #endif
6250          cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6251          cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6252       }
6253       ice->shaders.cc_offset = cc_offset;
6254 #if GFX_VER >= 6
6255       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6256          ptr.ColorCalcStatePointer = cc_offset;
6257 #if GFX_VER != 7
6258          ptr.ColorCalcStatePointerValid = true;
6259 #endif
6260       }
6261 #endif
6262    }
6263 #if GFX_VER <= 5
6264    if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6265       crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6266          blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6267          blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6268          blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6269          blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6270       }
6271    }
6272 #endif
6273    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6274       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6275          continue;
6276 
6277       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6278       struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6279 
6280       if (!shader)
6281          continue;
6282 
6283       if (shs->sysvals_need_upload)
6284          upload_sysvals(ice, stage);
6285 
6286 #if GFX_VER <= 5
6287       dirty |= CROCUS_DIRTY_GEN4_CURBE;
6288 #endif
6289 #if GFX_VER >= 7
6290       struct push_bos push_bos = {};
6291       setup_constant_buffers(ice, batch, stage, &push_bos);
6292 
6293       emit_push_constant_packets(ice, batch, stage, &push_bos);
6294 #endif
6295    }
6296 
6297    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6298       if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6299          if (ice->shaders.prog[stage]) {
6300 #if GFX_VER <= 6
6301             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6302 #endif
6303             crocus_populate_binding_table(ice, batch, stage, false);
6304             ice->shaders.prog[stage]->bind_bo_offset =
6305                crocus_upload_binding_table(ice, batch,
6306                                            ice->shaders.prog[stage]->surf_offset,
6307                                            ice->shaders.prog[stage]->bt.size_bytes);
6308 
6309 #if GFX_VER >= 7
6310             crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6311                ptr._3DCommandSubOpcode = 38 + stage;
6312                ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6313             }
6314 #endif
6315 #if GFX_VER == 6
6316          } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6317             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6318             crocus_populate_binding_table(ice, batch, stage, true);
6319             ice->shaders.ff_gs_prog->bind_bo_offset =
6320                crocus_upload_binding_table(ice, batch,
6321                                            ice->shaders.ff_gs_prog->surf_offset,
6322                                            ice->shaders.ff_gs_prog->bt.size_bytes);
6323 #endif
6324          }
6325       }
6326    }
6327 #if GFX_VER <= 6
6328    if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6329       struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6330       if (gs == NULL)
6331          gs = ice->shaders.ff_gs_prog;
6332       crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6333          ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6334          ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6335 #if GFX_VER == 6
6336          ptr.VSBindingTableChange = true;
6337          ptr.PSBindingTableChange = true;
6338          ptr.GSBindingTableChange = gs ? true : false;
6339          ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6340 #endif
6341       }
6342    }
6343 #endif
6344 
6345    bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6346    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6347       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6348           !ice->shaders.prog[stage])
6349          continue;
6350 
6351       crocus_upload_sampler_states(ice, batch, stage);
6352 
6353       sampler_updates = true;
6354 
6355 #if GFX_VER >= 7
6356       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6357 
6358       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6359          ptr._3DCommandSubOpcode = 43 + stage;
6360          ptr.PointertoVSSamplerState = shs->sampler_offset;
6361       }
6362 #endif
6363    }
6364 
6365    if (sampler_updates) {
6366 #if GFX_VER == 6
6367       struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6368       struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6369       struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6370       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6371          if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6372              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6373               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6374             ptr.VSSamplerStateChange = true;
6375             ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6376          }
6377          if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6378              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6379               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6380             ptr.GSSamplerStateChange = true;
6381             ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6382          }
6383          if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6384              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6385               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6386             ptr.PSSamplerStateChange = true;
6387             ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6388          }
6389       }
6390 #endif
6391    }
6392 
6393 #if GFX_VER >= 6
6394    if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6395       crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6396          ms.PixelLocation =
6397             ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6398          if (ice->state.framebuffer.samples > 0)
6399             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6400 #if GFX_VER == 6
6401          INTEL_SAMPLE_POS_4X(ms.Sample);
6402 #elif GFX_VER == 7
6403          switch (ice->state.framebuffer.samples) {
6404          case 1:
6405             INTEL_SAMPLE_POS_1X(ms.Sample);
6406             break;
6407          case 2:
6408             INTEL_SAMPLE_POS_2X(ms.Sample);
6409             break;
6410          case 4:
6411             INTEL_SAMPLE_POS_4X(ms.Sample);
6412             break;
6413          case 8:
6414             INTEL_SAMPLE_POS_8X(ms.Sample);
6415             break;
6416          default:
6417             break;
6418          }
6419 #endif
6420       }
6421    }
6422 
6423    if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6424       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6425          ms.SampleMask = determine_sample_mask(ice);
6426       }
6427    }
6428 #endif
6429 
6430 #if GFX_VER >= 7
6431    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6432    if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6433       struct brw_stage_prog_data *prog_data = shader->prog_data;
6434       struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6435 
6436       crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6437 
6438          /* Initialize the execution mask with VMask.  Otherwise, derivatives are
6439           * incorrect for subspans where some of the pixels are unlit.  We believe
6440           * the bit just didn't take effect in previous generations.
6441           */
6442          ps.VectorMaskEnable = GFX_VER >= 8;
6443 
6444          ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
6445          ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
6446          ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
6447 
6448          ps.DispatchGRFStartRegisterForConstantSetupData0 =
6449             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6450          ps.DispatchGRFStartRegisterForConstantSetupData1 =
6451             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6452          ps.DispatchGRFStartRegisterForConstantSetupData2 =
6453             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6454 
6455          ps.KernelStartPointer0 = KSP(ice, shader) +
6456             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6457          ps.KernelStartPointer1 = KSP(ice, shader) +
6458             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6459          ps.KernelStartPointer2 = KSP(ice, shader) +
6460             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6461 
6462 #if GFX_VERx10 == 75
6463          ps.SampleMask = determine_sample_mask(ice);
6464 #endif
6465          // XXX: WABTPPrefetchDisable, see above, drop at C0
6466          ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6467          ps.FloatingPointMode = prog_data->use_alt_mode;
6468 #if GFX_VER >= 8
6469          ps.MaximumNumberofThreadsPerPSD = 64 - 2;
6470 #else
6471          ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6472 #endif
6473 
6474          ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6475 
6476 #if GFX_VER < 8
6477          ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6478          ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6479          ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6480 #endif
6481          /* From the documentation for this packet:
6482           * "If the PS kernel does not need the Position XY Offsets to
6483           *  compute a Position Value, then this field should be programmed
6484           *  to POSOFFSET_NONE."
6485           *
6486           * "SW Recommendation: If the PS kernel needs the Position Offsets
6487           *  to compute a Position XY value, this field should match Position
6488           *  ZW Interpolation Mode to ensure a consistent position.xyzw
6489           *  computation."
6490           *
6491           * We only require XY sample offsets. So, this recommendation doesn't
6492           * look useful at the moment.  We might need this in future.
6493           */
6494          ps.PositionXYOffsetSelect =
6495             wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6496 
6497          if (wm_prog_data->base.total_scratch) {
6498             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6499             ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6500             ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6501          }
6502       }
6503 #if GFX_VER == 8
6504       const struct shader_info *fs_info =
6505          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6506       crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6507          psx.PixelShaderValid = true;
6508          psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6509          psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6510          psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6511          psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6512          psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6513          psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
6514 
6515          /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
6516          if (wm_prog_data->uses_sample_mask)
6517             psx.PixelShaderUsesInputCoverageMask = true;
6518 
6519          psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6520 
6521          /* The stricter cross-primitive coherency guarantees that the hardware
6522           * gives us with the "Accesses UAV" bit set for at least one shader stage
6523           * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6524           * are redundant within the current image, atomic counter and SSBO GL
6525           * APIs, which all have very loose ordering and coherency requirements
6526           * and generally rely on the application to insert explicit barriers when
6527           * a shader invocation is expected to see the memory writes performed by
6528           * the invocations of some previous primitive.  Regardless of the value
6529           * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6530           * cause an in most cases useless DC flush when the lowermost stage with
6531           * the bit set finishes execution.
6532           *
6533           * It would be nice to disable it, but in some cases we can't because on
6534           * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6535           * signal (which could be set independently from the coherency mechanism
6536           * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6537           * determine whether the hardware skips execution of the fragment shader
6538           * or not via the ThreadDispatchEnable signal.  However if we know that
6539           * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6540           * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6541           * difference so we may just disable it here.
6542           *
6543           * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6544           * take into account KillPixels when no depth or stencil writes are
6545           * enabled.  In order for occlusion queries to work correctly with no
6546           * attachments, we need to force-enable here.
6547           *
6548           */
6549          if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6550              !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6551             psx.PixelShaderHasUAV = true;
6552       }
6553 #endif
6554    }
6555 #endif
6556 
6557 #if GFX_VER >= 7
6558    if (ice->state.streamout_active) {
6559       if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6560          for (int i = 0; i < 4; i++) {
6561             struct crocus_stream_output_target *tgt =
6562                (void *) ice->state.so_target[i];
6563 
6564             if (!tgt) {
6565                crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6566                   sob.SOBufferIndex = i;
6567                }
6568                continue;
6569             }
6570             struct crocus_resource *res = (void *) tgt->base.buffer;
6571             uint32_t start = tgt->base.buffer_offset;
6572 #if GFX_VER < 8
6573             uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6574 #endif
6575             crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6576                sob.SOBufferIndex = i;
6577 
6578                sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6579 #if GFX_VER < 8
6580                sob.SurfacePitch = tgt->stride;
6581                sob.SurfaceEndAddress = rw_bo(res->bo, end);
6582 #else
6583                sob.SOBufferEnable = true;
6584                sob.StreamOffsetWriteEnable = true;
6585                sob.StreamOutputBufferOffsetAddressEnable = true;
6586                sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6587 
6588                sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6589                sob.StreamOutputBufferOffsetAddress =
6590                   rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6591                if (tgt->zero_offset) {
6592                   sob.StreamOffset = 0;
6593                   tgt->zero_offset = false;
6594                } else
6595                   sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6596 #endif
6597             }
6598          }
6599       }
6600 
6601       if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6602          uint32_t *decl_list =
6603             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6604          crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6605       }
6606 
6607       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6608          const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6609 
6610          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6611          crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6612             sol.SOFunctionEnable = true;
6613             sol.SOStatisticsEnable = true;
6614 
6615             sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6616                                    !ice->state.prims_generated_query_active;
6617             sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6618          }
6619 
6620          assert(ice->state.streamout);
6621 
6622          crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6623                          GENX(3DSTATE_STREAMOUT_length));
6624       }
6625    } else {
6626       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6627          crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6628       }
6629    }
6630 #endif
6631 #if GFX_VER == 6
6632    if (ice->state.streamout_active) {
6633       if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6634          crocus_emit_so_svbi(ice);
6635       }
6636    }
6637 #endif
6638 
6639    if (dirty & CROCUS_DIRTY_CLIP) {
6640 #if GFX_VER < 6
6641       const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6642       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6643 
6644       uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6645       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6646       _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6647          clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6648          clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6649          clip.SingleProgramFlow = true;
6650          clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6651 
6652          clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6653          clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6654 
6655          clip.DispatchGRFStartRegisterForURBData = 1;
6656          clip.VertexURBEntryReadOffset = 0;
6657          clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6658 
6659          clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6660          clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6661 
6662          if (batch->ice->urb.nr_clip_entries >= 10) {
6663             /* Half of the URB entries go to each thread, and it has to be an
6664              * even number.
6665              */
6666             assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6667 
6668             /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6669              * only 2 threads can output VUEs at a time.
6670              */
6671             clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6672          } else {
6673             assert(batch->ice->urb.nr_clip_entries >= 5);
6674             clip.MaximumNumberofThreads = 1 - 1;
6675          }
6676          clip.VertexPositionSpace = VPOS_NDCSPACE;
6677          clip.UserClipFlagsMustClipEnable = true;
6678          clip.GuardbandClipTestEnable = true;
6679 
6680          clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6681          clip.ScreenSpaceViewportXMin = -1.0;
6682          clip.ScreenSpaceViewportXMax = 1.0;
6683          clip.ScreenSpaceViewportYMin = -1.0;
6684          clip.ScreenSpaceViewportYMax = 1.0;
6685          clip.ViewportXYClipTestEnable = true;
6686          clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6687 
6688 #if GFX_VER == 5 || GFX_VERx10 == 45
6689          clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6690 #else
6691          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6692           * workaround.
6693           */
6694          clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6695 #endif
6696 
6697          clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6698          clip.GuardbandClipTestEnable = true;
6699 
6700          clip.ClipMode = clip_prog_data->clip_mode;
6701 #if GFX_VERx10 == 45
6702          clip.NegativeWClipTestEnable = true;
6703 #endif
6704       }
6705 
6706 #else //if GFX_VER >= 6
6707       struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6708       const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6709       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6710       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6711                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6712       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6713          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6714                     : ice->state.prim_is_points_or_lines);
6715       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6716       crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6717          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6718          if (cso_rast->cso.rasterizer_discard)
6719             cl.ClipMode = CLIPMODE_REJECT_ALL;
6720          else if (ice->state.window_space_position)
6721             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6722          else
6723             cl.ClipMode = CLIPMODE_NORMAL;
6724 
6725          cl.PerspectiveDivideDisable = ice->state.window_space_position;
6726          cl.ViewportXYClipTestEnable = !points_or_lines;
6727 
6728          cl.UserClipDistanceCullTestEnableBitmask =
6729             brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6730 
6731          if (wm_prog_data->barycentric_interp_modes &
6732              BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
6733             cl.NonPerspectiveBarycentricEnable = true;
6734 
6735          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6736          cl.MaximumVPIndex = ice->state.num_viewports - 1;
6737       }
6738       crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6739                       ARRAY_SIZE(cso_rast->clip));
6740 #endif
6741    }
6742 
6743    if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6744       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6745       const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6746       const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
6747 #if GFX_VER == 7
6748       if (batch->screen->devinfo.is_ivybridge)
6749          gen7_emit_vs_workaround_flush(batch);
6750 #endif
6751 
6752 
6753 #if GFX_VER == 6
6754       struct push_bos push_bos = {};
6755       setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6756 
6757       emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6758 #endif
6759 #if GFX_VER >= 6
6760       crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6761 #else
6762       uint32_t *vs_ptr = stream_state(batch,
6763                                       GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6764       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6765       _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6766 #endif
6767       {
6768          INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6769 
6770          vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6771 
6772 #if GFX_VER < 6
6773          vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6774          vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6775          vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6776 
6777          vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6778          vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6779 
6780          vs.MaximumNumberofThreads =
6781             CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6782          vs.StatisticsEnable = false;
6783          vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6784 #endif
6785 #if GFX_VER == 5
6786          /* Force single program flow on Ironlake.  We cannot reliably get
6787           * all applications working without it.  See:
6788           * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6789           *
6790           * The most notable and reliably failing application is the Humus
6791           * demo "CelShading"
6792           */
6793          vs.SingleProgramFlow = true;
6794          vs.SamplerCount = 0; /* hardware requirement */
6795 
6796 #endif
6797 #if GFX_VER >= 8
6798          vs.SIMD8DispatchEnable =
6799             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6800 
6801          vs.UserClipDistanceCullTestEnableBitmask =
6802             vue_prog_data->cull_distance_mask;
6803 #endif
6804       }
6805 
6806 #if GFX_VER == 6
6807       crocus_emit_pipe_control_flush(batch,
6808                                      "post VS const",
6809                                      PIPE_CONTROL_DEPTH_STALL |
6810                                      PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6811                                      PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6812 #endif
6813    }
6814 
6815    if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6816       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6817       bool active = GFX_VER >= 6 && shader;
6818 #if GFX_VER == 6
6819       struct push_bos push_bos = {};
6820       if (shader)
6821          setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6822 
6823       emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6824 #endif
6825 #if GFX_VERx10 == 70
6826    /**
6827     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6828     * Geometry > Geometry Shader > State:
6829     *
6830     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
6831     *     whole fixed function pipeline when the GS enable changes value in
6832     *     the 3DSTATE_GS."
6833     *
6834     * The hardware architects have clarified that in this context "flush the
6835     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6836     * Stall" bit set.
6837     */
6838    if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6839       gen7_emit_cs_stall_flush(batch);
6840 #endif
6841 #if GFX_VER >= 6
6842       crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6843 #else
6844       uint32_t *gs_ptr = stream_state(batch,
6845                                       GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6846       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6847       _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6848 #endif
6849      {
6850 #if GFX_VER >= 6
6851          if (active) {
6852             const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
6853             const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6854             const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
6855 
6856             INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6857 #if GFX_VER >= 7
6858             gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6859             gs.OutputTopology = gs_prog_data->output_topology;
6860             gs.ControlDataHeaderSize =
6861                gs_prog_data->control_data_header_size_hwords;
6862 
6863             gs.InstanceControl = gs_prog_data->invocations - 1;
6864             gs.DispatchMode = vue_prog_data->dispatch_mode;
6865 
6866             gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6867 
6868             gs.ControlDataFormat = gs_prog_data->control_data_format;
6869 #endif
6870 
6871             /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6872              * Ivy Bridge and Haswell.
6873              *
6874              * On Ivy Bridge, setting this bit causes the vertices of a triangle
6875              * strip to be delivered to the geometry shader in an order that does
6876              * not strictly follow the OpenGL spec, but preserves triangle
6877              * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
6878              * the geometry shader sees triangles:
6879              *
6880              * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6881              *
6882              * (Clearing the bit is even worse, because it fails to preserve
6883              * orientation).
6884              *
6885              * Triangle strips with adjacency always ordered in a way that preserves
6886              * triangle orientation but does not strictly follow the OpenGL spec,
6887              * regardless of the setting of this bit.
6888              *
6889              * On Haswell, both triangle strips and triangle strips with adjacency
6890              * are always ordered in a way that preserves triangle orientation.
6891              * Setting this bit causes the ordering to strictly follow the OpenGL
6892              * spec.
6893              *
6894              * So in either case we want to set the bit.  Unfortunately on Ivy
6895              * Bridge this will get the order close to correct but not perfect.
6896              */
6897             gs.ReorderMode = TRAILING;
6898             gs.MaximumNumberofThreads =
6899                GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6900                (batch->screen->devinfo.max_gs_threads - 1);
6901 #if GFX_VER < 7
6902             gs.SOStatisticsEnable = true;
6903             if (gs_prog_data->num_transform_feedback_bindings)
6904                gs.SVBIPayloadEnable = ice->state.streamout_active;
6905 
6906             /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6907              * was previously done for gen6.
6908              *
6909              * TODO: test with both disabled to see if the HW is behaving
6910              * as expected, like in gen7.
6911              */
6912             gs.SingleProgramFlow = true;
6913             gs.VectorMaskEnable = true;
6914 #endif
6915 #if GFX_VER >= 8
6916             gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6917 
6918             if (gs_prog_data->static_vertex_count != -1) {
6919                gs.StaticOutput = true;
6920                gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6921             }
6922             gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6923 
6924             gs.UserClipDistanceCullTestEnableBitmask =
6925                vue_prog_data->cull_distance_mask;
6926 
6927             const int urb_entry_write_offset = 1;
6928             const uint32_t urb_entry_output_length =
6929                DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6930                urb_entry_write_offset;
6931 
6932             gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6933             gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6934 #endif
6935          }
6936 #endif
6937 #if GFX_VER <= 6
6938          if (!active && ice->shaders.ff_gs_prog) {
6939             const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6940             /* In gen6, transform feedback for the VS stage is done with an
6941              * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6942              * for this.
6943              */
6944             gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6945             gs.SingleProgramFlow = true;
6946             gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6947             gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6948 
6949 #if GFX_VER <= 5
6950             gs.GRFRegisterCount =
6951                DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6952             /* BRW_NEW_URB_FENCE */
6953             gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6954             gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6955             gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6956             gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6957 #else
6958             gs.Enable = true;
6959             gs.VectorMaskEnable = true;
6960             gs.SVBIPayloadEnable = true;
6961             gs.SVBIPostIncrementEnable = true;
6962             gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6963             gs.SOStatisticsEnable = true;
6964             gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6965 #endif
6966          }
6967 #endif
6968          if (!active && !ice->shaders.ff_gs_prog) {
6969 #if GFX_VER < 8
6970             gs.DispatchGRFStartRegisterForURBData = 1;
6971 #if GFX_VER >= 7
6972             gs.IncludeVertexHandles = true;
6973 #endif
6974 #endif
6975          }
6976 #if GFX_VER >= 6
6977          gs.StatisticsEnable = true;
6978 #endif
6979 #if GFX_VER == 5 || GFX_VER == 6
6980          gs.RenderingEnabled = true;
6981 #endif
6982 #if GFX_VER <= 5
6983          gs.MaximumVPIndex = ice->state.num_viewports - 1;
6984 #endif
6985       }
6986       ice->state.gs_enabled = active;
6987    }
6988 
6989 #if GFX_VER >= 7
6990    if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6991       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6992 
6993       if (shader) {
6994          const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
6995          const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6996          const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6997 
6998          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6999             INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
7000             hs.InstanceCount = tcs_prog_data->instances - 1;
7001             hs.IncludeVertexHandles = true;
7002             hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7003          }
7004       } else {
7005          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7006       }
7007 
7008    }
7009 
7010    if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7011       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7012       if (shader) {
7013          const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
7014          const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7015          const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
7016 
7017          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7018             te.Partitioning = tes_prog_data->partitioning;
7019             te.OutputTopology = tes_prog_data->output_topology;
7020             te.TEDomain = tes_prog_data->domain;
7021             te.TEEnable = true;
7022             te.MaximumTessellationFactorOdd = 63.0;
7023             te.MaximumTessellationFactorNotOdd = 64.0;
7024          };
7025          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7026             INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7027 
7028             ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7029             ds.ComputeWCoordinateEnable =
7030                tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
7031 
7032 #if GFX_VER >= 8
7033             if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7034                ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7035             ds.UserClipDistanceCullTestEnableBitmask =
7036                vue_prog_data->cull_distance_mask;
7037 #endif
7038          };
7039       } else {
7040          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7041          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7042       }
7043    }
7044 #endif
7045    if (dirty & CROCUS_DIRTY_RASTER) {
7046 
7047 #if GFX_VER < 6
7048       const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7049       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7050       uint32_t *sf_ptr = stream_state(batch,
7051                                       GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7052       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7053       _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7054          sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7055          sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7056          sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7057          sf.DispatchGRFStartRegisterForURBData = 3;
7058          sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
7059          sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7060          sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7061          sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7062          sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7063 
7064          sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7065 
7066          sf.MaximumNumberofThreads =
7067             MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7068 
7069          sf.SpritePointEnable = cso_state->point_quad_rasterization;
7070          sf.DestinationOriginHorizontalBias = 0.5;
7071          sf.DestinationOriginVerticalBias = 0.5;
7072 
7073 	 sf.LineEndCapAntialiasingRegionWidth =
7074             cso_state->line_smooth ? _10pixels : _05pixels;
7075          sf.LastPixelEnable = cso_state->line_last_pixel;
7076          sf.AntialiasingEnable = cso_state->line_smooth;
7077 
7078          sf.LineWidth = get_line_width(cso_state);
7079          sf.PointWidth = cso_state->point_size;
7080          sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7081 #if GFX_VERx10 >= 45
7082          sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7083 #endif
7084          sf.ViewportTransformEnable = true;
7085          sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7086          sf.ScissorRectangleEnable = true;
7087          sf.CullMode = translate_cull_mode(cso_state->cull_face);
7088 
7089          if (cso_state->flatshade_first) {
7090             sf.TriangleFanProvokingVertexSelect = 1;
7091          } else {
7092             sf.TriangleStripListProvokingVertexSelect = 2;
7093             sf.TriangleFanProvokingVertexSelect = 2;
7094             sf.LineStripListProvokingVertexSelect = 1;
7095          }
7096       }
7097 #else
7098       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7099       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7100       crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7101          sf.ViewportTransformEnable = !ice->state.window_space_position;
7102 
7103 #if GFX_VER == 6
7104          const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7105          uint32_t urb_entry_read_length;
7106          uint32_t urb_entry_read_offset;
7107          uint32_t point_sprite_enables;
7108          calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7109                                   &urb_entry_read_length,
7110                                   &urb_entry_read_offset);
7111          sf.VertexURBEntryReadLength = urb_entry_read_length;
7112          sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7113          sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7114          sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7115          sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7116 #endif
7117 
7118 #if GFX_VER >= 6 && GFX_VER < 8
7119          if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7120             sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7121 #endif
7122 #if GFX_VER == 7
7123          if (ice->state.framebuffer.zsbuf) {
7124             struct crocus_resource *zres, *sres;
7125                crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7126                                                   ice->state.framebuffer.zsbuf->texture,
7127                                                   &zres, &sres);
7128             /* ANV thinks that the stencil-ness doesn't matter, this is just
7129              * about handling polygon offset scaling.
7130              */
7131             sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7132          }
7133 #endif
7134       }
7135       crocus_emit_merge(batch, cso->sf, dynamic_sf,
7136                       ARRAY_SIZE(dynamic_sf));
7137 #if GFX_VER == 8
7138       crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7139 #endif
7140 #endif
7141    }
7142 
7143    if (dirty & CROCUS_DIRTY_WM) {
7144       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7145       const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7146       UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
7147       UNUSED const struct shader_info *fs_info =
7148          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7149 
7150 #if GFX_VER == 6
7151       struct push_bos push_bos = {};
7152       setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7153 
7154       emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7155 #endif
7156 #if GFX_VER >= 6
7157       crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7158 #else
7159       uint32_t *wm_ptr = stream_state(batch,
7160                                       GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7161 
7162       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7163 
7164       _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7165 #endif
7166      {
7167 #if GFX_VER <= 6
7168          wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7169          wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7170          wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7171 #endif
7172 #if GFX_VER == 4
7173       /* On gen4, we only have one shader kernel */
7174          if (brw_wm_state_has_ksp(wm, 0)) {
7175             wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7176             wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7177             wm.DispatchGRFStartRegisterForConstantSetupData0 =
7178                wm_prog_data->base.dispatch_grf_start_reg;
7179          }
7180 #elif GFX_VER == 5
7181          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7182             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7183          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7184             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7185          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7186             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7187 
7188          wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7189          wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7190          wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7191 
7192          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7193             wm_prog_data->base.dispatch_grf_start_reg;
7194 #elif GFX_VER == 6
7195          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7196             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7197          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7198             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7199          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7200             brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7201 
7202          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7203            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7204          wm.DispatchGRFStartRegisterForConstantSetupData1 =
7205            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7206          wm.DispatchGRFStartRegisterForConstantSetupData2 =
7207            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7208 #endif
7209 #if GFX_VER <= 5
7210          wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7211          wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7212          wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7213          wm.SetupURBEntryReadOffset = 0;
7214          wm.EarlyDepthTestEnable = true;
7215          wm.LineAntialiasingRegionWidth = _05pixels;
7216          wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7217          wm.DepthCoefficientURBReadOffset = 1;
7218 
7219          if (cso->cso.offset_tri) {
7220             wm.GlobalDepthOffsetEnable = true;
7221 
7222          /* Something weird going on with legacy_global_depth_bias,
7223           * offset_constant, scaling and MRD.  This value passes glean
7224           * but gives some odd results elsewere (eg. the
7225           * quad-offset-units test).
7226           */
7227             wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7228             wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7229          }
7230          wm.SamplerStatePointer = ro_bo(batch->state.bo,
7231                                         ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7232 #endif
7233 
7234          wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7235             ice->state.statistics_counters_enabled : 0;
7236 
7237 #if GFX_VER >= 6
7238          wm.LineAntialiasingRegionWidth = _10pixels;
7239          wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7240 
7241          wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7242          wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7243 #endif
7244 #if GFX_VER == 6
7245       wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7246          ice->state.cso_blend->dual_color_blending;
7247       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7248       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7249 
7250       /* From the SNB PRM, volume 2 part 1, page 281:
7251        * "If the PS kernel does not need the Position XY Offsets
7252        * to compute a Position XY value, then this field should be
7253        * programmed to POSOFFSET_NONE."
7254        *
7255        * "SW Recommendation: If the PS kernel needs the Position Offsets
7256        * to compute a Position XY value, this field should match Position
7257        * ZW Interpolation Mode to ensure a consistent position.xyzw
7258        * computation."
7259        * We only require XY sample offsets. So, this recommendation doesn't
7260        * look useful at the moment. We might need this in future.
7261        */
7262       if (wm_prog_data->uses_pos_offset)
7263          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7264       else
7265          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7266 #endif
7267          wm.LineStippleEnable = cso->cso.line_stipple_enable;
7268          wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7269 
7270 #if GFX_VER < 7
7271          if (wm_prog_data->base.use_alt_mode)
7272             wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7273          wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7274          wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7275 #endif
7276 
7277 #if GFX_VER < 8
7278 #if GFX_VER >= 6
7279          wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7280 
7281          struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7282          if (fb->samples > 1) {
7283             if (cso->cso.multisample)
7284                wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7285             else
7286                wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7287 
7288             if (wm_prog_data->persample_dispatch)
7289                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7290             else
7291                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7292          } else {
7293             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7294             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7295          }
7296 #endif
7297 
7298          wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7299 
7300          if (wm_prog_data->uses_kill ||
7301              ice->state.cso_zsa->cso.alpha_enabled ||
7302              ice->state.cso_blend->cso.alpha_to_coverage ||
7303              (GFX_VER >= 6 && wm_prog_data->uses_omask))
7304             wm.PixelShaderKillsPixel = true;
7305 
7306          if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7307              writes_depth || wm.PixelShaderKillsPixel ||
7308              (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7309             wm.ThreadDispatchEnable = true;
7310 
7311 #if GFX_VER >= 7
7312          wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7313          wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7314 #else
7315          if (wm_prog_data->base.total_scratch) {
7316             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7317                                                             MESA_SHADER_FRAGMENT);
7318             wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7319             wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7320          }
7321 
7322          wm.PixelShaderComputedDepth = writes_depth;
7323 
7324 #endif
7325          /* The "UAV access enable" bits are unnecessary on HSW because they only
7326           * seem to have an effect on the HW-assisted coherency mechanism which we
7327           * don't need, and the rasterization-related UAV_ONLY flag and the
7328           * DISPATCH_ENABLE bit can be set independently from it.
7329           * C.f. gen8_upload_ps_extra().
7330           *
7331           * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7332           * _NEW_COLOR
7333           */
7334 #if GFX_VERx10 == 75
7335          if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7336              wm_prog_data->has_side_effects)
7337             wm.PSUAVonly = ON;
7338 #endif
7339 #endif
7340 #if GFX_VER >= 7
7341       /* BRW_NEW_FS_PROG_DATA */
7342          if (wm_prog_data->early_fragment_tests)
7343            wm.EarlyDepthStencilControl = EDSC_PREPS;
7344          else if (wm_prog_data->has_side_effects)
7345            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7346 #endif
7347 #if GFX_VER == 8
7348          /* We could skip this bit if color writes are enabled. */
7349          if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7350             wm.ForceThreadDispatchEnable = ForceON;
7351 #endif
7352       };
7353 
7354 #if GFX_VER <= 5
7355       if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7356          crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7357             clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7358          }
7359          ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7360       }
7361 #endif
7362    }
7363 
7364 #if GFX_VER >= 7
7365    if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7366       crocus_emit_sbe(batch, ice);
7367    }
7368 #endif
7369 
7370 #if GFX_VER >= 8
7371    if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7372       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7373       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7374       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7375       struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7376       const struct shader_info *fs_info =
7377          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7378       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7379       crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7380          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7381          pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7382          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7383             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7384       }
7385       crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7386                         ARRAY_SIZE(cso_blend->ps_blend));
7387    }
7388 #endif
7389 
7390 #if GFX_VER >= 6
7391    if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7392 
7393 #if GFX_VER >= 8
7394       crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7395          set_depth_stencil_bits(ice, &wmds);
7396       }
7397 #else
7398       uint32_t ds_offset;
7399       void *ds_map = stream_state(batch,
7400                                   sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7401                                   64, &ds_offset);
7402       _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7403          set_depth_stencil_bits(ice, &ds);
7404       }
7405 
7406 #if GFX_VER == 6
7407       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7408          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7409          ptr.DEPTH_STENCIL_STATEChange = true;
7410       }
7411 #else
7412       crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7413          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7414       }
7415 #endif
7416 #endif
7417    }
7418 
7419    if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7420       /* Align to 64-byte boundary as per anv. */
7421       uint32_t scissor_offset;
7422       struct pipe_scissor_state *scissor_map = (void *)
7423          stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7424                       64, &scissor_offset);
7425       for (int i = 0; i < ice->state.num_viewports; i++) {
7426          struct pipe_scissor_state scissor;
7427          crocus_fill_scissor_rect(ice, i, &scissor);
7428          scissor_map[i] = scissor;
7429       }
7430 
7431       crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7432          ptr.ScissorRectPointer = scissor_offset;
7433       }
7434    }
7435 #endif
7436 
7437    if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7438       struct isl_device *isl_dev = &batch->screen->isl_dev;
7439 #if GFX_VER >= 6
7440       crocus_emit_depth_stall_flushes(batch);
7441 #endif
7442       void *batch_ptr;
7443       struct crocus_resource *zres, *sres;
7444       struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7445       batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7446 
7447       struct isl_view view = {
7448                               .base_level = 0,
7449                               .levels = 1,
7450                               .base_array_layer = 0,
7451                               .array_len = 1,
7452                               .swizzle = ISL_SWIZZLE_IDENTITY,
7453       };
7454       struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
7455 
7456       if (cso->zsbuf) {
7457          crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7458          struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7459          if (zsbuf->align_res) {
7460             zres = (struct crocus_resource *)zsbuf->align_res;
7461          }
7462          view.base_level = cso->zsbuf->u.tex.level;
7463          view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7464          view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7465 
7466          if (zres) {
7467             view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7468 
7469             info.depth_surf = &zres->surf;
7470             info.depth_address = crocus_command_reloc(batch,
7471                                                       (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7472                                                       zres->bo, 0, RELOC_32BIT);
7473 
7474             info.mocs = crocus_mocs(zres->bo, isl_dev);
7475             view.format = zres->surf.format;
7476 
7477             if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7478                info.hiz_usage = zres->aux.usage;
7479                info.hiz_surf = &zres->aux.surf;
7480                uint64_t hiz_offset = 0;
7481 
7482 #if GFX_VER == 6
7483                /* HiZ surfaces on Sandy Bridge technically don't support
7484                 * mip-mapping.  However, we can fake it by offsetting to the
7485                 * first slice of LOD0 in the HiZ surface.
7486                 */
7487                isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7488                                                    view.base_level, 0, 0,
7489                                                    &hiz_offset, NULL, NULL);
7490 #endif
7491                info.hiz_address = crocus_command_reloc(batch,
7492                                                        (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7493                                                        zres->aux.bo, zres->aux.offset + hiz_offset,
7494                                                        RELOC_32BIT);
7495                info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7496             }
7497          }
7498 
7499 #if GFX_VER >= 6
7500          if (sres) {
7501             view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7502             info.stencil_aux_usage = sres->aux.usage;
7503             info.stencil_surf = &sres->surf;
7504 
7505             uint64_t stencil_offset = 0;
7506 #if GFX_VER == 6
7507             /* Stencil surfaces on Sandy Bridge technically don't support
7508              * mip-mapping.  However, we can fake it by offsetting to the
7509              * first slice of LOD0 in the stencil surface.
7510              */
7511             isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7512                                                 view.base_level, 0, 0,
7513                                                 &stencil_offset, NULL, NULL);
7514 #endif
7515 
7516             info.stencil_address = crocus_command_reloc(batch,
7517                                                         (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7518                                                         sres->bo, stencil_offset, RELOC_32BIT);
7519             if (!zres) {
7520                view.format = sres->surf.format;
7521                info.mocs = crocus_mocs(sres->bo, isl_dev);
7522             }
7523          }
7524 #endif
7525       }
7526       isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7527    }
7528 
7529    /* TODO: Disable emitting this until something uses a stipple. */
7530    if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7531       crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7532          for (int i = 0; i < 32; i++) {
7533             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7534          }
7535       }
7536    }
7537 
7538    if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7539       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7540       crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7541    }
7542 
7543 #if GFX_VER >= 8
7544    if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7545       crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7546          topo.PrimitiveTopologyType =
7547             translate_prim_type(draw->mode, ice->state.patch_vertices);
7548       }
7549    }
7550 #endif
7551 
7552 #if GFX_VER <= 5
7553    if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7554       upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7555                                       ice->shaders.vs_offset, ice->shaders.sf_offset,
7556                                       ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7557       crocus_upload_urb_fence(batch);
7558 
7559       crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7560         cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7561         cs.URBEntryAllocationSize = ice->urb.csize - 1;
7562       }
7563       dirty |= CROCUS_DIRTY_GEN4_CURBE;
7564    }
7565 #endif
7566    if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7567       struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7568       if (fb->width && fb->height) {
7569          crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7570             rect.ClippedDrawingRectangleXMax = fb->width - 1;
7571             rect.ClippedDrawingRectangleYMax = fb->height - 1;
7572          }
7573       }
7574    }
7575 
7576    if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7577       const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7578       const uint32_t count = user_count +
7579          ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7580       uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7581 
7582       if (count) {
7583          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7584 
7585          uint32_t *map =
7586             crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7587          _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7588             vb.DWordLength = (vb_dwords * count + 1) - 2;
7589          }
7590          map += 1;
7591 
7592          uint32_t bound = dynamic_bound;
7593          int i;
7594          while (bound) {
7595             i = u_bit_scan(&bound);
7596             struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7597             struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7598             uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7599 
7600             emit_vertex_buffer_state(batch, i, bo,
7601                                      buf->buffer_offset,
7602                                      ice->state.vb_end[i],
7603                                      buf->stride,
7604                                      step_rate,
7605                                      &map);
7606          }
7607          i = user_count;
7608          if (ice->state.vs_uses_draw_params) {
7609             struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7610             emit_vertex_buffer_state(batch, i++,
7611                                      res->bo,
7612                                      ice->draw.draw_params.offset,
7613                                      ice->draw.draw_params.res->width0,
7614                                      0, 0, &map);
7615          }
7616          if (ice->state.vs_uses_derived_draw_params) {
7617             struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7618             emit_vertex_buffer_state(batch, i++,
7619                                      res->bo,
7620                                      ice->draw.derived_draw_params.offset,
7621                                      ice->draw.derived_draw_params.res->width0,
7622                                      0, 0, &map);
7623          }
7624       }
7625    }
7626 
7627    if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7628       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7629       const unsigned entries = MAX2(cso->count, 1);
7630       if (!(ice->state.vs_needs_sgvs_element ||
7631             ice->state.vs_uses_derived_draw_params ||
7632             ice->state.vs_needs_edge_flag)) {
7633          crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7634                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7635       } else {
7636          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7637          const unsigned dyn_count = cso->count +
7638             ice->state.vs_needs_sgvs_element +
7639             ice->state.vs_uses_derived_draw_params;
7640 
7641          crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7642                            &dynamic_ves, ve) {
7643             ve.DWordLength =
7644                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7645          }
7646          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7647                 (cso->count - ice->state.vs_needs_edge_flag) *
7648                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7649          uint32_t *ve_pack_dest =
7650             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7651                          GENX(VERTEX_ELEMENT_STATE_length)];
7652 
7653          if (ice->state.vs_needs_sgvs_element) {
7654             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7655                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7656             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7657                ve.Valid = true;
7658                ve.VertexBufferIndex =
7659                   util_bitcount64(ice->state.bound_vertex_buffers);
7660                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7661                ve.Component0Control = base_ctrl;
7662                ve.Component1Control = base_ctrl;
7663 #if GFX_VER < 8
7664                ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7665                ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7666 #else
7667                ve.Component2Control = VFCOMP_STORE_0;
7668                ve.Component3Control = VFCOMP_STORE_0;
7669 #endif
7670 #if GFX_VER < 5
7671                ve.DestinationElementOffset = cso->count * 4;
7672 #endif
7673             }
7674             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7675          }
7676          if (ice->state.vs_uses_derived_draw_params) {
7677             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7678                ve.Valid = true;
7679                ve.VertexBufferIndex =
7680                   util_bitcount64(ice->state.bound_vertex_buffers) +
7681                   ice->state.vs_uses_draw_params;
7682                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7683                ve.Component0Control = VFCOMP_STORE_SRC;
7684                ve.Component1Control = VFCOMP_STORE_SRC;
7685                ve.Component2Control = VFCOMP_STORE_0;
7686                ve.Component3Control = VFCOMP_STORE_0;
7687 #if GFX_VER < 5
7688                ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7689 #endif
7690             }
7691             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7692          }
7693          if (ice->state.vs_needs_edge_flag) {
7694             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
7695                ve_pack_dest[i] = cso->edgeflag_ve[i];
7696          }
7697 
7698          crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7699                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7700       }
7701 
7702 #if GFX_VER == 8
7703       if (!ice->state.vs_needs_edge_flag) {
7704          crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7705                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7706       } else {
7707          assert(cso->count > 0);
7708          const unsigned edgeflag_index = cso->count - 1;
7709          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7710          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7711                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7712 
7713          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7714             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7715          crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7716             vi.VertexElementIndex = edgeflag_index +
7717                ice->state.vs_needs_sgvs_element +
7718                ice->state.vs_uses_derived_draw_params;
7719          }
7720          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
7721             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7722 
7723          crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7724                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7725       }
7726 #endif
7727    }
7728 
7729 #if GFX_VER == 8
7730    if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7731       const struct brw_vs_prog_data *vs_prog_data = (void *)
7732          ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7733       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7734 
7735       crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7736          if (vs_prog_data->uses_vertexid) {
7737             sgv.VertexIDEnable = true;
7738             sgv.VertexIDComponentNumber = 2;
7739             sgv.VertexIDElementOffset =
7740                cso->count - ice->state.vs_needs_edge_flag;
7741          }
7742 
7743          if (vs_prog_data->uses_instanceid) {
7744             sgv.InstanceIDEnable = true;
7745             sgv.InstanceIDComponentNumber = 3;
7746             sgv.InstanceIDElementOffset =
7747                cso->count - ice->state.vs_needs_edge_flag;
7748          }
7749       }
7750    }
7751 #endif
7752 #if GFX_VERx10 >= 75
7753    if (dirty & CROCUS_DIRTY_GEN75_VF) {
7754       crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7755          if (draw->primitive_restart) {
7756             vf.IndexedDrawCutIndexEnable = true;
7757             vf.CutIndex = draw->restart_index;
7758          }
7759       }
7760    }
7761 #endif
7762 
7763 #if GFX_VER == 8
7764    if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7765       bool enable = want_pma_fix(ice);
7766       genX(crocus_update_pma_fix)(ice, batch, enable);
7767    }
7768 #endif
7769 
7770 #if GFX_VER <= 5
7771    if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7772       gen4_upload_curbe(batch);
7773    }
7774 #endif
7775 }
7776 
7777 static void
7778 crocus_upload_render_state(struct crocus_context *ice,
7779                            struct crocus_batch *batch,
7780                            const struct pipe_draw_info *draw,
7781                            unsigned drawid_offset,
7782                            const struct pipe_draw_indirect_info *indirect,
7783                            const struct pipe_draw_start_count_bias *sc)
7784 {
7785 #if GFX_VER >= 7
7786    bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7787 #endif
7788 
7789    batch->no_wrap = true;
7790    batch->contains_draw = true;
7791 
7792    crocus_update_surface_base_address(batch);
7793 
7794    crocus_upload_dirty_render_state(ice, batch, draw);
7795 
7796    batch->no_wrap = false;
7797    if (draw->index_size > 0) {
7798       unsigned offset;
7799       unsigned size;
7800       bool emit_index = false;
7801 
7802       if (draw->has_user_indices) {
7803          unsigned start_offset = draw->index_size * sc->start;
7804          u_upload_data(ice->ctx.stream_uploader, 0,
7805                        sc->count * draw->index_size, 4,
7806                        (char *)draw->index.user + start_offset,
7807                        &offset, &ice->state.index_buffer.res);
7808          offset -= start_offset;
7809          size = start_offset + sc->count * draw->index_size;
7810          emit_index = true;
7811       } else {
7812          struct crocus_resource *res = (void *) draw->index.resource;
7813 
7814          if (ice->state.index_buffer.res != draw->index.resource) {
7815             res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7816             pipe_resource_reference(&ice->state.index_buffer.res,
7817                                     draw->index.resource);
7818             emit_index = true;
7819          }
7820          offset = 0;
7821          size = draw->index.resource->width0;
7822       }
7823 
7824       if (!emit_index &&
7825           (ice->state.index_buffer.size != size ||
7826            ice->state.index_buffer.index_size != draw->index_size
7827 #if GFX_VERx10 < 75
7828            || ice->state.index_buffer.prim_restart != draw->primitive_restart
7829 #endif
7830 	   )
7831 	  )
7832          emit_index = true;
7833 
7834       if (emit_index) {
7835          struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7836 
7837          crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7838 #if GFX_VERx10 < 75
7839             ib.CutIndexEnable = draw->primitive_restart;
7840 #endif
7841             ib.IndexFormat = draw->index_size >> 1;
7842             ib.BufferStartingAddress = ro_bo(bo, offset);
7843 #if GFX_VER >= 8
7844             ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7845             ib.BufferSize = bo->size - offset;
7846 #else
7847             ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7848 #endif
7849          }
7850          ice->state.index_buffer.size = size;
7851          ice->state.index_buffer.offset = offset;
7852          ice->state.index_buffer.index_size = draw->index_size;
7853 #if GFX_VERx10 < 75
7854          ice->state.index_buffer.prim_restart = draw->primitive_restart;
7855 #endif
7856       }
7857    }
7858 
7859 #define _3DPRIM_END_OFFSET          0x2420
7860 #define _3DPRIM_START_VERTEX        0x2430
7861 #define _3DPRIM_VERTEX_COUNT        0x2434
7862 #define _3DPRIM_INSTANCE_COUNT      0x2438
7863 #define _3DPRIM_START_INSTANCE      0x243C
7864 #define _3DPRIM_BASE_VERTEX         0x2440
7865 
7866 #if GFX_VER >= 7
7867    if (indirect && !indirect->count_from_stream_output) {
7868       if (indirect->indirect_draw_count) {
7869          use_predicate = true;
7870 
7871          struct crocus_bo *draw_count_bo =
7872             crocus_resource_bo(indirect->indirect_draw_count);
7873          unsigned draw_count_offset =
7874             indirect->indirect_draw_count_offset;
7875 
7876          crocus_emit_pipe_control_flush(batch,
7877                                         "ensure indirect draw buffer is flushed",
7878                                         PIPE_CONTROL_FLUSH_ENABLE);
7879          if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7880 #if GFX_VERx10 >= 75
7881             struct mi_builder b;
7882             mi_builder_init(&b, &batch->screen->devinfo, batch);
7883 
7884             /* comparison = draw id < draw count */
7885             struct mi_value comparison =
7886                mi_ult(&b, mi_imm(drawid_offset),
7887                       mi_mem32(ro_bo(draw_count_bo,
7888                                      draw_count_offset)));
7889 #if GFX_VER == 8
7890             /* predicate = comparison & conditional rendering predicate */
7891             mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7892                          mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7893 #else
7894             /* predicate = comparison & conditional rendering predicate */
7895             struct mi_value pred = mi_iand(&b, comparison,
7896                                            mi_reg32(CS_GPR(15)));
7897 
7898             mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7899             mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7900 
7901             unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7902                MI_PREDICATE_COMBINEOP_SET |
7903                MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7904 
7905             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7906 #endif
7907 #endif
7908          } else {
7909             uint32_t mi_predicate;
7910 
7911             /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7912             crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7913             /* Upload the current draw count from the draw parameters buffer
7914              * to MI_PREDICATE_SRC0.
7915              */
7916             crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7917                                        draw_count_bo, draw_count_offset);
7918             /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7919             crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7920 
7921             if (drawid_offset == 0) {
7922                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7923                   MI_PREDICATE_COMBINEOP_SET |
7924                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7925             } else {
7926                /* While draw_index < draw_count the predicate's result will be
7927                 *  (draw_index == draw_count) ^ TRUE = TRUE
7928                 * When draw_index == draw_count the result is
7929                 *  (TRUE) ^ TRUE = FALSE
7930                 * After this all results will be:
7931                 *  (FALSE) ^ FALSE = FALSE
7932                 */
7933                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7934                   MI_PREDICATE_COMBINEOP_XOR |
7935                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7936             }
7937             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7938          }
7939       }
7940 
7941 #if GFX_VER >= 7
7942       struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7943       assert(bo);
7944 
7945       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7946          lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7947          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7948       }
7949       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7950          lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7951          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7952       }
7953       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7954          lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7955          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7956       }
7957       if (draw->index_size) {
7958          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7959             lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7960             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7961          }
7962          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7963             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7964             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7965          }
7966       } else {
7967          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7968             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7969             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7970          }
7971          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7972             lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7973             lri.DataDWord = 0;
7974          }
7975       }
7976 #endif
7977    } else if (indirect && indirect->count_from_stream_output) {
7978 #if GFX_VERx10 >= 75
7979       struct crocus_stream_output_target *so =
7980          (void *) indirect->count_from_stream_output;
7981 
7982       /* XXX: Replace with actual cache tracking */
7983       crocus_emit_pipe_control_flush(batch,
7984                                      "draw count from stream output stall",
7985                                      PIPE_CONTROL_CS_STALL);
7986 
7987       struct mi_builder b;
7988       mi_builder_init(&b, &batch->screen->devinfo, batch);
7989 
7990       struct crocus_address addr =
7991          ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7992       struct mi_value offset =
7993          mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7994 
7995       mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
7996                mi_udiv32_imm(&b, offset, so->stride));
7997 
7998       _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
7999       _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8000       _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8001       _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8002 #endif
8003    }
8004 #else
8005    assert(!indirect);
8006 #endif
8007 
8008    crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8009       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8010 #if GFX_VER >= 7
8011       prim.PredicateEnable = use_predicate;
8012 #endif
8013 
8014       prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8015       if (indirect) {
8016          // XXX Probably have to do something for gen6 here?
8017 #if GFX_VER >= 7
8018          prim.IndirectParameterEnable = true;
8019 #endif
8020       } else {
8021 #if GFX_VER >= 5
8022          prim.StartInstanceLocation = draw->start_instance;
8023 #endif
8024          prim.InstanceCount = draw->instance_count;
8025          prim.VertexCountPerInstance = sc->count;
8026 
8027          prim.StartVertexLocation = sc->start;
8028 
8029          if (draw->index_size) {
8030             prim.BaseVertexLocation += sc->index_bias;
8031          }
8032       }
8033    }
8034 }
8035 
8036 #if GFX_VER >= 7
8037 
8038 static void
8039 crocus_upload_compute_state(struct crocus_context *ice,
8040                             struct crocus_batch *batch,
8041                             const struct pipe_grid_info *grid)
8042 {
8043    const uint64_t stage_dirty = ice->state.stage_dirty;
8044    struct crocus_screen *screen = batch->screen;
8045    const struct intel_device_info *devinfo = &screen->devinfo;
8046    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8047    struct crocus_compiled_shader *shader =
8048       ice->shaders.prog[MESA_SHADER_COMPUTE];
8049    struct brw_stage_prog_data *prog_data = shader->prog_data;
8050    struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
8051    const struct brw_cs_dispatch_info dispatch =
8052       brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8053 
8054    crocus_update_surface_base_address(batch);
8055    if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8056       upload_sysvals(ice, MESA_SHADER_COMPUTE);
8057 
8058    if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8059       crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8060       ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8061          crocus_upload_binding_table(ice, batch,
8062                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8063                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8064    }
8065 
8066    if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8067       crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8068 
8069    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8070        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8071       /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8072        *
8073        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8074        *    the only bits that are changed are scoreboard related: Scoreboard
8075        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
8076        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
8077        *    sufficient."
8078        */
8079       crocus_emit_pipe_control_flush(batch,
8080                                      "workaround: stall before MEDIA_VFE_STATE",
8081                                      PIPE_CONTROL_CS_STALL);
8082 
8083       crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8084          if (prog_data->total_scratch) {
8085             struct crocus_bo *bo =
8086                crocus_get_scratch_space(ice, prog_data->total_scratch,
8087                                         MESA_SHADER_COMPUTE);
8088 #if GFX_VER == 8
8089             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8090              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8091              */
8092             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8093 #elif GFX_VERx10 == 75
8094             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8095              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8096              */
8097             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8098 #else
8099             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8100              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8101              */
8102             vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8103 #endif
8104             vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8105          }
8106 
8107          vfe.MaximumNumberofThreads =
8108             devinfo->max_cs_threads * devinfo->subslice_total - 1;
8109          vfe.ResetGatewayTimer =
8110             Resettingrelativetimerandlatchingtheglobaltimestamp;
8111          vfe.BypassGatewayControl = true;
8112 #if GFX_VER == 7
8113          vfe.GPGPUMode = true;
8114 #endif
8115 #if GFX_VER == 8
8116          vfe.BypassGatewayControl = true;
8117 #endif
8118          vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8119          vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8120 
8121          vfe.CURBEAllocationSize =
8122             ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8123                   cs_prog_data->push.cross_thread.regs, 2);
8124       }
8125    }
8126 
8127    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8128    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8129        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8130       uint32_t curbe_data_offset = 0;
8131       assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8132              cs_prog_data->push.per_thread.dwords == 1 &&
8133              cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
8134       const unsigned push_const_size =
8135          brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8136       uint32_t *curbe_data_map =
8137          stream_state(batch,
8138                       ALIGN(push_const_size, 64), 64,
8139                       &curbe_data_offset);
8140       assert(curbe_data_map);
8141       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8142       crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8143                                        curbe_data_map);
8144 
8145       crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8146          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8147          curbe.CURBEDataStartAddress = curbe_data_offset;
8148       }
8149    }
8150 
8151    if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8152                       CROCUS_STAGE_DIRTY_BINDINGS_CS |
8153                       CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8154                       CROCUS_STAGE_DIRTY_CS)) {
8155       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8156       const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8157       crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8158          idd.KernelStartPointer = ksp;
8159          idd.SamplerStatePointer = shs->sampler_offset;
8160          idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8161          idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8162          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8163          idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8164          idd.BarrierEnable = cs_prog_data->uses_barrier;
8165          idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
8166                                                      prog_data->total_shared);
8167 #if GFX_VERx10 >= 75
8168          idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8169 #endif
8170       }
8171 
8172       crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8173          load.InterfaceDescriptorTotalLength =
8174             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8175          load.InterfaceDescriptorDataStartAddress =
8176             emit_state(batch, desc, sizeof(desc), 64);
8177       }
8178    }
8179 
8180 #define GPGPU_DISPATCHDIMX 0x2500
8181 #define GPGPU_DISPATCHDIMY 0x2504
8182 #define GPGPU_DISPATCHDIMZ 0x2508
8183 
8184    if (grid->indirect) {
8185       struct crocus_state_ref *grid_size = &ice->state.grid_size;
8186       struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8187       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8188          lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8189          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8190       }
8191       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8192          lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8193          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8194       }
8195       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8196          lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8197          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8198       }
8199 
8200 #if GFX_VER == 7
8201       /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8202       _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8203       crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8204 
8205       /* Load compute_dispatch_indirect_x_size into SRC0 */
8206       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8207 
8208       /* predicate = (compute_dispatch_indirect_x_size == 0); */
8209       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8210          mip.LoadOperation    = LOAD_LOAD;
8211          mip.CombineOperation = COMBINE_SET;
8212          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8213       };
8214 
8215       /* Load compute_dispatch_indirect_y_size into SRC0 */
8216       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8217 
8218       /* predicate = (compute_dispatch_indirect_y_size == 0); */
8219       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8220          mip.LoadOperation    = LOAD_LOAD;
8221          mip.CombineOperation = COMBINE_OR;
8222          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8223       };
8224 
8225       /* Load compute_dispatch_indirect_z_size into SRC0 */
8226       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8227 
8228       /* predicate = (compute_dispatch_indirect_z_size == 0); */
8229       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8230          mip.LoadOperation    = LOAD_LOAD;
8231          mip.CombineOperation = COMBINE_OR;
8232          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8233       };
8234 
8235       /* predicate = !predicate; */
8236 #define COMPARE_FALSE                           1
8237       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8238          mip.LoadOperation    = LOAD_LOADINV;
8239          mip.CombineOperation = COMBINE_OR;
8240          mip.CompareOperation = COMPARE_FALSE;
8241       }
8242 #endif
8243    }
8244 
8245    crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8246       ggw.IndirectParameterEnable    = grid->indirect != NULL;
8247       ggw.PredicateEnable            = GFX_VER <= 7 && grid->indirect != NULL;
8248       ggw.SIMDSize                   = dispatch.simd_size / 16;
8249       ggw.ThreadDepthCounterMaximum  = 0;
8250       ggw.ThreadHeightCounterMaximum = 0;
8251       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
8252       ggw.ThreadGroupIDXDimension    = grid->grid[0];
8253       ggw.ThreadGroupIDYDimension    = grid->grid[1];
8254       ggw.ThreadGroupIDZDimension    = grid->grid[2];
8255       ggw.RightExecutionMask         = dispatch.right_mask;
8256       ggw.BottomExecutionMask        = 0xffffffff;
8257    }
8258 
8259    crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8260 
8261    batch->contains_draw = true;
8262 }
8263 
8264 #endif /* GFX_VER >= 7 */
8265 
8266 /**
8267  * State module teardown.
8268  */
8269 static void
8270 crocus_destroy_state(struct crocus_context *ice)
8271 {
8272    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8273    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8274 
8275    free(ice->state.genx);
8276 
8277    for (int i = 0; i < 4; i++) {
8278       pipe_so_target_reference(&ice->state.so_target[i], NULL);
8279    }
8280 
8281    for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
8282       pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
8283    }
8284    pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
8285 
8286    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8287       struct crocus_shader_state *shs = &ice->state.shaders[stage];
8288       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8289          pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8290       }
8291       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8292          pipe_resource_reference(&shs->image[i].base.resource, NULL);
8293       }
8294       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8295          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8296       }
8297       for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8298          pipe_sampler_view_reference((struct pipe_sampler_view **)
8299                                      &shs->textures[i], NULL);
8300       }
8301    }
8302 
8303    for (int i = 0; i < 16; i++)
8304       pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8305    pipe_resource_reference(&ice->state.grid_size.res, NULL);
8306 
8307    pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8308 }
8309 
8310 /* ------------------------------------------------------------------- */
8311 
8312 static void
8313 crocus_rebind_buffer(struct crocus_context *ice,
8314                      struct crocus_resource *res)
8315 {
8316    struct pipe_context *ctx = &ice->ctx;
8317 
8318    assert(res->base.b.target == PIPE_BUFFER);
8319 
8320    /* Buffers can't be framebuffer attachments, nor display related,
8321     * and we don't have upstream Clover support.
8322     */
8323    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8324                                  PIPE_BIND_RENDER_TARGET |
8325                                  PIPE_BIND_BLENDABLE |
8326                                  PIPE_BIND_DISPLAY_TARGET |
8327                                  PIPE_BIND_CURSOR |
8328                                  PIPE_BIND_COMPUTE_RESOURCE |
8329                                  PIPE_BIND_GLOBAL)));
8330 
8331    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8332       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8333       while (bound_vbs) {
8334          const int i = u_bit_scan64(&bound_vbs);
8335          struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8336 
8337          if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8338             ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8339       }
8340    }
8341 
8342    if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8343        ice->state.index_buffer.res) {
8344       if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8345          pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8346    }
8347    /* There is no need to handle these:
8348     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8349     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8350     */
8351 
8352    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8353       /* XXX: be careful about resetting vs appending... */
8354       for (int i = 0; i < 4; i++) {
8355          if (ice->state.so_target[i] &&
8356              (ice->state.so_target[i]->buffer == &res->base.b)) {
8357 #if GFX_VER == 6
8358             ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8359 #else
8360             ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8361 #endif
8362          }
8363       }
8364    }
8365 
8366    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8367       struct crocus_shader_state *shs = &ice->state.shaders[s];
8368       enum pipe_shader_type p_stage = stage_to_pipe(s);
8369 
8370       if (!(res->bind_stages & (1 << s)))
8371          continue;
8372 
8373       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8374          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8375          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8376          while (bound_cbufs) {
8377             const int i = u_bit_scan(&bound_cbufs);
8378             struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8379 
8380             if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8381                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8382             }
8383          }
8384       }
8385 
8386       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8387          uint32_t bound_ssbos = shs->bound_ssbos;
8388          while (bound_ssbos) {
8389             const int i = u_bit_scan(&bound_ssbos);
8390             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8391 
8392             if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8393                struct pipe_shader_buffer buf = {
8394                   .buffer = &res->base.b,
8395                   .buffer_offset = ssbo->buffer_offset,
8396                   .buffer_size = ssbo->buffer_size,
8397                };
8398                crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8399                                          (shs->writable_ssbos >> i) & 1);
8400             }
8401          }
8402       }
8403 
8404       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8405          uint32_t bound_sampler_views = shs->bound_sampler_views;
8406          while (bound_sampler_views) {
8407             const int i = u_bit_scan(&bound_sampler_views);
8408             struct crocus_sampler_view *isv = shs->textures[i];
8409             struct crocus_bo *bo = isv->res->bo;
8410 
8411             if (res->bo == bo) {
8412                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8413             }
8414          }
8415       }
8416 
8417       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8418          uint32_t bound_image_views = shs->bound_image_views;
8419          while (bound_image_views) {
8420             const int i = u_bit_scan(&bound_image_views);
8421             struct crocus_image_view *iv = &shs->image[i];
8422             struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8423 
8424             if (res->bo == bo)
8425                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8426          }
8427       }
8428    }
8429 }
8430 
8431 /* ------------------------------------------------------------------- */
8432 
8433 static unsigned
8434 flags_to_post_sync_op(uint32_t flags)
8435 {
8436    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8437       return WriteImmediateData;
8438 
8439    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8440       return WritePSDepthCount;
8441 
8442    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8443       return WriteTimestamp;
8444 
8445    return 0;
8446 }
8447 
8448 /*
8449  * Do the given flags have a Post Sync or LRI Post Sync operation?
8450  */
8451 static enum pipe_control_flags
8452 get_post_sync_flags(enum pipe_control_flags flags)
8453 {
8454    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8455             PIPE_CONTROL_WRITE_DEPTH_COUNT |
8456             PIPE_CONTROL_WRITE_TIMESTAMP |
8457             PIPE_CONTROL_LRI_POST_SYNC_OP;
8458 
8459    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8460     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
8461     */
8462    assert(util_bitcount(flags) <= 1);
8463 
8464    return flags;
8465 }
8466 
8467 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8468 
8469 /**
8470  * Emit a series of PIPE_CONTROL commands, taking into account any
8471  * workarounds necessary to actually accomplish the caller's request.
8472  *
8473  * Unless otherwise noted, spec quotations in this function come from:
8474  *
8475  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8476  * Restrictions for PIPE_CONTROL.
8477  *
8478  * You should not use this function directly.  Use the helpers in
8479  * crocus_pipe_control.c instead, which may split the pipe control further.
8480  */
8481 static void
8482 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8483                              const char *reason,
8484                              uint32_t flags,
8485                              struct crocus_bo *bo,
8486                              uint32_t offset,
8487                              uint64_t imm)
8488 {
8489    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8490    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8491    UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8492       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8493 
8494    /* Recursive PIPE_CONTROL workarounds --------------------------------
8495     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8496     *
8497     * We do these first because we want to look at the original operation,
8498     * rather than any workarounds we set.
8499     */
8500 
8501    /* "Flush Types" workarounds ---------------------------------------------
8502     * We do these now because they may add post-sync operations or CS stalls.
8503     */
8504 
8505    if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8506       /* Hardware workaround: SNB B-Spec says:
8507        *
8508        *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8509        *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8510        *     required."
8511        */
8512       crocus_emit_post_sync_nonzero_flush(batch);
8513    }
8514 
8515 #if GFX_VER == 8
8516    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8517       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8518        *
8519        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8520        *  'Write PS Depth Count' or 'Write Timestamp'."
8521        */
8522       if (!bo) {
8523          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8524          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8525          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8526          bo = batch->ice->workaround_bo;
8527          offset = batch->ice->workaround_offset;
8528       }
8529    }
8530 #endif
8531 
8532 #if GFX_VERx10 < 75
8533    if (flags & PIPE_CONTROL_DEPTH_STALL) {
8534       /* Project: PRE-HSW / Argument: Depth Stall
8535        *
8536        * "The following bits must be clear:
8537        *  - Render Target Cache Flush Enable ([12] of DW1)
8538        *  - Depth Cache Flush Enable ([0] of DW1)"
8539        */
8540       assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8541                         PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8542    }
8543 #endif
8544    if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8545       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8546        *
8547        *    "This bit must be DISABLED for operations other than writing
8548        *     PS_DEPTH_COUNT."
8549        *
8550        * This seems like nonsense.  An Ivybridge workaround requires us to
8551        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8552        * operation.  Gen8+ requires us to emit depth stalls and depth cache
8553        * flushes together.  So, it's hard to imagine this means anything other
8554        * than "we originally intended this to be used for PS_DEPTH_COUNT".
8555        *
8556        * We ignore the supposed restriction and do nothing.
8557        */
8558    }
8559 
8560    if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8561       /* Project: PRE-HSW / Argument: Depth Cache Flush
8562        *
8563        * "Depth Stall must be clear ([13] of DW1)."
8564        */
8565       assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8566    }
8567 
8568    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8569                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8570       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8571        *
8572        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
8573        *     PS_DEPTH_COUNT or TIMESTAMP queries."
8574        *
8575        * TODO: Implement end-of-pipe checking.
8576        */
8577       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8578                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
8579    }
8580 
8581    if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8582       /* From the PIPE_CONTROL instruction table, bit 1:
8583        *
8584        *    "This bit is ignored if Depth Stall Enable is set.
8585        *     Further, the render cache is not flushed even if Write Cache
8586        *     Flush Enable bit is set."
8587        *
8588        * We assert that the caller doesn't do this combination, to try and
8589        * prevent mistakes.  It shouldn't hurt the GPU, though.
8590        *
8591        * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8592        * and "Render Target Flush" combo is explicitly required for BTI
8593        * update workarounds.
8594        */
8595       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8596                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8597    }
8598 
8599    /* PIPE_CONTROL page workarounds ------------------------------------- */
8600 
8601    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8602       /* From the PIPE_CONTROL page itself:
8603        *
8604        *    "IVB, HSW, BDW
8605        *     Restriction: Pipe_control with CS-stall bit set must be issued
8606        *     before a pipe-control command that has the State Cache
8607        *     Invalidate bit set."
8608        */
8609       flags |= PIPE_CONTROL_CS_STALL;
8610    }
8611 
8612    if ((GFX_VERx10 == 75)) {
8613       /* From the PIPE_CONTROL page itself:
8614        *
8615        *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8616        *     Prior to programming a PIPECONTROL command with any of the RO
8617        *     cache invalidation bit set, program a PIPECONTROL flush command
8618        *     with “CS stall” bit and “HDC Flush” bit set."
8619        *
8620        * TODO: Actually implement this.  What's an HDC Flush?
8621        */
8622    }
8623 
8624    if (flags & PIPE_CONTROL_FLUSH_LLC) {
8625       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8626        *
8627        *    "Project: ALL
8628        *     SW must always program Post-Sync Operation to "Write Immediate
8629        *     Data" when Flush LLC is set."
8630        *
8631        * For now, we just require the caller to do it.
8632        */
8633       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8634    }
8635 
8636    /* "Post-Sync Operation" workarounds -------------------------------- */
8637 
8638    /* Project: All / Argument: Global Snapshot Count Reset [19]
8639     *
8640     * "This bit must not be exercised on any product.
8641     *  Requires stall bit ([20] of DW1) set."
8642     *
8643     * We don't use this, so we just assert that it isn't used.  The
8644     * PIPE_CONTROL instruction page indicates that they intended this
8645     * as a debug feature and don't think it is useful in production,
8646     * but it may actually be usable, should we ever want to.
8647     */
8648    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8649 
8650    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8651                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8652       /* Project: All / Arguments:
8653        *
8654        * - Generic Media State Clear [16]
8655        * - Indirect State Pointers Disable [16]
8656        *
8657        *    "Requires stall bit ([20] of DW1) set."
8658        *
8659        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8660        * State Clear) says:
8661        *
8662        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
8663        *     programmed prior to programming a PIPECONTROL command with "Media
8664        *     State Clear" set in GPGPU mode of operation"
8665        *
8666        * This is a subset of the earlier rule, so there's nothing to do.
8667        */
8668       flags |= PIPE_CONTROL_CS_STALL;
8669    }
8670 
8671    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8672       /* Project: All / Argument: Store Data Index
8673        *
8674        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8675        *  than '0'."
8676        *
8677        * For now, we just assert that the caller does this.  We might want to
8678        * automatically add a write to the workaround BO...
8679        */
8680       assert(non_lri_post_sync_flags != 0);
8681    }
8682 
8683    if (flags & PIPE_CONTROL_SYNC_GFDT) {
8684       /* Project: All / Argument: Sync GFDT
8685        *
8686        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8687        *  than '0' or 0x2520[13] must be set."
8688        *
8689        * For now, we just assert that the caller does this.
8690        */
8691       assert(non_lri_post_sync_flags != 0);
8692    }
8693 
8694    if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8695       /* Project: SNB, IVB, HSW / Argument: TLB inv
8696        *
8697        * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8698        *  must be set to something other than '0'."
8699        *
8700        * For now, we just assert that the caller does this.
8701        */
8702       assert(non_lri_post_sync_flags != 0);
8703    }
8704 
8705    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8706       /* Project: IVB+ / Argument: TLB inv
8707        *
8708        *    "Requires stall bit ([20] of DW1) set."
8709        *
8710        * Also, from the PIPE_CONTROL instruction table:
8711        *
8712        *    "Project: SKL+
8713        *     Post Sync Operation or CS stall must be set to ensure a TLB
8714        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
8715        *     cache to invalidate."
8716        *
8717        * This is not a subset of the earlier rule, so there's nothing to do.
8718        */
8719       flags |= PIPE_CONTROL_CS_STALL;
8720    }
8721 #if GFX_VER == 8
8722    if (IS_COMPUTE_PIPELINE(batch)) {
8723       if (post_sync_flags ||
8724           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8725                     PIPE_CONTROL_DEPTH_STALL |
8726                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
8727                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8728                     PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8729          /* Project: BDW / Arguments:
8730           *
8731           * - LRI Post Sync Operation   [23]
8732           * - Post Sync Op              [15:14]
8733           * - Notify En                 [8]
8734           * - Depth Stall               [13]
8735           * - Render Target Cache Flush [12]
8736           * - Depth Cache Flush         [0]
8737           * - DC Flush Enable           [5]
8738           *
8739           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
8740           *     Workloads."
8741           *
8742           * (The docs have separate table rows for each bit, with essentially
8743           * the same workaround text.  We've combined them here.)
8744           */
8745          flags |= PIPE_CONTROL_CS_STALL;
8746 
8747          /* Also, from the PIPE_CONTROL instruction table, bit 20:
8748           *
8749           *    "Project: BDW
8750           *     This bit must be always set when PIPE_CONTROL command is
8751           *     programmed by GPGPU and MEDIA workloads, except for the cases
8752           *     when only Read Only Cache Invalidation bits are set (State
8753           *     Cache Invalidation Enable, Instruction cache Invalidation
8754           *     Enable, Texture Cache Invalidation Enable, Constant Cache
8755           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
8756           *     need not implemented when FF_DOP_CG is disable via "Fixed
8757           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8758           *
8759           * It sounds like we could avoid CS stalls in some cases, but we
8760           * don't currently bother.  This list isn't exactly the list above,
8761           * either...
8762           */
8763       }
8764    }
8765 #endif
8766    /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8767     *
8768     * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8769     *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8770     *
8771     * Note that the kernel does CS stalls between batches, so we only need
8772     * to count them within a batch.  We currently naively count every 4, and
8773     * don't skip the ones with only read-cache-invalidate bits set.  This
8774     * may or may not be a problem...
8775     */
8776    if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8777       if (flags & PIPE_CONTROL_CS_STALL) {
8778          /* If we're doing a CS stall, reset the counter and carry on. */
8779          batch->pipe_controls_since_last_cs_stall = 0;
8780       }
8781 
8782       /* If this is the fourth pipe control without a CS stall, do one now. */
8783       if (++batch->pipe_controls_since_last_cs_stall == 4) {
8784          batch->pipe_controls_since_last_cs_stall = 0;
8785          flags |= PIPE_CONTROL_CS_STALL;
8786       }
8787    }
8788 
8789    /* "Stall" workarounds ----------------------------------------------
8790     * These have to come after the earlier ones because we may have added
8791     * some additional CS stalls above.
8792     */
8793 
8794    if (flags & PIPE_CONTROL_CS_STALL) {
8795       /* Project: PRE-SKL, VLV, CHV
8796        *
8797        * "[All Stepping][All SKUs]:
8798        *
8799        *  One of the following must also be set:
8800        *
8801        *  - Render Target Cache Flush Enable ([12] of DW1)
8802        *  - Depth Cache Flush Enable ([0] of DW1)
8803        *  - Stall at Pixel Scoreboard ([1] of DW1)
8804        *  - Depth Stall ([13] of DW1)
8805        *  - Post-Sync Operation ([13] of DW1)
8806        *  - DC Flush Enable ([5] of DW1)"
8807        *
8808        * If we don't already have one of those bits set, we choose to add
8809        * "Stall at Pixel Scoreboard".  Some of the other bits require a
8810        * CS stall as a workaround (see above), which would send us into
8811        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
8812        * appears to be safe, so we choose that.
8813        */
8814       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8815                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8816                                PIPE_CONTROL_WRITE_IMMEDIATE |
8817                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
8818                                PIPE_CONTROL_WRITE_TIMESTAMP |
8819                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
8820                                PIPE_CONTROL_DEPTH_STALL |
8821                                PIPE_CONTROL_DATA_CACHE_FLUSH;
8822       if (!(flags & wa_bits))
8823          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8824    }
8825 
8826    /* Emit --------------------------------------------------------------- */
8827 
8828    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8829       fprintf(stderr,
8830               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8831               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8832               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8833               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8834               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8835               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8836               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8837               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8838               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8839               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8840               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8841               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8842               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8843               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8844               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8845               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8846               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8847               "SnapRes" : "",
8848               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8849               "ISPDis" : "",
8850               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8851               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8852               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8853               imm, reason);
8854    }
8855 
8856    crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8857 #if GFX_VER >= 7
8858       pc.LRIPostSyncOperation = NoLRIOperation;
8859       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8860       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8861 #endif
8862 #if GFX_VER >= 6
8863       pc.StoreDataIndex = 0;
8864       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8865       pc.GlobalSnapshotCountReset =
8866          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8867       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8868       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8869       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8870       pc.RenderTargetCacheFlushEnable =
8871          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8872       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8873       pc.StateCacheInvalidationEnable =
8874          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8875       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8876       pc.ConstantCacheInvalidationEnable =
8877          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8878 #else
8879       pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8880 #endif
8881       pc.PostSyncOperation = flags_to_post_sync_op(flags);
8882       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8883       pc.InstructionCacheInvalidateEnable =
8884          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8885       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8886 #if GFX_VER >= 5 || GFX_VERx10 == 45
8887       pc.IndirectStatePointersDisable =
8888          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8889 #endif
8890 #if GFX_VER >= 6
8891       pc.TextureCacheInvalidationEnable =
8892          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8893 #elif GFX_VER == 5 || GFX_VERx10 == 45
8894       pc.TextureCacheFlushEnable =
8895          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8896 #endif
8897       pc.Address = ggtt_bo(bo, offset);
8898       if (GFX_VER < 7 && bo)
8899          pc.DestinationAddressType = DAT_GGTT;
8900       pc.ImmediateData = imm;
8901    }
8902 }
8903 
8904 #if GFX_VER == 6
8905 void
8906 genX(crocus_upload_urb)(struct crocus_batch *batch,
8907                         unsigned vs_size,
8908                         bool gs_present,
8909                         unsigned gs_size)
8910 {
8911    struct crocus_context *ice = batch->ice;
8912    int nr_vs_entries, nr_gs_entries;
8913    int total_urb_size = ice->urb.size * 1024; /* in bytes */
8914    const struct intel_device_info *devinfo = &batch->screen->devinfo;
8915 
8916    /* Calculate how many entries fit in each stage's section of the URB */
8917    if (gs_present) {
8918       nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8919       nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8920    } else {
8921       nr_vs_entries = total_urb_size / (vs_size * 128);
8922       nr_gs_entries = 0;
8923    }
8924 
8925    /* Then clamp to the maximum allowed by the hardware */
8926    if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8927       nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8928 
8929    if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8930       nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8931 
8932    /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8933    ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8934    ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8935 
8936    assert(ice->urb.nr_vs_entries >=
8937           devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8938    assert(ice->urb.nr_vs_entries % 4 == 0);
8939    assert(ice->urb.nr_gs_entries % 4 == 0);
8940    assert(vs_size <= 5);
8941    assert(gs_size <= 5);
8942 
8943    crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8944       urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8945       urb.VSURBEntryAllocationSize = vs_size - 1;
8946 
8947       urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8948       urb.GSURBEntryAllocationSize = gs_size - 1;
8949    };
8950    /* From the PRM Volume 2 part 1, section 1.4.7:
8951     *
8952     *   Because of a urb corruption caused by allocating a previous gsunit’s
8953     *   urb entry to vsunit software is required to send a "GS NULL
8954     *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8955     *   a dummy DRAW call before any case where VS will be taking over GS URB
8956     *   space.
8957     *
8958     * It is not clear exactly what this means ("URB fence" is a command that
8959     * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
8960     * a workaround.
8961     */
8962    if (ice->urb.gs_present && !gs_present)
8963       crocus_emit_mi_flush(batch);
8964    ice->urb.gs_present = gs_present;
8965 }
8966 #endif
8967 
8968 static void
8969 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8970 {
8971 }
8972 
8973 static void
8974 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8975                                  struct crocus_bo *bo,
8976                                  uint32_t offset_in_bytes,
8977                                  uint32_t report_id)
8978 {
8979 #if GFX_VER >= 7
8980    crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8981       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8982       mi_rpc.ReportID = report_id;
8983    }
8984 #endif
8985 }
8986 
8987 /**
8988  * From the PRM, Volume 2a:
8989  *
8990  *    "Indirect State Pointers Disable
8991  *
8992  *    At the completion of the post-sync operation associated with this pipe
8993  *    control packet, the indirect state pointers in the hardware are
8994  *    considered invalid; the indirect pointers are not saved in the context.
8995  *    If any new indirect state commands are executed in the command stream
8996  *    while the pipe control is pending, the new indirect state commands are
8997  *    preserved.
8998  *
8999  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9000  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9001  *    commands are only considered as Indirect State Pointers. Once ISP is
9002  *    issued in a context, SW must initialize by programming push constant
9003  *    commands for all the shaders (at least to zero length) before attempting
9004  *    any rendering operation for the same context."
9005  *
9006  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9007  * even though they point to a BO that has been already unreferenced at
9008  * the end of the previous batch buffer. This has been fine so far since
9009  * we are protected by these scratch page (every address not covered by
9010  * a BO should be pointing to the scratch page). But on CNL, it is
9011  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9012  * instruction.
9013  *
9014  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9015  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9016  * context restore, so the mentioned hang doesn't happen. However,
9017  * software must program push constant commands for all stages prior to
9018  * rendering anything, so we flag them as dirty.
9019  *
9020  * Finally, we also make sure to stall at pixel scoreboard to make sure the
9021  * constants have been loaded into the EUs prior to disable the push constants
9022  * so that it doesn't hang a previous 3DPRIMITIVE.
9023  */
9024 #if GFX_VER >= 7
9025 static void
9026 gen7_emit_isp_disable(struct crocus_batch *batch)
9027 {
9028    crocus_emit_raw_pipe_control(batch, "isp disable",
9029                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9030                                 PIPE_CONTROL_CS_STALL,
9031                                 NULL, 0, 0);
9032    crocus_emit_raw_pipe_control(batch, "isp disable",
9033                                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9034                                 PIPE_CONTROL_CS_STALL,
9035                                 NULL, 0, 0);
9036 
9037    struct crocus_context *ice = batch->ice;
9038    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9039                               CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9040                               CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9041                               CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9042                               CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9043 }
9044 #endif
9045 
9046 #if GFX_VER >= 7
9047 static void
9048 crocus_state_finish_batch(struct crocus_batch *batch)
9049 {
9050 #if GFX_VERx10 == 75
9051    if (batch->name == CROCUS_BATCH_RENDER) {
9052       crocus_emit_mi_flush(batch);
9053       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9054          ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9055       }
9056 
9057       crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9058                                      PIPE_CONTROL_CS_STALL);
9059    }
9060 #endif
9061    gen7_emit_isp_disable(batch);
9062 }
9063 #endif
9064 
9065 static void
9066 crocus_batch_reset_dirty(struct crocus_batch *batch)
9067 {
9068    /* unreference any index buffer so it get reemitted. */
9069    pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9070 
9071    /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9072     * as the old state batch won't still be available.
9073     */
9074    batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9075       CROCUS_DIRTY_COLOR_CALC_STATE;
9076 
9077    batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9078 
9079    batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9080    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9081    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9082    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9083    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9084    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9085    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9086 
9087    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9088    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9089    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9090    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9091    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9092    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9093 
9094    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9095    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9096    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9097    batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9098 
9099 #if GFX_VER >= 6
9100    /* SCISSOR_STATE */
9101    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9102    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9103    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9104 
9105 #endif
9106 #if GFX_VER <= 5
9107    /* dirty the SF state on gen4/5 */
9108    batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9109    batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9110    batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9111    batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9112 #endif
9113 #if GFX_VER >= 7
9114    /* Streamout dirty */
9115    batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9116    batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9117    batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9118 #endif
9119 }
9120 
9121 #if GFX_VERx10 == 75
9122 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9123 {
9124    return &ice->state.cso_rast->cso;
9125 }
9126 #endif
9127 
9128 #if GFX_VER >= 6
9129 static void update_so_strides(struct crocus_context *ice,
9130                               uint16_t *strides)
9131 {
9132    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9133       struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9134       if (so)
9135          so->stride = strides[i] * sizeof(uint32_t);
9136    }
9137 }
9138 #endif
9139 
9140 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9141                                    int s,
9142                                    uint32_t *clamp_mask)
9143 {
9144 #if GFX_VER < 8
9145    if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9146        samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9147       if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9148          clamp_mask[0] |= (1 << s);
9149       if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9150          clamp_mask[1] |= (1 << s);
9151       if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9152          clamp_mask[2] |= (1 << s);
9153    }
9154 #endif
9155 }
9156 
9157 static void
9158 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9159 {
9160    struct crocus_context *ice = (struct crocus_context *) ctx;
9161 
9162    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9163       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9164       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9165    }
9166 
9167    if (ice->batch_count == 1)
9168       return;
9169 
9170    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9171       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9172       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9173    }
9174 }
9175 
9176 void
9177 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9178 {
9179    assert(screen->devinfo.verx10 == GFX_VERx10);
9180    screen->vtbl.destroy_state = crocus_destroy_state;
9181    screen->vtbl.init_render_context = crocus_init_render_context;
9182    screen->vtbl.upload_render_state = crocus_upload_render_state;
9183 #if GFX_VER >= 7
9184    screen->vtbl.init_compute_context = crocus_init_compute_context;
9185    screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9186 #endif
9187    screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9188    screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9189    screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9190 #if GFX_VERx10 >= 75
9191    screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9192    screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9193    screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9194    screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9195    screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9196    screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9197 #endif
9198 #if GFX_VER >= 7
9199    screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9200    screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9201    screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9202    screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9203 #endif
9204    screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9205 #if GFX_VER >= 6
9206    screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9207    screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9208 #endif
9209    screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9210    screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9211    screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9212    screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9213    screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9214    screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9215    screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9216 #if GFX_VER >= 7
9217    screen->vtbl.finish_batch = crocus_state_finish_batch;
9218 #endif
9219 #if GFX_VER <= 5
9220    screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9221    screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9222 #endif
9223    screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9224    screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9225    screen->vtbl.translate_prim_type = translate_prim_type;
9226 #if GFX_VER >= 6
9227    screen->vtbl.update_so_strides = update_so_strides;
9228    screen->vtbl.get_so_offset = crocus_get_so_offset;
9229 #endif
9230 
9231    genX(crocus_init_blt)(screen);
9232 }
9233 
9234 void
9235 genX(crocus_init_state)(struct crocus_context *ice)
9236 {
9237    struct pipe_context *ctx = &ice->ctx;
9238 
9239    ctx->create_blend_state = crocus_create_blend_state;
9240    ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9241    ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9242    ctx->create_sampler_state = crocus_create_sampler_state;
9243    ctx->create_sampler_view = crocus_create_sampler_view;
9244    ctx->create_surface = crocus_create_surface;
9245    ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9246    ctx->bind_blend_state = crocus_bind_blend_state;
9247    ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9248    ctx->bind_sampler_states = crocus_bind_sampler_states;
9249    ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9250    ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9251    ctx->delete_blend_state = crocus_delete_state;
9252    ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9253    ctx->delete_rasterizer_state = crocus_delete_state;
9254    ctx->delete_sampler_state = crocus_delete_state;
9255    ctx->delete_vertex_elements_state = crocus_delete_state;
9256    ctx->set_blend_color = crocus_set_blend_color;
9257    ctx->set_clip_state = crocus_set_clip_state;
9258    ctx->set_constant_buffer = crocus_set_constant_buffer;
9259    ctx->set_shader_buffers = crocus_set_shader_buffers;
9260    ctx->set_shader_images = crocus_set_shader_images;
9261    ctx->set_sampler_views = crocus_set_sampler_views;
9262    ctx->set_tess_state = crocus_set_tess_state;
9263    ctx->set_patch_vertices = crocus_set_patch_vertices;
9264    ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9265    ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9266    ctx->set_sample_mask = crocus_set_sample_mask;
9267    ctx->set_scissor_states = crocus_set_scissor_states;
9268    ctx->set_stencil_ref = crocus_set_stencil_ref;
9269    ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9270    ctx->set_viewport_states = crocus_set_viewport_states;
9271    ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9272    ctx->surface_destroy = crocus_surface_destroy;
9273    ctx->draw_vbo = crocus_draw_vbo;
9274    ctx->launch_grid = crocus_launch_grid;
9275 
9276    ctx->set_frontend_noop = crocus_set_frontend_noop;
9277 
9278 #if GFX_VER >= 6
9279    ctx->create_stream_output_target = crocus_create_stream_output_target;
9280    ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9281    ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9282 #endif
9283 
9284    ice->state.dirty = ~0ull;
9285    ice->state.stage_dirty = ~0ull;
9286 
9287    ice->state.statistics_counters_enabled = true;
9288 
9289    ice->state.sample_mask = 0xff;
9290    ice->state.num_viewports = 1;
9291    ice->state.prim_mode = PIPE_PRIM_MAX;
9292    ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
9293    ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9294    ice->draw.derived_params.drawid = -1;
9295 
9296    /* Default all scissor rectangles to be empty regions. */
9297    for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9298       ice->state.scissors[i] = (struct pipe_scissor_state) {
9299          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9300       };
9301    }
9302 }
9303