1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_context.h"
25 #include "brw_defines.h"
26 #include "brw_state.h"
27 #include "brw_batch.h"
28 #include "brw_fbo.h"
29 
30 /**
31  * Emit a PIPE_CONTROL with various flushing flags.
32  *
33  * The caller is responsible for deciding what flags are appropriate for the
34  * given generation.
35  */
36 void
brw_emit_pipe_control_flush(struct brw_context * brw,uint32_t flags)37 brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
38 {
39    const struct intel_device_info *devinfo = &brw->screen->devinfo;
40 
41    if (devinfo->ver >= 6 &&
42        (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
43        (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
44       /* A pipe control command with flush and invalidate bits set
45        * simultaneously is an inherently racy operation on Gfx6+ if the
46        * contents of the flushed caches were intended to become visible from
47        * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
48        * first one should stall the pipeline to make sure that the flushed R/W
49        * caches are coherent with memory once the specified R/O caches are
50        * invalidated.  On pre-Gfx6 hardware the (implicit) R/O cache
51        * invalidation seems to happen at the bottom of the pipeline together
52        * with any write cache flush, so this shouldn't be a concern.  In order
53        * to ensure a full stall, we do an end-of-pipe sync.
54        */
55       brw_emit_end_of_pipe_sync(brw, (flags & PIPE_CONTROL_CACHE_FLUSH_BITS));
56       flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
57    }
58 
59    brw->vtbl.emit_raw_pipe_control(brw, flags, NULL, 0, 0);
60 }
61 
62 /**
63  * Emit a PIPE_CONTROL that writes to a buffer object.
64  *
65  * \p flags should contain one of the following items:
66  *  - PIPE_CONTROL_WRITE_IMMEDIATE
67  *  - PIPE_CONTROL_WRITE_TIMESTAMP
68  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
69  */
70 void
brw_emit_pipe_control_write(struct brw_context * brw,uint32_t flags,struct brw_bo * bo,uint32_t offset,uint64_t imm)71 brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
72                             struct brw_bo *bo, uint32_t offset,
73                             uint64_t imm)
74 {
75    brw->vtbl.emit_raw_pipe_control(brw, flags, bo, offset, imm);
76 }
77 
78 /**
79  * Restriction [DevSNB, DevIVB]:
80  *
81  * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
82  * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
83  * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
84  * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
85  * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
86  * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
87  * unless SW can otherwise guarantee that the pipeline from WM onwards is
88  * already flushed (e.g., via a preceding MI_FLUSH).
89  */
90 void
brw_emit_depth_stall_flushes(struct brw_context * brw)91 brw_emit_depth_stall_flushes(struct brw_context *brw)
92 {
93    const struct intel_device_info *devinfo = &brw->screen->devinfo;
94 
95    assert(devinfo->ver >= 6);
96 
97    /* Starting on BDW, these pipe controls are unnecessary.
98     *
99     *   WM HW will internally manage the draining pipe and flushing of the caches
100     *   when this command is issued. The PIPE_CONTROL restrictions are removed.
101     */
102    if (devinfo->ver >= 8)
103       return;
104 
105    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
106    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
107    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
108 }
109 
110 /**
111  * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
112  * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
113  *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
114  *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
115  *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
116  *  to be sent before any combination of VS associated 3DSTATE."
117  */
118 void
gfx7_emit_vs_workaround_flush(struct brw_context * brw)119 gfx7_emit_vs_workaround_flush(struct brw_context *brw)
120 {
121    ASSERTED const struct intel_device_info *devinfo = &brw->screen->devinfo;
122 
123    assert(devinfo->ver == 7);
124    brw_emit_pipe_control_write(brw,
125                                PIPE_CONTROL_WRITE_IMMEDIATE
126                                | PIPE_CONTROL_DEPTH_STALL,
127                                brw->workaround_bo,
128                                brw->workaround_bo_offset, 0);
129 }
130 
131 /**
132  * From the PRM, Volume 2a:
133  *
134  *    "Indirect State Pointers Disable
135  *
136  *    At the completion of the post-sync operation associated with this pipe
137  *    control packet, the indirect state pointers in the hardware are
138  *    considered invalid; the indirect pointers are not saved in the context.
139  *    If any new indirect state commands are executed in the command stream
140  *    while the pipe control is pending, the new indirect state commands are
141  *    preserved.
142  *
143  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
144  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
145  *    commands are only considered as Indirect State Pointers. Once ISP is
146  *    issued in a context, SW must initialize by programming push constant
147  *    commands for all the shaders (at least to zero length) before attempting
148  *    any rendering operation for the same context."
149  *
150  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
151  * even though they point to a BO that has been already unreferenced at
152  * the end of the previous batch buffer. This has been fine so far since
153  * we are protected by these scratch page (every address not covered by
154  * a BO should be pointing to the scratch page). But on CNL, it is
155  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
156  * instruction.
157  *
158  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
159  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
160  * context restore, so the mentioned hang doesn't happen. However,
161  * software must program push constant commands for all stages prior to
162  * rendering anything, so we flag them as dirty.
163  *
164  * Finally, we also make sure to stall at pixel scoreboard to make sure the
165  * constants have been loaded into the EUs prior to disable the push constants
166  * so that it doesn't hang a previous 3DPRIMITIVE.
167  */
168 void
gfx7_emit_isp_disable(struct brw_context * brw)169 gfx7_emit_isp_disable(struct brw_context *brw)
170 {
171    brw->vtbl.emit_raw_pipe_control(brw,
172                                    PIPE_CONTROL_STALL_AT_SCOREBOARD |
173                                    PIPE_CONTROL_CS_STALL,
174                                    NULL, 0, 0);
175    brw->vtbl.emit_raw_pipe_control(brw,
176                                    PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
177                                    PIPE_CONTROL_CS_STALL,
178                                    NULL, 0, 0);
179 
180    brw->vs.base.push_constants_dirty = true;
181    brw->tcs.base.push_constants_dirty = true;
182    brw->tes.base.push_constants_dirty = true;
183    brw->gs.base.push_constants_dirty = true;
184    brw->wm.base.push_constants_dirty = true;
185 }
186 
187 /**
188  * Emit a PIPE_CONTROL command for gfx7 with the CS Stall bit set.
189  */
190 void
gfx7_emit_cs_stall_flush(struct brw_context * brw)191 gfx7_emit_cs_stall_flush(struct brw_context *brw)
192 {
193    brw_emit_pipe_control_write(brw,
194                                PIPE_CONTROL_CS_STALL
195                                | PIPE_CONTROL_WRITE_IMMEDIATE,
196                                brw->workaround_bo,
197                                brw->workaround_bo_offset, 0);
198 }
199 
200 /**
201  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
202  * implementing two workarounds on gfx6.  From section 1.4.7.1
203  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
204  *
205  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
206  * produced by non-pipelined state commands), software needs to first
207  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
208  * 0.
209  *
210  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
211  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
212  *
213  * And the workaround for these two requires this workaround first:
214  *
215  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
216  * BEFORE the pipe-control with a post-sync op and no write-cache
217  * flushes.
218  *
219  * And this last workaround is tricky because of the requirements on
220  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
221  * volume 2 part 1:
222  *
223  *     "1 of the following must also be set:
224  *      - Render Target Cache Flush Enable ([12] of DW1)
225  *      - Depth Cache Flush Enable ([0] of DW1)
226  *      - Stall at Pixel Scoreboard ([1] of DW1)
227  *      - Depth Stall ([13] of DW1)
228  *      - Post-Sync Operation ([13] of DW1)
229  *      - Notify Enable ([8] of DW1)"
230  *
231  * The cache flushes require the workaround flush that triggered this
232  * one, so we can't use it.  Depth stall would trigger the same.
233  * Post-sync nonzero is what triggered this second workaround, so we
234  * can't use that one either.  Notify enable is IRQs, which aren't
235  * really our business.  That leaves only stall at scoreboard.
236  */
237 void
brw_emit_post_sync_nonzero_flush(struct brw_context * brw)238 brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
239 {
240    brw_emit_pipe_control_flush(brw,
241                                PIPE_CONTROL_CS_STALL |
242                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
243 
244    brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
245                                brw->workaround_bo,
246                                brw->workaround_bo_offset, 0);
247 }
248 
249 /*
250  * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
251  *
252  *  Write synchronization is a special case of end-of-pipe
253  *  synchronization that requires that the render cache and/or depth
254  *  related caches are flushed to memory, where the data will become
255  *  globally visible. This type of synchronization is required prior to
256  *  SW (CPU) actually reading the result data from memory, or initiating
257  *  an operation that will use as a read surface (such as a texture
258  *  surface) a previous render target and/or depth/stencil buffer
259  *
260  *
261  * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
262  *
263  *  Exercising the write cache flush bits (Render Target Cache Flush
264  *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
265  *  ensures the write caches are flushed and doesn't guarantee the data
266  *  is globally visible.
267  *
268  *  SW can track the completion of the end-of-pipe-synchronization by
269  *  using "Notify Enable" and "PostSync Operation - Write Immediate
270  *  Data" in the PIPE_CONTROL command.
271  */
272 void
brw_emit_end_of_pipe_sync(struct brw_context * brw,uint32_t flags)273 brw_emit_end_of_pipe_sync(struct brw_context *brw, uint32_t flags)
274 {
275    const struct intel_device_info *devinfo = &brw->screen->devinfo;
276 
277    if (devinfo->ver >= 6) {
278       /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
279        *
280        *    "The most common action to perform upon reaching a synchronization
281        *    point is to write a value out to memory. An immediate value
282        *    (included with the synchronization command) may be written."
283        *
284        *
285        * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
286        *
287        *    "In case the data flushed out by the render engine is to be read
288        *    back in to the render engine in coherent manner, then the render
289        *    engine has to wait for the fence completion before accessing the
290        *    flushed data. This can be achieved by following means on various
291        *    products: PIPE_CONTROL command with CS Stall and the required
292        *    write caches flushed with Post-Sync-Operation as Write Immediate
293        *    Data.
294        *
295        *    Example:
296        *       - Workload-1 (3D/GPGPU/MEDIA)
297        *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
298        *         Data, Required Write Cache Flush bits set)
299        *       - Workload-2 (Can use the data produce or output by Workload-1)
300        */
301       brw_emit_pipe_control_write(brw,
302                                   flags | PIPE_CONTROL_CS_STALL |
303                                   PIPE_CONTROL_WRITE_IMMEDIATE,
304                                   brw->workaround_bo,
305                                   brw->workaround_bo_offset, 0);
306 
307       if (devinfo->is_haswell) {
308          /* Haswell needs addition work-arounds:
309           *
310           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
311           *
312           *    Option 1:
313           *    PIPE_CONTROL command with the CS Stall and the required write
314           *    caches flushed with Post-SyncOperation as Write Immediate Data
315           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
316           *    spce) commands.
317           *
318           *    Example:
319           *       - Workload-1
320           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
321           *         Immediate Data, Required Write Cache Flush bits set)
322           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
323           *       - Workload-2 (Can use the data produce or output by
324           *         Workload-1)
325           *
326           * Unfortunately, both the PRMs and the internal docs are a bit
327           * out-of-date in this regard.  What the windows driver does (and
328           * this appears to actually work) is to emit a register read from the
329           * memory address written by the pipe control above.
330           *
331           * What register we load into doesn't matter.  We choose an indirect
332           * rendering register because we know it always exists and it's one
333           * of the first registers the command parser allows us to write.  If
334           * you don't have command parser support in your kernel (pre-4.2),
335           * this will get turned into MI_NOOP and you won't get the
336           * workaround.  Unfortunately, there's just not much we can do in
337           * that case.  This register is perfectly safe to write since we
338           * always re-load all of the indirect draw registers right before
339           * 3DPRIMITIVE when needed anyway.
340           */
341          brw_load_register_mem(brw, GFX7_3DPRIM_START_INSTANCE,
342                                brw->workaround_bo, brw->workaround_bo_offset);
343       }
344    } else {
345       /* On gfx4-5, a regular pipe control seems to suffice. */
346       brw_emit_pipe_control_flush(brw, flags);
347    }
348 }
349 
350 /* Emit a pipelined flush to either flush render and texture cache for
351  * reading from a FBO-drawn texture, or flush so that frontbuffer
352  * render appears on the screen in DRI1.
353  *
354  * This is also used for the always_flush_cache driconf debug option.
355  */
356 void
brw_emit_mi_flush(struct brw_context * brw)357 brw_emit_mi_flush(struct brw_context *brw)
358 {
359    const struct intel_device_info *devinfo = &brw->screen->devinfo;
360 
361    int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
362    if (devinfo->ver >= 6) {
363       flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
364                PIPE_CONTROL_CONST_CACHE_INVALIDATE |
365                PIPE_CONTROL_DATA_CACHE_FLUSH |
366                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
367                PIPE_CONTROL_VF_CACHE_INVALIDATE |
368                PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
369                PIPE_CONTROL_CS_STALL;
370    }
371    brw_emit_pipe_control_flush(brw, flags);
372 }
373 
374 static bool
init_identifier_bo(struct brw_context * brw)375 init_identifier_bo(struct brw_context *brw)
376 {
377    void *bo_map;
378 
379    if (!can_do_exec_capture(brw->screen))
380       return true;
381 
382    bo_map = brw_bo_map(NULL, brw->workaround_bo, MAP_READ | MAP_WRITE);
383    if (!bo_map)
384       return false;
385 
386    brw->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE;
387    brw->workaround_bo_offset =
388       ALIGN(intel_debug_write_identifiers(bo_map, 4096, "i965") + 8, 8);
389 
390    brw_bo_unmap(brw->workaround_bo);
391 
392    return true;
393 }
394 
395 int
brw_init_pipe_control(struct brw_context * brw,const struct intel_device_info * devinfo)396 brw_init_pipe_control(struct brw_context *brw,
397                       const struct intel_device_info *devinfo)
398 {
399    switch (devinfo->ver) {
400    case 11:
401       brw->vtbl.emit_raw_pipe_control = gfx11_emit_raw_pipe_control;
402       break;
403    case 9:
404       brw->vtbl.emit_raw_pipe_control = gfx9_emit_raw_pipe_control;
405       break;
406    case 8:
407       brw->vtbl.emit_raw_pipe_control = gfx8_emit_raw_pipe_control;
408       break;
409    case 7:
410       brw->vtbl.emit_raw_pipe_control =
411          devinfo->is_haswell ? gfx75_emit_raw_pipe_control
412                              : gfx7_emit_raw_pipe_control;
413       break;
414    case 6:
415       brw->vtbl.emit_raw_pipe_control = gfx6_emit_raw_pipe_control;
416       break;
417    case 5:
418       brw->vtbl.emit_raw_pipe_control = gfx5_emit_raw_pipe_control;
419       break;
420    case 4:
421       brw->vtbl.emit_raw_pipe_control =
422          devinfo->is_g4x ? gfx45_emit_raw_pipe_control
423                          : gfx4_emit_raw_pipe_control;
424       break;
425    default:
426       unreachable("Unhandled Gen.");
427    }
428 
429    if (devinfo->ver < 6)
430       return 0;
431 
432    /* We can't just use brw_state_batch to get a chunk of space for
433     * the gfx6 workaround because it involves actually writing to
434     * the buffer, and the kernel doesn't let us write to the batch.
435     */
436    brw->workaround_bo = brw_bo_alloc(brw->bufmgr, "workaround", 4096,
437                                      BRW_MEMZONE_OTHER);
438    if (brw->workaround_bo == NULL)
439       return -ENOMEM;
440 
441    if (!init_identifier_bo(brw))
442       return -ENOMEM; /* Couldn't map workaround_bo?? */
443 
444    brw->workaround_bo_offset = 0;
445    brw->pipe_controls_since_last_cs_stall = 0;
446 
447    return 0;
448 }
449 
450 void
brw_fini_pipe_control(struct brw_context * brw)451 brw_fini_pipe_control(struct brw_context *brw)
452 {
453    brw_bo_unreference(brw->workaround_bo);
454 }
455