1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 
26 #include "intel_batchbuffer.h"
27 #include "intel_mipmap_tree.h"
28 #include "intel_fbo.h"
29 
30 #include "brw_context.h"
31 #include "brw_state.h"
32 
33 #include "blorp/blorp_genX_exec.h"
34 
35 #if GEN_GEN <= 5
36 #include "gen4_blorp_exec.h"
37 #endif
38 
39 #include "brw_blorp.h"
40 
41 static void *
blorp_emit_dwords(struct blorp_batch * batch,unsigned n)42 blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
43 {
44    assert(batch->blorp->driver_ctx == batch->driver_batch);
45    struct brw_context *brw = batch->driver_batch;
46 
47    intel_batchbuffer_begin(brw, n);
48    uint32_t *map = brw->batch.map_next;
49    brw->batch.map_next += n;
50    intel_batchbuffer_advance(brw);
51    return map;
52 }
53 
54 static uint64_t
blorp_emit_reloc(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)55 blorp_emit_reloc(struct blorp_batch *batch,
56                  void *location, struct blorp_address address, uint32_t delta)
57 {
58    assert(batch->blorp->driver_ctx == batch->driver_batch);
59    struct brw_context *brw = batch->driver_batch;
60    uint32_t offset;
61 
62    if (GEN_GEN < 6 && brw_ptr_in_state_buffer(&brw->batch, location)) {
63       offset = (char *)location - (char *)brw->batch.state.map;
64       return brw_state_reloc(&brw->batch, offset,
65                              address.buffer, address.offset + delta,
66                              address.reloc_flags);
67    }
68 
69    assert(!brw_ptr_in_state_buffer(&brw->batch, location));
70 
71    offset = (char *)location - (char *)brw->batch.batch.map;
72    return brw_batch_reloc(&brw->batch, offset,
73                           address.buffer, address.offset + delta,
74                           address.reloc_flags);
75 }
76 
77 static void
blorp_surface_reloc(struct blorp_batch * batch,uint32_t ss_offset,struct blorp_address address,uint32_t delta)78 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
79                     struct blorp_address address, uint32_t delta)
80 {
81    assert(batch->blorp->driver_ctx == batch->driver_batch);
82    struct brw_context *brw = batch->driver_batch;
83    struct brw_bo *bo = address.buffer;
84 
85    uint64_t reloc_val =
86       brw_state_reloc(&brw->batch, ss_offset, bo, address.offset + delta,
87                       address.reloc_flags);
88 
89    void *reloc_ptr = (void *)brw->batch.state.map + ss_offset;
90 #if GEN_GEN >= 8
91    *(uint64_t *)reloc_ptr = reloc_val;
92 #else
93    *(uint32_t *)reloc_ptr = reloc_val;
94 #endif
95 }
96 
97 static uint64_t
blorp_get_surface_address(struct blorp_batch * blorp_batch,struct blorp_address address)98 blorp_get_surface_address(struct blorp_batch *blorp_batch,
99                           struct blorp_address address)
100 {
101    /* We'll let blorp_surface_reloc write the address. */
102    return 0ull;
103 }
104 
105 #if GEN_GEN >= 7 && GEN_GEN < 10
106 static struct blorp_address
blorp_get_surface_base_address(struct blorp_batch * batch)107 blorp_get_surface_base_address(struct blorp_batch *batch)
108 {
109    assert(batch->blorp->driver_ctx == batch->driver_batch);
110    struct brw_context *brw = batch->driver_batch;
111    return (struct blorp_address) {
112       .buffer = brw->batch.state.bo,
113       .offset = 0,
114    };
115 }
116 #endif
117 
118 static void *
blorp_alloc_dynamic_state(struct blorp_batch * batch,uint32_t size,uint32_t alignment,uint32_t * offset)119 blorp_alloc_dynamic_state(struct blorp_batch *batch,
120                           uint32_t size,
121                           uint32_t alignment,
122                           uint32_t *offset)
123 {
124    assert(batch->blorp->driver_ctx == batch->driver_batch);
125    struct brw_context *brw = batch->driver_batch;
126 
127    return brw_state_batch(brw, size, alignment, offset);
128 }
129 
130 static void
blorp_alloc_binding_table(struct blorp_batch * batch,unsigned num_entries,unsigned state_size,unsigned state_alignment,uint32_t * bt_offset,uint32_t * surface_offsets,void ** surface_maps)131 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
132                           unsigned state_size, unsigned state_alignment,
133                           uint32_t *bt_offset, uint32_t *surface_offsets,
134                           void **surface_maps)
135 {
136    assert(batch->blorp->driver_ctx == batch->driver_batch);
137    struct brw_context *brw = batch->driver_batch;
138 
139    uint32_t *bt_map = brw_state_batch(brw,
140                                       num_entries * sizeof(uint32_t), 32,
141                                       bt_offset);
142 
143    for (unsigned i = 0; i < num_entries; i++) {
144       surface_maps[i] = brw_state_batch(brw,
145                                         state_size, state_alignment,
146                                         &(surface_offsets)[i]);
147       bt_map[i] = surface_offsets[i];
148    }
149 }
150 
151 static void *
blorp_alloc_vertex_buffer(struct blorp_batch * batch,uint32_t size,struct blorp_address * addr)152 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
153                           struct blorp_address *addr)
154 {
155    assert(batch->blorp->driver_ctx == batch->driver_batch);
156    struct brw_context *brw = batch->driver_batch;
157 
158    /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
159     *
160     *    "The VF cache needs to be invalidated before binding and then using
161     *    Vertex Buffers that overlap with any previously bound Vertex Buffer
162     *    (at a 64B granularity) since the last invalidation.  A VF cache
163     *    invalidate is performed by setting the "VF Cache Invalidation Enable"
164     *    bit in PIPE_CONTROL."
165     *
166     * This restriction first appears in the Skylake PRM but the internal docs
167     * also list it as being an issue on Broadwell.  In order to avoid this
168     * problem, we align all vertex buffer allocations to 64 bytes.
169     */
170    uint32_t offset;
171    void *data = brw_state_batch(brw, size, 64, &offset);
172 
173    *addr = (struct blorp_address) {
174       .buffer = brw->batch.state.bo,
175       .offset = offset,
176 
177       /* The VF cache designers apparently cut corners, and made the cache
178        * only consider the bottom 32 bits of memory addresses.  If you happen
179        * to have two vertex buffers which get placed exactly 4 GiB apart and
180        * use them in back-to-back draw calls, you can get collisions.  To work
181        * around this problem, we restrict vertex buffers to the low 32 bits of
182        * the address space.
183        */
184       .reloc_flags = RELOC_32BIT,
185 
186 #if GEN_GEN == 11
187       .mocs = ICL_MOCS_WB,
188 #elif GEN_GEN == 10
189       .mocs = CNL_MOCS_WB,
190 #elif GEN_GEN == 9
191       .mocs = SKL_MOCS_WB,
192 #elif GEN_GEN == 8
193       .mocs = BDW_MOCS_WB,
194 #elif GEN_GEN == 7
195       .mocs = GEN7_MOCS_L3,
196 #elif GEN_GEN > 6
197 #error "Missing MOCS setting!"
198 #endif
199    };
200 
201    return data;
202 }
203 
204 /**
205  * See vf_invalidate_for_vb_48b_transitions in genX_state_upload.c.
206  */
207 static void
blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch * batch,const struct blorp_address * addrs,UNUSED uint32_t * sizes,unsigned num_vbs)208 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
209                                            const struct blorp_address *addrs,
210                                            UNUSED uint32_t *sizes,
211                                            unsigned num_vbs)
212 {
213 #if GEN_GEN >= 8 && GEN_GEN < 11
214    struct brw_context *brw = batch->driver_batch;
215    bool need_invalidate = false;
216 
217    for (unsigned i = 0; i < num_vbs; i++) {
218       struct brw_bo *bo = addrs[i].buffer;
219       uint16_t high_bits =
220          bo && (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32u : 0;
221 
222       if (high_bits != brw->vb.last_bo_high_bits[i]) {
223          need_invalidate = true;
224          brw->vb.last_bo_high_bits[i] = high_bits;
225       }
226    }
227 
228    if (need_invalidate) {
229       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
230    }
231 #endif
232 }
233 
234 UNUSED static struct blorp_address
blorp_get_workaround_address(struct blorp_batch * batch)235 blorp_get_workaround_address(struct blorp_batch *batch)
236 {
237    assert(batch->blorp->driver_ctx == batch->driver_batch);
238    struct brw_context *brw = batch->driver_batch;
239 
240    return (struct blorp_address) {
241       .buffer = brw->workaround_bo,
242       .offset = brw->workaround_bo_offset,
243    };
244 }
245 
246 static void
blorp_flush_range(UNUSED struct blorp_batch * batch,UNUSED void * start,UNUSED size_t size)247 blorp_flush_range(UNUSED struct blorp_batch *batch, UNUSED void *start,
248                   UNUSED size_t size)
249 {
250    /* All allocated states come from the batch which we will flush before we
251     * submit it.  There's nothing for us to do here.
252     */
253 }
254 
255 #if GEN_GEN >= 7
256 static const struct gen_l3_config *
blorp_get_l3_config(struct blorp_batch * batch)257 blorp_get_l3_config(struct blorp_batch *batch)
258 {
259    assert(batch->blorp->driver_ctx == batch->driver_batch);
260    struct brw_context *brw = batch->driver_batch;
261 
262    return brw->l3.config;
263 }
264 #else /* GEN_GEN < 7 */
265 static void
blorp_emit_urb_config(struct blorp_batch * batch,unsigned vs_entry_size,UNUSED unsigned sf_entry_size)266 blorp_emit_urb_config(struct blorp_batch *batch,
267                       unsigned vs_entry_size,
268                       UNUSED unsigned sf_entry_size)
269 {
270    assert(batch->blorp->driver_ctx == batch->driver_batch);
271    struct brw_context *brw = batch->driver_batch;
272 
273 #if GEN_GEN == 6
274    gen6_upload_urb(brw, vs_entry_size, false, 0);
275 #else
276    /* We calculate it now and emit later. */
277    brw_calculate_urb_fence(brw, 0, vs_entry_size, sf_entry_size);
278 #endif
279 }
280 #endif
281 
282 void
genX(blorp_exec)283 genX(blorp_exec)(struct blorp_batch *batch,
284                  const struct blorp_params *params)
285 {
286    assert(batch->blorp->driver_ctx == batch->driver_batch);
287    struct brw_context *brw = batch->driver_batch;
288    struct gl_context *ctx = &brw->ctx;
289    bool check_aperture_failed_once = false;
290 
291 #if GEN_GEN >= 11
292    /* The PIPE_CONTROL command description says:
293     *
294     * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
295     *  points to a different RENDER_SURFACE_STATE, SW must issue a Render
296     *  Target Cache Flush by enabling this bit. When render target flush
297     *  is set due to new association of BTI, PS Scoreboard Stall bit must
298     *  be set in this packet."
299    */
300    brw_emit_pipe_control_flush(brw,
301                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
302                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
303 #endif
304 
305    /* Flush the sampler and render caches.  We definitely need to flush the
306     * sampler cache so that we get updated contents from the render cache for
307     * the glBlitFramebuffer() source.  Also, we are sometimes warned in the
308     * docs to flush the cache between reinterpretations of the same surface
309     * data with different formats, which blorp does for stencil and depth
310     * data.
311     */
312    if (params->src.enabled)
313       brw_cache_flush_for_read(brw, params->src.addr.buffer);
314    if (params->dst.enabled) {
315       brw_cache_flush_for_render(brw, params->dst.addr.buffer,
316                                  params->dst.view.format,
317                                  params->dst.aux_usage);
318    }
319    if (params->depth.enabled)
320       brw_cache_flush_for_depth(brw, params->depth.addr.buffer);
321    if (params->stencil.enabled)
322       brw_cache_flush_for_depth(brw, params->stencil.addr.buffer);
323 
324    brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
325    brw_emit_l3_state(brw);
326 
327 retry:
328    intel_batchbuffer_require_space(brw, 1400);
329    brw_require_statebuffer_space(brw, 600);
330    intel_batchbuffer_save_state(brw);
331    check_aperture_failed_once |= intel_batchbuffer_saved_state_is_empty(brw);
332    brw->batch.no_wrap = true;
333 
334 #if GEN_GEN == 6
335    /* Emit workaround flushes when we switch from drawing to blorping. */
336    brw_emit_post_sync_nonzero_flush(brw);
337 #endif
338 
339    brw_upload_state_base_address(brw);
340 
341 #if GEN_GEN >= 8
342    gen7_l3_state.emit(brw);
343 #endif
344 
345 #if GEN_GEN >= 6
346    brw_emit_depth_stall_flushes(brw);
347 #endif
348 
349 #if GEN_GEN == 8
350    gen8_write_pma_stall_bits(brw, 0);
351 #endif
352 
353    const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
354    if (brw->current_hash_scale != scale) {
355       brw_emit_hashing_mode(brw, params->x1 - params->x0,
356                             params->y1 - params->y0, scale);
357    }
358 
359    blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
360       rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
361       rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
362    }
363 
364    blorp_exec(batch, params);
365 
366    brw->batch.no_wrap = false;
367 
368    /* Check if the blorp op we just did would make our batch likely to fail to
369     * map all the BOs into the GPU at batch exec time later.  If so, flush the
370     * batch and try again with nothing else in the batch.
371     */
372    if (!brw_batch_has_aperture_space(brw, 0)) {
373       if (!check_aperture_failed_once) {
374          check_aperture_failed_once = true;
375          intel_batchbuffer_reset_to_saved(brw);
376          intel_batchbuffer_flush(brw);
377          goto retry;
378       } else {
379          int ret = intel_batchbuffer_flush(brw);
380          WARN_ONCE(ret == -ENOSPC,
381                    "i965: blorp emit exceeded available aperture space\n");
382       }
383    }
384 
385    if (unlikely(brw->always_flush_batch))
386       intel_batchbuffer_flush(brw);
387 
388    /* We've smashed all state compared to what the normal 3D pipeline
389     * rendering tracks for GL.
390     */
391    brw->ctx.NewDriverState |= BRW_NEW_BLORP;
392    brw->no_depth_or_stencil = !params->depth.enabled &&
393                               !params->stencil.enabled;
394    brw->ib.index_size = -1;
395    brw->urb.vsize = 0;
396    brw->urb.gs_present = false;
397    brw->urb.gsize = 0;
398    brw->urb.tess_present = false;
399    brw->urb.hsize = 0;
400    brw->urb.dsize = 0;
401 
402    if (params->dst.enabled) {
403       brw_render_cache_add_bo(brw, params->dst.addr.buffer,
404                               params->dst.view.format,
405                               params->dst.aux_usage);
406    }
407    if (params->depth.enabled)
408       brw_depth_cache_add_bo(brw, params->depth.addr.buffer);
409    if (params->stencil.enabled)
410       brw_depth_cache_add_bo(brw, params->stencil.addr.buffer);
411 }
412