1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include <drm/drmP.h> 137 #include <drm/i915_drm.h> 138 #include "i915_drv.h" 139 #include "intel_mocs.h" 140 141 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) 142 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) 143 #define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE) 144 145 #define RING_EXECLIST_QFULL (1 << 0x2) 146 #define RING_EXECLIST1_VALID (1 << 0x3) 147 #define RING_EXECLIST0_VALID (1 << 0x4) 148 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 149 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 150 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 151 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 153 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 156 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 157 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 158 159 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 160 (GEN8_CTX_STATUS_ACTIVE_IDLE | \ 161 GEN8_CTX_STATUS_PREEMPTED | \ 162 GEN8_CTX_STATUS_ELEMENT_SWITCH) 163 164 #define CTX_LRI_HEADER_0 0x01 165 #define CTX_CONTEXT_CONTROL 0x02 166 #define CTX_RING_HEAD 0x04 167 #define CTX_RING_TAIL 0x06 168 #define CTX_RING_BUFFER_START 0x08 169 #define CTX_RING_BUFFER_CONTROL 0x0a 170 #define CTX_BB_HEAD_U 0x0c 171 #define CTX_BB_HEAD_L 0x0e 172 #define CTX_BB_STATE 0x10 173 #define CTX_SECOND_BB_HEAD_U 0x12 174 #define CTX_SECOND_BB_HEAD_L 0x14 175 #define CTX_SECOND_BB_STATE 0x16 176 #define CTX_BB_PER_CTX_PTR 0x18 177 #define CTX_RCS_INDIRECT_CTX 0x1a 178 #define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c 179 #define CTX_LRI_HEADER_1 0x21 180 #define CTX_CTX_TIMESTAMP 0x22 181 #define CTX_PDP3_UDW 0x24 182 #define CTX_PDP3_LDW 0x26 183 #define CTX_PDP2_UDW 0x28 184 #define CTX_PDP2_LDW 0x2a 185 #define CTX_PDP1_UDW 0x2c 186 #define CTX_PDP1_LDW 0x2e 187 #define CTX_PDP0_UDW 0x30 188 #define CTX_PDP0_LDW 0x32 189 #define CTX_LRI_HEADER_2 0x41 190 #define CTX_R_PWR_CLK_STATE 0x42 191 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 192 193 #define CTX_REG(reg_state, pos, reg, val) do { \ 194 (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \ 195 (reg_state)[(pos)+1] = (val); \ 196 } while (0) 197 198 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ 199 const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ 200 reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ 201 reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ 202 } while (0) 203 204 #define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ 205 reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ 206 reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ 207 } while (0) 208 209 #define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 210 #define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 211 212 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 213 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 214 215 #define WA_TAIL_DWORDS 2 216 217 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, 218 struct intel_engine_cs *engine); 219 static void execlists_init_reg_state(u32 *reg_state, 220 struct i915_gem_context *ctx, 221 struct intel_engine_cs *engine, 222 struct intel_ring *ring); 223 224 /** 225 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists 226 * @dev_priv: i915 device private 227 * @enable_execlists: value of i915.enable_execlists module parameter. 228 * 229 * Only certain platforms support Execlists (the prerequisites being 230 * support for Logical Ring Contexts and Aliasing PPGTT or better). 231 * 232 * Return: 1 if Execlists is supported and has to be enabled. 233 */ 234 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enable_execlists) 235 { 236 /* On platforms with execlist available, vGPU will only 237 * support execlist mode, no ring buffer mode. 238 */ 239 if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && intel_vgpu_active(dev_priv)) 240 return 1; 241 242 if (INTEL_GEN(dev_priv) >= 9) 243 return 1; 244 245 if (enable_execlists == 0) 246 return 0; 247 248 if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && 249 USES_PPGTT(dev_priv) && 250 i915.use_mmio_flip >= 0) 251 return 1; 252 253 return 0; 254 } 255 256 /** 257 * intel_lr_context_descriptor_update() - calculate & cache the descriptor 258 * descriptor for a pinned context 259 * @ctx: Context to work on 260 * @engine: Engine the descriptor will be used with 261 * 262 * The context descriptor encodes various attributes of a context, 263 * including its GTT address and some flags. Because it's fairly 264 * expensive to calculate, we'll just do it once and cache the result, 265 * which remains valid until the context is unpinned. 266 * 267 * This is what a descriptor looks like, from LSB to MSB:: 268 * 269 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 270 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 271 * bits 32-52: ctx ID, a globally unique tag 272 * bits 53-54: mbz, reserved for use by hardware 273 * bits 55-63: group ID, currently unused and set to 0 274 */ 275 static void 276 intel_lr_context_descriptor_update(struct i915_gem_context *ctx, 277 struct intel_engine_cs *engine) 278 { 279 struct intel_context *ce = &ctx->engine[engine->id]; 280 u64 desc; 281 282 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (1<<GEN8_CTX_ID_WIDTH)); 283 284 desc = ctx->desc_template; /* bits 0-11 */ 285 desc |= i915_ggtt_offset(ce->state) + LRC_PPHWSP_PN * PAGE_SIZE; 286 /* bits 12-31 */ 287 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ 288 289 ce->lrc_desc = desc; 290 } 291 292 uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx, 293 struct intel_engine_cs *engine) 294 { 295 return ctx->engine[engine->id].lrc_desc; 296 } 297 298 static inline void 299 execlists_context_status_change(struct drm_i915_gem_request *rq, 300 unsigned long status) 301 { 302 /* 303 * Only used when GVT-g is enabled now. When GVT-g is disabled, 304 * The compiler should eliminate this function as dead-code. 305 */ 306 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 307 return; 308 309 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 310 status, rq); 311 } 312 313 static void 314 execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state) 315 { 316 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 317 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 318 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 319 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 320 } 321 322 static u64 execlists_update_context(struct drm_i915_gem_request *rq) 323 { 324 struct intel_context *ce = &rq->ctx->engine[rq->engine->id]; 325 struct i915_hw_ppgtt *ppgtt = 326 rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt; 327 u32 *reg_state = ce->lrc_reg_state; 328 329 reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail); 330 331 /* True 32b PPGTT with dynamic page allocation: update PDP 332 * registers and point the unallocated PDPs to scratch page. 333 * PML4 is allocated during ppgtt init, so this is not needed 334 * in 48-bit mode. 335 */ 336 if (ppgtt && !i915_vm_is_48bit(&ppgtt->base)) 337 execlists_update_context_pdps(ppgtt, reg_state); 338 339 return ce->lrc_desc; 340 } 341 342 static void execlists_submit_ports(struct intel_engine_cs *engine) 343 { 344 struct drm_i915_private *dev_priv = engine->i915; 345 struct execlist_port *port = engine->execlist_port; 346 u32 __iomem *elsp = 347 dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine)); 348 u64 desc[2]; 349 350 GEM_BUG_ON(port[0].count > 1); 351 if (!port[0].count) 352 execlists_context_status_change(port[0].request, 353 INTEL_CONTEXT_SCHEDULE_IN); 354 desc[0] = execlists_update_context(port[0].request); 355 GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0])); 356 port[0].count++; 357 358 if (port[1].request) { 359 GEM_BUG_ON(port[1].count); 360 execlists_context_status_change(port[1].request, 361 INTEL_CONTEXT_SCHEDULE_IN); 362 desc[1] = execlists_update_context(port[1].request); 363 GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1])); 364 port[1].count = 1; 365 } else { 366 desc[1] = 0; 367 } 368 GEM_BUG_ON(desc[0] == desc[1]); 369 370 /* You must always write both descriptors in the order below. */ 371 writel(upper_32_bits(desc[1]), elsp); 372 writel(lower_32_bits(desc[1]), elsp); 373 374 writel(upper_32_bits(desc[0]), elsp); 375 /* The context is automatically loaded after the following */ 376 writel(lower_32_bits(desc[0]), elsp); 377 } 378 379 static bool ctx_single_port_submission(const struct i915_gem_context *ctx) 380 { 381 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 382 i915_gem_context_force_single_submission(ctx)); 383 } 384 385 static bool can_merge_ctx(const struct i915_gem_context *prev, 386 const struct i915_gem_context *next) 387 { 388 if (prev != next) 389 return false; 390 391 if (ctx_single_port_submission(prev)) 392 return false; 393 394 return true; 395 } 396 397 static void execlists_dequeue(struct intel_engine_cs *engine) 398 { 399 struct drm_i915_gem_request *last; 400 struct execlist_port *port = engine->execlist_port; 401 struct rb_node *rb; 402 bool submit = false; 403 404 last = port->request; 405 if (last) 406 /* WaIdleLiteRestore:bdw,skl 407 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL 408 * as we resubmit the request. See gen8_emit_breadcrumb() 409 * for where we prepare the padding after the end of the 410 * request. 411 */ 412 last->tail = last->wa_tail; 413 414 GEM_BUG_ON(port[1].request); 415 416 /* Hardware submission is through 2 ports. Conceptually each port 417 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 418 * static for a context, and unique to each, so we only execute 419 * requests belonging to a single context from each ring. RING_HEAD 420 * is maintained by the CS in the context image, it marks the place 421 * where it got up to last time, and through RING_TAIL we tell the CS 422 * where we want to execute up to this time. 423 * 424 * In this list the requests are in order of execution. Consecutive 425 * requests from the same context are adjacent in the ringbuffer. We 426 * can combine these requests into a single RING_TAIL update: 427 * 428 * RING_HEAD...req1...req2 429 * ^- RING_TAIL 430 * since to execute req2 the CS must first execute req1. 431 * 432 * Our goal then is to point each port to the end of a consecutive 433 * sequence of requests as being the most optimal (fewest wake ups 434 * and context switches) submission. 435 */ 436 437 spin_lock_irq(&engine->timeline->lock); 438 rb = engine->execlist_first; 439 while (rb) { 440 struct drm_i915_gem_request *cursor = 441 rb_entry(rb, typeof(*cursor), priotree.node); 442 443 /* Can we combine this request with the current port? It has to 444 * be the same context/ringbuffer and not have any exceptions 445 * (e.g. GVT saying never to combine contexts). 446 * 447 * If we can combine the requests, we can execute both by 448 * updating the RING_TAIL to point to the end of the second 449 * request, and so we never need to tell the hardware about 450 * the first. 451 */ 452 if (last && !can_merge_ctx(cursor->ctx, last->ctx)) { 453 /* If we are on the second port and cannot combine 454 * this request with the last, then we are done. 455 */ 456 if (port != engine->execlist_port) 457 break; 458 459 /* If GVT overrides us we only ever submit port[0], 460 * leaving port[1] empty. Note that we also have 461 * to be careful that we don't queue the same 462 * context (even though a different request) to 463 * the second port. 464 */ 465 if (ctx_single_port_submission(last->ctx) || 466 ctx_single_port_submission(cursor->ctx)) 467 break; 468 469 GEM_BUG_ON(last->ctx == cursor->ctx); 470 471 i915_gem_request_assign(&port->request, last); 472 port++; 473 } 474 475 rb = rb_next(rb); 476 rb_erase(&cursor->priotree.node, &engine->execlist_queue); 477 RB_CLEAR_NODE(&cursor->priotree.node); 478 cursor->priotree.priority = INT_MAX; 479 480 __i915_gem_request_submit(cursor); 481 trace_i915_gem_request_in(cursor, port - engine->execlist_port); 482 last = cursor; 483 submit = true; 484 } 485 if (submit) { 486 i915_gem_request_assign(&port->request, last); 487 engine->execlist_first = rb; 488 } 489 spin_unlock_irq(&engine->timeline->lock); 490 491 if (submit) 492 execlists_submit_ports(engine); 493 } 494 495 static bool execlists_elsp_idle(struct intel_engine_cs *engine) 496 { 497 return !engine->execlist_port[0].request; 498 } 499 500 static bool execlists_elsp_ready(const struct intel_engine_cs *engine) 501 { 502 const struct execlist_port *port = engine->execlist_port; 503 504 return port[0].count + port[1].count < 2; 505 } 506 507 /* 508 * Check the unread Context Status Buffers and manage the submission of new 509 * contexts to the ELSP accordingly. 510 */ 511 static void intel_lrc_irq_handler(unsigned long data) 512 { 513 struct intel_engine_cs *engine = (struct intel_engine_cs *)data; 514 struct execlist_port *port = engine->execlist_port; 515 struct drm_i915_private *dev_priv = engine->i915; 516 517 intel_uncore_forcewake_get(dev_priv, engine->fw_domains); 518 519 /* Prefer doing test_and_clear_bit() as a two stage operation to avoid 520 * imposing the cost of a locked atomic transaction when submitting a 521 * new request (outside of the context-switch interrupt). 522 */ 523 while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) { 524 u32 __iomem *csb_mmio = 525 dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)); 526 u32 __iomem *buf = 527 dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)); 528 unsigned int head, tail; 529 530 /* The write will be ordered by the uncached read (itself 531 * a memory barrier), so we do not need another in the form 532 * of a locked instruction. The race between the interrupt 533 * handler and the split test/clear is harmless as we order 534 * our clear before the CSB read. If the interrupt arrived 535 * first between the test and the clear, we read the updated 536 * CSB and clear the bit. If the interrupt arrives as we read 537 * the CSB or later (i.e. after we had cleared the bit) the bit 538 * is set and we do a new loop. 539 */ 540 __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); 541 head = readl(csb_mmio); 542 tail = GEN8_CSB_WRITE_PTR(head); 543 head = GEN8_CSB_READ_PTR(head); 544 while (head != tail) { 545 unsigned int status; 546 547 if (++head == GEN8_CSB_ENTRIES) 548 head = 0; 549 550 /* We are flying near dragons again. 551 * 552 * We hold a reference to the request in execlist_port[] 553 * but no more than that. We are operating in softirq 554 * context and so cannot hold any mutex or sleep. That 555 * prevents us stopping the requests we are processing 556 * in port[] from being retired simultaneously (the 557 * breadcrumb will be complete before we see the 558 * context-switch). As we only hold the reference to the 559 * request, any pointer chasing underneath the request 560 * is subject to a potential use-after-free. Thus we 561 * store all of the bookkeeping within port[] as 562 * required, and avoid using unguarded pointers beneath 563 * request itself. The same applies to the atomic 564 * status notifier. 565 */ 566 567 status = readl(buf + 2 * head); 568 if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) 569 continue; 570 571 /* Check the context/desc id for this event matches */ 572 GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != 573 port[0].context_id); 574 575 GEM_BUG_ON(port[0].count == 0); 576 if (--port[0].count == 0) { 577 GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); 578 GEM_BUG_ON(!i915_gem_request_completed(port[0].request)); 579 execlists_context_status_change(port[0].request, 580 INTEL_CONTEXT_SCHEDULE_OUT); 581 582 trace_i915_gem_request_out(port[0].request); 583 i915_gem_request_put(port[0].request); 584 port[0] = port[1]; 585 memset(&port[1], 0, sizeof(port[1])); 586 } 587 588 GEM_BUG_ON(port[0].count == 0 && 589 !(status & GEN8_CTX_STATUS_ACTIVE_IDLE)); 590 } 591 592 writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8), 593 csb_mmio); 594 } 595 596 if (execlists_elsp_ready(engine)) 597 execlists_dequeue(engine); 598 599 intel_uncore_forcewake_put(dev_priv, engine->fw_domains); 600 } 601 602 static bool insert_request(struct i915_priotree *pt, struct rb_root *root) 603 { 604 struct rb_node **p, *rb; 605 bool first = true; 606 607 /* most positive priority is scheduled first, equal priorities fifo */ 608 rb = NULL; 609 p = &root->rb_node; 610 while (*p) { 611 struct i915_priotree *pos; 612 613 rb = *p; 614 pos = rb_entry(rb, typeof(*pos), node); 615 if (pt->priority > pos->priority) { 616 p = &rb->rb_left; 617 } else { 618 p = &rb->rb_right; 619 first = false; 620 } 621 } 622 rb_link_node(&pt->node, rb, p); 623 rb_insert_color(&pt->node, root); 624 625 return first; 626 } 627 628 static void execlists_submit_request(struct drm_i915_gem_request *request) 629 { 630 struct intel_engine_cs *engine = request->engine; 631 unsigned long flags; 632 633 /* Will be called from irq-context when using foreign fences. */ 634 spin_lock_irqsave(&engine->timeline->lock, flags); 635 636 if (insert_request(&request->priotree, &engine->execlist_queue)) { 637 engine->execlist_first = &request->priotree.node; 638 if (execlists_elsp_ready(engine)) 639 tasklet_hi_schedule(&engine->irq_tasklet); 640 } 641 642 spin_unlock_irqrestore(&engine->timeline->lock, flags); 643 } 644 645 static struct intel_engine_cs * 646 pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) 647 { 648 struct intel_engine_cs *engine = 649 container_of(pt, struct drm_i915_gem_request, priotree)->engine; 650 651 GEM_BUG_ON(!locked); 652 653 if (engine != locked) { 654 lockmgr(&locked->timeline->lock, LK_RELEASE); 655 lockmgr(&engine->timeline->lock, LK_EXCLUSIVE); 656 } 657 658 return engine; 659 } 660 661 static void execlists_schedule(struct drm_i915_gem_request *request, int prio) 662 { 663 struct intel_engine_cs *engine; 664 struct i915_dependency *dep, *p; 665 struct i915_dependency stack; 666 LINUX_LIST_HEAD(dfs); 667 668 if (prio <= READ_ONCE(request->priotree.priority)) 669 return; 670 671 /* Need BKL in order to use the temporary link inside i915_dependency */ 672 lockdep_assert_held(&request->i915->drm.struct_mutex); 673 674 stack.signaler = &request->priotree; 675 list_add(&stack.dfs_link, &dfs); 676 677 /* Recursively bump all dependent priorities to match the new request. 678 * 679 * A naive approach would be to use recursion: 680 * static void update_priorities(struct i915_priotree *pt, prio) { 681 * list_for_each_entry(dep, &pt->signalers_list, signal_link) 682 * update_priorities(dep->signal, prio) 683 * insert_request(pt); 684 * } 685 * but that may have unlimited recursion depth and so runs a very 686 * real risk of overunning the kernel stack. Instead, we build 687 * a flat list of all dependencies starting with the current request. 688 * As we walk the list of dependencies, we add all of its dependencies 689 * to the end of the list (this may include an already visited 690 * request) and continue to walk onwards onto the new dependencies. The 691 * end result is a topological list of requests in reverse order, the 692 * last element in the list is the request we must execute first. 693 */ 694 list_for_each_entry_safe(dep, p, &dfs, dfs_link) { 695 struct i915_priotree *pt = dep->signaler; 696 697 /* Within an engine, there can be no cycle, but we may 698 * refer to the same dependency chain multiple times 699 * (redundant dependencies are not eliminated) and across 700 * engines. 701 */ 702 list_for_each_entry(p, &pt->signalers_list, signal_link) { 703 GEM_BUG_ON(p->signaler->priority < pt->priority); 704 if (prio > READ_ONCE(p->signaler->priority)) 705 list_move_tail(&p->dfs_link, &dfs); 706 } 707 708 list_safe_reset_next(dep, p, dfs_link); 709 } 710 711 engine = request->engine; 712 spin_lock_irq(&engine->timeline->lock); 713 714 /* Fifo and depth-first replacement ensure our deps execute before us */ 715 list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) { 716 struct i915_priotree *pt = dep->signaler; 717 718 INIT_LIST_HEAD(&dep->dfs_link); 719 720 engine = pt_lock_engine(pt, engine); 721 722 if (prio <= pt->priority) 723 continue; 724 725 pt->priority = prio; 726 if (!RB_EMPTY_NODE(&pt->node)) { 727 rb_erase(&pt->node, &engine->execlist_queue); 728 if (insert_request(pt, &engine->execlist_queue)) 729 engine->execlist_first = &pt->node; 730 } 731 } 732 733 spin_unlock_irq(&engine->timeline->lock); 734 735 /* XXX Do we need to preempt to make room for us and our deps? */ 736 } 737 738 static int execlists_context_pin(struct intel_engine_cs *engine, 739 struct i915_gem_context *ctx) 740 { 741 struct intel_context *ce = &ctx->engine[engine->id]; 742 unsigned int flags; 743 void *vaddr; 744 int ret; 745 746 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 747 748 if (ce->pin_count++) 749 return 0; 750 GEM_BUG_ON(!ce->pin_count); /* no overflow please! */ 751 752 if (!ce->state) { 753 ret = execlists_context_deferred_alloc(ctx, engine); 754 if (ret) 755 goto err; 756 } 757 GEM_BUG_ON(!ce->state); 758 759 flags = PIN_GLOBAL | PIN_HIGH; 760 if (ctx->ggtt_offset_bias) 761 flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias; 762 763 ret = i915_vma_pin(ce->state, 0, GEN8_LR_CONTEXT_ALIGN, flags); 764 if (ret) 765 goto err; 766 767 vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB); 768 if (IS_ERR(vaddr)) { 769 ret = PTR_ERR(vaddr); 770 goto unpin_vma; 771 } 772 773 ret = intel_ring_pin(ce->ring, ctx->ggtt_offset_bias); 774 if (ret) 775 goto unpin_map; 776 777 intel_lr_context_descriptor_update(ctx, engine); 778 779 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 780 ce->lrc_reg_state[CTX_RING_BUFFER_START+1] = 781 i915_ggtt_offset(ce->ring->vma); 782 783 ce->state->obj->mm.dirty = true; 784 785 i915_gem_context_get(ctx); 786 return 0; 787 788 unpin_map: 789 i915_gem_object_unpin_map(ce->state->obj); 790 unpin_vma: 791 __i915_vma_unpin(ce->state); 792 err: 793 ce->pin_count = 0; 794 return ret; 795 } 796 797 static void execlists_context_unpin(struct intel_engine_cs *engine, 798 struct i915_gem_context *ctx) 799 { 800 struct intel_context *ce = &ctx->engine[engine->id]; 801 802 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 803 GEM_BUG_ON(ce->pin_count == 0); 804 805 if (--ce->pin_count) 806 return; 807 808 intel_ring_unpin(ce->ring); 809 810 i915_gem_object_unpin_map(ce->state->obj); 811 i915_vma_unpin(ce->state); 812 813 i915_gem_context_put(ctx); 814 } 815 816 static int execlists_request_alloc(struct drm_i915_gem_request *request) 817 { 818 struct intel_engine_cs *engine = request->engine; 819 struct intel_context *ce = &request->ctx->engine[engine->id]; 820 u32 *cs; 821 int ret; 822 823 GEM_BUG_ON(!ce->pin_count); 824 825 /* Flush enough space to reduce the likelihood of waiting after 826 * we start building the request - in which case we will just 827 * have to repeat work. 828 */ 829 request->reserved_space += EXECLISTS_REQUEST_SIZE; 830 831 GEM_BUG_ON(!ce->ring); 832 request->ring = ce->ring; 833 834 if (i915.enable_guc_submission) { 835 /* 836 * Check that the GuC has space for the request before 837 * going any further, as the i915_add_request() call 838 * later on mustn't fail ... 839 */ 840 ret = i915_guc_wq_reserve(request); 841 if (ret) 842 goto err; 843 } 844 845 cs = intel_ring_begin(request, 0); 846 if (IS_ERR(cs)) { 847 ret = PTR_ERR(cs); 848 goto err_unreserve; 849 } 850 851 if (!ce->initialised) { 852 ret = engine->init_context(request); 853 if (ret) 854 goto err_unreserve; 855 856 ce->initialised = true; 857 } 858 859 /* Note that after this point, we have committed to using 860 * this request as it is being used to both track the 861 * state of engine initialisation and liveness of the 862 * golden renderstate above. Think twice before you try 863 * to cancel/unwind this request now. 864 */ 865 866 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 867 return 0; 868 869 err_unreserve: 870 if (i915.enable_guc_submission) 871 i915_guc_wq_unreserve(request); 872 err: 873 return ret; 874 } 875 876 /* 877 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 878 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 879 * but there is a slight complication as this is applied in WA batch where the 880 * values are only initialized once so we cannot take register value at the 881 * beginning and reuse it further; hence we save its value to memory, upload a 882 * constant value with bit21 set and then we restore it back with the saved value. 883 * To simplify the WA, a constant value is formed by using the default value 884 * of this register. This shouldn't be a problem because we are only modifying 885 * it for a short period and this batch in non-premptible. We can ofcourse 886 * use additional instructions that read the actual value of the register 887 * at that time and set our bit of interest but it makes the WA complicated. 888 * 889 * This WA is also required for Gen9 so extracting as a function avoids 890 * code duplication. 891 */ 892 static u32 * 893 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 894 { 895 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 896 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 897 *batch++ = i915_ggtt_offset(engine->scratch) + 256; 898 *batch++ = 0; 899 900 *batch++ = MI_LOAD_REGISTER_IMM(1); 901 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 902 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 903 904 batch = gen8_emit_pipe_control(batch, 905 PIPE_CONTROL_CS_STALL | 906 PIPE_CONTROL_DC_FLUSH_ENABLE, 907 0); 908 909 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 910 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 911 *batch++ = i915_ggtt_offset(engine->scratch) + 256; 912 *batch++ = 0; 913 914 return batch; 915 } 916 917 /* 918 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 919 * initialized at the beginning and shared across all contexts but this field 920 * helps us to have multiple batches at different offsets and select them based 921 * on a criteria. At the moment this batch always start at the beginning of the page 922 * and at this point we don't have multiple wa_ctx batch buffers. 923 * 924 * The number of WA applied are not known at the beginning; we use this field 925 * to return the no of DWORDS written. 926 * 927 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 928 * so it adds NOOPs as padding to make it cacheline aligned. 929 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 930 * makes a complete batch buffer. 931 */ 932 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 933 { 934 /* WaDisableCtxRestoreArbitration:bdw,chv */ 935 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 936 937 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 938 if (IS_BROADWELL(engine->i915)) 939 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 940 941 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 942 /* Actual scratch location is at 128 bytes offset */ 943 batch = gen8_emit_pipe_control(batch, 944 PIPE_CONTROL_FLUSH_L3 | 945 PIPE_CONTROL_GLOBAL_GTT_IVB | 946 PIPE_CONTROL_CS_STALL | 947 PIPE_CONTROL_QW_WRITE, 948 i915_ggtt_offset(engine->scratch) + 949 2 * CACHELINE_BYTES); 950 951 /* Pad to end of cacheline */ 952 while ((unsigned long)batch % CACHELINE_BYTES) 953 *batch++ = MI_NOOP; 954 955 /* 956 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 957 * execution depends on the length specified in terms of cache lines 958 * in the register CTX_RCS_INDIRECT_CTX 959 */ 960 961 return batch; 962 } 963 964 /* 965 * This batch is started immediately after indirect_ctx batch. Since we ensure 966 * that indirect_ctx ends on a cacheline this batch is aligned automatically. 967 * 968 * The number of DWORDS written are returned using this field. 969 * 970 * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding 971 * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. 972 */ 973 static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) 974 { 975 /* WaDisableCtxRestoreArbitration:bdw,chv */ 976 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 977 *batch++ = MI_BATCH_BUFFER_END; 978 979 return batch; 980 } 981 982 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 983 { 984 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 985 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 986 987 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 988 *batch++ = MI_LOAD_REGISTER_IMM(1); 989 *batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2); 990 *batch++ = _MASKED_BIT_DISABLE( 991 GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE); 992 *batch++ = MI_NOOP; 993 994 /* WaClearSlmSpaceAtContextSwitch:kbl */ 995 /* Actual scratch location is at 128 bytes offset */ 996 if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) { 997 batch = gen8_emit_pipe_control(batch, 998 PIPE_CONTROL_FLUSH_L3 | 999 PIPE_CONTROL_GLOBAL_GTT_IVB | 1000 PIPE_CONTROL_CS_STALL | 1001 PIPE_CONTROL_QW_WRITE, 1002 i915_ggtt_offset(engine->scratch) 1003 + 2 * CACHELINE_BYTES); 1004 } 1005 1006 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1007 if (HAS_POOLED_EU(engine->i915)) { 1008 /* 1009 * EU pool configuration is setup along with golden context 1010 * during context initialization. This value depends on 1011 * device type (2x6 or 3x6) and needs to be updated based 1012 * on which subslice is disabled especially for 2x6 1013 * devices, however it is safe to load default 1014 * configuration of 3x6 device instead of masking off 1015 * corresponding bits because HW ignores bits of a disabled 1016 * subslice and drops down to appropriate config. Please 1017 * see render_state_setup() in i915_gem_render_state.c for 1018 * possible configurations, to avoid duplication they are 1019 * not shown here again. 1020 */ 1021 *batch++ = GEN9_MEDIA_POOL_STATE; 1022 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1023 *batch++ = 0x00777000; 1024 *batch++ = 0; 1025 *batch++ = 0; 1026 *batch++ = 0; 1027 } 1028 1029 /* Pad to end of cacheline */ 1030 while ((unsigned long)batch % CACHELINE_BYTES) 1031 *batch++ = MI_NOOP; 1032 1033 return batch; 1034 } 1035 1036 static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) 1037 { 1038 *batch++ = MI_BATCH_BUFFER_END; 1039 1040 return batch; 1041 } 1042 1043 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 1044 1045 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 1046 { 1047 struct drm_i915_gem_object *obj; 1048 struct i915_vma *vma; 1049 int err; 1050 1051 obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE); 1052 if (IS_ERR(obj)) 1053 return PTR_ERR(obj); 1054 1055 vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL); 1056 if (IS_ERR(vma)) { 1057 err = PTR_ERR(vma); 1058 goto err; 1059 } 1060 1061 err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH); 1062 if (err) 1063 goto err; 1064 1065 engine->wa_ctx.vma = vma; 1066 return 0; 1067 1068 err: 1069 i915_gem_object_put(obj); 1070 return err; 1071 } 1072 1073 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 1074 { 1075 i915_vma_unpin_and_release(&engine->wa_ctx.vma); 1076 } 1077 1078 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1079 1080 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 1081 { 1082 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1083 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 1084 &wa_ctx->per_ctx }; 1085 wa_bb_func_t wa_bb_fn[2]; 1086 struct page *page; 1087 void *batch, *batch_ptr; 1088 unsigned int i; 1089 int ret; 1090 1091 if (WARN_ON(engine->id != RCS || !engine->scratch)) 1092 return -EINVAL; 1093 1094 switch (INTEL_GEN(engine->i915)) { 1095 case 9: 1096 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1097 wa_bb_fn[1] = gen9_init_perctx_bb; 1098 break; 1099 case 8: 1100 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1101 wa_bb_fn[1] = gen8_init_perctx_bb; 1102 break; 1103 default: 1104 MISSING_CASE(INTEL_GEN(engine->i915)); 1105 return 0; 1106 } 1107 1108 ret = lrc_setup_wa_ctx(engine); 1109 if (ret) { 1110 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 1111 return ret; 1112 } 1113 1114 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 1115 batch = batch_ptr = kmap_atomic(page); 1116 1117 /* 1118 * Emit the two workaround batch buffers, recording the offset from the 1119 * start of the workaround batch buffer object for each and their 1120 * respective sizes. 1121 */ 1122 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1123 wa_bb[i]->offset = batch_ptr - batch; 1124 if (WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, CACHELINE_BYTES))) { 1125 ret = -EINVAL; 1126 break; 1127 } 1128 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1129 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1130 } 1131 1132 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 1133 1134 kunmap_atomic(batch); 1135 if (ret) 1136 lrc_destroy_wa_ctx(engine); 1137 1138 return ret; 1139 } 1140 1141 static u32 port_seqno(struct execlist_port *port) 1142 { 1143 return port->request ? port->request->global_seqno : 0; 1144 } 1145 1146 static int gen8_init_common_ring(struct intel_engine_cs *engine) 1147 { 1148 struct drm_i915_private *dev_priv = engine->i915; 1149 int ret; 1150 1151 ret = intel_mocs_init_engine(engine); 1152 if (ret) 1153 return ret; 1154 1155 intel_engine_reset_breadcrumbs(engine); 1156 intel_engine_init_hangcheck(engine); 1157 1158 I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff); 1159 I915_WRITE(RING_MODE_GEN7(engine), 1160 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); 1161 I915_WRITE(RING_HWS_PGA(engine->mmio_base), 1162 engine->status_page.ggtt_offset); 1163 POSTING_READ(RING_HWS_PGA(engine->mmio_base)); 1164 1165 DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name); 1166 1167 /* After a GPU reset, we may have requests to replay */ 1168 clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); 1169 if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) { 1170 DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n", 1171 engine->name, 1172 port_seqno(&engine->execlist_port[0]), 1173 port_seqno(&engine->execlist_port[1])); 1174 engine->execlist_port[0].count = 0; 1175 engine->execlist_port[1].count = 0; 1176 execlists_submit_ports(engine); 1177 } 1178 1179 return 0; 1180 } 1181 1182 static int gen8_init_render_ring(struct intel_engine_cs *engine) 1183 { 1184 struct drm_i915_private *dev_priv = engine->i915; 1185 int ret; 1186 1187 ret = gen8_init_common_ring(engine); 1188 if (ret) 1189 return ret; 1190 1191 /* We need to disable the AsyncFlip performance optimisations in order 1192 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1193 * programmed to '1' on all products. 1194 * 1195 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv 1196 */ 1197 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1198 1199 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1200 1201 return init_workarounds_ring(engine); 1202 } 1203 1204 static int gen9_init_render_ring(struct intel_engine_cs *engine) 1205 { 1206 int ret; 1207 1208 ret = gen8_init_common_ring(engine); 1209 if (ret) 1210 return ret; 1211 1212 return init_workarounds_ring(engine); 1213 } 1214 1215 static void reset_common_ring(struct intel_engine_cs *engine, 1216 struct drm_i915_gem_request *request) 1217 { 1218 struct execlist_port *port = engine->execlist_port; 1219 struct intel_context *ce; 1220 1221 /* If the request was innocent, we leave the request in the ELSP 1222 * and will try to replay it on restarting. The context image may 1223 * have been corrupted by the reset, in which case we may have 1224 * to service a new GPU hang, but more likely we can continue on 1225 * without impact. 1226 * 1227 * If the request was guilty, we presume the context is corrupt 1228 * and have to at least restore the RING register in the context 1229 * image back to the expected values to skip over the guilty request. 1230 */ 1231 if (!request || request->fence.error != -EIO) 1232 return; 1233 1234 /* We want a simple context + ring to execute the breadcrumb update. 1235 * We cannot rely on the context being intact across the GPU hang, 1236 * so clear it and rebuild just what we need for the breadcrumb. 1237 * All pending requests for this context will be zapped, and any 1238 * future request will be after userspace has had the opportunity 1239 * to recreate its own state. 1240 */ 1241 ce = &request->ctx->engine[engine->id]; 1242 execlists_init_reg_state(ce->lrc_reg_state, 1243 request->ctx, engine, ce->ring); 1244 1245 /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */ 1246 ce->lrc_reg_state[CTX_RING_BUFFER_START+1] = 1247 i915_ggtt_offset(ce->ring->vma); 1248 ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix; 1249 1250 request->ring->head = request->postfix; 1251 intel_ring_update_space(request->ring); 1252 1253 /* Catch up with any missed context-switch interrupts */ 1254 if (request->ctx != port[0].request->ctx) { 1255 i915_gem_request_put(port[0].request); 1256 port[0] = port[1]; 1257 memset(&port[1], 0, sizeof(port[1])); 1258 } 1259 1260 GEM_BUG_ON(request->ctx != port[0].request->ctx); 1261 1262 /* Reset WaIdleLiteRestore:bdw,skl as well */ 1263 request->tail = 1264 intel_ring_wrap(request->ring, 1265 request->wa_tail - WA_TAIL_DWORDS*sizeof(u32)); 1266 assert_ring_tail_valid(request->ring, request->tail); 1267 } 1268 1269 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) 1270 { 1271 struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; 1272 struct intel_engine_cs *engine = req->engine; 1273 const int num_lri_cmds = GEN8_3LVL_PDPES * 2; 1274 u32 *cs; 1275 int i; 1276 1277 cs = intel_ring_begin(req, num_lri_cmds * 2 + 2); 1278 if (IS_ERR(cs)) 1279 return PTR_ERR(cs); 1280 1281 *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds); 1282 for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) { 1283 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1284 1285 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i)); 1286 *cs++ = upper_32_bits(pd_daddr); 1287 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i)); 1288 *cs++ = lower_32_bits(pd_daddr); 1289 } 1290 1291 *cs++ = MI_NOOP; 1292 intel_ring_advance(req, cs); 1293 1294 return 0; 1295 } 1296 1297 static int gen8_emit_bb_start(struct drm_i915_gem_request *req, 1298 u64 offset, u32 len, 1299 const unsigned int flags) 1300 { 1301 u32 *cs; 1302 int ret; 1303 1304 /* Don't rely in hw updating PDPs, specially in lite-restore. 1305 * Ideally, we should set Force PD Restore in ctx descriptor, 1306 * but we can't. Force Restore would be a second option, but 1307 * it is unsafe in case of lite-restore (because the ctx is 1308 * not idle). PML4 is allocated during ppgtt init so this is 1309 * not needed in 48-bit.*/ 1310 if (req->ctx->ppgtt && 1311 (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings) && 1312 !i915_vm_is_48bit(&req->ctx->ppgtt->base) && 1313 !intel_vgpu_active(req->i915)) { 1314 ret = intel_logical_ring_emit_pdps(req); 1315 if (ret) 1316 return ret; 1317 1318 req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine); 1319 } 1320 1321 cs = intel_ring_begin(req, 4); 1322 if (IS_ERR(cs)) 1323 return PTR_ERR(cs); 1324 1325 /* FIXME(BDW): Address space and security selectors. */ 1326 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 1327 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) | 1328 (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0); 1329 *cs++ = lower_32_bits(offset); 1330 *cs++ = upper_32_bits(offset); 1331 *cs++ = MI_NOOP; 1332 intel_ring_advance(req, cs); 1333 1334 return 0; 1335 } 1336 1337 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 1338 { 1339 struct drm_i915_private *dev_priv = engine->i915; 1340 I915_WRITE_IMR(engine, 1341 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 1342 POSTING_READ_FW(RING_IMR(engine->mmio_base)); 1343 } 1344 1345 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 1346 { 1347 struct drm_i915_private *dev_priv = engine->i915; 1348 I915_WRITE_IMR(engine, ~engine->irq_keep_mask); 1349 } 1350 1351 static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode) 1352 { 1353 u32 cmd, *cs; 1354 1355 cs = intel_ring_begin(request, 4); 1356 if (IS_ERR(cs)) 1357 return PTR_ERR(cs); 1358 1359 cmd = MI_FLUSH_DW + 1; 1360 1361 /* We always require a command barrier so that subsequent 1362 * commands, such as breadcrumb interrupts, are strictly ordered 1363 * wrt the contents of the write cache being flushed to memory 1364 * (and thus being coherent from the CPU). 1365 */ 1366 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 1367 1368 if (mode & EMIT_INVALIDATE) { 1369 cmd |= MI_INVALIDATE_TLB; 1370 if (request->engine->id == VCS) 1371 cmd |= MI_INVALIDATE_BSD; 1372 } 1373 1374 *cs++ = cmd; 1375 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 1376 *cs++ = 0; /* upper addr */ 1377 *cs++ = 0; /* value */ 1378 intel_ring_advance(request, cs); 1379 1380 return 0; 1381 } 1382 1383 static int gen8_emit_flush_render(struct drm_i915_gem_request *request, 1384 u32 mode) 1385 { 1386 struct intel_engine_cs *engine = request->engine; 1387 u32 scratch_addr = 1388 i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES; 1389 bool vf_flush_wa = false, dc_flush_wa = false; 1390 u32 *cs, flags = 0; 1391 int len; 1392 1393 flags |= PIPE_CONTROL_CS_STALL; 1394 1395 if (mode & EMIT_FLUSH) { 1396 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 1397 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 1398 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 1399 flags |= PIPE_CONTROL_FLUSH_ENABLE; 1400 } 1401 1402 if (mode & EMIT_INVALIDATE) { 1403 flags |= PIPE_CONTROL_TLB_INVALIDATE; 1404 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 1405 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 1406 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 1407 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 1408 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 1409 flags |= PIPE_CONTROL_QW_WRITE; 1410 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 1411 1412 /* 1413 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 1414 * pipe control. 1415 */ 1416 if (IS_GEN9(request->i915)) 1417 vf_flush_wa = true; 1418 1419 /* WaForGAMHang:kbl */ 1420 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 1421 dc_flush_wa = true; 1422 } 1423 1424 len = 6; 1425 1426 if (vf_flush_wa) 1427 len += 6; 1428 1429 if (dc_flush_wa) 1430 len += 12; 1431 1432 cs = intel_ring_begin(request, len); 1433 if (IS_ERR(cs)) 1434 return PTR_ERR(cs); 1435 1436 if (vf_flush_wa) 1437 cs = gen8_emit_pipe_control(cs, 0, 0); 1438 1439 if (dc_flush_wa) 1440 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 1441 0); 1442 1443 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 1444 1445 if (dc_flush_wa) 1446 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 1447 1448 intel_ring_advance(request, cs); 1449 1450 return 0; 1451 } 1452 1453 /* 1454 * Reserve space for 2 NOOPs at the end of each request to be 1455 * used as a workaround for not being allowed to do lite 1456 * restore with HEAD==TAIL (WaIdleLiteRestore). 1457 */ 1458 static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs) 1459 { 1460 *cs++ = MI_NOOP; 1461 *cs++ = MI_NOOP; 1462 request->wa_tail = intel_ring_offset(request, cs); 1463 } 1464 1465 static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs) 1466 { 1467 /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ 1468 BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); 1469 1470 *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; 1471 *cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT; 1472 *cs++ = 0; 1473 *cs++ = request->global_seqno; 1474 *cs++ = MI_USER_INTERRUPT; 1475 *cs++ = MI_NOOP; 1476 request->tail = intel_ring_offset(request, cs); 1477 assert_ring_tail_valid(request->ring, request->tail); 1478 1479 gen8_emit_wa_tail(request, cs); 1480 } 1481 1482 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; 1483 1484 static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, 1485 u32 *cs) 1486 { 1487 /* We're using qword write, seqno should be aligned to 8 bytes. */ 1488 BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); 1489 1490 /* w/a for post sync ops following a GPGPU operation we 1491 * need a prior CS_STALL, which is emitted by the flush 1492 * following the batch. 1493 */ 1494 *cs++ = GFX_OP_PIPE_CONTROL(6); 1495 *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL | 1496 PIPE_CONTROL_QW_WRITE; 1497 *cs++ = intel_hws_seqno_address(request->engine); 1498 *cs++ = 0; 1499 *cs++ = request->global_seqno; 1500 /* We're thrashing one dword of HWS. */ 1501 *cs++ = 0; 1502 *cs++ = MI_USER_INTERRUPT; 1503 *cs++ = MI_NOOP; 1504 request->tail = intel_ring_offset(request, cs); 1505 assert_ring_tail_valid(request->ring, request->tail); 1506 1507 gen8_emit_wa_tail(request, cs); 1508 } 1509 1510 static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; 1511 1512 static int gen8_init_rcs_context(struct drm_i915_gem_request *req) 1513 { 1514 int ret; 1515 1516 ret = intel_ring_workarounds_emit(req); 1517 if (ret) 1518 return ret; 1519 1520 ret = intel_rcs_context_init_mocs(req); 1521 /* 1522 * Failing to program the MOCS is non-fatal.The system will not 1523 * run at peak performance. So generate an error and carry on. 1524 */ 1525 if (ret) 1526 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1527 1528 return i915_gem_render_state_emit(req); 1529 } 1530 1531 /** 1532 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer 1533 * @engine: Engine Command Streamer. 1534 */ 1535 void intel_logical_ring_cleanup(struct intel_engine_cs *engine) 1536 { 1537 struct drm_i915_private *dev_priv; 1538 1539 /* 1540 * Tasklet cannot be active at this point due intel_mark_active/idle 1541 * so this is just for documentation. 1542 */ 1543 if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->irq_tasklet.state))) 1544 tasklet_kill(&engine->irq_tasklet); 1545 1546 dev_priv = engine->i915; 1547 1548 if (engine->buffer) { 1549 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0); 1550 } 1551 1552 if (engine->cleanup) 1553 engine->cleanup(engine); 1554 1555 if (engine->status_page.vma) { 1556 i915_gem_object_unpin_map(engine->status_page.vma->obj); 1557 engine->status_page.vma = NULL; 1558 } 1559 1560 intel_engine_cleanup_common(engine); 1561 1562 lrc_destroy_wa_ctx(engine); 1563 engine->i915 = NULL; 1564 dev_priv->engine[engine->id] = NULL; 1565 kfree(engine); 1566 } 1567 1568 static void execlists_set_default_submission(struct intel_engine_cs *engine) 1569 { 1570 engine->submit_request = execlists_submit_request; 1571 engine->schedule = execlists_schedule; 1572 engine->irq_tasklet.func = intel_lrc_irq_handler; 1573 } 1574 1575 static void 1576 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 1577 { 1578 /* Default vfuncs which can be overriden by each engine. */ 1579 engine->init_hw = gen8_init_common_ring; 1580 engine->reset_hw = reset_common_ring; 1581 1582 engine->context_pin = execlists_context_pin; 1583 engine->context_unpin = execlists_context_unpin; 1584 1585 engine->request_alloc = execlists_request_alloc; 1586 1587 engine->emit_flush = gen8_emit_flush; 1588 engine->emit_breadcrumb = gen8_emit_breadcrumb; 1589 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz; 1590 1591 engine->set_default_submission = execlists_set_default_submission; 1592 1593 engine->irq_enable = gen8_logical_ring_enable_irq; 1594 engine->irq_disable = gen8_logical_ring_disable_irq; 1595 engine->emit_bb_start = gen8_emit_bb_start; 1596 } 1597 1598 static inline void 1599 logical_ring_default_irqs(struct intel_engine_cs *engine) 1600 { 1601 unsigned shift = engine->irq_shift; 1602 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 1603 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 1604 } 1605 1606 static int 1607 lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma) 1608 { 1609 const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE; 1610 void *hws; 1611 1612 /* The HWSP is part of the default context object in LRC mode. */ 1613 hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); 1614 if (IS_ERR(hws)) 1615 return PTR_ERR(hws); 1616 1617 engine->status_page.page_addr = hws + hws_offset; 1618 engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset; 1619 engine->status_page.vma = vma; 1620 1621 return 0; 1622 } 1623 1624 static void 1625 logical_ring_setup(struct intel_engine_cs *engine) 1626 { 1627 struct drm_i915_private *dev_priv = engine->i915; 1628 enum forcewake_domains fw_domains; 1629 1630 intel_engine_setup_common(engine); 1631 1632 /* Intentionally left blank. */ 1633 engine->buffer = NULL; 1634 1635 fw_domains = intel_uncore_forcewake_for_reg(dev_priv, 1636 RING_ELSP(engine), 1637 FW_REG_WRITE); 1638 1639 fw_domains |= intel_uncore_forcewake_for_reg(dev_priv, 1640 RING_CONTEXT_STATUS_PTR(engine), 1641 FW_REG_READ | FW_REG_WRITE); 1642 1643 fw_domains |= intel_uncore_forcewake_for_reg(dev_priv, 1644 RING_CONTEXT_STATUS_BUF_BASE(engine), 1645 FW_REG_READ); 1646 1647 engine->fw_domains = fw_domains; 1648 1649 tasklet_init(&engine->irq_tasklet, 1650 intel_lrc_irq_handler, (unsigned long)engine); 1651 1652 logical_ring_default_vfuncs(engine); 1653 logical_ring_default_irqs(engine); 1654 } 1655 1656 static int 1657 logical_ring_init(struct intel_engine_cs *engine) 1658 { 1659 struct i915_gem_context *dctx = engine->i915->kernel_context; 1660 int ret; 1661 1662 ret = intel_engine_init_common(engine); 1663 if (ret) 1664 goto error; 1665 1666 /* And setup the hardware status page. */ 1667 ret = lrc_setup_hws(engine, dctx->engine[engine->id].state); 1668 if (ret) { 1669 DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret); 1670 goto error; 1671 } 1672 1673 return 0; 1674 1675 error: 1676 intel_logical_ring_cleanup(engine); 1677 return ret; 1678 } 1679 1680 int logical_render_ring_init(struct intel_engine_cs *engine) 1681 { 1682 struct drm_i915_private *dev_priv = engine->i915; 1683 int ret; 1684 1685 logical_ring_setup(engine); 1686 1687 if (HAS_L3_DPF(dev_priv)) 1688 engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 1689 1690 /* Override some for render ring. */ 1691 if (INTEL_GEN(dev_priv) >= 9) 1692 engine->init_hw = gen9_init_render_ring; 1693 else 1694 engine->init_hw = gen8_init_render_ring; 1695 engine->init_context = gen8_init_rcs_context; 1696 engine->emit_flush = gen8_emit_flush_render; 1697 engine->emit_breadcrumb = gen8_emit_breadcrumb_render; 1698 engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_render_sz; 1699 1700 ret = intel_engine_create_scratch(engine, PAGE_SIZE); 1701 if (ret) 1702 return ret; 1703 1704 ret = intel_init_workaround_bb(engine); 1705 if (ret) { 1706 /* 1707 * We continue even if we fail to initialize WA batch 1708 * because we only expect rare glitches but nothing 1709 * critical to prevent us from using GPU 1710 */ 1711 DRM_ERROR("WA batch buffer initialization failed: %d\n", 1712 ret); 1713 } 1714 1715 return logical_ring_init(engine); 1716 } 1717 1718 int logical_xcs_ring_init(struct intel_engine_cs *engine) 1719 { 1720 logical_ring_setup(engine); 1721 1722 return logical_ring_init(engine); 1723 } 1724 1725 static u32 1726 make_rpcs(struct drm_i915_private *dev_priv) 1727 { 1728 u32 rpcs = 0; 1729 1730 /* 1731 * No explicit RPCS request is needed to ensure full 1732 * slice/subslice/EU enablement prior to Gen9. 1733 */ 1734 if (INTEL_GEN(dev_priv) < 9) 1735 return 0; 1736 1737 /* 1738 * Starting in Gen9, render power gating can leave 1739 * slice/subslice/EU in a partially enabled state. We 1740 * must make an explicit request through RPCS for full 1741 * enablement. 1742 */ 1743 if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) { 1744 rpcs |= GEN8_RPCS_S_CNT_ENABLE; 1745 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) << 1746 GEN8_RPCS_S_CNT_SHIFT; 1747 rpcs |= GEN8_RPCS_ENABLE; 1748 } 1749 1750 if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) { 1751 rpcs |= GEN8_RPCS_SS_CNT_ENABLE; 1752 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask) << 1753 GEN8_RPCS_SS_CNT_SHIFT; 1754 rpcs |= GEN8_RPCS_ENABLE; 1755 } 1756 1757 if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) { 1758 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice << 1759 GEN8_RPCS_EU_MIN_SHIFT; 1760 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice << 1761 GEN8_RPCS_EU_MAX_SHIFT; 1762 rpcs |= GEN8_RPCS_ENABLE; 1763 } 1764 1765 return rpcs; 1766 } 1767 1768 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) 1769 { 1770 u32 indirect_ctx_offset; 1771 1772 switch (INTEL_GEN(engine->i915)) { 1773 default: 1774 MISSING_CASE(INTEL_GEN(engine->i915)); 1775 /* fall through */ 1776 case 9: 1777 indirect_ctx_offset = 1778 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 1779 break; 1780 case 8: 1781 indirect_ctx_offset = 1782 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 1783 break; 1784 } 1785 1786 return indirect_ctx_offset; 1787 } 1788 1789 static void execlists_init_reg_state(u32 *regs, 1790 struct i915_gem_context *ctx, 1791 struct intel_engine_cs *engine, 1792 struct intel_ring *ring) 1793 { 1794 struct drm_i915_private *dev_priv = engine->i915; 1795 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt; 1796 u32 base = engine->mmio_base; 1797 bool rcs = engine->id == RCS; 1798 1799 /* A context is actually a big batch buffer with several 1800 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 1801 * values we are setting here are only for the first context restore: 1802 * on a subsequent save, the GPU will recreate this batchbuffer with new 1803 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 1804 * we are not initializing here). 1805 */ 1806 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | 1807 MI_LRI_FORCE_POSTED; 1808 1809 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine), 1810 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 1811 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | 1812 (HAS_RESOURCE_STREAMER(dev_priv) ? 1813 CTX_CTRL_RS_CTX_ENABLE : 0))); 1814 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0); 1815 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0); 1816 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0); 1817 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base), 1818 RING_CTL_SIZE(ring->size) | RING_VALID); 1819 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0); 1820 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0); 1821 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT); 1822 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0); 1823 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); 1824 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); 1825 if (rcs) { 1826 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); 1827 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); 1828 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, 1829 RING_INDIRECT_CTX_OFFSET(base), 0); 1830 1831 if (engine->wa_ctx.vma) { 1832 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1833 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 1834 1835 regs[CTX_RCS_INDIRECT_CTX + 1] = 1836 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 1837 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 1838 1839 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = 1840 intel_lr_indirect_ctx_offset(engine) << 6; 1841 1842 regs[CTX_BB_PER_CTX_PTR + 1] = 1843 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 1844 } 1845 } 1846 1847 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 1848 1849 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0); 1850 /* PDP values well be assigned later if needed */ 1851 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0); 1852 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0); 1853 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0); 1854 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0); 1855 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0); 1856 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0); 1857 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0); 1858 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0); 1859 1860 if (ppgtt && i915_vm_is_48bit(&ppgtt->base)) { 1861 /* 64b PPGTT (48bit canonical) 1862 * PDP0_DESCRIPTOR contains the base address to PML4 and 1863 * other PDP Descriptors are ignored. 1864 */ 1865 ASSIGN_CTX_PML4(ppgtt, regs); 1866 } 1867 1868 if (rcs) { 1869 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 1870 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 1871 make_rpcs(dev_priv)); 1872 } 1873 } 1874 1875 static int 1876 populate_lr_context(struct i915_gem_context *ctx, 1877 struct drm_i915_gem_object *ctx_obj, 1878 struct intel_engine_cs *engine, 1879 struct intel_ring *ring) 1880 { 1881 void *vaddr; 1882 int ret; 1883 1884 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); 1885 if (ret) { 1886 DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); 1887 return ret; 1888 } 1889 1890 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 1891 if (IS_ERR(vaddr)) { 1892 ret = PTR_ERR(vaddr); 1893 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 1894 return ret; 1895 } 1896 ctx_obj->mm.dirty = true; 1897 1898 /* The second page of the context object contains some fields which must 1899 * be set up prior to the first execution. */ 1900 1901 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 1902 ctx, engine, ring); 1903 1904 i915_gem_object_unpin_map(ctx_obj); 1905 1906 return 0; 1907 } 1908 1909 /** 1910 * intel_lr_context_size() - return the size of the context for an engine 1911 * @engine: which engine to find the context size for 1912 * 1913 * Each engine may require a different amount of space for a context image, 1914 * so when allocating (or copying) an image, this function can be used to 1915 * find the right size for the specific engine. 1916 * 1917 * Return: size (in bytes) of an engine-specific context image 1918 * 1919 * Note: this size includes the HWSP, which is part of the context image 1920 * in LRC mode, but does not include the "shared data page" used with 1921 * GuC submission. The caller should account for this if using the GuC. 1922 */ 1923 uint32_t intel_lr_context_size(struct intel_engine_cs *engine) 1924 { 1925 int ret = 0; 1926 1927 WARN_ON(INTEL_GEN(engine->i915) < 8); 1928 1929 switch (engine->id) { 1930 case RCS: 1931 if (INTEL_GEN(engine->i915) >= 9) 1932 ret = GEN9_LR_CONTEXT_RENDER_SIZE; 1933 else 1934 ret = GEN8_LR_CONTEXT_RENDER_SIZE; 1935 break; 1936 case VCS: 1937 case BCS: 1938 case VECS: 1939 case VCS2: 1940 ret = GEN8_LR_CONTEXT_OTHER_SIZE; 1941 break; 1942 } 1943 1944 return ret; 1945 } 1946 1947 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, 1948 struct intel_engine_cs *engine) 1949 { 1950 struct drm_i915_gem_object *ctx_obj; 1951 struct intel_context *ce = &ctx->engine[engine->id]; 1952 struct i915_vma *vma; 1953 uint32_t context_size; 1954 struct intel_ring *ring; 1955 int ret; 1956 1957 WARN_ON(ce->state); 1958 1959 context_size = round_up(intel_lr_context_size(engine), 1960 I915_GTT_PAGE_SIZE); 1961 1962 /* One extra page as the sharing data between driver and GuC */ 1963 context_size += PAGE_SIZE * LRC_PPHWSP_PN; 1964 1965 ctx_obj = i915_gem_object_create(ctx->i915, context_size); 1966 if (IS_ERR(ctx_obj)) { 1967 DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); 1968 return PTR_ERR(ctx_obj); 1969 } 1970 1971 vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.base, NULL); 1972 if (IS_ERR(vma)) { 1973 ret = PTR_ERR(vma); 1974 goto error_deref_obj; 1975 } 1976 1977 ring = intel_engine_create_ring(engine, ctx->ring_size); 1978 if (IS_ERR(ring)) { 1979 ret = PTR_ERR(ring); 1980 goto error_deref_obj; 1981 } 1982 1983 ret = populate_lr_context(ctx, ctx_obj, engine, ring); 1984 if (ret) { 1985 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 1986 goto error_ring_free; 1987 } 1988 1989 ce->ring = ring; 1990 ce->state = vma; 1991 ce->initialised |= engine->init_context == NULL; 1992 1993 return 0; 1994 1995 error_ring_free: 1996 intel_ring_free(ring); 1997 error_deref_obj: 1998 i915_gem_object_put(ctx_obj); 1999 return ret; 2000 } 2001 2002 void intel_lr_context_resume(struct drm_i915_private *dev_priv) 2003 { 2004 struct intel_engine_cs *engine; 2005 struct i915_gem_context *ctx; 2006 enum intel_engine_id id; 2007 2008 /* Because we emit WA_TAIL_DWORDS there may be a disparity 2009 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2010 * that stored in context. As we only write new commands from 2011 * ce->ring->tail onwards, everything before that is junk. If the GPU 2012 * starts reading from its RING_HEAD from the context, it may try to 2013 * execute that junk and die. 2014 * 2015 * So to avoid that we reset the context images upon resume. For 2016 * simplicity, we just zero everything out. 2017 */ 2018 list_for_each_entry(ctx, &dev_priv->context_list, link) { 2019 for_each_engine(engine, dev_priv, id) { 2020 struct intel_context *ce = &ctx->engine[engine->id]; 2021 u32 *reg; 2022 2023 if (!ce->state) 2024 continue; 2025 2026 reg = i915_gem_object_pin_map(ce->state->obj, 2027 I915_MAP_WB); 2028 if (WARN_ON(IS_ERR(reg))) 2029 continue; 2030 2031 reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg); 2032 reg[CTX_RING_HEAD+1] = 0; 2033 reg[CTX_RING_TAIL+1] = 0; 2034 2035 ce->state->obj->mm.dirty = true; 2036 i915_gem_object_unpin_map(ce->state->obj); 2037 2038 intel_ring_reset(ce->ring, 0); 2039 } 2040 } 2041 } 2042