1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include <drm/drmP.h> 137 #include <drm/i915_drm.h> 138 #include "i915_drv.h" 139 #include "intel_mocs.h" 140 141 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) 142 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) 143 #define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE) 144 145 #define RING_EXECLIST_QFULL (1 << 0x2) 146 #define RING_EXECLIST1_VALID (1 << 0x3) 147 #define RING_EXECLIST0_VALID (1 << 0x4) 148 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 149 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 150 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 151 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 153 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 156 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 157 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 158 159 #define CTX_LRI_HEADER_0 0x01 160 #define CTX_CONTEXT_CONTROL 0x02 161 #define CTX_RING_HEAD 0x04 162 #define CTX_RING_TAIL 0x06 163 #define CTX_RING_BUFFER_START 0x08 164 #define CTX_RING_BUFFER_CONTROL 0x0a 165 #define CTX_BB_HEAD_U 0x0c 166 #define CTX_BB_HEAD_L 0x0e 167 #define CTX_BB_STATE 0x10 168 #define CTX_SECOND_BB_HEAD_U 0x12 169 #define CTX_SECOND_BB_HEAD_L 0x14 170 #define CTX_SECOND_BB_STATE 0x16 171 #define CTX_BB_PER_CTX_PTR 0x18 172 #define CTX_RCS_INDIRECT_CTX 0x1a 173 #define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c 174 #define CTX_LRI_HEADER_1 0x21 175 #define CTX_CTX_TIMESTAMP 0x22 176 #define CTX_PDP3_UDW 0x24 177 #define CTX_PDP3_LDW 0x26 178 #define CTX_PDP2_UDW 0x28 179 #define CTX_PDP2_LDW 0x2a 180 #define CTX_PDP1_UDW 0x2c 181 #define CTX_PDP1_LDW 0x2e 182 #define CTX_PDP0_UDW 0x30 183 #define CTX_PDP0_LDW 0x32 184 #define CTX_LRI_HEADER_2 0x41 185 #define CTX_R_PWR_CLK_STATE 0x42 186 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 187 188 #define GEN8_CTX_VALID (1<<0) 189 #define GEN8_CTX_FORCE_PD_RESTORE (1<<1) 190 #define GEN8_CTX_FORCE_RESTORE (1<<2) 191 #define GEN8_CTX_L3LLC_COHERENT (1<<5) 192 #define GEN8_CTX_PRIVILEGE (1<<8) 193 194 #define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \ 195 (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \ 196 (reg_state)[(pos)+1] = (val); \ 197 } while (0) 198 199 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ 200 const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ 201 reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ 202 reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ 203 } while (0) 204 205 #define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ 206 reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ 207 reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ 208 } while (0) 209 210 enum { 211 FAULT_AND_HANG = 0, 212 FAULT_AND_HALT, /* Debug only */ 213 FAULT_AND_STREAM, 214 FAULT_AND_CONTINUE /* Unsupported */ 215 }; 216 #define GEN8_CTX_ID_SHIFT 32 217 #define GEN8_CTX_ID_WIDTH 21 218 #define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 219 #define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 220 221 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 222 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 223 224 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, 225 struct intel_engine_cs *engine); 226 static int intel_lr_context_pin(struct i915_gem_context *ctx, 227 struct intel_engine_cs *engine); 228 229 /** 230 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists 231 * @dev_priv: i915 device private 232 * @enable_execlists: value of i915.enable_execlists module parameter. 233 * 234 * Only certain platforms support Execlists (the prerequisites being 235 * support for Logical Ring Contexts and Aliasing PPGTT or better). 236 * 237 * Return: 1 if Execlists is supported and has to be enabled. 238 */ 239 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enable_execlists) 240 { 241 /* On platforms with execlist available, vGPU will only 242 * support execlist mode, no ring buffer mode. 243 */ 244 if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && intel_vgpu_active(dev_priv)) 245 return 1; 246 247 if (INTEL_GEN(dev_priv) >= 9) 248 return 1; 249 250 if (enable_execlists == 0) 251 return 0; 252 253 if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && 254 USES_PPGTT(dev_priv) && 255 i915.use_mmio_flip >= 0) 256 return 1; 257 258 return 0; 259 } 260 261 static void 262 logical_ring_init_platform_invariants(struct intel_engine_cs *engine) 263 { 264 struct drm_i915_private *dev_priv = engine->i915; 265 266 if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv)) 267 engine->idle_lite_restore_wa = ~0; 268 269 engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) || 270 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) && 271 (engine->id == VCS || engine->id == VCS2); 272 273 engine->ctx_desc_template = GEN8_CTX_VALID; 274 if (IS_GEN8(dev_priv)) 275 engine->ctx_desc_template |= GEN8_CTX_L3LLC_COHERENT; 276 engine->ctx_desc_template |= GEN8_CTX_PRIVILEGE; 277 278 /* TODO: WaDisableLiteRestore when we start using semaphore 279 * signalling between Command Streamers */ 280 /* ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; */ 281 282 /* WaEnableForceRestoreInCtxtDescForVCS:skl */ 283 /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ 284 if (engine->disable_lite_restore_wa) 285 engine->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; 286 } 287 288 /** 289 * intel_lr_context_descriptor_update() - calculate & cache the descriptor 290 * descriptor for a pinned context 291 * @ctx: Context to work on 292 * @engine: Engine the descriptor will be used with 293 * 294 * The context descriptor encodes various attributes of a context, 295 * including its GTT address and some flags. Because it's fairly 296 * expensive to calculate, we'll just do it once and cache the result, 297 * which remains valid until the context is unpinned. 298 * 299 * This is what a descriptor looks like, from LSB to MSB:: 300 * 301 * bits 0-11: flags, GEN8_CTX_* (cached in ctx_desc_template) 302 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 303 * bits 32-52: ctx ID, a globally unique tag 304 * bits 53-54: mbz, reserved for use by hardware 305 * bits 55-63: group ID, currently unused and set to 0 306 */ 307 static void 308 intel_lr_context_descriptor_update(struct i915_gem_context *ctx, 309 struct intel_engine_cs *engine) 310 { 311 struct intel_context *ce = &ctx->engine[engine->id]; 312 u64 desc; 313 314 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (1<<GEN8_CTX_ID_WIDTH)); 315 316 desc = ctx->desc_template; /* bits 3-4 */ 317 desc |= engine->ctx_desc_template; /* bits 0-11 */ 318 desc |= ce->lrc_vma->node.start + LRC_PPHWSP_PN * PAGE_SIZE; 319 /* bits 12-31 */ 320 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ 321 322 ce->lrc_desc = desc; 323 } 324 325 uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx, 326 struct intel_engine_cs *engine) 327 { 328 return ctx->engine[engine->id].lrc_desc; 329 } 330 331 static void execlists_elsp_write(struct drm_i915_gem_request *rq0, 332 struct drm_i915_gem_request *rq1) 333 { 334 335 struct intel_engine_cs *engine = rq0->engine; 336 struct drm_i915_private *dev_priv = rq0->i915; 337 uint64_t desc[2]; 338 339 if (rq1) { 340 desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine); 341 rq1->elsp_submitted++; 342 } else { 343 desc[1] = 0; 344 } 345 346 desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine); 347 rq0->elsp_submitted++; 348 349 /* You must always write both descriptors in the order below. */ 350 I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); 351 I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); 352 353 I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); 354 /* The context is automatically loaded after the following */ 355 I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); 356 357 /* ELSP is a wo register, use another nearby reg for posting */ 358 POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); 359 } 360 361 static void 362 execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state) 363 { 364 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 365 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 366 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 367 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 368 } 369 370 static void execlists_update_context(struct drm_i915_gem_request *rq) 371 { 372 struct intel_engine_cs *engine = rq->engine; 373 struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; 374 uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state; 375 376 reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail); 377 378 /* True 32b PPGTT with dynamic page allocation: update PDP 379 * registers and point the unallocated PDPs to scratch page. 380 * PML4 is allocated during ppgtt init, so this is not needed 381 * in 48-bit mode. 382 */ 383 if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) 384 execlists_update_context_pdps(ppgtt, reg_state); 385 } 386 387 static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0, 388 struct drm_i915_gem_request *rq1) 389 { 390 struct drm_i915_private *dev_priv = rq0->i915; 391 unsigned int fw_domains = rq0->engine->fw_domains; 392 393 execlists_update_context(rq0); 394 395 if (rq1) 396 execlists_update_context(rq1); 397 398 spin_lock_irq(&dev_priv->uncore.lock); 399 intel_uncore_forcewake_get__locked(dev_priv, fw_domains); 400 401 execlists_elsp_write(rq0, rq1); 402 403 intel_uncore_forcewake_put__locked(dev_priv, fw_domains); 404 spin_unlock_irq(&dev_priv->uncore.lock); 405 } 406 407 static inline void execlists_context_status_change( 408 struct drm_i915_gem_request *rq, 409 unsigned long status) 410 { 411 /* 412 * Only used when GVT-g is enabled now. When GVT-g is disabled, 413 * The compiler should eliminate this function as dead-code. 414 */ 415 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 416 return; 417 418 atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq); 419 } 420 421 static void execlists_unqueue(struct intel_engine_cs *engine) 422 { 423 struct drm_i915_gem_request *req0 = NULL, *req1 = NULL; 424 struct drm_i915_gem_request *cursor, *tmp; 425 426 assert_spin_locked(&engine->execlist_lock); 427 428 /* 429 * If irqs are not active generate a warning as batches that finish 430 * without the irqs may get lost and a GPU Hang may occur. 431 */ 432 WARN_ON(!intel_irqs_enabled(engine->i915)); 433 434 /* Try to read in pairs */ 435 list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue, 436 execlist_link) { 437 if (!req0) { 438 req0 = cursor; 439 } else if (req0->ctx == cursor->ctx) { 440 /* Same ctx: ignore first request, as second request 441 * will update tail past first request's workload */ 442 cursor->elsp_submitted = req0->elsp_submitted; 443 list_del(&req0->execlist_link); 444 i915_gem_request_put(req0); 445 req0 = cursor; 446 } else { 447 if (IS_ENABLED(CONFIG_DRM_I915_GVT)) { 448 /* 449 * req0 (after merged) ctx requires single 450 * submission, stop picking 451 */ 452 if (req0->ctx->execlists_force_single_submission) 453 break; 454 /* 455 * req0 ctx doesn't require single submission, 456 * but next req ctx requires, stop picking 457 */ 458 if (cursor->ctx->execlists_force_single_submission) 459 break; 460 } 461 req1 = cursor; 462 WARN_ON(req1->elsp_submitted); 463 break; 464 } 465 } 466 467 if (unlikely(!req0)) 468 return; 469 470 execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN); 471 472 if (req1) 473 execlists_context_status_change(req1, 474 INTEL_CONTEXT_SCHEDULE_IN); 475 476 if (req0->elsp_submitted & engine->idle_lite_restore_wa) { 477 /* 478 * WaIdleLiteRestore: make sure we never cause a lite restore 479 * with HEAD==TAIL. 480 * 481 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we 482 * resubmit the request. See gen8_emit_request() for where we 483 * prepare the padding after the end of the request. 484 */ 485 req0->tail += 8; 486 req0->tail &= req0->ring->size - 1; 487 } 488 489 execlists_elsp_submit_contexts(req0, req1); 490 } 491 492 static unsigned int 493 execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id) 494 { 495 struct drm_i915_gem_request *head_req; 496 497 assert_spin_locked(&engine->execlist_lock); 498 499 head_req = list_first_entry_or_null(&engine->execlist_queue, 500 struct drm_i915_gem_request, 501 execlist_link); 502 503 if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id))) 504 return 0; 505 506 WARN(head_req->elsp_submitted == 0, "Never submitted head request\n"); 507 508 if (--head_req->elsp_submitted > 0) 509 return 0; 510 511 execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT); 512 513 list_del(&head_req->execlist_link); 514 i915_gem_request_put(head_req); 515 516 return 1; 517 } 518 519 static u32 520 get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer, 521 u32 *context_id) 522 { 523 struct drm_i915_private *dev_priv = engine->i915; 524 u32 status; 525 526 read_pointer %= GEN8_CSB_ENTRIES; 527 528 status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer)); 529 530 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 531 return 0; 532 533 *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine, 534 read_pointer)); 535 536 return status; 537 } 538 539 /* 540 * Check the unread Context Status Buffers and manage the submission of new 541 * contexts to the ELSP accordingly. 542 */ 543 static void intel_lrc_irq_handler(unsigned long data) 544 { 545 struct intel_engine_cs *engine = (struct intel_engine_cs *)data; 546 struct drm_i915_private *dev_priv = engine->i915; 547 u32 status_pointer; 548 unsigned int read_pointer, write_pointer; 549 u32 csb[GEN8_CSB_ENTRIES][2]; 550 unsigned int csb_read = 0, i; 551 unsigned int submit_contexts = 0; 552 553 intel_uncore_forcewake_get(dev_priv, engine->fw_domains); 554 555 status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine)); 556 557 read_pointer = engine->next_context_status_buffer; 558 write_pointer = GEN8_CSB_WRITE_PTR(status_pointer); 559 if (read_pointer > write_pointer) 560 write_pointer += GEN8_CSB_ENTRIES; 561 562 while (read_pointer < write_pointer) { 563 if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES)) 564 break; 565 csb[csb_read][0] = get_context_status(engine, ++read_pointer, 566 &csb[csb_read][1]); 567 csb_read++; 568 } 569 570 engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; 571 572 /* Update the read pointer to the old write pointer. Manual ringbuffer 573 * management ftw </sarcasm> */ 574 I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine), 575 _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, 576 engine->next_context_status_buffer << 8)); 577 578 intel_uncore_forcewake_put(dev_priv, engine->fw_domains); 579 580 lockmgr(&engine->execlist_lock, LK_EXCLUSIVE); 581 582 for (i = 0; i < csb_read; i++) { 583 if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) { 584 if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) { 585 if (execlists_check_remove_request(engine, csb[i][1])) 586 WARN(1, "Lite Restored request removed from queue\n"); 587 } else 588 WARN(1, "Preemption without Lite Restore\n"); 589 } 590 591 if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE | 592 GEN8_CTX_STATUS_ELEMENT_SWITCH)) 593 submit_contexts += 594 execlists_check_remove_request(engine, csb[i][1]); 595 } 596 597 if (submit_contexts) { 598 if (!engine->disable_lite_restore_wa || 599 (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE)) 600 execlists_unqueue(engine); 601 } 602 603 lockmgr(&engine->execlist_lock, LK_RELEASE); 604 605 if (unlikely(submit_contexts > 2)) 606 DRM_ERROR("More than two context complete events?\n"); 607 } 608 609 static void execlists_submit_request(struct drm_i915_gem_request *request) 610 { 611 struct intel_engine_cs *engine = request->engine; 612 struct drm_i915_gem_request *cursor; 613 int num_elements = 0; 614 615 spin_lock_bh(&engine->execlist_lock); 616 617 list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) 618 if (++num_elements > 2) 619 break; 620 621 if (num_elements > 2) { 622 struct drm_i915_gem_request *tail_req; 623 624 tail_req = list_last_entry(&engine->execlist_queue, 625 struct drm_i915_gem_request, 626 execlist_link); 627 628 if (request->ctx == tail_req->ctx) { 629 WARN(tail_req->elsp_submitted != 0, 630 "More than 2 already-submitted reqs queued\n"); 631 list_del(&tail_req->execlist_link); 632 i915_gem_request_put(tail_req); 633 } 634 } 635 636 i915_gem_request_get(request); 637 list_add_tail(&request->execlist_link, &engine->execlist_queue); 638 request->ctx_hw_id = request->ctx->hw_id; 639 if (num_elements == 0) 640 execlists_unqueue(engine); 641 642 spin_unlock_bh(&engine->execlist_lock); 643 } 644 645 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) 646 { 647 struct intel_engine_cs *engine = request->engine; 648 struct intel_context *ce = &request->ctx->engine[engine->id]; 649 int ret; 650 651 /* Flush enough space to reduce the likelihood of waiting after 652 * we start building the request - in which case we will just 653 * have to repeat work. 654 */ 655 request->reserved_space += EXECLISTS_REQUEST_SIZE; 656 657 if (!ce->state) { 658 ret = execlists_context_deferred_alloc(request->ctx, engine); 659 if (ret) 660 return ret; 661 } 662 663 request->ring = ce->ring; 664 665 if (i915.enable_guc_submission) { 666 /* 667 * Check that the GuC has space for the request before 668 * going any further, as the i915_add_request() call 669 * later on mustn't fail ... 670 */ 671 ret = i915_guc_wq_check_space(request); 672 if (ret) 673 return ret; 674 } 675 676 ret = intel_lr_context_pin(request->ctx, engine); 677 if (ret) 678 return ret; 679 680 ret = intel_ring_begin(request, 0); 681 if (ret) 682 goto err_unpin; 683 684 if (!ce->initialised) { 685 ret = engine->init_context(request); 686 if (ret) 687 goto err_unpin; 688 689 ce->initialised = true; 690 } 691 692 /* Note that after this point, we have committed to using 693 * this request as it is being used to both track the 694 * state of engine initialisation and liveness of the 695 * golden renderstate above. Think twice before you try 696 * to cancel/unwind this request now. 697 */ 698 699 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 700 return 0; 701 702 err_unpin: 703 intel_lr_context_unpin(request->ctx, engine); 704 return ret; 705 } 706 707 /* 708 * intel_logical_ring_advance() - advance the tail and prepare for submission 709 * @request: Request to advance the logical ringbuffer of. 710 * 711 * The tail is updated in our logical ringbuffer struct, not in the actual context. What 712 * really happens during submission is that the context and current tail will be placed 713 * on a queue waiting for the ELSP to be ready to accept a new context submission. At that 714 * point, the tail *inside* the context is updated and the ELSP written to. 715 */ 716 static int 717 intel_logical_ring_advance(struct drm_i915_gem_request *request) 718 { 719 struct intel_ring *ring = request->ring; 720 struct intel_engine_cs *engine = request->engine; 721 722 intel_ring_advance(ring); 723 request->tail = ring->tail; 724 725 /* 726 * Here we add two extra NOOPs as padding to avoid 727 * lite restore of a context with HEAD==TAIL. 728 * 729 * Caller must reserve WA_TAIL_DWORDS for us! 730 */ 731 intel_ring_emit(ring, MI_NOOP); 732 intel_ring_emit(ring, MI_NOOP); 733 intel_ring_advance(ring); 734 735 /* We keep the previous context alive until we retire the following 736 * request. This ensures that any the context object is still pinned 737 * for any residual writes the HW makes into it on the context switch 738 * into the next object following the breadcrumb. Otherwise, we may 739 * retire the context too early. 740 */ 741 request->previous_context = engine->last_context; 742 engine->last_context = request->ctx; 743 return 0; 744 } 745 746 void intel_execlists_cancel_requests(struct intel_engine_cs *engine) 747 { 748 struct drm_i915_gem_request *req, *tmp; 749 LINUX_LIST_HEAD(cancel_list); 750 751 WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex)); 752 753 spin_lock_bh(&engine->execlist_lock); 754 list_replace_init(&engine->execlist_queue, &cancel_list); 755 spin_unlock_bh(&engine->execlist_lock); 756 757 list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) { 758 list_del(&req->execlist_link); 759 i915_gem_request_put(req); 760 } 761 } 762 763 static int intel_lr_context_pin(struct i915_gem_context *ctx, 764 struct intel_engine_cs *engine) 765 { 766 struct drm_i915_private *dev_priv = ctx->i915; 767 struct intel_context *ce = &ctx->engine[engine->id]; 768 void *vaddr; 769 u32 *lrc_reg_state; 770 int ret; 771 772 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 773 774 if (ce->pin_count++) 775 return 0; 776 777 ret = i915_gem_object_ggtt_pin(ce->state, NULL, 778 0, GEN8_LR_CONTEXT_ALIGN, 779 PIN_OFFSET_BIAS | GUC_WOPCM_TOP); 780 if (ret) 781 goto err; 782 783 vaddr = i915_gem_object_pin_map(ce->state); 784 if (IS_ERR(vaddr)) { 785 ret = PTR_ERR(vaddr); 786 goto unpin_ctx_obj; 787 } 788 789 lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 790 791 ret = intel_ring_pin(ce->ring); 792 if (ret) 793 goto unpin_map; 794 795 ce->lrc_vma = i915_gem_obj_to_ggtt(ce->state); 796 intel_lr_context_descriptor_update(ctx, engine); 797 798 lrc_reg_state[CTX_RING_BUFFER_START+1] = ce->ring->vma->node.start; 799 ce->lrc_reg_state = lrc_reg_state; 800 ce->state->dirty = true; 801 802 /* Invalidate GuC TLB. */ 803 if (i915.enable_guc_submission) 804 I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); 805 806 i915_gem_context_get(ctx); 807 return 0; 808 809 unpin_map: 810 i915_gem_object_unpin_map(ce->state); 811 unpin_ctx_obj: 812 i915_gem_object_ggtt_unpin(ce->state); 813 err: 814 ce->pin_count = 0; 815 return ret; 816 } 817 818 void intel_lr_context_unpin(struct i915_gem_context *ctx, 819 struct intel_engine_cs *engine) 820 { 821 struct intel_context *ce = &ctx->engine[engine->id]; 822 823 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 824 GEM_BUG_ON(ce->pin_count == 0); 825 826 if (--ce->pin_count) 827 return; 828 829 intel_ring_unpin(ce->ring); 830 831 i915_gem_object_unpin_map(ce->state); 832 i915_gem_object_ggtt_unpin(ce->state); 833 834 ce->lrc_vma = NULL; 835 ce->lrc_desc = 0; 836 ce->lrc_reg_state = NULL; 837 838 i915_gem_context_put(ctx); 839 } 840 841 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) 842 { 843 int ret, i; 844 struct intel_ring *ring = req->ring; 845 struct i915_workarounds *w = &req->i915->workarounds; 846 847 if (w->count == 0) 848 return 0; 849 850 ret = req->engine->emit_flush(req, EMIT_BARRIER); 851 if (ret) 852 return ret; 853 854 ret = intel_ring_begin(req, w->count * 2 + 2); 855 if (ret) 856 return ret; 857 858 intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count)); 859 for (i = 0; i < w->count; i++) { 860 intel_ring_emit_reg(ring, w->reg[i].addr); 861 intel_ring_emit(ring, w->reg[i].value); 862 } 863 intel_ring_emit(ring, MI_NOOP); 864 865 intel_ring_advance(ring); 866 867 ret = req->engine->emit_flush(req, EMIT_BARRIER); 868 if (ret) 869 return ret; 870 871 return 0; 872 } 873 874 #define wa_ctx_emit(batch, index, cmd) \ 875 do { \ 876 int __index = (index)++; \ 877 if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ 878 return -ENOSPC; \ 879 } \ 880 batch[__index] = (cmd); \ 881 } while (0) 882 883 #define wa_ctx_emit_reg(batch, index, reg) \ 884 wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg)) 885 886 /* 887 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 888 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 889 * but there is a slight complication as this is applied in WA batch where the 890 * values are only initialized once so we cannot take register value at the 891 * beginning and reuse it further; hence we save its value to memory, upload a 892 * constant value with bit21 set and then we restore it back with the saved value. 893 * To simplify the WA, a constant value is formed by using the default value 894 * of this register. This shouldn't be a problem because we are only modifying 895 * it for a short period and this batch in non-premptible. We can ofcourse 896 * use additional instructions that read the actual value of the register 897 * at that time and set our bit of interest but it makes the WA complicated. 898 * 899 * This WA is also required for Gen9 so extracting as a function avoids 900 * code duplication. 901 */ 902 static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, 903 uint32_t *batch, 904 uint32_t index) 905 { 906 uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); 907 908 /* 909 * WaDisableLSQCROPERFforOCL:skl,kbl 910 * This WA is implemented in skl_init_clock_gating() but since 911 * this batch updates GEN8_L3SQCREG4 with default value we need to 912 * set this bit here to retain the WA during flush. 913 */ 914 if (IS_SKL_REVID(engine->i915, 0, SKL_REVID_E0) || 915 IS_KBL_REVID(engine->i915, 0, KBL_REVID_E0)) 916 l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS; 917 918 wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | 919 MI_SRM_LRM_GLOBAL_GTT)); 920 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 921 wa_ctx_emit(batch, index, engine->scratch.gtt_offset + 256); 922 wa_ctx_emit(batch, index, 0); 923 924 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 925 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 926 wa_ctx_emit(batch, index, l3sqc4_flush); 927 928 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 929 wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | 930 PIPE_CONTROL_DC_FLUSH_ENABLE)); 931 wa_ctx_emit(batch, index, 0); 932 wa_ctx_emit(batch, index, 0); 933 wa_ctx_emit(batch, index, 0); 934 wa_ctx_emit(batch, index, 0); 935 936 wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | 937 MI_SRM_LRM_GLOBAL_GTT)); 938 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 939 wa_ctx_emit(batch, index, engine->scratch.gtt_offset + 256); 940 wa_ctx_emit(batch, index, 0); 941 942 return index; 943 } 944 945 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, 946 uint32_t offset, 947 uint32_t start_alignment) 948 { 949 return wa_ctx->offset = ALIGN(offset, start_alignment); 950 } 951 952 static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, 953 uint32_t offset, 954 uint32_t size_alignment) 955 { 956 wa_ctx->size = offset - wa_ctx->offset; 957 958 WARN(wa_ctx->size % size_alignment, 959 "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", 960 wa_ctx->size, size_alignment); 961 return 0; 962 } 963 964 /* 965 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 966 * initialized at the beginning and shared across all contexts but this field 967 * helps us to have multiple batches at different offsets and select them based 968 * on a criteria. At the moment this batch always start at the beginning of the page 969 * and at this point we don't have multiple wa_ctx batch buffers. 970 * 971 * The number of WA applied are not known at the beginning; we use this field 972 * to return the no of DWORDS written. 973 * 974 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 975 * so it adds NOOPs as padding to make it cacheline aligned. 976 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 977 * makes a complete batch buffer. 978 */ 979 static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine, 980 struct i915_wa_ctx_bb *wa_ctx, 981 uint32_t *batch, 982 uint32_t *offset) 983 { 984 uint32_t scratch_addr; 985 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 986 987 /* WaDisableCtxRestoreArbitration:bdw,chv */ 988 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 989 990 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 991 if (IS_BROADWELL(engine->i915)) { 992 int rc = gen8_emit_flush_coherentl3_wa(engine, batch, index); 993 if (rc < 0) 994 return rc; 995 index = rc; 996 } 997 998 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 999 /* Actual scratch location is at 128 bytes offset */ 1000 scratch_addr = engine->scratch.gtt_offset + 2*CACHELINE_BYTES; 1001 1002 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1003 wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | 1004 PIPE_CONTROL_GLOBAL_GTT_IVB | 1005 PIPE_CONTROL_CS_STALL | 1006 PIPE_CONTROL_QW_WRITE)); 1007 wa_ctx_emit(batch, index, scratch_addr); 1008 wa_ctx_emit(batch, index, 0); 1009 wa_ctx_emit(batch, index, 0); 1010 wa_ctx_emit(batch, index, 0); 1011 1012 /* Pad to end of cacheline */ 1013 while (index % CACHELINE_DWORDS) 1014 wa_ctx_emit(batch, index, MI_NOOP); 1015 1016 /* 1017 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1018 * execution depends on the length specified in terms of cache lines 1019 * in the register CTX_RCS_INDIRECT_CTX 1020 */ 1021 1022 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1023 } 1024 1025 /* 1026 * This batch is started immediately after indirect_ctx batch. Since we ensure 1027 * that indirect_ctx ends on a cacheline this batch is aligned automatically. 1028 * 1029 * The number of DWORDS written are returned using this field. 1030 * 1031 * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding 1032 * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. 1033 */ 1034 static int gen8_init_perctx_bb(struct intel_engine_cs *engine, 1035 struct i915_wa_ctx_bb *wa_ctx, 1036 uint32_t *batch, 1037 uint32_t *offset) 1038 { 1039 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1040 1041 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1042 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1043 1044 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1045 1046 return wa_ctx_end(wa_ctx, *offset = index, 1); 1047 } 1048 1049 static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine, 1050 struct i915_wa_ctx_bb *wa_ctx, 1051 uint32_t *batch, 1052 uint32_t *offset) 1053 { 1054 int ret; 1055 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1056 1057 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1058 if (IS_SKL_REVID(engine->i915, 0, SKL_REVID_D0) || 1059 IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) 1060 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1061 1062 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ 1063 ret = gen8_emit_flush_coherentl3_wa(engine, batch, index); 1064 if (ret < 0) 1065 return ret; 1066 index = ret; 1067 1068 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl */ 1069 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1070 wa_ctx_emit_reg(batch, index, COMMON_SLICE_CHICKEN2); 1071 wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE( 1072 GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE)); 1073 wa_ctx_emit(batch, index, MI_NOOP); 1074 1075 /* WaClearSlmSpaceAtContextSwitch:kbl */ 1076 /* Actual scratch location is at 128 bytes offset */ 1077 if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) { 1078 uint32_t scratch_addr 1079 = engine->scratch.gtt_offset + 2*CACHELINE_BYTES; 1080 1081 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1082 wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | 1083 PIPE_CONTROL_GLOBAL_GTT_IVB | 1084 PIPE_CONTROL_CS_STALL | 1085 PIPE_CONTROL_QW_WRITE)); 1086 wa_ctx_emit(batch, index, scratch_addr); 1087 wa_ctx_emit(batch, index, 0); 1088 wa_ctx_emit(batch, index, 0); 1089 wa_ctx_emit(batch, index, 0); 1090 } 1091 1092 /* WaMediaPoolStateCmdInWABB:bxt */ 1093 if (HAS_POOLED_EU(engine->i915)) { 1094 /* 1095 * EU pool configuration is setup along with golden context 1096 * during context initialization. This value depends on 1097 * device type (2x6 or 3x6) and needs to be updated based 1098 * on which subslice is disabled especially for 2x6 1099 * devices, however it is safe to load default 1100 * configuration of 3x6 device instead of masking off 1101 * corresponding bits because HW ignores bits of a disabled 1102 * subslice and drops down to appropriate config. Please 1103 * see render_state_setup() in i915_gem_render_state.c for 1104 * possible configurations, to avoid duplication they are 1105 * not shown here again. 1106 */ 1107 u32 eu_pool_config = 0x00777000; 1108 wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_STATE); 1109 wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_ENABLE); 1110 wa_ctx_emit(batch, index, eu_pool_config); 1111 wa_ctx_emit(batch, index, 0); 1112 wa_ctx_emit(batch, index, 0); 1113 wa_ctx_emit(batch, index, 0); 1114 } 1115 1116 /* Pad to end of cacheline */ 1117 while (index % CACHELINE_DWORDS) 1118 wa_ctx_emit(batch, index, MI_NOOP); 1119 1120 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1121 } 1122 1123 static int gen9_init_perctx_bb(struct intel_engine_cs *engine, 1124 struct i915_wa_ctx_bb *wa_ctx, 1125 uint32_t *batch, 1126 uint32_t *offset) 1127 { 1128 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1129 1130 /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */ 1131 if (IS_SKL_REVID(engine->i915, 0, SKL_REVID_B0) || 1132 IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) { 1133 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1134 wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); 1135 wa_ctx_emit(batch, index, 1136 _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); 1137 wa_ctx_emit(batch, index, MI_NOOP); 1138 } 1139 1140 /* WaClearTdlStateAckDirtyBits:bxt */ 1141 if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_B0)) { 1142 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(4)); 1143 1144 wa_ctx_emit_reg(batch, index, GEN8_STATE_ACK); 1145 wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); 1146 1147 wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE1); 1148 wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); 1149 1150 wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE2); 1151 wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); 1152 1153 wa_ctx_emit_reg(batch, index, GEN7_ROW_CHICKEN2); 1154 /* dummy write to CS, mask bits are 0 to ensure the register is not modified */ 1155 wa_ctx_emit(batch, index, 0x0); 1156 wa_ctx_emit(batch, index, MI_NOOP); 1157 } 1158 1159 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1160 if (IS_SKL_REVID(engine->i915, 0, SKL_REVID_D0) || 1161 IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) 1162 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1163 1164 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1165 1166 return wa_ctx_end(wa_ctx, *offset = index, 1); 1167 } 1168 1169 static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *engine, u32 size) 1170 { 1171 int ret; 1172 1173 engine->wa_ctx.obj = i915_gem_object_create(&engine->i915->drm, 1174 PAGE_ALIGN(size)); 1175 if (IS_ERR(engine->wa_ctx.obj)) { 1176 DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); 1177 ret = PTR_ERR(engine->wa_ctx.obj); 1178 engine->wa_ctx.obj = NULL; 1179 return ret; 1180 } 1181 1182 ret = i915_gem_object_ggtt_pin(engine->wa_ctx.obj, NULL, 1183 0, PAGE_SIZE, 0); 1184 if (ret) { 1185 DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", 1186 ret); 1187 i915_gem_object_put(engine->wa_ctx.obj); 1188 return ret; 1189 } 1190 1191 return 0; 1192 } 1193 1194 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *engine) 1195 { 1196 if (engine->wa_ctx.obj) { 1197 i915_gem_object_ggtt_unpin(engine->wa_ctx.obj); 1198 i915_gem_object_put(engine->wa_ctx.obj); 1199 engine->wa_ctx.obj = NULL; 1200 } 1201 } 1202 1203 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 1204 { 1205 int ret; 1206 uint32_t *batch; 1207 uint32_t offset; 1208 struct page *page; 1209 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1210 1211 WARN_ON(engine->id != RCS); 1212 1213 /* update this when WA for higher Gen are added */ 1214 if (INTEL_GEN(engine->i915) > 9) { 1215 DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", 1216 INTEL_GEN(engine->i915)); 1217 return 0; 1218 } 1219 1220 /* some WA perform writes to scratch page, ensure it is valid */ 1221 if (engine->scratch.obj == NULL) { 1222 DRM_ERROR("scratch page not allocated for %s\n", engine->name); 1223 return -EINVAL; 1224 } 1225 1226 ret = lrc_setup_wa_ctx_obj(engine, PAGE_SIZE); 1227 if (ret) { 1228 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 1229 return ret; 1230 } 1231 1232 page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0); 1233 batch = kmap_atomic(page); 1234 offset = 0; 1235 1236 if (IS_GEN8(engine->i915)) { 1237 ret = gen8_init_indirectctx_bb(engine, 1238 &wa_ctx->indirect_ctx, 1239 batch, 1240 &offset); 1241 if (ret) 1242 goto out; 1243 1244 ret = gen8_init_perctx_bb(engine, 1245 &wa_ctx->per_ctx, 1246 batch, 1247 &offset); 1248 if (ret) 1249 goto out; 1250 } else if (IS_GEN9(engine->i915)) { 1251 ret = gen9_init_indirectctx_bb(engine, 1252 &wa_ctx->indirect_ctx, 1253 batch, 1254 &offset); 1255 if (ret) 1256 goto out; 1257 1258 ret = gen9_init_perctx_bb(engine, 1259 &wa_ctx->per_ctx, 1260 batch, 1261 &offset); 1262 if (ret) 1263 goto out; 1264 } 1265 1266 out: 1267 kunmap_atomic(batch); 1268 if (ret) 1269 lrc_destroy_wa_ctx_obj(engine); 1270 1271 return ret; 1272 } 1273 1274 static void lrc_init_hws(struct intel_engine_cs *engine) 1275 { 1276 struct drm_i915_private *dev_priv = engine->i915; 1277 1278 I915_WRITE(RING_HWS_PGA(engine->mmio_base), 1279 (u32)engine->status_page.gfx_addr); 1280 POSTING_READ(RING_HWS_PGA(engine->mmio_base)); 1281 } 1282 1283 static int gen8_init_common_ring(struct intel_engine_cs *engine) 1284 { 1285 struct drm_i915_private *dev_priv = engine->i915; 1286 unsigned int next_context_status_buffer_hw; 1287 1288 lrc_init_hws(engine); 1289 1290 I915_WRITE_IMR(engine, 1291 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 1292 I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff); 1293 1294 I915_WRITE(RING_MODE_GEN7(engine), 1295 _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | 1296 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); 1297 POSTING_READ(RING_MODE_GEN7(engine)); 1298 1299 /* 1300 * Instead of resetting the Context Status Buffer (CSB) read pointer to 1301 * zero, we need to read the write pointer from hardware and use its 1302 * value because "this register is power context save restored". 1303 * Effectively, these states have been observed: 1304 * 1305 * | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) | 1306 * BDW | CSB regs not reset | CSB regs reset | 1307 * CHT | CSB regs not reset | CSB regs not reset | 1308 * SKL | ? | ? | 1309 * BXT | ? | ? | 1310 */ 1311 next_context_status_buffer_hw = 1312 GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine))); 1313 1314 /* 1315 * When the CSB registers are reset (also after power-up / gpu reset), 1316 * CSB write pointer is set to all 1's, which is not valid, use '5' in 1317 * this special case, so the first element read is CSB[0]. 1318 */ 1319 if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) 1320 next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); 1321 1322 engine->next_context_status_buffer = next_context_status_buffer_hw; 1323 DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name); 1324 1325 intel_engine_init_hangcheck(engine); 1326 1327 return intel_mocs_init_engine(engine); 1328 } 1329 1330 static int gen8_init_render_ring(struct intel_engine_cs *engine) 1331 { 1332 struct drm_i915_private *dev_priv = engine->i915; 1333 int ret; 1334 1335 ret = gen8_init_common_ring(engine); 1336 if (ret) 1337 return ret; 1338 1339 /* We need to disable the AsyncFlip performance optimisations in order 1340 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1341 * programmed to '1' on all products. 1342 * 1343 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv 1344 */ 1345 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1346 1347 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1348 1349 return init_workarounds_ring(engine); 1350 } 1351 1352 static int gen9_init_render_ring(struct intel_engine_cs *engine) 1353 { 1354 int ret; 1355 1356 ret = gen8_init_common_ring(engine); 1357 if (ret) 1358 return ret; 1359 1360 return init_workarounds_ring(engine); 1361 } 1362 1363 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) 1364 { 1365 struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; 1366 struct intel_ring *ring = req->ring; 1367 struct intel_engine_cs *engine = req->engine; 1368 const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; 1369 int i, ret; 1370 1371 ret = intel_ring_begin(req, num_lri_cmds * 2 + 2); 1372 if (ret) 1373 return ret; 1374 1375 intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds)); 1376 for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { 1377 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1378 1379 intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i)); 1380 intel_ring_emit(ring, upper_32_bits(pd_daddr)); 1381 intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i)); 1382 intel_ring_emit(ring, lower_32_bits(pd_daddr)); 1383 } 1384 1385 intel_ring_emit(ring, MI_NOOP); 1386 intel_ring_advance(ring); 1387 1388 return 0; 1389 } 1390 1391 static int gen8_emit_bb_start(struct drm_i915_gem_request *req, 1392 u64 offset, u32 len, 1393 unsigned int dispatch_flags) 1394 { 1395 struct intel_ring *ring = req->ring; 1396 bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); 1397 int ret; 1398 1399 /* Don't rely in hw updating PDPs, specially in lite-restore. 1400 * Ideally, we should set Force PD Restore in ctx descriptor, 1401 * but we can't. Force Restore would be a second option, but 1402 * it is unsafe in case of lite-restore (because the ctx is 1403 * not idle). PML4 is allocated during ppgtt init so this is 1404 * not needed in 48-bit.*/ 1405 if (req->ctx->ppgtt && 1406 (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings)) { 1407 if (!USES_FULL_48BIT_PPGTT(req->i915) && 1408 !intel_vgpu_active(req->i915)) { 1409 ret = intel_logical_ring_emit_pdps(req); 1410 if (ret) 1411 return ret; 1412 } 1413 1414 req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine); 1415 } 1416 1417 ret = intel_ring_begin(req, 4); 1418 if (ret) 1419 return ret; 1420 1421 /* FIXME(BDW): Address space and security selectors. */ 1422 intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | 1423 (ppgtt<<8) | 1424 (dispatch_flags & I915_DISPATCH_RS ? 1425 MI_BATCH_RESOURCE_STREAMER : 0)); 1426 intel_ring_emit(ring, lower_32_bits(offset)); 1427 intel_ring_emit(ring, upper_32_bits(offset)); 1428 intel_ring_emit(ring, MI_NOOP); 1429 intel_ring_advance(ring); 1430 1431 return 0; 1432 } 1433 1434 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 1435 { 1436 struct drm_i915_private *dev_priv = engine->i915; 1437 I915_WRITE_IMR(engine, 1438 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 1439 POSTING_READ_FW(RING_IMR(engine->mmio_base)); 1440 } 1441 1442 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 1443 { 1444 struct drm_i915_private *dev_priv = engine->i915; 1445 I915_WRITE_IMR(engine, ~engine->irq_keep_mask); 1446 } 1447 1448 static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode) 1449 { 1450 struct intel_ring *ring = request->ring; 1451 u32 cmd; 1452 int ret; 1453 1454 ret = intel_ring_begin(request, 4); 1455 if (ret) 1456 return ret; 1457 1458 cmd = MI_FLUSH_DW + 1; 1459 1460 /* We always require a command barrier so that subsequent 1461 * commands, such as breadcrumb interrupts, are strictly ordered 1462 * wrt the contents of the write cache being flushed to memory 1463 * (and thus being coherent from the CPU). 1464 */ 1465 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 1466 1467 if (mode & EMIT_INVALIDATE) { 1468 cmd |= MI_INVALIDATE_TLB; 1469 if (request->engine->id == VCS) 1470 cmd |= MI_INVALIDATE_BSD; 1471 } 1472 1473 intel_ring_emit(ring, cmd); 1474 intel_ring_emit(ring, 1475 I915_GEM_HWS_SCRATCH_ADDR | 1476 MI_FLUSH_DW_USE_GTT); 1477 intel_ring_emit(ring, 0); /* upper addr */ 1478 intel_ring_emit(ring, 0); /* value */ 1479 intel_ring_advance(ring); 1480 1481 return 0; 1482 } 1483 1484 static int gen8_emit_flush_render(struct drm_i915_gem_request *request, 1485 u32 mode) 1486 { 1487 struct intel_ring *ring = request->ring; 1488 struct intel_engine_cs *engine = request->engine; 1489 u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES; 1490 bool vf_flush_wa = false, dc_flush_wa = false; 1491 u32 flags = 0; 1492 int ret; 1493 int len; 1494 1495 flags |= PIPE_CONTROL_CS_STALL; 1496 1497 if (mode & EMIT_FLUSH) { 1498 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 1499 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 1500 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 1501 flags |= PIPE_CONTROL_FLUSH_ENABLE; 1502 } 1503 1504 if (mode & EMIT_INVALIDATE) { 1505 flags |= PIPE_CONTROL_TLB_INVALIDATE; 1506 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 1507 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 1508 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 1509 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 1510 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 1511 flags |= PIPE_CONTROL_QW_WRITE; 1512 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 1513 1514 /* 1515 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 1516 * pipe control. 1517 */ 1518 if (IS_GEN9(request->i915)) 1519 vf_flush_wa = true; 1520 1521 /* WaForGAMHang:kbl */ 1522 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 1523 dc_flush_wa = true; 1524 } 1525 1526 len = 6; 1527 1528 if (vf_flush_wa) 1529 len += 6; 1530 1531 if (dc_flush_wa) 1532 len += 12; 1533 1534 ret = intel_ring_begin(request, len); 1535 if (ret) 1536 return ret; 1537 1538 if (vf_flush_wa) { 1539 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 1540 intel_ring_emit(ring, 0); 1541 intel_ring_emit(ring, 0); 1542 intel_ring_emit(ring, 0); 1543 intel_ring_emit(ring, 0); 1544 intel_ring_emit(ring, 0); 1545 } 1546 1547 if (dc_flush_wa) { 1548 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 1549 intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE); 1550 intel_ring_emit(ring, 0); 1551 intel_ring_emit(ring, 0); 1552 intel_ring_emit(ring, 0); 1553 intel_ring_emit(ring, 0); 1554 } 1555 1556 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 1557 intel_ring_emit(ring, flags); 1558 intel_ring_emit(ring, scratch_addr); 1559 intel_ring_emit(ring, 0); 1560 intel_ring_emit(ring, 0); 1561 intel_ring_emit(ring, 0); 1562 1563 if (dc_flush_wa) { 1564 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 1565 intel_ring_emit(ring, PIPE_CONTROL_CS_STALL); 1566 intel_ring_emit(ring, 0); 1567 intel_ring_emit(ring, 0); 1568 intel_ring_emit(ring, 0); 1569 intel_ring_emit(ring, 0); 1570 } 1571 1572 intel_ring_advance(ring); 1573 1574 return 0; 1575 } 1576 1577 static void bxt_a_seqno_barrier(struct intel_engine_cs *engine) 1578 { 1579 /* 1580 * On BXT A steppings there is a HW coherency issue whereby the 1581 * MI_STORE_DATA_IMM storing the completed request's seqno 1582 * occasionally doesn't invalidate the CPU cache. Work around this by 1583 * clflushing the corresponding cacheline whenever the caller wants 1584 * the coherency to be guaranteed. Note that this cacheline is known 1585 * to be clean at this point, since we only write it in 1586 * bxt_a_set_seqno(), where we also do a clflush after the write. So 1587 * this clflush in practice becomes an invalidate operation. 1588 */ 1589 intel_flush_status_page(engine, I915_GEM_HWS_INDEX); 1590 } 1591 1592 /* 1593 * Reserve space for 2 NOOPs at the end of each request to be 1594 * used as a workaround for not being allowed to do lite 1595 * restore with HEAD==TAIL (WaIdleLiteRestore). 1596 */ 1597 #define WA_TAIL_DWORDS 2 1598 1599 static int gen8_emit_request(struct drm_i915_gem_request *request) 1600 { 1601 struct intel_ring *ring = request->ring; 1602 int ret; 1603 1604 ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS); 1605 if (ret) 1606 return ret; 1607 1608 /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ 1609 BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); 1610 1611 intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW); 1612 intel_ring_emit(ring, 1613 intel_hws_seqno_address(request->engine) | 1614 MI_FLUSH_DW_USE_GTT); 1615 intel_ring_emit(ring, 0); 1616 intel_ring_emit(ring, request->fence.seqno); 1617 intel_ring_emit(ring, MI_USER_INTERRUPT); 1618 intel_ring_emit(ring, MI_NOOP); 1619 return intel_logical_ring_advance(request); 1620 } 1621 1622 static int gen8_emit_request_render(struct drm_i915_gem_request *request) 1623 { 1624 struct intel_ring *ring = request->ring; 1625 int ret; 1626 1627 ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS); 1628 if (ret) 1629 return ret; 1630 1631 /* We're using qword write, seqno should be aligned to 8 bytes. */ 1632 BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); 1633 1634 /* w/a for post sync ops following a GPGPU operation we 1635 * need a prior CS_STALL, which is emitted by the flush 1636 * following the batch. 1637 */ 1638 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 1639 intel_ring_emit(ring, 1640 (PIPE_CONTROL_GLOBAL_GTT_IVB | 1641 PIPE_CONTROL_CS_STALL | 1642 PIPE_CONTROL_QW_WRITE)); 1643 intel_ring_emit(ring, intel_hws_seqno_address(request->engine)); 1644 intel_ring_emit(ring, 0); 1645 intel_ring_emit(ring, i915_gem_request_get_seqno(request)); 1646 /* We're thrashing one dword of HWS. */ 1647 intel_ring_emit(ring, 0); 1648 intel_ring_emit(ring, MI_USER_INTERRUPT); 1649 intel_ring_emit(ring, MI_NOOP); 1650 return intel_logical_ring_advance(request); 1651 } 1652 1653 static int gen8_init_rcs_context(struct drm_i915_gem_request *req) 1654 { 1655 int ret; 1656 1657 ret = intel_logical_ring_workarounds_emit(req); 1658 if (ret) 1659 return ret; 1660 1661 ret = intel_rcs_context_init_mocs(req); 1662 /* 1663 * Failing to program the MOCS is non-fatal.The system will not 1664 * run at peak performance. So generate an error and carry on. 1665 */ 1666 if (ret) 1667 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1668 1669 return i915_gem_render_state_init(req); 1670 } 1671 1672 /** 1673 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer 1674 * @engine: Engine Command Streamer. 1675 */ 1676 void intel_logical_ring_cleanup(struct intel_engine_cs *engine) 1677 { 1678 struct drm_i915_private *dev_priv; 1679 1680 if (!intel_engine_initialized(engine)) 1681 return; 1682 1683 /* 1684 * Tasklet cannot be active at this point due intel_mark_active/idle 1685 * so this is just for documentation. 1686 */ 1687 if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->irq_tasklet.state))) 1688 tasklet_kill(&engine->irq_tasklet); 1689 1690 dev_priv = engine->i915; 1691 1692 if (engine->buffer) { 1693 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0); 1694 } 1695 1696 if (engine->cleanup) 1697 engine->cleanup(engine); 1698 1699 intel_engine_cleanup_common(engine); 1700 1701 if (engine->status_page.obj) { 1702 i915_gem_object_unpin_map(engine->status_page.obj); 1703 engine->status_page.obj = NULL; 1704 } 1705 intel_lr_context_unpin(dev_priv->kernel_context, engine); 1706 1707 engine->idle_lite_restore_wa = 0; 1708 engine->disable_lite_restore_wa = false; 1709 engine->ctx_desc_template = 0; 1710 1711 lrc_destroy_wa_ctx_obj(engine); 1712 engine->i915 = NULL; 1713 } 1714 1715 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv) 1716 { 1717 struct intel_engine_cs *engine; 1718 1719 for_each_engine(engine, dev_priv) 1720 engine->submit_request = execlists_submit_request; 1721 } 1722 1723 static void 1724 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 1725 { 1726 /* Default vfuncs which can be overriden by each engine. */ 1727 engine->init_hw = gen8_init_common_ring; 1728 engine->emit_flush = gen8_emit_flush; 1729 engine->emit_request = gen8_emit_request; 1730 engine->submit_request = execlists_submit_request; 1731 1732 engine->irq_enable = gen8_logical_ring_enable_irq; 1733 engine->irq_disable = gen8_logical_ring_disable_irq; 1734 engine->emit_bb_start = gen8_emit_bb_start; 1735 if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) 1736 engine->irq_seqno_barrier = bxt_a_seqno_barrier; 1737 } 1738 1739 static inline void 1740 logical_ring_default_irqs(struct intel_engine_cs *engine) 1741 { 1742 unsigned shift = engine->irq_shift; 1743 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 1744 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 1745 } 1746 1747 static int 1748 lrc_setup_hws(struct intel_engine_cs *engine, 1749 struct drm_i915_gem_object *dctx_obj) 1750 { 1751 void *hws; 1752 1753 /* The HWSP is part of the default context object in LRC mode. */ 1754 engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(dctx_obj) + 1755 LRC_PPHWSP_PN * PAGE_SIZE; 1756 hws = i915_gem_object_pin_map(dctx_obj); 1757 if (IS_ERR(hws)) 1758 return PTR_ERR(hws); 1759 engine->status_page.page_addr = hws + LRC_PPHWSP_PN * PAGE_SIZE; 1760 engine->status_page.obj = dctx_obj; 1761 1762 return 0; 1763 } 1764 1765 static void 1766 logical_ring_setup(struct intel_engine_cs *engine) 1767 { 1768 struct drm_i915_private *dev_priv = engine->i915; 1769 enum forcewake_domains fw_domains; 1770 1771 intel_engine_setup_common(engine); 1772 1773 /* Intentionally left blank. */ 1774 engine->buffer = NULL; 1775 1776 fw_domains = intel_uncore_forcewake_for_reg(dev_priv, 1777 RING_ELSP(engine), 1778 FW_REG_WRITE); 1779 1780 fw_domains |= intel_uncore_forcewake_for_reg(dev_priv, 1781 RING_CONTEXT_STATUS_PTR(engine), 1782 FW_REG_READ | FW_REG_WRITE); 1783 1784 fw_domains |= intel_uncore_forcewake_for_reg(dev_priv, 1785 RING_CONTEXT_STATUS_BUF_BASE(engine), 1786 FW_REG_READ); 1787 1788 engine->fw_domains = fw_domains; 1789 1790 tasklet_init(&engine->irq_tasklet, 1791 intel_lrc_irq_handler, (unsigned long)engine); 1792 1793 logical_ring_init_platform_invariants(engine); 1794 logical_ring_default_vfuncs(engine); 1795 logical_ring_default_irqs(engine); 1796 } 1797 1798 static int 1799 logical_ring_init(struct intel_engine_cs *engine) 1800 { 1801 struct i915_gem_context *dctx = engine->i915->kernel_context; 1802 int ret; 1803 1804 ret = intel_engine_init_common(engine); 1805 if (ret) 1806 goto error; 1807 1808 ret = execlists_context_deferred_alloc(dctx, engine); 1809 if (ret) 1810 goto error; 1811 1812 /* As this is the default context, always pin it */ 1813 ret = intel_lr_context_pin(dctx, engine); 1814 if (ret) { 1815 DRM_ERROR("Failed to pin context for %s: %d\n", 1816 engine->name, ret); 1817 goto error; 1818 } 1819 1820 /* And setup the hardware status page. */ 1821 ret = lrc_setup_hws(engine, dctx->engine[engine->id].state); 1822 if (ret) { 1823 DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret); 1824 goto error; 1825 } 1826 1827 return 0; 1828 1829 error: 1830 intel_logical_ring_cleanup(engine); 1831 return ret; 1832 } 1833 1834 int logical_render_ring_init(struct intel_engine_cs *engine) 1835 { 1836 struct drm_i915_private *dev_priv = engine->i915; 1837 int ret; 1838 1839 logical_ring_setup(engine); 1840 1841 if (HAS_L3_DPF(dev_priv)) 1842 engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 1843 1844 /* Override some for render ring. */ 1845 if (INTEL_GEN(dev_priv) >= 9) 1846 engine->init_hw = gen9_init_render_ring; 1847 else 1848 engine->init_hw = gen8_init_render_ring; 1849 engine->init_context = gen8_init_rcs_context; 1850 engine->cleanup = intel_fini_pipe_control; 1851 engine->emit_flush = gen8_emit_flush_render; 1852 engine->emit_request = gen8_emit_request_render; 1853 1854 ret = intel_init_pipe_control(engine, 4096); 1855 if (ret) 1856 return ret; 1857 1858 ret = intel_init_workaround_bb(engine); 1859 if (ret) { 1860 /* 1861 * We continue even if we fail to initialize WA batch 1862 * because we only expect rare glitches but nothing 1863 * critical to prevent us from using GPU 1864 */ 1865 DRM_ERROR("WA batch buffer initialization failed: %d\n", 1866 ret); 1867 } 1868 1869 ret = logical_ring_init(engine); 1870 if (ret) { 1871 lrc_destroy_wa_ctx_obj(engine); 1872 } 1873 1874 return ret; 1875 } 1876 1877 int logical_xcs_ring_init(struct intel_engine_cs *engine) 1878 { 1879 logical_ring_setup(engine); 1880 1881 return logical_ring_init(engine); 1882 } 1883 1884 static u32 1885 make_rpcs(struct drm_i915_private *dev_priv) 1886 { 1887 u32 rpcs = 0; 1888 1889 /* 1890 * No explicit RPCS request is needed to ensure full 1891 * slice/subslice/EU enablement prior to Gen9. 1892 */ 1893 if (INTEL_GEN(dev_priv) < 9) 1894 return 0; 1895 1896 /* 1897 * Starting in Gen9, render power gating can leave 1898 * slice/subslice/EU in a partially enabled state. We 1899 * must make an explicit request through RPCS for full 1900 * enablement. 1901 */ 1902 if (INTEL_INFO(dev_priv)->has_slice_pg) { 1903 rpcs |= GEN8_RPCS_S_CNT_ENABLE; 1904 rpcs |= INTEL_INFO(dev_priv)->slice_total << 1905 GEN8_RPCS_S_CNT_SHIFT; 1906 rpcs |= GEN8_RPCS_ENABLE; 1907 } 1908 1909 if (INTEL_INFO(dev_priv)->has_subslice_pg) { 1910 rpcs |= GEN8_RPCS_SS_CNT_ENABLE; 1911 rpcs |= INTEL_INFO(dev_priv)->subslice_per_slice << 1912 GEN8_RPCS_SS_CNT_SHIFT; 1913 rpcs |= GEN8_RPCS_ENABLE; 1914 } 1915 1916 if (INTEL_INFO(dev_priv)->has_eu_pg) { 1917 rpcs |= INTEL_INFO(dev_priv)->eu_per_subslice << 1918 GEN8_RPCS_EU_MIN_SHIFT; 1919 rpcs |= INTEL_INFO(dev_priv)->eu_per_subslice << 1920 GEN8_RPCS_EU_MAX_SHIFT; 1921 rpcs |= GEN8_RPCS_ENABLE; 1922 } 1923 1924 return rpcs; 1925 } 1926 1927 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) 1928 { 1929 u32 indirect_ctx_offset; 1930 1931 switch (INTEL_GEN(engine->i915)) { 1932 default: 1933 MISSING_CASE(INTEL_GEN(engine->i915)); 1934 /* fall through */ 1935 case 9: 1936 indirect_ctx_offset = 1937 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 1938 break; 1939 case 8: 1940 indirect_ctx_offset = 1941 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 1942 break; 1943 } 1944 1945 return indirect_ctx_offset; 1946 } 1947 1948 static int 1949 populate_lr_context(struct i915_gem_context *ctx, 1950 struct drm_i915_gem_object *ctx_obj, 1951 struct intel_engine_cs *engine, 1952 struct intel_ring *ring) 1953 { 1954 struct drm_i915_private *dev_priv = ctx->i915; 1955 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt; 1956 void *vaddr; 1957 u32 *reg_state; 1958 int ret; 1959 1960 if (!ppgtt) 1961 ppgtt = dev_priv->mm.aliasing_ppgtt; 1962 1963 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); 1964 if (ret) { 1965 DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); 1966 return ret; 1967 } 1968 1969 vaddr = i915_gem_object_pin_map(ctx_obj); 1970 if (IS_ERR(vaddr)) { 1971 ret = PTR_ERR(vaddr); 1972 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 1973 return ret; 1974 } 1975 ctx_obj->dirty = true; 1976 1977 /* The second page of the context object contains some fields which must 1978 * be set up prior to the first execution. */ 1979 reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 1980 1981 /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM 1982 * commands followed by (reg, value) pairs. The values we are setting here are 1983 * only for the first context restore: on a subsequent save, the GPU will 1984 * recreate this batchbuffer with new values (including all the missing 1985 * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ 1986 reg_state[CTX_LRI_HEADER_0] = 1987 MI_LOAD_REGISTER_IMM(engine->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED; 1988 ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, 1989 RING_CONTEXT_CONTROL(engine), 1990 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 1991 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | 1992 (HAS_RESOURCE_STREAMER(dev_priv) ? 1993 CTX_CTRL_RS_CTX_ENABLE : 0))); 1994 ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base), 1995 0); 1996 ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base), 1997 0); 1998 /* Ring buffer start address is not known until the buffer is pinned. 1999 * It is written to the context image in execlists_update_context() 2000 */ 2001 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, 2002 RING_START(engine->mmio_base), 0); 2003 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, 2004 RING_CTL(engine->mmio_base), 2005 ((ring->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID); 2006 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, 2007 RING_BBADDR_UDW(engine->mmio_base), 0); 2008 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, 2009 RING_BBADDR(engine->mmio_base), 0); 2010 ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, 2011 RING_BBSTATE(engine->mmio_base), 2012 RING_BB_PPGTT); 2013 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, 2014 RING_SBBADDR_UDW(engine->mmio_base), 0); 2015 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, 2016 RING_SBBADDR(engine->mmio_base), 0); 2017 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, 2018 RING_SBBSTATE(engine->mmio_base), 0); 2019 if (engine->id == RCS) { 2020 ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, 2021 RING_BB_PER_CTX_PTR(engine->mmio_base), 0); 2022 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, 2023 RING_INDIRECT_CTX(engine->mmio_base), 0); 2024 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, 2025 RING_INDIRECT_CTX_OFFSET(engine->mmio_base), 0); 2026 if (engine->wa_ctx.obj) { 2027 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2028 uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); 2029 2030 reg_state[CTX_RCS_INDIRECT_CTX+1] = 2031 (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | 2032 (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); 2033 2034 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 2035 intel_lr_indirect_ctx_offset(engine) << 6; 2036 2037 reg_state[CTX_BB_PER_CTX_PTR+1] = 2038 (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | 2039 0x01; 2040 } 2041 } 2042 reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 2043 ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, 2044 RING_CTX_TIMESTAMP(engine->mmio_base), 0); 2045 /* PDP values well be assigned later if needed */ 2046 ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 2047 0); 2048 ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 2049 0); 2050 ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 2051 0); 2052 ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 2053 0); 2054 ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 2055 0); 2056 ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 2057 0); 2058 ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 2059 0); 2060 ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 2061 0); 2062 2063 if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 2064 /* 64b PPGTT (48bit canonical) 2065 * PDP0_DESCRIPTOR contains the base address to PML4 and 2066 * other PDP Descriptors are ignored. 2067 */ 2068 ASSIGN_CTX_PML4(ppgtt, reg_state); 2069 } else { 2070 /* 32b PPGTT 2071 * PDP*_DESCRIPTOR contains the base address of space supported. 2072 * With dynamic page allocation, PDPs may not be allocated at 2073 * this point. Point the unallocated PDPs to the scratch page 2074 */ 2075 execlists_update_context_pdps(ppgtt, reg_state); 2076 } 2077 2078 if (engine->id == RCS) { 2079 reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 2080 ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 2081 make_rpcs(dev_priv)); 2082 } 2083 2084 i915_gem_object_unpin_map(ctx_obj); 2085 2086 return 0; 2087 } 2088 2089 /** 2090 * intel_lr_context_size() - return the size of the context for an engine 2091 * @engine: which engine to find the context size for 2092 * 2093 * Each engine may require a different amount of space for a context image, 2094 * so when allocating (or copying) an image, this function can be used to 2095 * find the right size for the specific engine. 2096 * 2097 * Return: size (in bytes) of an engine-specific context image 2098 * 2099 * Note: this size includes the HWSP, which is part of the context image 2100 * in LRC mode, but does not include the "shared data page" used with 2101 * GuC submission. The caller should account for this if using the GuC. 2102 */ 2103 uint32_t intel_lr_context_size(struct intel_engine_cs *engine) 2104 { 2105 int ret = 0; 2106 2107 WARN_ON(INTEL_GEN(engine->i915) < 8); 2108 2109 switch (engine->id) { 2110 case RCS: 2111 if (INTEL_GEN(engine->i915) >= 9) 2112 ret = GEN9_LR_CONTEXT_RENDER_SIZE; 2113 else 2114 ret = GEN8_LR_CONTEXT_RENDER_SIZE; 2115 break; 2116 case VCS: 2117 case BCS: 2118 case VECS: 2119 case VCS2: 2120 ret = GEN8_LR_CONTEXT_OTHER_SIZE; 2121 break; 2122 } 2123 2124 return ret; 2125 } 2126 2127 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, 2128 struct intel_engine_cs *engine) 2129 { 2130 struct drm_i915_gem_object *ctx_obj; 2131 struct intel_context *ce = &ctx->engine[engine->id]; 2132 uint32_t context_size; 2133 struct intel_ring *ring; 2134 int ret; 2135 2136 WARN_ON(ce->state); 2137 2138 context_size = round_up(intel_lr_context_size(engine), 4096); 2139 2140 /* One extra page as the sharing data between driver and GuC */ 2141 context_size += PAGE_SIZE * LRC_PPHWSP_PN; 2142 2143 ctx_obj = i915_gem_object_create(&ctx->i915->drm, context_size); 2144 if (IS_ERR(ctx_obj)) { 2145 DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); 2146 return PTR_ERR(ctx_obj); 2147 } 2148 2149 ring = intel_engine_create_ring(engine, ctx->ring_size); 2150 if (IS_ERR(ring)) { 2151 ret = PTR_ERR(ring); 2152 goto error_deref_obj; 2153 } 2154 2155 ret = populate_lr_context(ctx, ctx_obj, engine, ring); 2156 if (ret) { 2157 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 2158 goto error_ring_free; 2159 } 2160 2161 ce->ring = ring; 2162 ce->state = ctx_obj; 2163 ce->initialised = engine->init_context == NULL; 2164 2165 return 0; 2166 2167 error_ring_free: 2168 intel_ring_free(ring); 2169 error_deref_obj: 2170 i915_gem_object_put(ctx_obj); 2171 ce->ring = NULL; 2172 ce->state = NULL; 2173 return ret; 2174 } 2175 2176 void intel_lr_context_reset(struct drm_i915_private *dev_priv, 2177 struct i915_gem_context *ctx) 2178 { 2179 struct intel_engine_cs *engine; 2180 2181 for_each_engine(engine, dev_priv) { 2182 struct intel_context *ce = &ctx->engine[engine->id]; 2183 struct drm_i915_gem_object *ctx_obj = ce->state; 2184 void *vaddr; 2185 uint32_t *reg_state; 2186 2187 if (!ctx_obj) 2188 continue; 2189 2190 vaddr = i915_gem_object_pin_map(ctx_obj); 2191 if (WARN_ON(IS_ERR(vaddr))) 2192 continue; 2193 2194 reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2195 ctx_obj->dirty = true; 2196 2197 reg_state[CTX_RING_HEAD+1] = 0; 2198 reg_state[CTX_RING_TAIL+1] = 0; 2199 2200 i915_gem_object_unpin_map(ctx_obj); 2201 2202 ce->ring->head = 0; 2203 ce->ring->tail = 0; 2204 } 2205 } 2206