1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 135 #include <drm/drmP.h> 136 #include <drm/i915_drm.h> 137 #include "i915_drv.h" 138 #include "intel_drv.h" 139 #include "intel_mocs.h" 140 141 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) 142 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) 143 #define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE) 144 145 #define RING_EXECLIST_QFULL (1 << 0x2) 146 #define RING_EXECLIST1_VALID (1 << 0x3) 147 #define RING_EXECLIST0_VALID (1 << 0x4) 148 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 149 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 150 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 151 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 153 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 156 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 157 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 158 159 #define CTX_LRI_HEADER_0 0x01 160 #define CTX_CONTEXT_CONTROL 0x02 161 #define CTX_RING_HEAD 0x04 162 #define CTX_RING_TAIL 0x06 163 #define CTX_RING_BUFFER_START 0x08 164 #define CTX_RING_BUFFER_CONTROL 0x0a 165 #define CTX_BB_HEAD_U 0x0c 166 #define CTX_BB_HEAD_L 0x0e 167 #define CTX_BB_STATE 0x10 168 #define CTX_SECOND_BB_HEAD_U 0x12 169 #define CTX_SECOND_BB_HEAD_L 0x14 170 #define CTX_SECOND_BB_STATE 0x16 171 #define CTX_BB_PER_CTX_PTR 0x18 172 #define CTX_RCS_INDIRECT_CTX 0x1a 173 #define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c 174 #define CTX_LRI_HEADER_1 0x21 175 #define CTX_CTX_TIMESTAMP 0x22 176 #define CTX_PDP3_UDW 0x24 177 #define CTX_PDP3_LDW 0x26 178 #define CTX_PDP2_UDW 0x28 179 #define CTX_PDP2_LDW 0x2a 180 #define CTX_PDP1_UDW 0x2c 181 #define CTX_PDP1_LDW 0x2e 182 #define CTX_PDP0_UDW 0x30 183 #define CTX_PDP0_LDW 0x32 184 #define CTX_LRI_HEADER_2 0x41 185 #define CTX_R_PWR_CLK_STATE 0x42 186 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 187 188 #define GEN8_CTX_VALID (1<<0) 189 #define GEN8_CTX_FORCE_PD_RESTORE (1<<1) 190 #define GEN8_CTX_FORCE_RESTORE (1<<2) 191 #define GEN8_CTX_L3LLC_COHERENT (1<<5) 192 #define GEN8_CTX_PRIVILEGE (1<<8) 193 194 #define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \ 195 (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \ 196 (reg_state)[(pos)+1] = (val); \ 197 } while (0) 198 199 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ 200 const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ 201 reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ 202 reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ 203 } while (0) 204 205 #define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ 206 reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ 207 reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ 208 } while (0) 209 210 enum { 211 ADVANCED_CONTEXT = 0, 212 LEGACY_32B_CONTEXT, 213 ADVANCED_AD_CONTEXT, 214 LEGACY_64B_CONTEXT 215 }; 216 #define GEN8_CTX_ADDRESSING_MODE_SHIFT 3 217 #define GEN8_CTX_ADDRESSING_MODE(dev) (USES_FULL_48BIT_PPGTT(dev) ?\ 218 LEGACY_64B_CONTEXT :\ 219 LEGACY_32B_CONTEXT) 220 enum { 221 FAULT_AND_HANG = 0, 222 FAULT_AND_HALT, /* Debug only */ 223 FAULT_AND_STREAM, 224 FAULT_AND_CONTINUE /* Unsupported */ 225 }; 226 #define GEN8_CTX_ID_SHIFT 32 227 #define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 228 #define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 229 230 static int intel_lr_context_pin(struct intel_context *ctx, 231 struct intel_engine_cs *engine); 232 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 233 struct drm_i915_gem_object *default_ctx_obj); 234 235 236 /** 237 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists 238 * @dev: DRM device. 239 * @enable_execlists: value of i915.enable_execlists module parameter. 240 * 241 * Only certain platforms support Execlists (the prerequisites being 242 * support for Logical Ring Contexts and Aliasing PPGTT or better). 243 * 244 * Return: 1 if Execlists is supported and has to be enabled. 245 */ 246 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists) 247 { 248 WARN_ON(i915.enable_ppgtt == -1); 249 250 /* On platforms with execlist available, vGPU will only 251 * support execlist mode, no ring buffer mode. 252 */ 253 if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev)) 254 return 1; 255 256 if (INTEL_INFO(dev)->gen >= 9) 257 return 1; 258 259 if (enable_execlists == 0) 260 return 0; 261 262 if (HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev) && 263 i915.use_mmio_flip >= 0) 264 return 1; 265 266 return 0; 267 } 268 269 static void 270 logical_ring_init_platform_invariants(struct intel_engine_cs *ring) 271 { 272 struct drm_device *dev = ring->dev; 273 274 ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) || 275 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) && 276 (ring->id == VCS || ring->id == VCS2); 277 278 ring->ctx_desc_template = GEN8_CTX_VALID; 279 ring->ctx_desc_template |= GEN8_CTX_ADDRESSING_MODE(dev) << 280 GEN8_CTX_ADDRESSING_MODE_SHIFT; 281 if (IS_GEN8(dev)) 282 ring->ctx_desc_template |= GEN8_CTX_L3LLC_COHERENT; 283 ring->ctx_desc_template |= GEN8_CTX_PRIVILEGE; 284 285 /* TODO: WaDisableLiteRestore when we start using semaphore 286 * signalling between Command Streamers */ 287 /* ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; */ 288 289 /* WaEnableForceRestoreInCtxtDescForVCS:skl */ 290 /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ 291 if (ring->disable_lite_restore_wa) 292 ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; 293 } 294 295 /** 296 * intel_lr_context_descriptor_update() - calculate & cache the descriptor 297 * descriptor for a pinned context 298 * 299 * @ctx: Context to work on 300 * @ring: Engine the descriptor will be used with 301 * 302 * The context descriptor encodes various attributes of a context, 303 * including its GTT address and some flags. Because it's fairly 304 * expensive to calculate, we'll just do it once and cache the result, 305 * which remains valid until the context is unpinned. 306 * 307 * This is what a descriptor looks like, from LSB to MSB: 308 * bits 0-11: flags, GEN8_CTX_* (cached in ctx_desc_template) 309 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 310 * bits 32-51: ctx ID, a globally unique tag (the LRCA again!) 311 * bits 52-63: reserved, may encode the engine ID (for GuC) 312 */ 313 static void 314 intel_lr_context_descriptor_update(struct intel_context *ctx, 315 struct intel_engine_cs *ring) 316 { 317 uint64_t lrca, desc; 318 319 lrca = ctx->engine[ring->id].lrc_vma->node.start + 320 LRC_PPHWSP_PN * PAGE_SIZE; 321 322 desc = ring->ctx_desc_template; /* bits 0-11 */ 323 desc |= lrca; /* bits 12-31 */ 324 desc |= (lrca >> PAGE_SHIFT) << GEN8_CTX_ID_SHIFT; /* bits 32-51 */ 325 326 ctx->engine[ring->id].lrc_desc = desc; 327 } 328 329 uint64_t intel_lr_context_descriptor(struct intel_context *ctx, 330 struct intel_engine_cs *ring) 331 { 332 return ctx->engine[ring->id].lrc_desc; 333 } 334 335 /** 336 * intel_execlists_ctx_id() - get the Execlists Context ID 337 * @ctx: Context to get the ID for 338 * @ring: Engine to get the ID for 339 * 340 * Do not confuse with ctx->id! Unfortunately we have a name overload 341 * here: the old context ID we pass to userspace as a handler so that 342 * they can refer to a context, and the new context ID we pass to the 343 * ELSP so that the GPU can inform us of the context status via 344 * interrupts. 345 * 346 * The context ID is a portion of the context descriptor, so we can 347 * just extract the required part from the cached descriptor. 348 * 349 * Return: 20-bits globally unique context ID. 350 */ 351 u32 intel_execlists_ctx_id(struct intel_context *ctx, 352 struct intel_engine_cs *ring) 353 { 354 return intel_lr_context_descriptor(ctx, ring) >> GEN8_CTX_ID_SHIFT; 355 } 356 357 static void execlists_elsp_write(struct drm_i915_gem_request *rq0, 358 struct drm_i915_gem_request *rq1) 359 { 360 361 struct intel_engine_cs *ring = rq0->ring; 362 struct drm_device *dev = ring->dev; 363 struct drm_i915_private *dev_priv = dev->dev_private; 364 uint64_t desc[2]; 365 366 if (rq1) { 367 desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring); 368 rq1->elsp_submitted++; 369 } else { 370 desc[1] = 0; 371 } 372 373 desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring); 374 rq0->elsp_submitted++; 375 376 /* You must always write both descriptors in the order below. */ 377 lockmgr(&dev_priv->uncore.lock, LK_EXCLUSIVE); 378 intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); 379 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); 380 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); 381 382 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0])); 383 /* The context is automatically loaded after the following */ 384 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0])); 385 386 /* ELSP is a wo register, use another nearby reg for posting */ 387 POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); 388 intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); 389 lockmgr(&dev_priv->uncore.lock, LK_RELEASE); 390 } 391 392 static int execlists_update_context(struct drm_i915_gem_request *rq) 393 { 394 struct intel_engine_cs *ring = rq->ring; 395 struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; 396 uint32_t *reg_state = rq->ctx->engine[ring->id].lrc_reg_state; 397 398 reg_state[CTX_RING_TAIL+1] = rq->tail; 399 400 if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 401 /* True 32b PPGTT with dynamic page allocation: update PDP 402 * registers and point the unallocated PDPs to scratch page. 403 * PML4 is allocated during ppgtt init, so this is not needed 404 * in 48-bit mode. 405 */ 406 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 407 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 408 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 409 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 410 } 411 412 return 0; 413 } 414 415 static void execlists_submit_requests(struct drm_i915_gem_request *rq0, 416 struct drm_i915_gem_request *rq1) 417 { 418 execlists_update_context(rq0); 419 420 if (rq1) 421 execlists_update_context(rq1); 422 423 execlists_elsp_write(rq0, rq1); 424 } 425 426 static void execlists_context_unqueue(struct intel_engine_cs *ring) 427 { 428 struct drm_i915_gem_request *req0 = NULL, *req1 = NULL; 429 struct drm_i915_gem_request *cursor = NULL, *tmp = NULL; 430 431 assert_spin_locked(&ring->execlist_lock); 432 433 /* 434 * If irqs are not active generate a warning as batches that finish 435 * without the irqs may get lost and a GPU Hang may occur. 436 */ 437 WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); 438 439 if (list_empty(&ring->execlist_queue)) 440 return; 441 442 /* Try to read in pairs */ 443 list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue, 444 execlist_link) { 445 if (!req0) { 446 req0 = cursor; 447 } else if (req0->ctx == cursor->ctx) { 448 /* Same ctx: ignore first request, as second request 449 * will update tail past first request's workload */ 450 cursor->elsp_submitted = req0->elsp_submitted; 451 list_move_tail(&req0->execlist_link, 452 &ring->execlist_retired_req_list); 453 req0 = cursor; 454 } else { 455 req1 = cursor; 456 break; 457 } 458 } 459 460 if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) { 461 /* 462 * WaIdleLiteRestore: make sure we never cause a lite 463 * restore with HEAD==TAIL 464 */ 465 if (req0->elsp_submitted) { 466 /* 467 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL 468 * as we resubmit the request. See gen8_emit_request() 469 * for where we prepare the padding after the end of the 470 * request. 471 */ 472 struct intel_ringbuffer *ringbuf; 473 474 ringbuf = req0->ctx->engine[ring->id].ringbuf; 475 req0->tail += 8; 476 req0->tail &= ringbuf->size - 1; 477 } 478 } 479 480 WARN_ON(req1 && req1->elsp_submitted); 481 482 execlists_submit_requests(req0, req1); 483 } 484 485 static bool execlists_check_remove_request(struct intel_engine_cs *ring, 486 u32 request_id) 487 { 488 struct drm_i915_gem_request *head_req; 489 490 assert_spin_locked(&ring->execlist_lock); 491 492 head_req = list_first_entry_or_null(&ring->execlist_queue, 493 struct drm_i915_gem_request, 494 execlist_link); 495 496 if (head_req != NULL) { 497 if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) { 498 WARN(head_req->elsp_submitted == 0, 499 "Never submitted head request\n"); 500 501 if (--head_req->elsp_submitted <= 0) { 502 list_move_tail(&head_req->execlist_link, 503 &ring->execlist_retired_req_list); 504 return true; 505 } 506 } 507 } 508 509 return false; 510 } 511 512 static void get_context_status(struct intel_engine_cs *ring, 513 u8 read_pointer, 514 u32 *status, u32 *context_id) 515 { 516 struct drm_i915_private *dev_priv = ring->dev->dev_private; 517 518 if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES)) 519 return; 520 521 *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer)); 522 *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); 523 } 524 525 /** 526 * intel_lrc_irq_handler() - handle Context Switch interrupts 527 * @ring: Engine Command Streamer to handle. 528 * 529 * Check the unread Context Status Buffers and manage the submission of new 530 * contexts to the ELSP accordingly. 531 */ 532 void intel_lrc_irq_handler(struct intel_engine_cs *ring) 533 { 534 struct drm_i915_private *dev_priv = ring->dev->dev_private; 535 u32 status_pointer; 536 u8 read_pointer; 537 u8 write_pointer; 538 u32 status = 0; 539 u32 status_id = 0; 540 u32 submit_contexts = 0; 541 542 status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); 543 544 read_pointer = ring->next_context_status_buffer; 545 write_pointer = GEN8_CSB_WRITE_PTR(status_pointer); 546 if (read_pointer > write_pointer) 547 write_pointer += GEN8_CSB_ENTRIES; 548 549 lockmgr(&ring->execlist_lock, LK_EXCLUSIVE); 550 551 while (read_pointer < write_pointer) { 552 553 get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES, 554 &status, &status_id); 555 556 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 557 continue; 558 559 if (status & GEN8_CTX_STATUS_PREEMPTED) { 560 if (status & GEN8_CTX_STATUS_LITE_RESTORE) { 561 if (execlists_check_remove_request(ring, status_id)) 562 WARN(1, "Lite Restored request removed from queue\n"); 563 } else 564 WARN(1, "Preemption without Lite Restore\n"); 565 } 566 567 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) || 568 (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) { 569 if (execlists_check_remove_request(ring, status_id)) 570 submit_contexts++; 571 } 572 } 573 574 if (ring->disable_lite_restore_wa) { 575 /* Prevent a ctx to preempt itself */ 576 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) && 577 (submit_contexts != 0)) 578 execlists_context_unqueue(ring); 579 } else if (submit_contexts != 0) { 580 execlists_context_unqueue(ring); 581 } 582 583 lockmgr(&ring->execlist_lock, LK_RELEASE); 584 585 if (unlikely(submit_contexts > 2)) 586 DRM_ERROR("More than two context complete events?\n"); 587 588 ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; 589 590 /* Update the read pointer to the old write pointer. Manual ringbuffer 591 * management ftw </sarcasm> */ 592 I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), 593 _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, 594 ring->next_context_status_buffer << 8)); 595 } 596 597 static int execlists_context_queue(struct drm_i915_gem_request *request) 598 { 599 struct intel_engine_cs *ring = request->ring; 600 struct drm_i915_gem_request *cursor; 601 int num_elements = 0; 602 603 if (request->ctx != request->i915->kernel_context) 604 intel_lr_context_pin(request->ctx, ring); 605 606 i915_gem_request_reference(request); 607 608 spin_lock_irq(&ring->execlist_lock); 609 610 list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) 611 if (++num_elements > 2) 612 break; 613 614 if (num_elements > 2) { 615 struct drm_i915_gem_request *tail_req; 616 617 tail_req = list_last_entry(&ring->execlist_queue, 618 struct drm_i915_gem_request, 619 execlist_link); 620 621 if (request->ctx == tail_req->ctx) { 622 WARN(tail_req->elsp_submitted != 0, 623 "More than 2 already-submitted reqs queued\n"); 624 list_move_tail(&tail_req->execlist_link, 625 &ring->execlist_retired_req_list); 626 } 627 } 628 629 list_add_tail(&request->execlist_link, &ring->execlist_queue); 630 if (num_elements == 0) 631 execlists_context_unqueue(ring); 632 633 spin_unlock_irq(&ring->execlist_lock); 634 635 return 0; 636 } 637 638 static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req) 639 { 640 struct intel_engine_cs *ring = req->ring; 641 uint32_t flush_domains; 642 int ret; 643 644 flush_domains = 0; 645 if (ring->gpu_caches_dirty) 646 flush_domains = I915_GEM_GPU_DOMAINS; 647 648 ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains); 649 if (ret) 650 return ret; 651 652 ring->gpu_caches_dirty = false; 653 return 0; 654 } 655 656 static int execlists_move_to_gpu(struct drm_i915_gem_request *req, 657 struct list_head *vmas) 658 { 659 const unsigned other_rings = ~intel_ring_flag(req->ring); 660 struct i915_vma *vma; 661 uint32_t flush_domains = 0; 662 bool flush_chipset = false; 663 int ret; 664 665 list_for_each_entry(vma, vmas, exec_list) { 666 struct drm_i915_gem_object *obj = vma->obj; 667 668 if (obj->active & other_rings) { 669 ret = i915_gem_object_sync(obj, req->ring, &req); 670 if (ret) 671 return ret; 672 } 673 674 if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) 675 flush_chipset |= i915_gem_clflush_object(obj, false); 676 677 flush_domains |= obj->base.write_domain; 678 } 679 680 if (flush_domains & I915_GEM_DOMAIN_GTT) 681 wmb(); 682 683 /* Unconditionally invalidate gpu caches and ensure that we do flush 684 * any residual writes from the previous batch. 685 */ 686 return logical_ring_invalidate_all_caches(req); 687 } 688 689 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) 690 { 691 int ret = 0; 692 693 request->ringbuf = request->ctx->engine[request->ring->id].ringbuf; 694 695 if (i915.enable_guc_submission) { 696 /* 697 * Check that the GuC has space for the request before 698 * going any further, as the i915_add_request() call 699 * later on mustn't fail ... 700 */ 701 struct intel_guc *guc = &request->i915->guc; 702 703 ret = i915_guc_wq_check_space(guc->execbuf_client); 704 if (ret) 705 return ret; 706 } 707 708 if (request->ctx != request->i915->kernel_context) 709 ret = intel_lr_context_pin(request->ctx, request->ring); 710 711 return ret; 712 } 713 714 static int logical_ring_wait_for_space(struct drm_i915_gem_request *req, 715 int bytes) 716 { 717 struct intel_ringbuffer *ringbuf = req->ringbuf; 718 struct intel_engine_cs *ring = req->ring; 719 struct drm_i915_gem_request *target; 720 unsigned space; 721 int ret; 722 723 if (intel_ring_space(ringbuf) >= bytes) 724 return 0; 725 726 /* The whole point of reserving space is to not wait! */ 727 WARN_ON(ringbuf->reserved_in_use); 728 729 list_for_each_entry(target, &ring->request_list, list) { 730 /* 731 * The request queue is per-engine, so can contain requests 732 * from multiple ringbuffers. Here, we must ignore any that 733 * aren't from the ringbuffer we're considering. 734 */ 735 if (target->ringbuf != ringbuf) 736 continue; 737 738 /* Would completion of this request free enough space? */ 739 space = __intel_ring_space(target->postfix, ringbuf->tail, 740 ringbuf->size); 741 if (space >= bytes) 742 break; 743 } 744 745 if (WARN_ON(&target->list == &ring->request_list)) 746 return -ENOSPC; 747 748 ret = i915_wait_request(target); 749 if (ret) 750 return ret; 751 752 ringbuf->space = space; 753 return 0; 754 } 755 756 /* 757 * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload 758 * @request: Request to advance the logical ringbuffer of. 759 * 760 * The tail is updated in our logical ringbuffer struct, not in the actual context. What 761 * really happens during submission is that the context and current tail will be placed 762 * on a queue waiting for the ELSP to be ready to accept a new context submission. At that 763 * point, the tail *inside* the context is updated and the ELSP written to. 764 */ 765 static int 766 intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request) 767 { 768 struct intel_ringbuffer *ringbuf = request->ringbuf; 769 struct drm_i915_private *dev_priv = request->i915; 770 struct intel_engine_cs *engine = request->ring; 771 772 intel_logical_ring_advance(ringbuf); 773 request->tail = ringbuf->tail; 774 775 /* 776 * Here we add two extra NOOPs as padding to avoid 777 * lite restore of a context with HEAD==TAIL. 778 * 779 * Caller must reserve WA_TAIL_DWORDS for us! 780 */ 781 intel_logical_ring_emit(ringbuf, MI_NOOP); 782 intel_logical_ring_emit(ringbuf, MI_NOOP); 783 intel_logical_ring_advance(ringbuf); 784 785 if (intel_ring_stopped(engine)) 786 return 0; 787 788 if (engine->last_context != request->ctx) { 789 if (engine->last_context) 790 intel_lr_context_unpin(engine->last_context, engine); 791 if (request->ctx != request->i915->kernel_context) { 792 intel_lr_context_pin(request->ctx, engine); 793 engine->last_context = request->ctx; 794 } else { 795 engine->last_context = NULL; 796 } 797 } 798 799 if (dev_priv->guc.execbuf_client) 800 i915_guc_submit(dev_priv->guc.execbuf_client, request); 801 else 802 execlists_context_queue(request); 803 804 return 0; 805 } 806 807 static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf) 808 { 809 uint32_t __iomem *virt; 810 int rem = ringbuf->size - ringbuf->tail; 811 812 virt = (uint32_t *)(ringbuf->virtual_start + ringbuf->tail); 813 rem /= 4; 814 while (rem--) 815 iowrite32(MI_NOOP, virt++); 816 817 ringbuf->tail = 0; 818 intel_ring_update_space(ringbuf); 819 } 820 821 static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes) 822 { 823 struct intel_ringbuffer *ringbuf = req->ringbuf; 824 int remain_usable = ringbuf->effective_size - ringbuf->tail; 825 int remain_actual = ringbuf->size - ringbuf->tail; 826 int ret, total_bytes, wait_bytes = 0; 827 bool need_wrap = false; 828 829 if (ringbuf->reserved_in_use) 830 total_bytes = bytes; 831 else 832 total_bytes = bytes + ringbuf->reserved_size; 833 834 if (unlikely(bytes > remain_usable)) { 835 /* 836 * Not enough space for the basic request. So need to flush 837 * out the remainder and then wait for base + reserved. 838 */ 839 wait_bytes = remain_actual + total_bytes; 840 need_wrap = true; 841 } else { 842 if (unlikely(total_bytes > remain_usable)) { 843 /* 844 * The base request will fit but the reserved space 845 * falls off the end. So don't need an immediate wrap 846 * and only need to effectively wait for the reserved 847 * size space from the start of ringbuffer. 848 */ 849 wait_bytes = remain_actual + ringbuf->reserved_size; 850 } else if (total_bytes > ringbuf->space) { 851 /* No wrapping required, just waiting. */ 852 wait_bytes = total_bytes; 853 } 854 } 855 856 if (wait_bytes) { 857 ret = logical_ring_wait_for_space(req, wait_bytes); 858 if (unlikely(ret)) 859 return ret; 860 861 if (need_wrap) 862 __wrap_ring_buffer(ringbuf); 863 } 864 865 return 0; 866 } 867 868 /** 869 * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands 870 * 871 * @req: The request to start some new work for 872 * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. 873 * 874 * The ringbuffer might not be ready to accept the commands right away (maybe it needs to 875 * be wrapped, or wait a bit for the tail to be updated). This function takes care of that 876 * and also preallocates a request (every workload submission is still mediated through 877 * requests, same as it did with legacy ringbuffer submission). 878 * 879 * Return: non-zero if the ringbuffer is not ready to be written to. 880 */ 881 int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords) 882 { 883 struct drm_i915_private *dev_priv; 884 int ret; 885 886 WARN_ON(req == NULL); 887 dev_priv = req->ring->dev->dev_private; 888 889 ret = i915_gem_check_wedge(&dev_priv->gpu_error, 890 dev_priv->mm.interruptible); 891 if (ret) 892 return ret; 893 894 ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t)); 895 if (ret) 896 return ret; 897 898 req->ringbuf->space -= num_dwords * sizeof(uint32_t); 899 return 0; 900 } 901 902 int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request) 903 { 904 /* 905 * The first call merely notes the reserve request and is common for 906 * all back ends. The subsequent localised _begin() call actually 907 * ensures that the reservation is available. Without the begin, if 908 * the request creator immediately submitted the request without 909 * adding any commands to it then there might not actually be 910 * sufficient room for the submission commands. 911 */ 912 intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST); 913 914 return intel_logical_ring_begin(request, 0); 915 } 916 917 /** 918 * execlists_submission() - submit a batchbuffer for execution, Execlists style 919 * @dev: DRM device. 920 * @file: DRM file. 921 * @ring: Engine Command Streamer to submit to. 922 * @ctx: Context to employ for this submission. 923 * @args: execbuffer call arguments. 924 * @vmas: list of vmas. 925 * @batch_obj: the batchbuffer to submit. 926 * @exec_start: batchbuffer start virtual address pointer. 927 * @dispatch_flags: translated execbuffer call flags. 928 * 929 * This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts 930 * away the submission details of the execbuffer ioctl call. 931 * 932 * Return: non-zero if the submission fails. 933 */ 934 int intel_execlists_submission(struct i915_execbuffer_params *params, 935 struct drm_i915_gem_execbuffer2 *args, 936 struct list_head *vmas) 937 { 938 struct drm_device *dev = params->dev; 939 struct intel_engine_cs *ring = params->ring; 940 struct drm_i915_private *dev_priv = dev->dev_private; 941 struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf; 942 u64 exec_start; 943 int instp_mode; 944 u32 instp_mask; 945 int ret; 946 947 instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK; 948 instp_mask = I915_EXEC_CONSTANTS_MASK; 949 switch (instp_mode) { 950 case I915_EXEC_CONSTANTS_REL_GENERAL: 951 case I915_EXEC_CONSTANTS_ABSOLUTE: 952 case I915_EXEC_CONSTANTS_REL_SURFACE: 953 if (instp_mode != 0 && ring != &dev_priv->ring[RCS]) { 954 DRM_DEBUG("non-0 rel constants mode on non-RCS\n"); 955 return -EINVAL; 956 } 957 958 if (instp_mode != dev_priv->relative_constants_mode) { 959 if (instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) { 960 DRM_DEBUG("rel surface constants mode invalid on gen5+\n"); 961 return -EINVAL; 962 } 963 964 /* The HW changed the meaning on this bit on gen6 */ 965 instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE; 966 } 967 break; 968 default: 969 DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode); 970 return -EINVAL; 971 } 972 973 if (args->flags & I915_EXEC_GEN7_SOL_RESET) { 974 DRM_DEBUG("sol reset is gen7 only\n"); 975 return -EINVAL; 976 } 977 978 ret = execlists_move_to_gpu(params->request, vmas); 979 if (ret) 980 return ret; 981 982 if (ring == &dev_priv->ring[RCS] && 983 instp_mode != dev_priv->relative_constants_mode) { 984 ret = intel_logical_ring_begin(params->request, 4); 985 if (ret) 986 return ret; 987 988 intel_logical_ring_emit(ringbuf, MI_NOOP); 989 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); 990 intel_logical_ring_emit_reg(ringbuf, INSTPM); 991 intel_logical_ring_emit(ringbuf, instp_mask << 16 | instp_mode); 992 intel_logical_ring_advance(ringbuf); 993 994 dev_priv->relative_constants_mode = instp_mode; 995 } 996 997 exec_start = params->batch_obj_vm_offset + 998 args->batch_start_offset; 999 1000 ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags); 1001 if (ret) 1002 return ret; 1003 1004 trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); 1005 1006 i915_gem_execbuffer_move_to_active(vmas, params->request); 1007 i915_gem_execbuffer_retire_commands(params); 1008 1009 return 0; 1010 } 1011 1012 void intel_execlists_retire_requests(struct intel_engine_cs *ring) 1013 { 1014 struct drm_i915_gem_request *req, *tmp; 1015 struct list_head retired_list; 1016 1017 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1018 if (list_empty(&ring->execlist_retired_req_list)) 1019 return; 1020 1021 INIT_LIST_HEAD(&retired_list); 1022 spin_lock_irq(&ring->execlist_lock); 1023 list_replace_init(&ring->execlist_retired_req_list, &retired_list); 1024 spin_unlock_irq(&ring->execlist_lock); 1025 1026 list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { 1027 struct intel_context *ctx = req->ctx; 1028 struct drm_i915_gem_object *ctx_obj = 1029 ctx->engine[ring->id].state; 1030 1031 if (ctx_obj && (ctx != req->i915->kernel_context)) 1032 intel_lr_context_unpin(ctx, ring); 1033 1034 list_del(&req->execlist_link); 1035 i915_gem_request_unreference(req); 1036 } 1037 } 1038 1039 void intel_logical_ring_stop(struct intel_engine_cs *ring) 1040 { 1041 struct drm_i915_private *dev_priv = ring->dev->dev_private; 1042 int ret; 1043 1044 if (!intel_ring_initialized(ring)) 1045 return; 1046 1047 ret = intel_ring_idle(ring); 1048 if (ret && !i915_reset_in_progress(&to_i915(ring->dev)->gpu_error)) 1049 DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n", 1050 ring->name, ret); 1051 1052 /* TODO: Is this correct with Execlists enabled? */ 1053 I915_WRITE_MODE(ring, _MASKED_BIT_ENABLE(STOP_RING)); 1054 if (wait_for_atomic((I915_READ_MODE(ring) & MODE_IDLE) != 0, 1000)) { 1055 DRM_ERROR("%s :timed out trying to stop ring\n", ring->name); 1056 return; 1057 } 1058 I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING)); 1059 } 1060 1061 int logical_ring_flush_all_caches(struct drm_i915_gem_request *req) 1062 { 1063 struct intel_engine_cs *ring = req->ring; 1064 int ret; 1065 1066 if (!ring->gpu_caches_dirty) 1067 return 0; 1068 1069 ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS); 1070 if (ret) 1071 return ret; 1072 1073 ring->gpu_caches_dirty = false; 1074 return 0; 1075 } 1076 1077 static int intel_lr_context_do_pin(struct intel_context *ctx, 1078 struct intel_engine_cs *ring) 1079 { 1080 struct drm_device *dev = ring->dev; 1081 struct drm_i915_private *dev_priv = dev->dev_private; 1082 struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; 1083 struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; 1084 struct vm_page *lrc_state_page; 1085 uint32_t *lrc_reg_state; 1086 int ret; 1087 1088 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1089 1090 ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 1091 PIN_OFFSET_BIAS | GUC_WOPCM_TOP); 1092 if (ret) 1093 return ret; 1094 1095 lrc_state_page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 1096 if (WARN_ON(!lrc_state_page)) { 1097 ret = -ENODEV; 1098 goto unpin_ctx_obj; 1099 } 1100 1101 ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf); 1102 if (ret) 1103 goto unpin_ctx_obj; 1104 1105 ctx->engine[ring->id].lrc_vma = i915_gem_obj_to_ggtt(ctx_obj); 1106 intel_lr_context_descriptor_update(ctx, ring); 1107 lrc_reg_state = kmap(lrc_state_page); 1108 lrc_reg_state[CTX_RING_BUFFER_START+1] = ringbuf->vma->node.start; 1109 ctx->engine[ring->id].lrc_reg_state = lrc_reg_state; 1110 ctx_obj->dirty = true; 1111 1112 /* Invalidate GuC TLB. */ 1113 if (i915.enable_guc_submission) 1114 I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); 1115 1116 return ret; 1117 1118 unpin_ctx_obj: 1119 i915_gem_object_ggtt_unpin(ctx_obj); 1120 1121 return ret; 1122 } 1123 1124 static int intel_lr_context_pin(struct intel_context *ctx, 1125 struct intel_engine_cs *engine) 1126 { 1127 int ret = 0; 1128 1129 if (ctx->engine[engine->id].pin_count++ == 0) { 1130 ret = intel_lr_context_do_pin(ctx, engine); 1131 if (ret) 1132 goto reset_pin_count; 1133 1134 i915_gem_context_reference(ctx); 1135 } 1136 return ret; 1137 1138 reset_pin_count: 1139 ctx->engine[engine->id].pin_count = 0; 1140 return ret; 1141 } 1142 1143 void intel_lr_context_unpin(struct intel_context *ctx, 1144 struct intel_engine_cs *engine) 1145 { 1146 struct drm_i915_gem_object *ctx_obj = ctx->engine[engine->id].state; 1147 1148 WARN_ON(!mutex_is_locked(&ctx->i915->dev->struct_mutex)); 1149 if (--ctx->engine[engine->id].pin_count == 0) { 1150 kunmap(kmap_to_page(ctx->engine[engine->id].lrc_reg_state)); 1151 intel_unpin_ringbuffer_obj(ctx->engine[engine->id].ringbuf); 1152 i915_gem_object_ggtt_unpin(ctx_obj); 1153 ctx->engine[engine->id].lrc_vma = NULL; 1154 ctx->engine[engine->id].lrc_desc = 0; 1155 ctx->engine[engine->id].lrc_reg_state = NULL; 1156 1157 i915_gem_context_unreference(ctx); 1158 } 1159 } 1160 1161 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) 1162 { 1163 int ret, i; 1164 struct intel_engine_cs *ring = req->ring; 1165 struct intel_ringbuffer *ringbuf = req->ringbuf; 1166 struct drm_device *dev = ring->dev; 1167 struct drm_i915_private *dev_priv = dev->dev_private; 1168 struct i915_workarounds *w = &dev_priv->workarounds; 1169 1170 if (w->count == 0) 1171 return 0; 1172 1173 ring->gpu_caches_dirty = true; 1174 ret = logical_ring_flush_all_caches(req); 1175 if (ret) 1176 return ret; 1177 1178 ret = intel_logical_ring_begin(req, w->count * 2 + 2); 1179 if (ret) 1180 return ret; 1181 1182 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); 1183 for (i = 0; i < w->count; i++) { 1184 intel_logical_ring_emit_reg(ringbuf, w->reg[i].addr); 1185 intel_logical_ring_emit(ringbuf, w->reg[i].value); 1186 } 1187 intel_logical_ring_emit(ringbuf, MI_NOOP); 1188 1189 intel_logical_ring_advance(ringbuf); 1190 1191 ring->gpu_caches_dirty = true; 1192 ret = logical_ring_flush_all_caches(req); 1193 if (ret) 1194 return ret; 1195 1196 return 0; 1197 } 1198 1199 #define wa_ctx_emit(batch, index, cmd) \ 1200 do { \ 1201 int __index = (index)++; \ 1202 if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ 1203 return -ENOSPC; \ 1204 } \ 1205 batch[__index] = (cmd); \ 1206 } while (0) 1207 1208 #define wa_ctx_emit_reg(batch, index, reg) \ 1209 wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg)) 1210 1211 /* 1212 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1213 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1214 * but there is a slight complication as this is applied in WA batch where the 1215 * values are only initialized once so we cannot take register value at the 1216 * beginning and reuse it further; hence we save its value to memory, upload a 1217 * constant value with bit21 set and then we restore it back with the saved value. 1218 * To simplify the WA, a constant value is formed by using the default value 1219 * of this register. This shouldn't be a problem because we are only modifying 1220 * it for a short period and this batch in non-premptible. We can ofcourse 1221 * use additional instructions that read the actual value of the register 1222 * at that time and set our bit of interest but it makes the WA complicated. 1223 * 1224 * This WA is also required for Gen9 so extracting as a function avoids 1225 * code duplication. 1226 */ 1227 static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring, 1228 uint32_t *const batch, 1229 uint32_t index) 1230 { 1231 uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); 1232 1233 /* 1234 * WaDisableLSQCROPERFforOCL:skl 1235 * This WA is implemented in skl_init_clock_gating() but since 1236 * this batch updates GEN8_L3SQCREG4 with default value we need to 1237 * set this bit here to retain the WA during flush. 1238 */ 1239 if (IS_SKL_REVID(ring->dev, 0, SKL_REVID_E0)) 1240 l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS; 1241 1242 wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | 1243 MI_SRM_LRM_GLOBAL_GTT)); 1244 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1245 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1246 wa_ctx_emit(batch, index, 0); 1247 1248 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1249 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1250 wa_ctx_emit(batch, index, l3sqc4_flush); 1251 1252 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1253 wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | 1254 PIPE_CONTROL_DC_FLUSH_ENABLE)); 1255 wa_ctx_emit(batch, index, 0); 1256 wa_ctx_emit(batch, index, 0); 1257 wa_ctx_emit(batch, index, 0); 1258 wa_ctx_emit(batch, index, 0); 1259 1260 wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | 1261 MI_SRM_LRM_GLOBAL_GTT)); 1262 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1263 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1264 wa_ctx_emit(batch, index, 0); 1265 1266 return index; 1267 } 1268 1269 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, 1270 uint32_t offset, 1271 uint32_t start_alignment) 1272 { 1273 return wa_ctx->offset = ALIGN(offset, start_alignment); 1274 } 1275 1276 static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, 1277 uint32_t offset, 1278 uint32_t size_alignment) 1279 { 1280 wa_ctx->size = offset - wa_ctx->offset; 1281 1282 WARN(wa_ctx->size % size_alignment, 1283 "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", 1284 wa_ctx->size, size_alignment); 1285 return 0; 1286 } 1287 1288 /** 1289 * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA 1290 * 1291 * @ring: only applicable for RCS 1292 * @wa_ctx: structure representing wa_ctx 1293 * offset: specifies start of the batch, should be cache-aligned. This is updated 1294 * with the offset value received as input. 1295 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1296 * @batch: page in which WA are loaded 1297 * @offset: This field specifies the start of the batch, it should be 1298 * cache-aligned otherwise it is adjusted accordingly. 1299 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1300 * initialized at the beginning and shared across all contexts but this field 1301 * helps us to have multiple batches at different offsets and select them based 1302 * on a criteria. At the moment this batch always start at the beginning of the page 1303 * and at this point we don't have multiple wa_ctx batch buffers. 1304 * 1305 * The number of WA applied are not known at the beginning; we use this field 1306 * to return the no of DWORDS written. 1307 * 1308 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1309 * so it adds NOOPs as padding to make it cacheline aligned. 1310 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1311 * makes a complete batch buffer. 1312 * 1313 * Return: non-zero if we exceed the PAGE_SIZE limit. 1314 */ 1315 1316 static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, 1317 struct i915_wa_ctx_bb *wa_ctx, 1318 uint32_t *const batch, 1319 uint32_t *offset) 1320 { 1321 uint32_t scratch_addr; 1322 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1323 1324 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1325 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1326 1327 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1328 if (IS_BROADWELL(ring->dev)) { 1329 int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1330 if (rc < 0) 1331 return rc; 1332 index = rc; 1333 } 1334 1335 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1336 /* Actual scratch location is at 128 bytes offset */ 1337 scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; 1338 1339 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1340 wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | 1341 PIPE_CONTROL_GLOBAL_GTT_IVB | 1342 PIPE_CONTROL_CS_STALL | 1343 PIPE_CONTROL_QW_WRITE)); 1344 wa_ctx_emit(batch, index, scratch_addr); 1345 wa_ctx_emit(batch, index, 0); 1346 wa_ctx_emit(batch, index, 0); 1347 wa_ctx_emit(batch, index, 0); 1348 1349 /* Pad to end of cacheline */ 1350 while (index % CACHELINE_DWORDS) 1351 wa_ctx_emit(batch, index, MI_NOOP); 1352 1353 /* 1354 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1355 * execution depends on the length specified in terms of cache lines 1356 * in the register CTX_RCS_INDIRECT_CTX 1357 */ 1358 1359 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1360 } 1361 1362 /** 1363 * gen8_init_perctx_bb() - initialize per ctx batch with WA 1364 * 1365 * @ring: only applicable for RCS 1366 * @wa_ctx: structure representing wa_ctx 1367 * offset: specifies start of the batch, should be cache-aligned. 1368 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1369 * @batch: page in which WA are loaded 1370 * @offset: This field specifies the start of this batch. 1371 * This batch is started immediately after indirect_ctx batch. Since we ensure 1372 * that indirect_ctx ends on a cacheline this batch is aligned automatically. 1373 * 1374 * The number of DWORDS written are returned using this field. 1375 * 1376 * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding 1377 * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. 1378 */ 1379 static int gen8_init_perctx_bb(struct intel_engine_cs *ring, 1380 struct i915_wa_ctx_bb *wa_ctx, 1381 uint32_t *const batch, 1382 uint32_t *offset) 1383 { 1384 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1385 1386 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1387 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1388 1389 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1390 1391 return wa_ctx_end(wa_ctx, *offset = index, 1); 1392 } 1393 1394 static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring, 1395 struct i915_wa_ctx_bb *wa_ctx, 1396 uint32_t *const batch, 1397 uint32_t *offset) 1398 { 1399 int ret; 1400 struct drm_device *dev = ring->dev; 1401 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1402 1403 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1404 if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) || 1405 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) 1406 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1407 1408 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ 1409 ret = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1410 if (ret < 0) 1411 return ret; 1412 index = ret; 1413 1414 /* Pad to end of cacheline */ 1415 while (index % CACHELINE_DWORDS) 1416 wa_ctx_emit(batch, index, MI_NOOP); 1417 1418 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1419 } 1420 1421 static int gen9_init_perctx_bb(struct intel_engine_cs *ring, 1422 struct i915_wa_ctx_bb *wa_ctx, 1423 uint32_t *const batch, 1424 uint32_t *offset) 1425 { 1426 struct drm_device *dev = ring->dev; 1427 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1428 1429 /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */ 1430 if (IS_SKL_REVID(dev, 0, SKL_REVID_B0) || 1431 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 1432 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1433 wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); 1434 wa_ctx_emit(batch, index, 1435 _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); 1436 wa_ctx_emit(batch, index, MI_NOOP); 1437 } 1438 1439 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1440 if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) || 1441 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) 1442 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1443 1444 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1445 1446 return wa_ctx_end(wa_ctx, *offset = index, 1); 1447 } 1448 1449 static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size) 1450 { 1451 int ret; 1452 1453 ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size)); 1454 if (!ring->wa_ctx.obj) { 1455 DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); 1456 return -ENOMEM; 1457 } 1458 1459 ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0); 1460 if (ret) { 1461 DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", 1462 ret); 1463 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1464 return ret; 1465 } 1466 1467 return 0; 1468 } 1469 1470 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring) 1471 { 1472 if (ring->wa_ctx.obj) { 1473 i915_gem_object_ggtt_unpin(ring->wa_ctx.obj); 1474 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1475 ring->wa_ctx.obj = NULL; 1476 } 1477 } 1478 1479 static int intel_init_workaround_bb(struct intel_engine_cs *ring) 1480 { 1481 int ret; 1482 uint32_t *batch; 1483 uint32_t offset; 1484 struct vm_page *page; 1485 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 1486 1487 WARN_ON(ring->id != RCS); 1488 1489 /* update this when WA for higher Gen are added */ 1490 if (INTEL_INFO(ring->dev)->gen > 9) { 1491 DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", 1492 INTEL_INFO(ring->dev)->gen); 1493 return 0; 1494 } 1495 1496 /* some WA perform writes to scratch page, ensure it is valid */ 1497 if (ring->scratch.obj == NULL) { 1498 DRM_ERROR("scratch page not allocated for %s\n", ring->name); 1499 return -EINVAL; 1500 } 1501 1502 ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE); 1503 if (ret) { 1504 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 1505 return ret; 1506 } 1507 1508 page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0); 1509 batch = kmap_atomic(page); 1510 offset = 0; 1511 1512 if (INTEL_INFO(ring->dev)->gen == 8) { 1513 ret = gen8_init_indirectctx_bb(ring, 1514 &wa_ctx->indirect_ctx, 1515 batch, 1516 &offset); 1517 if (ret) 1518 goto out; 1519 1520 ret = gen8_init_perctx_bb(ring, 1521 &wa_ctx->per_ctx, 1522 batch, 1523 &offset); 1524 if (ret) 1525 goto out; 1526 } else if (INTEL_INFO(ring->dev)->gen == 9) { 1527 ret = gen9_init_indirectctx_bb(ring, 1528 &wa_ctx->indirect_ctx, 1529 batch, 1530 &offset); 1531 if (ret) 1532 goto out; 1533 1534 ret = gen9_init_perctx_bb(ring, 1535 &wa_ctx->per_ctx, 1536 batch, 1537 &offset); 1538 if (ret) 1539 goto out; 1540 } 1541 1542 out: 1543 kunmap_atomic(batch); 1544 if (ret) 1545 lrc_destroy_wa_ctx_obj(ring); 1546 1547 return ret; 1548 } 1549 1550 static int gen8_init_common_ring(struct intel_engine_cs *ring) 1551 { 1552 struct drm_device *dev = ring->dev; 1553 struct drm_i915_private *dev_priv = dev->dev_private; 1554 u8 next_context_status_buffer_hw; 1555 1556 lrc_setup_hardware_status_page(ring, 1557 dev_priv->kernel_context->engine[ring->id].state); 1558 1559 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1560 I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff); 1561 1562 I915_WRITE(RING_MODE_GEN7(ring), 1563 _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | 1564 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); 1565 POSTING_READ(RING_MODE_GEN7(ring)); 1566 1567 /* 1568 * Instead of resetting the Context Status Buffer (CSB) read pointer to 1569 * zero, we need to read the write pointer from hardware and use its 1570 * value because "this register is power context save restored". 1571 * Effectively, these states have been observed: 1572 * 1573 * | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) | 1574 * BDW | CSB regs not reset | CSB regs reset | 1575 * CHT | CSB regs not reset | CSB regs not reset | 1576 * SKL | ? | ? | 1577 * BXT | ? | ? | 1578 */ 1579 next_context_status_buffer_hw = 1580 GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring))); 1581 1582 /* 1583 * When the CSB registers are reset (also after power-up / gpu reset), 1584 * CSB write pointer is set to all 1's, which is not valid, use '5' in 1585 * this special case, so the first element read is CSB[0]. 1586 */ 1587 if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) 1588 next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); 1589 1590 ring->next_context_status_buffer = next_context_status_buffer_hw; 1591 DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name); 1592 1593 memset(&ring->hangcheck, 0, sizeof(ring->hangcheck)); 1594 1595 return 0; 1596 } 1597 1598 static int gen8_init_render_ring(struct intel_engine_cs *ring) 1599 { 1600 struct drm_device *dev = ring->dev; 1601 struct drm_i915_private *dev_priv = dev->dev_private; 1602 int ret; 1603 1604 ret = gen8_init_common_ring(ring); 1605 if (ret) 1606 return ret; 1607 1608 /* We need to disable the AsyncFlip performance optimisations in order 1609 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1610 * programmed to '1' on all products. 1611 * 1612 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv 1613 */ 1614 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1615 1616 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1617 1618 return init_workarounds_ring(ring); 1619 } 1620 1621 static int gen9_init_render_ring(struct intel_engine_cs *ring) 1622 { 1623 int ret; 1624 1625 ret = gen8_init_common_ring(ring); 1626 if (ret) 1627 return ret; 1628 1629 return init_workarounds_ring(ring); 1630 } 1631 1632 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) 1633 { 1634 struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; 1635 struct intel_engine_cs *ring = req->ring; 1636 struct intel_ringbuffer *ringbuf = req->ringbuf; 1637 const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; 1638 int i, ret; 1639 1640 ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2); 1641 if (ret) 1642 return ret; 1643 1644 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds)); 1645 for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { 1646 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1647 1648 intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_UDW(ring, i)); 1649 intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr)); 1650 intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_LDW(ring, i)); 1651 intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr)); 1652 } 1653 1654 intel_logical_ring_emit(ringbuf, MI_NOOP); 1655 intel_logical_ring_advance(ringbuf); 1656 1657 return 0; 1658 } 1659 1660 static int gen8_emit_bb_start(struct drm_i915_gem_request *req, 1661 u64 offset, unsigned dispatch_flags) 1662 { 1663 struct intel_ringbuffer *ringbuf = req->ringbuf; 1664 bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); 1665 int ret; 1666 1667 /* Don't rely in hw updating PDPs, specially in lite-restore. 1668 * Ideally, we should set Force PD Restore in ctx descriptor, 1669 * but we can't. Force Restore would be a second option, but 1670 * it is unsafe in case of lite-restore (because the ctx is 1671 * not idle). PML4 is allocated during ppgtt init so this is 1672 * not needed in 48-bit.*/ 1673 if (req->ctx->ppgtt && 1674 (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) { 1675 if (!USES_FULL_48BIT_PPGTT(req->i915) && 1676 !intel_vgpu_active(req->i915->dev)) { 1677 ret = intel_logical_ring_emit_pdps(req); 1678 if (ret) 1679 return ret; 1680 } 1681 1682 req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring); 1683 } 1684 1685 ret = intel_logical_ring_begin(req, 4); 1686 if (ret) 1687 return ret; 1688 1689 /* FIXME(BDW): Address space and security selectors. */ 1690 intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | 1691 (ppgtt<<8) | 1692 (dispatch_flags & I915_DISPATCH_RS ? 1693 MI_BATCH_RESOURCE_STREAMER : 0)); 1694 intel_logical_ring_emit(ringbuf, lower_32_bits(offset)); 1695 intel_logical_ring_emit(ringbuf, upper_32_bits(offset)); 1696 intel_logical_ring_emit(ringbuf, MI_NOOP); 1697 intel_logical_ring_advance(ringbuf); 1698 1699 return 0; 1700 } 1701 1702 static bool gen8_logical_ring_get_irq(struct intel_engine_cs *ring) 1703 { 1704 struct drm_device *dev = ring->dev; 1705 struct drm_i915_private *dev_priv = dev->dev_private; 1706 unsigned long flags; 1707 1708 if (WARN_ON(!intel_irqs_enabled(dev_priv))) 1709 return false; 1710 1711 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1712 if (ring->irq_refcount++ == 0) { 1713 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1714 POSTING_READ(RING_IMR(ring->mmio_base)); 1715 } 1716 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1717 1718 return true; 1719 } 1720 1721 static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring) 1722 { 1723 struct drm_device *dev = ring->dev; 1724 struct drm_i915_private *dev_priv = dev->dev_private; 1725 unsigned long flags; 1726 1727 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1728 if (--ring->irq_refcount == 0) { 1729 I915_WRITE_IMR(ring, ~ring->irq_keep_mask); 1730 POSTING_READ(RING_IMR(ring->mmio_base)); 1731 } 1732 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1733 } 1734 1735 static int gen8_emit_flush(struct drm_i915_gem_request *request, 1736 u32 invalidate_domains, 1737 u32 unused) 1738 { 1739 struct intel_ringbuffer *ringbuf = request->ringbuf; 1740 struct intel_engine_cs *ring = ringbuf->ring; 1741 struct drm_device *dev = ring->dev; 1742 struct drm_i915_private *dev_priv = dev->dev_private; 1743 uint32_t cmd; 1744 int ret; 1745 1746 ret = intel_logical_ring_begin(request, 4); 1747 if (ret) 1748 return ret; 1749 1750 cmd = MI_FLUSH_DW + 1; 1751 1752 /* We always require a command barrier so that subsequent 1753 * commands, such as breadcrumb interrupts, are strictly ordered 1754 * wrt the contents of the write cache being flushed to memory 1755 * (and thus being coherent from the CPU). 1756 */ 1757 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 1758 1759 if (invalidate_domains & I915_GEM_GPU_DOMAINS) { 1760 cmd |= MI_INVALIDATE_TLB; 1761 if (ring == &dev_priv->ring[VCS]) 1762 cmd |= MI_INVALIDATE_BSD; 1763 } 1764 1765 intel_logical_ring_emit(ringbuf, cmd); 1766 intel_logical_ring_emit(ringbuf, 1767 I915_GEM_HWS_SCRATCH_ADDR | 1768 MI_FLUSH_DW_USE_GTT); 1769 intel_logical_ring_emit(ringbuf, 0); /* upper addr */ 1770 intel_logical_ring_emit(ringbuf, 0); /* value */ 1771 intel_logical_ring_advance(ringbuf); 1772 1773 return 0; 1774 } 1775 1776 static int gen8_emit_flush_render(struct drm_i915_gem_request *request, 1777 u32 invalidate_domains, 1778 u32 flush_domains) 1779 { 1780 struct intel_ringbuffer *ringbuf = request->ringbuf; 1781 struct intel_engine_cs *ring = ringbuf->ring; 1782 u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; 1783 bool vf_flush_wa = false; 1784 u32 flags = 0; 1785 int ret; 1786 1787 flags |= PIPE_CONTROL_CS_STALL; 1788 1789 if (flush_domains) { 1790 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 1791 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 1792 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 1793 flags |= PIPE_CONTROL_FLUSH_ENABLE; 1794 } 1795 1796 if (invalidate_domains) { 1797 flags |= PIPE_CONTROL_TLB_INVALIDATE; 1798 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 1799 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 1800 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 1801 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 1802 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 1803 flags |= PIPE_CONTROL_QW_WRITE; 1804 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 1805 1806 /* 1807 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 1808 * pipe control. 1809 */ 1810 if (IS_GEN9(ring->dev)) 1811 vf_flush_wa = true; 1812 } 1813 1814 ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6); 1815 if (ret) 1816 return ret; 1817 1818 if (vf_flush_wa) { 1819 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1820 intel_logical_ring_emit(ringbuf, 0); 1821 intel_logical_ring_emit(ringbuf, 0); 1822 intel_logical_ring_emit(ringbuf, 0); 1823 intel_logical_ring_emit(ringbuf, 0); 1824 intel_logical_ring_emit(ringbuf, 0); 1825 } 1826 1827 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1828 intel_logical_ring_emit(ringbuf, flags); 1829 intel_logical_ring_emit(ringbuf, scratch_addr); 1830 intel_logical_ring_emit(ringbuf, 0); 1831 intel_logical_ring_emit(ringbuf, 0); 1832 intel_logical_ring_emit(ringbuf, 0); 1833 intel_logical_ring_advance(ringbuf); 1834 1835 return 0; 1836 } 1837 1838 static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1839 { 1840 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1841 } 1842 1843 static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1844 { 1845 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1846 } 1847 1848 static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1849 { 1850 1851 /* 1852 * On BXT A steppings there is a HW coherency issue whereby the 1853 * MI_STORE_DATA_IMM storing the completed request's seqno 1854 * occasionally doesn't invalidate the CPU cache. Work around this by 1855 * clflushing the corresponding cacheline whenever the caller wants 1856 * the coherency to be guaranteed. Note that this cacheline is known 1857 * to be clean at this point, since we only write it in 1858 * bxt_a_set_seqno(), where we also do a clflush after the write. So 1859 * this clflush in practice becomes an invalidate operation. 1860 */ 1861 1862 if (!lazy_coherency) 1863 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1864 1865 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1866 } 1867 1868 static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1869 { 1870 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1871 1872 /* See bxt_a_get_seqno() explaining the reason for the clflush. */ 1873 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1874 } 1875 1876 /* 1877 * Reserve space for 2 NOOPs at the end of each request to be 1878 * used as a workaround for not being allowed to do lite 1879 * restore with HEAD==TAIL (WaIdleLiteRestore). 1880 */ 1881 #define WA_TAIL_DWORDS 2 1882 1883 static inline u32 hws_seqno_address(struct intel_engine_cs *engine) 1884 { 1885 return engine->status_page.gfx_addr + I915_GEM_HWS_INDEX_ADDR; 1886 } 1887 1888 static int gen8_emit_request(struct drm_i915_gem_request *request) 1889 { 1890 struct intel_ringbuffer *ringbuf = request->ringbuf; 1891 int ret; 1892 1893 ret = intel_logical_ring_begin(request, 6 + WA_TAIL_DWORDS); 1894 if (ret) 1895 return ret; 1896 1897 /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ 1898 BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); 1899 1900 intel_logical_ring_emit(ringbuf, 1901 (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW); 1902 intel_logical_ring_emit(ringbuf, 1903 hws_seqno_address(request->ring) | 1904 MI_FLUSH_DW_USE_GTT); 1905 intel_logical_ring_emit(ringbuf, 0); 1906 intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request)); 1907 intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT); 1908 intel_logical_ring_emit(ringbuf, MI_NOOP); 1909 return intel_logical_ring_advance_and_submit(request); 1910 } 1911 1912 static int gen8_emit_request_render(struct drm_i915_gem_request *request) 1913 { 1914 struct intel_ringbuffer *ringbuf = request->ringbuf; 1915 int ret; 1916 1917 ret = intel_logical_ring_begin(request, 8 + WA_TAIL_DWORDS); 1918 if (ret) 1919 return ret; 1920 1921 /* We're using qword write, seqno should be aligned to 8 bytes. */ 1922 BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); 1923 1924 /* w/a for post sync ops following a GPGPU operation we 1925 * need a prior CS_STALL, which is emitted by the flush 1926 * following the batch. 1927 */ 1928 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1929 intel_logical_ring_emit(ringbuf, 1930 (PIPE_CONTROL_GLOBAL_GTT_IVB | 1931 PIPE_CONTROL_CS_STALL | 1932 PIPE_CONTROL_QW_WRITE)); 1933 intel_logical_ring_emit(ringbuf, hws_seqno_address(request->ring)); 1934 intel_logical_ring_emit(ringbuf, 0); 1935 intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request)); 1936 /* We're thrashing one dword of HWS. */ 1937 intel_logical_ring_emit(ringbuf, 0); 1938 intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT); 1939 intel_logical_ring_emit(ringbuf, MI_NOOP); 1940 return intel_logical_ring_advance_and_submit(request); 1941 } 1942 1943 static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req) 1944 { 1945 struct render_state so; 1946 int ret; 1947 1948 ret = i915_gem_render_state_prepare(req->ring, &so); 1949 if (ret) 1950 return ret; 1951 1952 if (so.rodata == NULL) 1953 return 0; 1954 1955 ret = req->ring->emit_bb_start(req, so.ggtt_offset, 1956 I915_DISPATCH_SECURE); 1957 if (ret) 1958 goto out; 1959 1960 ret = req->ring->emit_bb_start(req, 1961 (so.ggtt_offset + so.aux_batch_offset), 1962 I915_DISPATCH_SECURE); 1963 if (ret) 1964 goto out; 1965 1966 i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req); 1967 1968 out: 1969 i915_gem_render_state_fini(&so); 1970 return ret; 1971 } 1972 1973 static int gen8_init_rcs_context(struct drm_i915_gem_request *req) 1974 { 1975 int ret; 1976 1977 ret = intel_logical_ring_workarounds_emit(req); 1978 if (ret) 1979 return ret; 1980 1981 ret = intel_rcs_context_init_mocs(req); 1982 /* 1983 * Failing to program the MOCS is non-fatal.The system will not 1984 * run at peak performance. So generate an error and carry on. 1985 */ 1986 if (ret) 1987 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1988 1989 return intel_lr_context_render_state_init(req); 1990 } 1991 1992 /** 1993 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer 1994 * 1995 * @ring: Engine Command Streamer. 1996 * 1997 */ 1998 void intel_logical_ring_cleanup(struct intel_engine_cs *ring) 1999 { 2000 struct drm_i915_private *dev_priv; 2001 2002 if (!intel_ring_initialized(ring)) 2003 return; 2004 2005 dev_priv = ring->dev->dev_private; 2006 2007 if (ring->buffer) { 2008 intel_logical_ring_stop(ring); 2009 WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0); 2010 } 2011 2012 if (ring->cleanup) 2013 ring->cleanup(ring); 2014 2015 i915_cmd_parser_fini_ring(ring); 2016 i915_gem_batch_pool_fini(&ring->batch_pool); 2017 2018 if (ring->status_page.obj) { 2019 kunmap(sg_page(ring->status_page.obj->pages->sgl)); 2020 ring->status_page.obj = NULL; 2021 } 2022 2023 ring->disable_lite_restore_wa = false; 2024 ring->ctx_desc_template = 0; 2025 2026 lrc_destroy_wa_ctx_obj(ring); 2027 ring->dev = NULL; 2028 } 2029 2030 static void 2031 logical_ring_default_vfuncs(struct drm_device *dev, 2032 struct intel_engine_cs *ring) 2033 { 2034 /* Default vfuncs which can be overriden by each engine. */ 2035 ring->init_hw = gen8_init_common_ring; 2036 ring->emit_request = gen8_emit_request; 2037 ring->emit_flush = gen8_emit_flush; 2038 ring->irq_get = gen8_logical_ring_get_irq; 2039 ring->irq_put = gen8_logical_ring_put_irq; 2040 ring->emit_bb_start = gen8_emit_bb_start; 2041 if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 2042 ring->get_seqno = bxt_a_get_seqno; 2043 ring->set_seqno = bxt_a_set_seqno; 2044 } else { 2045 ring->get_seqno = gen8_get_seqno; 2046 ring->set_seqno = gen8_set_seqno; 2047 } 2048 } 2049 2050 static inline void 2051 logical_ring_default_irqs(struct intel_engine_cs *ring, unsigned shift) 2052 { 2053 ring->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 2054 ring->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 2055 } 2056 2057 static int 2058 logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring) 2059 { 2060 struct intel_context *dctx = to_i915(dev)->kernel_context; 2061 int ret; 2062 2063 /* Intentionally left blank. */ 2064 ring->buffer = NULL; 2065 2066 ring->dev = dev; 2067 INIT_LIST_HEAD(&ring->active_list); 2068 INIT_LIST_HEAD(&ring->request_list); 2069 i915_gem_batch_pool_init(dev, &ring->batch_pool); 2070 init_waitqueue_head(&ring->irq_queue); 2071 2072 INIT_LIST_HEAD(&ring->buffers); 2073 INIT_LIST_HEAD(&ring->execlist_queue); 2074 INIT_LIST_HEAD(&ring->execlist_retired_req_list); 2075 lockinit(&ring->execlist_lock, "i915el", 0, LK_CANRECURSE); 2076 2077 logical_ring_init_platform_invariants(ring); 2078 2079 ret = i915_cmd_parser_init_ring(ring); 2080 if (ret) 2081 goto error; 2082 2083 ret = intel_lr_context_deferred_alloc(dctx, ring); 2084 if (ret) 2085 goto error; 2086 2087 /* As this is the default context, always pin it */ 2088 ret = intel_lr_context_do_pin(dctx, ring); 2089 if (ret) { 2090 DRM_ERROR( 2091 "Failed to pin and map ringbuffer %s: %d\n", 2092 ring->name, ret); 2093 goto error; 2094 } 2095 2096 return 0; 2097 2098 error: 2099 intel_logical_ring_cleanup(ring); 2100 return ret; 2101 } 2102 2103 static int logical_render_ring_init(struct drm_device *dev) 2104 { 2105 struct drm_i915_private *dev_priv = dev->dev_private; 2106 struct intel_engine_cs *ring = &dev_priv->ring[RCS]; 2107 int ret; 2108 2109 ring->name = "render ring"; 2110 ring->id = RCS; 2111 ring->exec_id = I915_EXEC_RENDER; 2112 ring->guc_id = GUC_RENDER_ENGINE; 2113 ring->mmio_base = RENDER_RING_BASE; 2114 2115 logical_ring_default_irqs(ring, GEN8_RCS_IRQ_SHIFT); 2116 if (HAS_L3_DPF(dev)) 2117 ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 2118 2119 logical_ring_default_vfuncs(dev, ring); 2120 2121 /* Override some for render ring. */ 2122 if (INTEL_INFO(dev)->gen >= 9) 2123 ring->init_hw = gen9_init_render_ring; 2124 else 2125 ring->init_hw = gen8_init_render_ring; 2126 ring->init_context = gen8_init_rcs_context; 2127 ring->cleanup = intel_fini_pipe_control; 2128 ring->emit_flush = gen8_emit_flush_render; 2129 ring->emit_request = gen8_emit_request_render; 2130 2131 ring->dev = dev; 2132 2133 ret = intel_init_pipe_control(ring); 2134 if (ret) 2135 return ret; 2136 2137 ret = intel_init_workaround_bb(ring); 2138 if (ret) { 2139 /* 2140 * We continue even if we fail to initialize WA batch 2141 * because we only expect rare glitches but nothing 2142 * critical to prevent us from using GPU 2143 */ 2144 DRM_ERROR("WA batch buffer initialization failed: %d\n", 2145 ret); 2146 } 2147 2148 ret = logical_ring_init(dev, ring); 2149 if (ret) { 2150 lrc_destroy_wa_ctx_obj(ring); 2151 } 2152 2153 return ret; 2154 } 2155 2156 static int logical_bsd_ring_init(struct drm_device *dev) 2157 { 2158 struct drm_i915_private *dev_priv = dev->dev_private; 2159 struct intel_engine_cs *ring = &dev_priv->ring[VCS]; 2160 2161 ring->name = "bsd ring"; 2162 ring->id = VCS; 2163 ring->exec_id = I915_EXEC_BSD; 2164 ring->guc_id = GUC_VIDEO_ENGINE; 2165 ring->mmio_base = GEN6_BSD_RING_BASE; 2166 2167 logical_ring_default_irqs(ring, GEN8_VCS1_IRQ_SHIFT); 2168 logical_ring_default_vfuncs(dev, ring); 2169 2170 return logical_ring_init(dev, ring); 2171 } 2172 2173 static int logical_bsd2_ring_init(struct drm_device *dev) 2174 { 2175 struct drm_i915_private *dev_priv = dev->dev_private; 2176 struct intel_engine_cs *ring = &dev_priv->ring[VCS2]; 2177 2178 ring->name = "bsd2 ring"; 2179 ring->id = VCS2; 2180 ring->exec_id = I915_EXEC_BSD; 2181 ring->guc_id = GUC_VIDEO_ENGINE2; 2182 ring->mmio_base = GEN8_BSD2_RING_BASE; 2183 2184 logical_ring_default_irqs(ring, GEN8_VCS2_IRQ_SHIFT); 2185 logical_ring_default_vfuncs(dev, ring); 2186 2187 return logical_ring_init(dev, ring); 2188 } 2189 2190 static int logical_blt_ring_init(struct drm_device *dev) 2191 { 2192 struct drm_i915_private *dev_priv = dev->dev_private; 2193 struct intel_engine_cs *ring = &dev_priv->ring[BCS]; 2194 2195 ring->name = "blitter ring"; 2196 ring->id = BCS; 2197 ring->exec_id = I915_EXEC_BLT; 2198 ring->guc_id = GUC_BLITTER_ENGINE; 2199 ring->mmio_base = BLT_RING_BASE; 2200 2201 logical_ring_default_irqs(ring, GEN8_BCS_IRQ_SHIFT); 2202 logical_ring_default_vfuncs(dev, ring); 2203 2204 return logical_ring_init(dev, ring); 2205 } 2206 2207 static int logical_vebox_ring_init(struct drm_device *dev) 2208 { 2209 struct drm_i915_private *dev_priv = dev->dev_private; 2210 struct intel_engine_cs *ring = &dev_priv->ring[VECS]; 2211 2212 ring->name = "video enhancement ring"; 2213 ring->id = VECS; 2214 ring->exec_id = I915_EXEC_VEBOX; 2215 ring->guc_id = GUC_VIDEOENHANCE_ENGINE; 2216 ring->mmio_base = VEBOX_RING_BASE; 2217 2218 logical_ring_default_irqs(ring, GEN8_VECS_IRQ_SHIFT); 2219 logical_ring_default_vfuncs(dev, ring); 2220 2221 return logical_ring_init(dev, ring); 2222 } 2223 2224 /** 2225 * intel_logical_rings_init() - allocate, populate and init the Engine Command Streamers 2226 * @dev: DRM device. 2227 * 2228 * This function inits the engines for an Execlists submission style (the equivalent in the 2229 * legacy ringbuffer submission world would be i915_gem_init_rings). It does it only for 2230 * those engines that are present in the hardware. 2231 * 2232 * Return: non-zero if the initialization failed. 2233 */ 2234 int intel_logical_rings_init(struct drm_device *dev) 2235 { 2236 struct drm_i915_private *dev_priv = dev->dev_private; 2237 int ret; 2238 2239 ret = logical_render_ring_init(dev); 2240 if (ret) 2241 return ret; 2242 2243 if (HAS_BSD(dev)) { 2244 ret = logical_bsd_ring_init(dev); 2245 if (ret) 2246 goto cleanup_render_ring; 2247 } 2248 2249 if (HAS_BLT(dev)) { 2250 ret = logical_blt_ring_init(dev); 2251 if (ret) 2252 goto cleanup_bsd_ring; 2253 } 2254 2255 if (HAS_VEBOX(dev)) { 2256 ret = logical_vebox_ring_init(dev); 2257 if (ret) 2258 goto cleanup_blt_ring; 2259 } 2260 2261 if (HAS_BSD2(dev)) { 2262 ret = logical_bsd2_ring_init(dev); 2263 if (ret) 2264 goto cleanup_vebox_ring; 2265 } 2266 2267 return 0; 2268 2269 cleanup_vebox_ring: 2270 intel_logical_ring_cleanup(&dev_priv->ring[VECS]); 2271 cleanup_blt_ring: 2272 intel_logical_ring_cleanup(&dev_priv->ring[BCS]); 2273 cleanup_bsd_ring: 2274 intel_logical_ring_cleanup(&dev_priv->ring[VCS]); 2275 cleanup_render_ring: 2276 intel_logical_ring_cleanup(&dev_priv->ring[RCS]); 2277 2278 return ret; 2279 } 2280 2281 static u32 2282 make_rpcs(struct drm_device *dev) 2283 { 2284 u32 rpcs = 0; 2285 2286 /* 2287 * No explicit RPCS request is needed to ensure full 2288 * slice/subslice/EU enablement prior to Gen9. 2289 */ 2290 if (INTEL_INFO(dev)->gen < 9) 2291 return 0; 2292 2293 /* 2294 * Starting in Gen9, render power gating can leave 2295 * slice/subslice/EU in a partially enabled state. We 2296 * must make an explicit request through RPCS for full 2297 * enablement. 2298 */ 2299 if (INTEL_INFO(dev)->has_slice_pg) { 2300 rpcs |= GEN8_RPCS_S_CNT_ENABLE; 2301 rpcs |= INTEL_INFO(dev)->slice_total << 2302 GEN8_RPCS_S_CNT_SHIFT; 2303 rpcs |= GEN8_RPCS_ENABLE; 2304 } 2305 2306 if (INTEL_INFO(dev)->has_subslice_pg) { 2307 rpcs |= GEN8_RPCS_SS_CNT_ENABLE; 2308 rpcs |= INTEL_INFO(dev)->subslice_per_slice << 2309 GEN8_RPCS_SS_CNT_SHIFT; 2310 rpcs |= GEN8_RPCS_ENABLE; 2311 } 2312 2313 if (INTEL_INFO(dev)->has_eu_pg) { 2314 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2315 GEN8_RPCS_EU_MIN_SHIFT; 2316 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2317 GEN8_RPCS_EU_MAX_SHIFT; 2318 rpcs |= GEN8_RPCS_ENABLE; 2319 } 2320 2321 return rpcs; 2322 } 2323 2324 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *ring) 2325 { 2326 u32 indirect_ctx_offset; 2327 2328 switch (INTEL_INFO(ring->dev)->gen) { 2329 default: 2330 MISSING_CASE(INTEL_INFO(ring->dev)->gen); 2331 /* fall through */ 2332 case 9: 2333 indirect_ctx_offset = 2334 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 2335 break; 2336 case 8: 2337 indirect_ctx_offset = 2338 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 2339 break; 2340 } 2341 2342 return indirect_ctx_offset; 2343 } 2344 2345 static int 2346 populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj, 2347 struct intel_engine_cs *ring, struct intel_ringbuffer *ringbuf) 2348 { 2349 struct drm_device *dev = ring->dev; 2350 struct drm_i915_private *dev_priv = dev->dev_private; 2351 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt; 2352 struct vm_page *page; 2353 uint32_t *reg_state; 2354 int ret; 2355 2356 if (!ppgtt) 2357 ppgtt = dev_priv->mm.aliasing_ppgtt; 2358 2359 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); 2360 if (ret) { 2361 DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); 2362 return ret; 2363 } 2364 2365 ret = i915_gem_object_get_pages(ctx_obj); 2366 if (ret) { 2367 DRM_DEBUG_DRIVER("Could not get object pages\n"); 2368 return ret; 2369 } 2370 2371 i915_gem_object_pin_pages(ctx_obj); 2372 2373 /* The second page of the context object contains some fields which must 2374 * be set up prior to the first execution. */ 2375 page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 2376 reg_state = kmap_atomic(page); 2377 2378 /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM 2379 * commands followed by (reg, value) pairs. The values we are setting here are 2380 * only for the first context restore: on a subsequent save, the GPU will 2381 * recreate this batchbuffer with new values (including all the missing 2382 * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ 2383 reg_state[CTX_LRI_HEADER_0] = 2384 MI_LOAD_REGISTER_IMM(ring->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED; 2385 ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(ring), 2386 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 2387 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | 2388 (HAS_RESOURCE_STREAMER(dev) ? 2389 CTX_CTRL_RS_CTX_ENABLE : 0))); 2390 ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0); 2391 ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0); 2392 /* Ring buffer start address is not known until the buffer is pinned. 2393 * It is written to the context image in execlists_update_context() 2394 */ 2395 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0); 2396 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base), 2397 ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID); 2398 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, RING_BBADDR_UDW(ring->mmio_base), 0); 2399 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, RING_BBADDR(ring->mmio_base), 0); 2400 ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, RING_BBSTATE(ring->mmio_base), 2401 RING_BB_PPGTT); 2402 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(ring->mmio_base), 0); 2403 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(ring->mmio_base), 0); 2404 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, RING_SBBSTATE(ring->mmio_base), 0); 2405 if (ring->id == RCS) { 2406 ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(ring->mmio_base), 0); 2407 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(ring->mmio_base), 0); 2408 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, RING_INDIRECT_CTX_OFFSET(ring->mmio_base), 0); 2409 if (ring->wa_ctx.obj) { 2410 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 2411 uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); 2412 2413 reg_state[CTX_RCS_INDIRECT_CTX+1] = 2414 (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | 2415 (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); 2416 2417 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 2418 intel_lr_indirect_ctx_offset(ring) << 6; 2419 2420 reg_state[CTX_BB_PER_CTX_PTR+1] = 2421 (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | 2422 0x01; 2423 } 2424 } 2425 reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 2426 ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(ring->mmio_base), 0); 2427 /* PDP values well be assigned later if needed */ 2428 ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(ring, 3), 0); 2429 ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(ring, 3), 0); 2430 ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(ring, 2), 0); 2431 ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(ring, 2), 0); 2432 ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(ring, 1), 0); 2433 ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(ring, 1), 0); 2434 ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(ring, 0), 0); 2435 ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(ring, 0), 0); 2436 2437 if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 2438 /* 64b PPGTT (48bit canonical) 2439 * PDP0_DESCRIPTOR contains the base address to PML4 and 2440 * other PDP Descriptors are ignored. 2441 */ 2442 ASSIGN_CTX_PML4(ppgtt, reg_state); 2443 } else { 2444 /* 32b PPGTT 2445 * PDP*_DESCRIPTOR contains the base address of space supported. 2446 * With dynamic page allocation, PDPs may not be allocated at 2447 * this point. Point the unallocated PDPs to the scratch page 2448 */ 2449 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 2450 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 2451 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 2452 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 2453 } 2454 2455 if (ring->id == RCS) { 2456 reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 2457 ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 2458 make_rpcs(dev)); 2459 } 2460 2461 kunmap_atomic(reg_state); 2462 i915_gem_object_unpin_pages(ctx_obj); 2463 2464 return 0; 2465 } 2466 2467 /** 2468 * intel_lr_context_free() - free the LRC specific bits of a context 2469 * @ctx: the LR context to free. 2470 * 2471 * The real context freeing is done in i915_gem_context_free: this only 2472 * takes care of the bits that are LRC related: the per-engine backing 2473 * objects and the logical ringbuffer. 2474 */ 2475 void intel_lr_context_free(struct intel_context *ctx) 2476 { 2477 int i; 2478 2479 for (i = I915_NUM_RINGS; --i >= 0; ) { 2480 struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf; 2481 struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state; 2482 2483 if (!ctx_obj) 2484 continue; 2485 2486 if (ctx == ctx->i915->kernel_context) { 2487 intel_unpin_ringbuffer_obj(ringbuf); 2488 i915_gem_object_ggtt_unpin(ctx_obj); 2489 } 2490 2491 WARN_ON(ctx->engine[i].pin_count); 2492 intel_ringbuffer_free(ringbuf); 2493 drm_gem_object_unreference(&ctx_obj->base); 2494 } 2495 } 2496 2497 /** 2498 * intel_lr_context_size() - return the size of the context for an engine 2499 * @ring: which engine to find the context size for 2500 * 2501 * Each engine may require a different amount of space for a context image, 2502 * so when allocating (or copying) an image, this function can be used to 2503 * find the right size for the specific engine. 2504 * 2505 * Return: size (in bytes) of an engine-specific context image 2506 * 2507 * Note: this size includes the HWSP, which is part of the context image 2508 * in LRC mode, but does not include the "shared data page" used with 2509 * GuC submission. The caller should account for this if using the GuC. 2510 */ 2511 uint32_t intel_lr_context_size(struct intel_engine_cs *ring) 2512 { 2513 int ret = 0; 2514 2515 WARN_ON(INTEL_INFO(ring->dev)->gen < 8); 2516 2517 switch (ring->id) { 2518 case RCS: 2519 if (INTEL_INFO(ring->dev)->gen >= 9) 2520 ret = GEN9_LR_CONTEXT_RENDER_SIZE; 2521 else 2522 ret = GEN8_LR_CONTEXT_RENDER_SIZE; 2523 break; 2524 case VCS: 2525 case BCS: 2526 case VECS: 2527 case VCS2: 2528 ret = GEN8_LR_CONTEXT_OTHER_SIZE; 2529 break; 2530 } 2531 2532 return ret; 2533 } 2534 2535 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 2536 struct drm_i915_gem_object *default_ctx_obj) 2537 { 2538 struct drm_i915_private *dev_priv = ring->dev->dev_private; 2539 struct vm_page *page; 2540 2541 /* The HWSP is part of the default context object in LRC mode. */ 2542 ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj) 2543 + LRC_PPHWSP_PN * PAGE_SIZE; 2544 page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN); 2545 ring->status_page.page_addr = kmap(page); 2546 ring->status_page.obj = default_ctx_obj; 2547 2548 I915_WRITE(RING_HWS_PGA(ring->mmio_base), 2549 (u32)ring->status_page.gfx_addr); 2550 POSTING_READ(RING_HWS_PGA(ring->mmio_base)); 2551 } 2552 2553 /** 2554 * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context 2555 * @ctx: LR context to create. 2556 * @ring: engine to be used with the context. 2557 * 2558 * This function can be called more than once, with different engines, if we plan 2559 * to use the context with them. The context backing objects and the ringbuffers 2560 * (specially the ringbuffer backing objects) suck a lot of memory up, and that's why 2561 * the creation is a deferred call: it's better to make sure first that we need to use 2562 * a given ring with the context. 2563 * 2564 * Return: non-zero on error. 2565 */ 2566 2567 int intel_lr_context_deferred_alloc(struct intel_context *ctx, 2568 struct intel_engine_cs *ring) 2569 { 2570 struct drm_device *dev = ring->dev; 2571 struct drm_i915_gem_object *ctx_obj; 2572 uint32_t context_size; 2573 struct intel_ringbuffer *ringbuf; 2574 int ret; 2575 2576 WARN_ON(ctx->legacy_hw_ctx.rcs_state != NULL); 2577 WARN_ON(ctx->engine[ring->id].state); 2578 2579 context_size = round_up(intel_lr_context_size(ring), 4096); 2580 2581 /* One extra page as the sharing data between driver and GuC */ 2582 context_size += PAGE_SIZE * LRC_PPHWSP_PN; 2583 2584 ctx_obj = i915_gem_alloc_object(dev, context_size); 2585 if (!ctx_obj) { 2586 DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); 2587 return -ENOMEM; 2588 } 2589 2590 ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE); 2591 if (IS_ERR(ringbuf)) { 2592 ret = PTR_ERR(ringbuf); 2593 goto error_deref_obj; 2594 } 2595 2596 ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); 2597 if (ret) { 2598 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 2599 goto error_ringbuf; 2600 } 2601 2602 ctx->engine[ring->id].ringbuf = ringbuf; 2603 ctx->engine[ring->id].state = ctx_obj; 2604 2605 if (ctx != ctx->i915->kernel_context && ring->init_context) { 2606 struct drm_i915_gem_request *req; 2607 2608 req = i915_gem_request_alloc(ring, ctx); 2609 if (IS_ERR(req)) { 2610 ret = PTR_ERR(req); 2611 DRM_ERROR("ring create req: %d\n", ret); 2612 goto error_ringbuf; 2613 } 2614 2615 ret = ring->init_context(req); 2616 if (ret) { 2617 DRM_ERROR("ring init context: %d\n", 2618 ret); 2619 i915_gem_request_cancel(req); 2620 goto error_ringbuf; 2621 } 2622 i915_add_request_no_flush(req); 2623 } 2624 return 0; 2625 2626 error_ringbuf: 2627 intel_ringbuffer_free(ringbuf); 2628 error_deref_obj: 2629 drm_gem_object_unreference(&ctx_obj->base); 2630 ctx->engine[ring->id].ringbuf = NULL; 2631 ctx->engine[ring->id].state = NULL; 2632 return ret; 2633 } 2634 2635 void intel_lr_context_reset(struct drm_device *dev, 2636 struct intel_context *ctx) 2637 { 2638 struct drm_i915_private *dev_priv = dev->dev_private; 2639 struct intel_engine_cs *ring; 2640 int i; 2641 2642 for_each_ring(ring, dev_priv, i) { 2643 struct drm_i915_gem_object *ctx_obj = 2644 ctx->engine[ring->id].state; 2645 struct intel_ringbuffer *ringbuf = 2646 ctx->engine[ring->id].ringbuf; 2647 uint32_t *reg_state; 2648 struct vm_page *page; 2649 2650 if (!ctx_obj) 2651 continue; 2652 2653 if (i915_gem_object_get_pages(ctx_obj)) { 2654 WARN(1, "Failed get_pages for context obj\n"); 2655 continue; 2656 } 2657 page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 2658 reg_state = kmap_atomic(page); 2659 2660 reg_state[CTX_RING_HEAD+1] = 0; 2661 reg_state[CTX_RING_TAIL+1] = 0; 2662 2663 kunmap_atomic(reg_state); 2664 2665 ringbuf->head = 0; 2666 ringbuf->tail = 0; 2667 } 2668 } 2669