1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 135 #include <drm/drmP.h> 136 #include <drm/i915_drm.h> 137 #include "i915_drv.h" 138 #include "intel_drv.h" 139 #include "intel_mocs.h" 140 141 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) 142 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) 143 #define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE) 144 145 #define RING_EXECLIST_QFULL (1 << 0x2) 146 #define RING_EXECLIST1_VALID (1 << 0x3) 147 #define RING_EXECLIST0_VALID (1 << 0x4) 148 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 149 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 150 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 151 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 153 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 156 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 157 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 158 159 #define CTX_LRI_HEADER_0 0x01 160 #define CTX_CONTEXT_CONTROL 0x02 161 #define CTX_RING_HEAD 0x04 162 #define CTX_RING_TAIL 0x06 163 #define CTX_RING_BUFFER_START 0x08 164 #define CTX_RING_BUFFER_CONTROL 0x0a 165 #define CTX_BB_HEAD_U 0x0c 166 #define CTX_BB_HEAD_L 0x0e 167 #define CTX_BB_STATE 0x10 168 #define CTX_SECOND_BB_HEAD_U 0x12 169 #define CTX_SECOND_BB_HEAD_L 0x14 170 #define CTX_SECOND_BB_STATE 0x16 171 #define CTX_BB_PER_CTX_PTR 0x18 172 #define CTX_RCS_INDIRECT_CTX 0x1a 173 #define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c 174 #define CTX_LRI_HEADER_1 0x21 175 #define CTX_CTX_TIMESTAMP 0x22 176 #define CTX_PDP3_UDW 0x24 177 #define CTX_PDP3_LDW 0x26 178 #define CTX_PDP2_UDW 0x28 179 #define CTX_PDP2_LDW 0x2a 180 #define CTX_PDP1_UDW 0x2c 181 #define CTX_PDP1_LDW 0x2e 182 #define CTX_PDP0_UDW 0x30 183 #define CTX_PDP0_LDW 0x32 184 #define CTX_LRI_HEADER_2 0x41 185 #define CTX_R_PWR_CLK_STATE 0x42 186 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 187 188 #define GEN8_CTX_VALID (1<<0) 189 #define GEN8_CTX_FORCE_PD_RESTORE (1<<1) 190 #define GEN8_CTX_FORCE_RESTORE (1<<2) 191 #define GEN8_CTX_L3LLC_COHERENT (1<<5) 192 #define GEN8_CTX_PRIVILEGE (1<<8) 193 194 #define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \ 195 (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \ 196 (reg_state)[(pos)+1] = (val); \ 197 } while (0) 198 199 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ 200 const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ 201 reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ 202 reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ 203 } while (0) 204 205 #define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ 206 reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ 207 reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ 208 } while (0) 209 210 enum { 211 ADVANCED_CONTEXT = 0, 212 LEGACY_32B_CONTEXT, 213 ADVANCED_AD_CONTEXT, 214 LEGACY_64B_CONTEXT 215 }; 216 #define GEN8_CTX_ADDRESSING_MODE_SHIFT 3 217 #define GEN8_CTX_ADDRESSING_MODE(dev) (USES_FULL_48BIT_PPGTT(dev) ?\ 218 LEGACY_64B_CONTEXT :\ 219 LEGACY_32B_CONTEXT) 220 enum { 221 FAULT_AND_HANG = 0, 222 FAULT_AND_HALT, /* Debug only */ 223 FAULT_AND_STREAM, 224 FAULT_AND_CONTINUE /* Unsupported */ 225 }; 226 #define GEN8_CTX_ID_SHIFT 32 227 #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 228 229 static int intel_lr_context_pin(struct drm_i915_gem_request *rq); 230 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 231 struct drm_i915_gem_object *default_ctx_obj); 232 233 234 /** 235 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists 236 * @dev: DRM device. 237 * @enable_execlists: value of i915.enable_execlists module parameter. 238 * 239 * Only certain platforms support Execlists (the prerequisites being 240 * support for Logical Ring Contexts and Aliasing PPGTT or better). 241 * 242 * Return: 1 if Execlists is supported and has to be enabled. 243 */ 244 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists) 245 { 246 WARN_ON(i915.enable_ppgtt == -1); 247 248 /* On platforms with execlist available, vGPU will only 249 * support execlist mode, no ring buffer mode. 250 */ 251 if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev)) 252 return 1; 253 254 if (INTEL_INFO(dev)->gen >= 9) 255 return 1; 256 257 if (enable_execlists == 0) 258 return 0; 259 260 if (HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev) && 261 i915.use_mmio_flip >= 0) 262 return 1; 263 264 return 0; 265 } 266 267 /** 268 * intel_execlists_ctx_id() - get the Execlists Context ID 269 * @ctx_obj: Logical Ring Context backing object. 270 * 271 * Do not confuse with ctx->id! Unfortunately we have a name overload 272 * here: the old context ID we pass to userspace as a handler so that 273 * they can refer to a context, and the new context ID we pass to the 274 * ELSP so that the GPU can inform us of the context status via 275 * interrupts. 276 * 277 * Return: 20-bits globally unique context ID. 278 */ 279 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) 280 { 281 u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) + 282 LRC_PPHWSP_PN * PAGE_SIZE; 283 284 /* LRCA is required to be 4K aligned so the more significant 20 bits 285 * are globally unique */ 286 return lrca >> 12; 287 } 288 289 static bool disable_lite_restore_wa(struct intel_engine_cs *ring) 290 { 291 struct drm_device *dev = ring->dev; 292 293 return (IS_SKL_REVID(dev, 0, SKL_REVID_B0) || 294 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) && 295 (ring->id == VCS || ring->id == VCS2); 296 } 297 298 uint64_t intel_lr_context_descriptor(struct intel_context *ctx, 299 struct intel_engine_cs *ring) 300 { 301 struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; 302 uint64_t desc; 303 uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) + 304 LRC_PPHWSP_PN * PAGE_SIZE; 305 306 WARN_ON(lrca & 0xFFFFFFFF00000FFFULL); 307 308 desc = GEN8_CTX_VALID; 309 desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT; 310 if (IS_GEN8(ctx_obj->base.dev)) 311 desc |= GEN8_CTX_L3LLC_COHERENT; 312 desc |= GEN8_CTX_PRIVILEGE; 313 desc |= lrca; 314 desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; 315 316 /* TODO: WaDisableLiteRestore when we start using semaphore 317 * signalling between Command Streamers */ 318 /* desc |= GEN8_CTX_FORCE_RESTORE; */ 319 320 /* WaEnableForceRestoreInCtxtDescForVCS:skl */ 321 /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ 322 if (disable_lite_restore_wa(ring)) 323 desc |= GEN8_CTX_FORCE_RESTORE; 324 325 return desc; 326 } 327 328 static void execlists_elsp_write(struct drm_i915_gem_request *rq0, 329 struct drm_i915_gem_request *rq1) 330 { 331 332 struct intel_engine_cs *ring = rq0->ring; 333 struct drm_device *dev = ring->dev; 334 struct drm_i915_private *dev_priv = dev->dev_private; 335 uint64_t desc[2]; 336 337 if (rq1) { 338 desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring); 339 rq1->elsp_submitted++; 340 } else { 341 desc[1] = 0; 342 } 343 344 desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring); 345 rq0->elsp_submitted++; 346 347 /* You must always write both descriptors in the order below. */ 348 lockmgr(&dev_priv->uncore.lock, LK_EXCLUSIVE); 349 intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); 350 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); 351 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); 352 353 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0])); 354 /* The context is automatically loaded after the following */ 355 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0])); 356 357 /* ELSP is a wo register, use another nearby reg for posting */ 358 POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); 359 intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); 360 lockmgr(&dev_priv->uncore.lock, LK_RELEASE); 361 } 362 363 static int execlists_update_context(struct drm_i915_gem_request *rq) 364 { 365 struct intel_engine_cs *ring = rq->ring; 366 struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; 367 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 368 struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj; 369 struct vm_page *page; 370 uint32_t *reg_state; 371 372 BUG_ON(!ctx_obj); 373 WARN_ON(!i915_gem_obj_is_pinned(ctx_obj)); 374 WARN_ON(!i915_gem_obj_is_pinned(rb_obj)); 375 376 page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 377 reg_state = kmap_atomic(page); 378 379 reg_state[CTX_RING_TAIL+1] = rq->tail; 380 reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj); 381 382 if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 383 /* True 32b PPGTT with dynamic page allocation: update PDP 384 * registers and point the unallocated PDPs to scratch page. 385 * PML4 is allocated during ppgtt init, so this is not needed 386 * in 48-bit mode. 387 */ 388 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 389 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 390 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 391 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 392 } 393 394 kunmap_atomic(reg_state); 395 396 return 0; 397 } 398 399 static void execlists_submit_requests(struct drm_i915_gem_request *rq0, 400 struct drm_i915_gem_request *rq1) 401 { 402 execlists_update_context(rq0); 403 404 if (rq1) 405 execlists_update_context(rq1); 406 407 execlists_elsp_write(rq0, rq1); 408 } 409 410 static void execlists_context_unqueue(struct intel_engine_cs *ring) 411 { 412 struct drm_i915_gem_request *req0 = NULL, *req1 = NULL; 413 struct drm_i915_gem_request *cursor = NULL, *tmp = NULL; 414 415 assert_spin_locked(&ring->execlist_lock); 416 417 /* 418 * If irqs are not active generate a warning as batches that finish 419 * without the irqs may get lost and a GPU Hang may occur. 420 */ 421 WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); 422 423 if (list_empty(&ring->execlist_queue)) 424 return; 425 426 /* Try to read in pairs */ 427 list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue, 428 execlist_link) { 429 if (!req0) { 430 req0 = cursor; 431 } else if (req0->ctx == cursor->ctx) { 432 /* Same ctx: ignore first request, as second request 433 * will update tail past first request's workload */ 434 cursor->elsp_submitted = req0->elsp_submitted; 435 list_del(&req0->execlist_link); 436 list_add_tail(&req0->execlist_link, 437 &ring->execlist_retired_req_list); 438 req0 = cursor; 439 } else { 440 req1 = cursor; 441 break; 442 } 443 } 444 445 if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) { 446 /* 447 * WaIdleLiteRestore: make sure we never cause a lite 448 * restore with HEAD==TAIL 449 */ 450 if (req0->elsp_submitted) { 451 /* 452 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL 453 * as we resubmit the request. See gen8_emit_request() 454 * for where we prepare the padding after the end of the 455 * request. 456 */ 457 struct intel_ringbuffer *ringbuf; 458 459 ringbuf = req0->ctx->engine[ring->id].ringbuf; 460 req0->tail += 8; 461 req0->tail &= ringbuf->size - 1; 462 } 463 } 464 465 WARN_ON(req1 && req1->elsp_submitted); 466 467 execlists_submit_requests(req0, req1); 468 } 469 470 static bool execlists_check_remove_request(struct intel_engine_cs *ring, 471 u32 request_id) 472 { 473 struct drm_i915_gem_request *head_req; 474 475 assert_spin_locked(&ring->execlist_lock); 476 477 head_req = list_first_entry_or_null(&ring->execlist_queue, 478 struct drm_i915_gem_request, 479 execlist_link); 480 481 if (head_req != NULL) { 482 struct drm_i915_gem_object *ctx_obj = 483 head_req->ctx->engine[ring->id].state; 484 if (intel_execlists_ctx_id(ctx_obj) == request_id) { 485 WARN(head_req->elsp_submitted == 0, 486 "Never submitted head request\n"); 487 488 if (--head_req->elsp_submitted <= 0) { 489 list_del(&head_req->execlist_link); 490 list_add_tail(&head_req->execlist_link, 491 &ring->execlist_retired_req_list); 492 return true; 493 } 494 } 495 } 496 497 return false; 498 } 499 500 /** 501 * intel_lrc_irq_handler() - handle Context Switch interrupts 502 * @ring: Engine Command Streamer to handle. 503 * 504 * Check the unread Context Status Buffers and manage the submission of new 505 * contexts to the ELSP accordingly. 506 */ 507 void intel_lrc_irq_handler(struct intel_engine_cs *ring) 508 { 509 struct drm_i915_private *dev_priv = ring->dev->dev_private; 510 u32 status_pointer; 511 u8 read_pointer; 512 u8 write_pointer; 513 u32 status = 0; 514 u32 status_id; 515 u32 submit_contexts = 0; 516 517 status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); 518 519 read_pointer = ring->next_context_status_buffer; 520 write_pointer = status_pointer & GEN8_CSB_PTR_MASK; 521 if (read_pointer > write_pointer) 522 write_pointer += GEN8_CSB_ENTRIES; 523 524 lockmgr(&ring->execlist_lock, LK_EXCLUSIVE); 525 526 while (read_pointer < write_pointer) { 527 read_pointer++; 528 status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer % GEN8_CSB_ENTRIES)); 529 status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer % GEN8_CSB_ENTRIES)); 530 531 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 532 continue; 533 534 if (status & GEN8_CTX_STATUS_PREEMPTED) { 535 if (status & GEN8_CTX_STATUS_LITE_RESTORE) { 536 if (execlists_check_remove_request(ring, status_id)) 537 WARN(1, "Lite Restored request removed from queue\n"); 538 } else 539 WARN(1, "Preemption without Lite Restore\n"); 540 } 541 542 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) || 543 (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) { 544 if (execlists_check_remove_request(ring, status_id)) 545 submit_contexts++; 546 } 547 } 548 549 if (disable_lite_restore_wa(ring)) { 550 /* Prevent a ctx to preempt itself */ 551 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) && 552 (submit_contexts != 0)) 553 execlists_context_unqueue(ring); 554 } else if (submit_contexts != 0) { 555 execlists_context_unqueue(ring); 556 } 557 558 lockmgr(&ring->execlist_lock, LK_RELEASE); 559 560 WARN(submit_contexts > 2, "More than two context complete events?\n"); 561 ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; 562 563 I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), 564 _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8, 565 ((u32)ring->next_context_status_buffer & 566 GEN8_CSB_PTR_MASK) << 8)); 567 } 568 569 static int execlists_context_queue(struct drm_i915_gem_request *request) 570 { 571 struct intel_engine_cs *ring = request->ring; 572 struct drm_i915_gem_request *cursor; 573 int num_elements = 0; 574 575 if (request->ctx != ring->default_context) 576 intel_lr_context_pin(request); 577 578 i915_gem_request_reference(request); 579 580 spin_lock_irq(&ring->execlist_lock); 581 582 list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) 583 if (++num_elements > 2) 584 break; 585 586 if (num_elements > 2) { 587 struct drm_i915_gem_request *tail_req; 588 589 tail_req = list_last_entry(&ring->execlist_queue, 590 struct drm_i915_gem_request, 591 execlist_link); 592 593 if (request->ctx == tail_req->ctx) { 594 WARN(tail_req->elsp_submitted != 0, 595 "More than 2 already-submitted reqs queued\n"); 596 list_del(&tail_req->execlist_link); 597 list_add_tail(&tail_req->execlist_link, 598 &ring->execlist_retired_req_list); 599 } 600 } 601 602 list_add_tail(&request->execlist_link, &ring->execlist_queue); 603 if (num_elements == 0) 604 execlists_context_unqueue(ring); 605 606 spin_unlock_irq(&ring->execlist_lock); 607 608 return 0; 609 } 610 611 static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req) 612 { 613 struct intel_engine_cs *ring = req->ring; 614 uint32_t flush_domains; 615 int ret; 616 617 flush_domains = 0; 618 if (ring->gpu_caches_dirty) 619 flush_domains = I915_GEM_GPU_DOMAINS; 620 621 ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains); 622 if (ret) 623 return ret; 624 625 ring->gpu_caches_dirty = false; 626 return 0; 627 } 628 629 static int execlists_move_to_gpu(struct drm_i915_gem_request *req, 630 struct list_head *vmas) 631 { 632 const unsigned other_rings = ~intel_ring_flag(req->ring); 633 struct i915_vma *vma; 634 uint32_t flush_domains = 0; 635 bool flush_chipset = false; 636 int ret; 637 638 list_for_each_entry(vma, vmas, exec_list) { 639 struct drm_i915_gem_object *obj = vma->obj; 640 641 if (obj->active & other_rings) { 642 ret = i915_gem_object_sync(obj, req->ring, &req); 643 if (ret) 644 return ret; 645 } 646 647 if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) 648 flush_chipset |= i915_gem_clflush_object(obj, false); 649 650 flush_domains |= obj->base.write_domain; 651 } 652 653 if (flush_domains & I915_GEM_DOMAIN_GTT) 654 wmb(); 655 656 /* Unconditionally invalidate gpu caches and ensure that we do flush 657 * any residual writes from the previous batch. 658 */ 659 return logical_ring_invalidate_all_caches(req); 660 } 661 662 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) 663 { 664 int ret; 665 666 request->ringbuf = request->ctx->engine[request->ring->id].ringbuf; 667 668 if (request->ctx != request->ring->default_context) { 669 ret = intel_lr_context_pin(request); 670 if (ret) 671 return ret; 672 } 673 674 return 0; 675 } 676 677 static int logical_ring_wait_for_space(struct drm_i915_gem_request *req, 678 int bytes) 679 { 680 struct intel_ringbuffer *ringbuf = req->ringbuf; 681 struct intel_engine_cs *ring = req->ring; 682 struct drm_i915_gem_request *target; 683 unsigned space; 684 int ret; 685 686 if (intel_ring_space(ringbuf) >= bytes) 687 return 0; 688 689 /* The whole point of reserving space is to not wait! */ 690 WARN_ON(ringbuf->reserved_in_use); 691 692 list_for_each_entry(target, &ring->request_list, list) { 693 /* 694 * The request queue is per-engine, so can contain requests 695 * from multiple ringbuffers. Here, we must ignore any that 696 * aren't from the ringbuffer we're considering. 697 */ 698 if (target->ringbuf != ringbuf) 699 continue; 700 701 /* Would completion of this request free enough space? */ 702 space = __intel_ring_space(target->postfix, ringbuf->tail, 703 ringbuf->size); 704 if (space >= bytes) 705 break; 706 } 707 708 if (WARN_ON(&target->list == &ring->request_list)) 709 return -ENOSPC; 710 711 ret = i915_wait_request(target); 712 if (ret) 713 return ret; 714 715 ringbuf->space = space; 716 return 0; 717 } 718 719 /* 720 * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload 721 * @request: Request to advance the logical ringbuffer of. 722 * 723 * The tail is updated in our logical ringbuffer struct, not in the actual context. What 724 * really happens during submission is that the context and current tail will be placed 725 * on a queue waiting for the ELSP to be ready to accept a new context submission. At that 726 * point, the tail *inside* the context is updated and the ELSP written to. 727 */ 728 static void 729 intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request) 730 { 731 struct intel_engine_cs *ring = request->ring; 732 struct drm_i915_private *dev_priv = request->i915; 733 734 intel_logical_ring_advance(request->ringbuf); 735 736 request->tail = request->ringbuf->tail; 737 738 if (intel_ring_stopped(ring)) 739 return; 740 741 if (dev_priv->guc.execbuf_client) 742 i915_guc_submit(dev_priv->guc.execbuf_client, request); 743 else 744 execlists_context_queue(request); 745 } 746 747 static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf) 748 { 749 uint32_t __iomem *virt; 750 int rem = ringbuf->size - ringbuf->tail; 751 752 virt = (uint32_t *)(ringbuf->virtual_start + ringbuf->tail); 753 rem /= 4; 754 while (rem--) 755 iowrite32(MI_NOOP, virt++); 756 757 ringbuf->tail = 0; 758 intel_ring_update_space(ringbuf); 759 } 760 761 static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes) 762 { 763 struct intel_ringbuffer *ringbuf = req->ringbuf; 764 int remain_usable = ringbuf->effective_size - ringbuf->tail; 765 int remain_actual = ringbuf->size - ringbuf->tail; 766 int ret, total_bytes, wait_bytes = 0; 767 bool need_wrap = false; 768 769 if (ringbuf->reserved_in_use) 770 total_bytes = bytes; 771 else 772 total_bytes = bytes + ringbuf->reserved_size; 773 774 if (unlikely(bytes > remain_usable)) { 775 /* 776 * Not enough space for the basic request. So need to flush 777 * out the remainder and then wait for base + reserved. 778 */ 779 wait_bytes = remain_actual + total_bytes; 780 need_wrap = true; 781 } else { 782 if (unlikely(total_bytes > remain_usable)) { 783 /* 784 * The base request will fit but the reserved space 785 * falls off the end. So only need to to wait for the 786 * reserved size after flushing out the remainder. 787 */ 788 wait_bytes = remain_actual + ringbuf->reserved_size; 789 need_wrap = true; 790 } else if (total_bytes > ringbuf->space) { 791 /* No wrapping required, just waiting. */ 792 wait_bytes = total_bytes; 793 } 794 } 795 796 if (wait_bytes) { 797 ret = logical_ring_wait_for_space(req, wait_bytes); 798 if (unlikely(ret)) 799 return ret; 800 801 if (need_wrap) 802 __wrap_ring_buffer(ringbuf); 803 } 804 805 return 0; 806 } 807 808 /** 809 * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands 810 * 811 * @req: The request to start some new work for 812 * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. 813 * 814 * The ringbuffer might not be ready to accept the commands right away (maybe it needs to 815 * be wrapped, or wait a bit for the tail to be updated). This function takes care of that 816 * and also preallocates a request (every workload submission is still mediated through 817 * requests, same as it did with legacy ringbuffer submission). 818 * 819 * Return: non-zero if the ringbuffer is not ready to be written to. 820 */ 821 int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords) 822 { 823 struct drm_i915_private *dev_priv; 824 int ret; 825 826 WARN_ON(req == NULL); 827 dev_priv = req->ring->dev->dev_private; 828 829 ret = i915_gem_check_wedge(&dev_priv->gpu_error, 830 dev_priv->mm.interruptible); 831 if (ret) 832 return ret; 833 834 ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t)); 835 if (ret) 836 return ret; 837 838 req->ringbuf->space -= num_dwords * sizeof(uint32_t); 839 return 0; 840 } 841 842 int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request) 843 { 844 /* 845 * The first call merely notes the reserve request and is common for 846 * all back ends. The subsequent localised _begin() call actually 847 * ensures that the reservation is available. Without the begin, if 848 * the request creator immediately submitted the request without 849 * adding any commands to it then there might not actually be 850 * sufficient room for the submission commands. 851 */ 852 intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST); 853 854 return intel_logical_ring_begin(request, 0); 855 } 856 857 /** 858 * execlists_submission() - submit a batchbuffer for execution, Execlists style 859 * @dev: DRM device. 860 * @file: DRM file. 861 * @ring: Engine Command Streamer to submit to. 862 * @ctx: Context to employ for this submission. 863 * @args: execbuffer call arguments. 864 * @vmas: list of vmas. 865 * @batch_obj: the batchbuffer to submit. 866 * @exec_start: batchbuffer start virtual address pointer. 867 * @dispatch_flags: translated execbuffer call flags. 868 * 869 * This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts 870 * away the submission details of the execbuffer ioctl call. 871 * 872 * Return: non-zero if the submission fails. 873 */ 874 int intel_execlists_submission(struct i915_execbuffer_params *params, 875 struct drm_i915_gem_execbuffer2 *args, 876 struct list_head *vmas) 877 { 878 struct drm_device *dev = params->dev; 879 struct intel_engine_cs *ring = params->ring; 880 struct drm_i915_private *dev_priv = dev->dev_private; 881 struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf; 882 u64 exec_start; 883 int instp_mode; 884 u32 instp_mask; 885 int ret; 886 887 instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK; 888 instp_mask = I915_EXEC_CONSTANTS_MASK; 889 switch (instp_mode) { 890 case I915_EXEC_CONSTANTS_REL_GENERAL: 891 case I915_EXEC_CONSTANTS_ABSOLUTE: 892 case I915_EXEC_CONSTANTS_REL_SURFACE: 893 if (instp_mode != 0 && ring != &dev_priv->ring[RCS]) { 894 DRM_DEBUG("non-0 rel constants mode on non-RCS\n"); 895 return -EINVAL; 896 } 897 898 if (instp_mode != dev_priv->relative_constants_mode) { 899 if (instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) { 900 DRM_DEBUG("rel surface constants mode invalid on gen5+\n"); 901 return -EINVAL; 902 } 903 904 /* The HW changed the meaning on this bit on gen6 */ 905 instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE; 906 } 907 break; 908 default: 909 DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode); 910 return -EINVAL; 911 } 912 913 if (args->flags & I915_EXEC_GEN7_SOL_RESET) { 914 DRM_DEBUG("sol reset is gen7 only\n"); 915 return -EINVAL; 916 } 917 918 ret = execlists_move_to_gpu(params->request, vmas); 919 if (ret) 920 return ret; 921 922 if (ring == &dev_priv->ring[RCS] && 923 instp_mode != dev_priv->relative_constants_mode) { 924 ret = intel_logical_ring_begin(params->request, 4); 925 if (ret) 926 return ret; 927 928 intel_logical_ring_emit(ringbuf, MI_NOOP); 929 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); 930 intel_logical_ring_emit_reg(ringbuf, INSTPM); 931 intel_logical_ring_emit(ringbuf, instp_mask << 16 | instp_mode); 932 intel_logical_ring_advance(ringbuf); 933 934 dev_priv->relative_constants_mode = instp_mode; 935 } 936 937 exec_start = params->batch_obj_vm_offset + 938 args->batch_start_offset; 939 940 ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags); 941 if (ret) 942 return ret; 943 944 trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); 945 946 i915_gem_execbuffer_move_to_active(vmas, params->request); 947 i915_gem_execbuffer_retire_commands(params); 948 949 return 0; 950 } 951 952 void intel_execlists_retire_requests(struct intel_engine_cs *ring) 953 { 954 struct drm_i915_gem_request *req, *tmp; 955 struct list_head retired_list; 956 957 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 958 if (list_empty(&ring->execlist_retired_req_list)) 959 return; 960 961 INIT_LIST_HEAD(&retired_list); 962 spin_lock_irq(&ring->execlist_lock); 963 list_replace_init(&ring->execlist_retired_req_list, &retired_list); 964 spin_unlock_irq(&ring->execlist_lock); 965 966 list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { 967 struct intel_context *ctx = req->ctx; 968 struct drm_i915_gem_object *ctx_obj = 969 ctx->engine[ring->id].state; 970 971 if (ctx_obj && (ctx != ring->default_context)) 972 intel_lr_context_unpin(req); 973 list_del(&req->execlist_link); 974 i915_gem_request_unreference(req); 975 } 976 } 977 978 void intel_logical_ring_stop(struct intel_engine_cs *ring) 979 { 980 struct drm_i915_private *dev_priv = ring->dev->dev_private; 981 int ret; 982 983 if (!intel_ring_initialized(ring)) 984 return; 985 986 ret = intel_ring_idle(ring); 987 if (ret && !i915_reset_in_progress(&to_i915(ring->dev)->gpu_error)) 988 DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n", 989 ring->name, ret); 990 991 /* TODO: Is this correct with Execlists enabled? */ 992 I915_WRITE_MODE(ring, _MASKED_BIT_ENABLE(STOP_RING)); 993 if (wait_for_atomic((I915_READ_MODE(ring) & MODE_IDLE) != 0, 1000)) { 994 DRM_ERROR("%s :timed out trying to stop ring\n", ring->name); 995 return; 996 } 997 I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING)); 998 } 999 1000 int logical_ring_flush_all_caches(struct drm_i915_gem_request *req) 1001 { 1002 struct intel_engine_cs *ring = req->ring; 1003 int ret; 1004 1005 if (!ring->gpu_caches_dirty) 1006 return 0; 1007 1008 ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS); 1009 if (ret) 1010 return ret; 1011 1012 ring->gpu_caches_dirty = false; 1013 return 0; 1014 } 1015 1016 static int intel_lr_context_do_pin(struct intel_engine_cs *ring, 1017 struct drm_i915_gem_object *ctx_obj, 1018 struct intel_ringbuffer *ringbuf) 1019 { 1020 struct drm_device *dev = ring->dev; 1021 struct drm_i915_private *dev_priv = dev->dev_private; 1022 int ret = 0; 1023 1024 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1025 ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 1026 PIN_OFFSET_BIAS | GUC_WOPCM_TOP); 1027 if (ret) 1028 return ret; 1029 1030 ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf); 1031 if (ret) 1032 goto unpin_ctx_obj; 1033 1034 ctx_obj->dirty = true; 1035 1036 /* Invalidate GuC TLB. */ 1037 if (i915.enable_guc_submission) 1038 I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); 1039 1040 return ret; 1041 1042 unpin_ctx_obj: 1043 i915_gem_object_ggtt_unpin(ctx_obj); 1044 1045 return ret; 1046 } 1047 1048 static int intel_lr_context_pin(struct drm_i915_gem_request *rq) 1049 { 1050 int ret = 0; 1051 struct intel_engine_cs *ring = rq->ring; 1052 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 1053 struct intel_ringbuffer *ringbuf = rq->ringbuf; 1054 1055 if (rq->ctx->engine[ring->id].pin_count++ == 0) { 1056 ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf); 1057 if (ret) 1058 goto reset_pin_count; 1059 } 1060 return ret; 1061 1062 reset_pin_count: 1063 rq->ctx->engine[ring->id].pin_count = 0; 1064 return ret; 1065 } 1066 1067 void intel_lr_context_unpin(struct drm_i915_gem_request *rq) 1068 { 1069 struct intel_engine_cs *ring = rq->ring; 1070 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 1071 struct intel_ringbuffer *ringbuf = rq->ringbuf; 1072 1073 if (ctx_obj) { 1074 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1075 if (--rq->ctx->engine[ring->id].pin_count == 0) { 1076 intel_unpin_ringbuffer_obj(ringbuf); 1077 i915_gem_object_ggtt_unpin(ctx_obj); 1078 } 1079 } 1080 } 1081 1082 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) 1083 { 1084 int ret, i; 1085 struct intel_engine_cs *ring = req->ring; 1086 struct intel_ringbuffer *ringbuf = req->ringbuf; 1087 struct drm_device *dev = ring->dev; 1088 struct drm_i915_private *dev_priv = dev->dev_private; 1089 struct i915_workarounds *w = &dev_priv->workarounds; 1090 1091 if (WARN_ON_ONCE(w->count == 0)) 1092 return 0; 1093 1094 ring->gpu_caches_dirty = true; 1095 ret = logical_ring_flush_all_caches(req); 1096 if (ret) 1097 return ret; 1098 1099 ret = intel_logical_ring_begin(req, w->count * 2 + 2); 1100 if (ret) 1101 return ret; 1102 1103 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); 1104 for (i = 0; i < w->count; i++) { 1105 intel_logical_ring_emit_reg(ringbuf, w->reg[i].addr); 1106 intel_logical_ring_emit(ringbuf, w->reg[i].value); 1107 } 1108 intel_logical_ring_emit(ringbuf, MI_NOOP); 1109 1110 intel_logical_ring_advance(ringbuf); 1111 1112 ring->gpu_caches_dirty = true; 1113 ret = logical_ring_flush_all_caches(req); 1114 if (ret) 1115 return ret; 1116 1117 return 0; 1118 } 1119 1120 #define wa_ctx_emit(batch, index, cmd) \ 1121 do { \ 1122 int __index = (index)++; \ 1123 if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ 1124 return -ENOSPC; \ 1125 } \ 1126 batch[__index] = (cmd); \ 1127 } while (0) 1128 1129 #define wa_ctx_emit_reg(batch, index, reg) \ 1130 wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg)) 1131 1132 /* 1133 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1134 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1135 * but there is a slight complication as this is applied in WA batch where the 1136 * values are only initialized once so we cannot take register value at the 1137 * beginning and reuse it further; hence we save its value to memory, upload a 1138 * constant value with bit21 set and then we restore it back with the saved value. 1139 * To simplify the WA, a constant value is formed by using the default value 1140 * of this register. This shouldn't be a problem because we are only modifying 1141 * it for a short period and this batch in non-premptible. We can ofcourse 1142 * use additional instructions that read the actual value of the register 1143 * at that time and set our bit of interest but it makes the WA complicated. 1144 * 1145 * This WA is also required for Gen9 so extracting as a function avoids 1146 * code duplication. 1147 */ 1148 static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring, 1149 uint32_t *const batch, 1150 uint32_t index) 1151 { 1152 uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); 1153 1154 /* 1155 * WaDisableLSQCROPERFforOCL:skl 1156 * This WA is implemented in skl_init_clock_gating() but since 1157 * this batch updates GEN8_L3SQCREG4 with default value we need to 1158 * set this bit here to retain the WA during flush. 1159 */ 1160 if (IS_SKL_REVID(ring->dev, 0, SKL_REVID_E0)) 1161 l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS; 1162 1163 wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | 1164 MI_SRM_LRM_GLOBAL_GTT)); 1165 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1166 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1167 wa_ctx_emit(batch, index, 0); 1168 1169 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1170 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1171 wa_ctx_emit(batch, index, l3sqc4_flush); 1172 1173 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1174 wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | 1175 PIPE_CONTROL_DC_FLUSH_ENABLE)); 1176 wa_ctx_emit(batch, index, 0); 1177 wa_ctx_emit(batch, index, 0); 1178 wa_ctx_emit(batch, index, 0); 1179 wa_ctx_emit(batch, index, 0); 1180 1181 wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | 1182 MI_SRM_LRM_GLOBAL_GTT)); 1183 wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); 1184 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1185 wa_ctx_emit(batch, index, 0); 1186 1187 return index; 1188 } 1189 1190 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, 1191 uint32_t offset, 1192 uint32_t start_alignment) 1193 { 1194 return wa_ctx->offset = ALIGN(offset, start_alignment); 1195 } 1196 1197 static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, 1198 uint32_t offset, 1199 uint32_t size_alignment) 1200 { 1201 wa_ctx->size = offset - wa_ctx->offset; 1202 1203 WARN(wa_ctx->size % size_alignment, 1204 "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", 1205 wa_ctx->size, size_alignment); 1206 return 0; 1207 } 1208 1209 /** 1210 * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA 1211 * 1212 * @ring: only applicable for RCS 1213 * @wa_ctx: structure representing wa_ctx 1214 * offset: specifies start of the batch, should be cache-aligned. This is updated 1215 * with the offset value received as input. 1216 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1217 * @batch: page in which WA are loaded 1218 * @offset: This field specifies the start of the batch, it should be 1219 * cache-aligned otherwise it is adjusted accordingly. 1220 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1221 * initialized at the beginning and shared across all contexts but this field 1222 * helps us to have multiple batches at different offsets and select them based 1223 * on a criteria. At the moment this batch always start at the beginning of the page 1224 * and at this point we don't have multiple wa_ctx batch buffers. 1225 * 1226 * The number of WA applied are not known at the beginning; we use this field 1227 * to return the no of DWORDS written. 1228 * 1229 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1230 * so it adds NOOPs as padding to make it cacheline aligned. 1231 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1232 * makes a complete batch buffer. 1233 * 1234 * Return: non-zero if we exceed the PAGE_SIZE limit. 1235 */ 1236 1237 static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, 1238 struct i915_wa_ctx_bb *wa_ctx, 1239 uint32_t *const batch, 1240 uint32_t *offset) 1241 { 1242 uint32_t scratch_addr; 1243 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1244 1245 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1246 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1247 1248 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1249 if (IS_BROADWELL(ring->dev)) { 1250 int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1251 if (rc < 0) 1252 return rc; 1253 index = rc; 1254 } 1255 1256 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1257 /* Actual scratch location is at 128 bytes offset */ 1258 scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; 1259 1260 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1261 wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | 1262 PIPE_CONTROL_GLOBAL_GTT_IVB | 1263 PIPE_CONTROL_CS_STALL | 1264 PIPE_CONTROL_QW_WRITE)); 1265 wa_ctx_emit(batch, index, scratch_addr); 1266 wa_ctx_emit(batch, index, 0); 1267 wa_ctx_emit(batch, index, 0); 1268 wa_ctx_emit(batch, index, 0); 1269 1270 /* Pad to end of cacheline */ 1271 while (index % CACHELINE_DWORDS) 1272 wa_ctx_emit(batch, index, MI_NOOP); 1273 1274 /* 1275 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1276 * execution depends on the length specified in terms of cache lines 1277 * in the register CTX_RCS_INDIRECT_CTX 1278 */ 1279 1280 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1281 } 1282 1283 /** 1284 * gen8_init_perctx_bb() - initialize per ctx batch with WA 1285 * 1286 * @ring: only applicable for RCS 1287 * @wa_ctx: structure representing wa_ctx 1288 * offset: specifies start of the batch, should be cache-aligned. 1289 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1290 * @batch: page in which WA are loaded 1291 * @offset: This field specifies the start of this batch. 1292 * This batch is started immediately after indirect_ctx batch. Since we ensure 1293 * that indirect_ctx ends on a cacheline this batch is aligned automatically. 1294 * 1295 * The number of DWORDS written are returned using this field. 1296 * 1297 * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding 1298 * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. 1299 */ 1300 static int gen8_init_perctx_bb(struct intel_engine_cs *ring, 1301 struct i915_wa_ctx_bb *wa_ctx, 1302 uint32_t *const batch, 1303 uint32_t *offset) 1304 { 1305 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1306 1307 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1308 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1309 1310 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1311 1312 return wa_ctx_end(wa_ctx, *offset = index, 1); 1313 } 1314 1315 static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring, 1316 struct i915_wa_ctx_bb *wa_ctx, 1317 uint32_t *const batch, 1318 uint32_t *offset) 1319 { 1320 int ret; 1321 struct drm_device *dev = ring->dev; 1322 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1323 1324 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1325 if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) || 1326 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) 1327 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1328 1329 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ 1330 ret = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1331 if (ret < 0) 1332 return ret; 1333 index = ret; 1334 1335 /* Pad to end of cacheline */ 1336 while (index % CACHELINE_DWORDS) 1337 wa_ctx_emit(batch, index, MI_NOOP); 1338 1339 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1340 } 1341 1342 static int gen9_init_perctx_bb(struct intel_engine_cs *ring, 1343 struct i915_wa_ctx_bb *wa_ctx, 1344 uint32_t *const batch, 1345 uint32_t *offset) 1346 { 1347 struct drm_device *dev = ring->dev; 1348 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1349 1350 /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */ 1351 if (IS_SKL_REVID(dev, 0, SKL_REVID_B0) || 1352 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 1353 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1354 wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); 1355 wa_ctx_emit(batch, index, 1356 _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); 1357 wa_ctx_emit(batch, index, MI_NOOP); 1358 } 1359 1360 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1361 if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) || 1362 IS_BXT_REVID(dev, 0, BXT_REVID_A1)) 1363 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1364 1365 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1366 1367 return wa_ctx_end(wa_ctx, *offset = index, 1); 1368 } 1369 1370 static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size) 1371 { 1372 int ret; 1373 1374 ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size)); 1375 if (!ring->wa_ctx.obj) { 1376 DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); 1377 return -ENOMEM; 1378 } 1379 1380 ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0); 1381 if (ret) { 1382 DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", 1383 ret); 1384 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1385 return ret; 1386 } 1387 1388 return 0; 1389 } 1390 1391 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring) 1392 { 1393 if (ring->wa_ctx.obj) { 1394 i915_gem_object_ggtt_unpin(ring->wa_ctx.obj); 1395 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1396 ring->wa_ctx.obj = NULL; 1397 } 1398 } 1399 1400 static int intel_init_workaround_bb(struct intel_engine_cs *ring) 1401 { 1402 int ret; 1403 uint32_t *batch; 1404 uint32_t offset; 1405 struct vm_page *page; 1406 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 1407 1408 WARN_ON(ring->id != RCS); 1409 1410 /* update this when WA for higher Gen are added */ 1411 if (INTEL_INFO(ring->dev)->gen > 9) { 1412 DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", 1413 INTEL_INFO(ring->dev)->gen); 1414 return 0; 1415 } 1416 1417 /* some WA perform writes to scratch page, ensure it is valid */ 1418 if (ring->scratch.obj == NULL) { 1419 DRM_ERROR("scratch page not allocated for %s\n", ring->name); 1420 return -EINVAL; 1421 } 1422 1423 ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE); 1424 if (ret) { 1425 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 1426 return ret; 1427 } 1428 1429 page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0); 1430 batch = kmap_atomic(page); 1431 offset = 0; 1432 1433 if (INTEL_INFO(ring->dev)->gen == 8) { 1434 ret = gen8_init_indirectctx_bb(ring, 1435 &wa_ctx->indirect_ctx, 1436 batch, 1437 &offset); 1438 if (ret) 1439 goto out; 1440 1441 ret = gen8_init_perctx_bb(ring, 1442 &wa_ctx->per_ctx, 1443 batch, 1444 &offset); 1445 if (ret) 1446 goto out; 1447 } else if (INTEL_INFO(ring->dev)->gen == 9) { 1448 ret = gen9_init_indirectctx_bb(ring, 1449 &wa_ctx->indirect_ctx, 1450 batch, 1451 &offset); 1452 if (ret) 1453 goto out; 1454 1455 ret = gen9_init_perctx_bb(ring, 1456 &wa_ctx->per_ctx, 1457 batch, 1458 &offset); 1459 if (ret) 1460 goto out; 1461 } 1462 1463 out: 1464 kunmap_atomic(batch); 1465 if (ret) 1466 lrc_destroy_wa_ctx_obj(ring); 1467 1468 return ret; 1469 } 1470 1471 static int gen8_init_common_ring(struct intel_engine_cs *ring) 1472 { 1473 struct drm_device *dev = ring->dev; 1474 struct drm_i915_private *dev_priv = dev->dev_private; 1475 u8 next_context_status_buffer_hw; 1476 1477 lrc_setup_hardware_status_page(ring, 1478 ring->default_context->engine[ring->id].state); 1479 1480 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1481 I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff); 1482 1483 I915_WRITE(RING_MODE_GEN7(ring), 1484 _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | 1485 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); 1486 POSTING_READ(RING_MODE_GEN7(ring)); 1487 1488 /* 1489 * Instead of resetting the Context Status Buffer (CSB) read pointer to 1490 * zero, we need to read the write pointer from hardware and use its 1491 * value because "this register is power context save restored". 1492 * Effectively, these states have been observed: 1493 * 1494 * | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) | 1495 * BDW | CSB regs not reset | CSB regs reset | 1496 * CHT | CSB regs not reset | CSB regs not reset | 1497 */ 1498 next_context_status_buffer_hw = (I915_READ(RING_CONTEXT_STATUS_PTR(ring)) 1499 & GEN8_CSB_PTR_MASK); 1500 1501 /* 1502 * When the CSB registers are reset (also after power-up / gpu reset), 1503 * CSB write pointer is set to all 1's, which is not valid, use '5' in 1504 * this special case, so the first element read is CSB[0]. 1505 */ 1506 if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) 1507 next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); 1508 1509 ring->next_context_status_buffer = next_context_status_buffer_hw; 1510 DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name); 1511 1512 memset(&ring->hangcheck, 0, sizeof(ring->hangcheck)); 1513 1514 return 0; 1515 } 1516 1517 static int gen8_init_render_ring(struct intel_engine_cs *ring) 1518 { 1519 struct drm_device *dev = ring->dev; 1520 struct drm_i915_private *dev_priv = dev->dev_private; 1521 int ret; 1522 1523 ret = gen8_init_common_ring(ring); 1524 if (ret) 1525 return ret; 1526 1527 /* We need to disable the AsyncFlip performance optimisations in order 1528 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1529 * programmed to '1' on all products. 1530 * 1531 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv 1532 */ 1533 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1534 1535 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1536 1537 return init_workarounds_ring(ring); 1538 } 1539 1540 static int gen9_init_render_ring(struct intel_engine_cs *ring) 1541 { 1542 int ret; 1543 1544 ret = gen8_init_common_ring(ring); 1545 if (ret) 1546 return ret; 1547 1548 return init_workarounds_ring(ring); 1549 } 1550 1551 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) 1552 { 1553 struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; 1554 struct intel_engine_cs *ring = req->ring; 1555 struct intel_ringbuffer *ringbuf = req->ringbuf; 1556 const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; 1557 int i, ret; 1558 1559 ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2); 1560 if (ret) 1561 return ret; 1562 1563 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds)); 1564 for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { 1565 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1566 1567 intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_UDW(ring, i)); 1568 intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr)); 1569 intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_LDW(ring, i)); 1570 intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr)); 1571 } 1572 1573 intel_logical_ring_emit(ringbuf, MI_NOOP); 1574 intel_logical_ring_advance(ringbuf); 1575 1576 return 0; 1577 } 1578 1579 static int gen8_emit_bb_start(struct drm_i915_gem_request *req, 1580 u64 offset, unsigned dispatch_flags) 1581 { 1582 struct intel_ringbuffer *ringbuf = req->ringbuf; 1583 bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); 1584 int ret; 1585 1586 /* Don't rely in hw updating PDPs, specially in lite-restore. 1587 * Ideally, we should set Force PD Restore in ctx descriptor, 1588 * but we can't. Force Restore would be a second option, but 1589 * it is unsafe in case of lite-restore (because the ctx is 1590 * not idle). PML4 is allocated during ppgtt init so this is 1591 * not needed in 48-bit.*/ 1592 if (req->ctx->ppgtt && 1593 (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) { 1594 if (!USES_FULL_48BIT_PPGTT(req->i915) && 1595 !intel_vgpu_active(req->i915->dev)) { 1596 ret = intel_logical_ring_emit_pdps(req); 1597 if (ret) 1598 return ret; 1599 } 1600 1601 req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring); 1602 } 1603 1604 ret = intel_logical_ring_begin(req, 4); 1605 if (ret) 1606 return ret; 1607 1608 /* FIXME(BDW): Address space and security selectors. */ 1609 intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | 1610 (ppgtt<<8) | 1611 (dispatch_flags & I915_DISPATCH_RS ? 1612 MI_BATCH_RESOURCE_STREAMER : 0)); 1613 intel_logical_ring_emit(ringbuf, lower_32_bits(offset)); 1614 intel_logical_ring_emit(ringbuf, upper_32_bits(offset)); 1615 intel_logical_ring_emit(ringbuf, MI_NOOP); 1616 intel_logical_ring_advance(ringbuf); 1617 1618 return 0; 1619 } 1620 1621 static bool gen8_logical_ring_get_irq(struct intel_engine_cs *ring) 1622 { 1623 struct drm_device *dev = ring->dev; 1624 struct drm_i915_private *dev_priv = dev->dev_private; 1625 unsigned long flags; 1626 1627 if (WARN_ON(!intel_irqs_enabled(dev_priv))) 1628 return false; 1629 1630 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1631 if (ring->irq_refcount++ == 0) { 1632 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1633 POSTING_READ(RING_IMR(ring->mmio_base)); 1634 } 1635 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1636 1637 return true; 1638 } 1639 1640 static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring) 1641 { 1642 struct drm_device *dev = ring->dev; 1643 struct drm_i915_private *dev_priv = dev->dev_private; 1644 unsigned long flags; 1645 1646 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1647 if (--ring->irq_refcount == 0) { 1648 I915_WRITE_IMR(ring, ~ring->irq_keep_mask); 1649 POSTING_READ(RING_IMR(ring->mmio_base)); 1650 } 1651 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1652 } 1653 1654 static int gen8_emit_flush(struct drm_i915_gem_request *request, 1655 u32 invalidate_domains, 1656 u32 unused) 1657 { 1658 struct intel_ringbuffer *ringbuf = request->ringbuf; 1659 struct intel_engine_cs *ring = ringbuf->ring; 1660 struct drm_device *dev = ring->dev; 1661 struct drm_i915_private *dev_priv = dev->dev_private; 1662 uint32_t cmd; 1663 int ret; 1664 1665 ret = intel_logical_ring_begin(request, 4); 1666 if (ret) 1667 return ret; 1668 1669 cmd = MI_FLUSH_DW + 1; 1670 1671 /* We always require a command barrier so that subsequent 1672 * commands, such as breadcrumb interrupts, are strictly ordered 1673 * wrt the contents of the write cache being flushed to memory 1674 * (and thus being coherent from the CPU). 1675 */ 1676 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 1677 1678 if (invalidate_domains & I915_GEM_GPU_DOMAINS) { 1679 cmd |= MI_INVALIDATE_TLB; 1680 if (ring == &dev_priv->ring[VCS]) 1681 cmd |= MI_INVALIDATE_BSD; 1682 } 1683 1684 intel_logical_ring_emit(ringbuf, cmd); 1685 intel_logical_ring_emit(ringbuf, 1686 I915_GEM_HWS_SCRATCH_ADDR | 1687 MI_FLUSH_DW_USE_GTT); 1688 intel_logical_ring_emit(ringbuf, 0); /* upper addr */ 1689 intel_logical_ring_emit(ringbuf, 0); /* value */ 1690 intel_logical_ring_advance(ringbuf); 1691 1692 return 0; 1693 } 1694 1695 static int gen8_emit_flush_render(struct drm_i915_gem_request *request, 1696 u32 invalidate_domains, 1697 u32 flush_domains) 1698 { 1699 struct intel_ringbuffer *ringbuf = request->ringbuf; 1700 struct intel_engine_cs *ring = ringbuf->ring; 1701 u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; 1702 bool vf_flush_wa; 1703 u32 flags = 0; 1704 int ret; 1705 1706 flags |= PIPE_CONTROL_CS_STALL; 1707 1708 if (flush_domains) { 1709 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 1710 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 1711 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 1712 flags |= PIPE_CONTROL_FLUSH_ENABLE; 1713 } 1714 1715 if (invalidate_domains) { 1716 flags |= PIPE_CONTROL_TLB_INVALIDATE; 1717 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 1718 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 1719 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 1720 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 1721 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 1722 flags |= PIPE_CONTROL_QW_WRITE; 1723 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 1724 } 1725 1726 /* 1727 * On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe 1728 * control. 1729 */ 1730 vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 && 1731 flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; 1732 1733 ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6); 1734 if (ret) 1735 return ret; 1736 1737 if (vf_flush_wa) { 1738 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1739 intel_logical_ring_emit(ringbuf, 0); 1740 intel_logical_ring_emit(ringbuf, 0); 1741 intel_logical_ring_emit(ringbuf, 0); 1742 intel_logical_ring_emit(ringbuf, 0); 1743 intel_logical_ring_emit(ringbuf, 0); 1744 } 1745 1746 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1747 intel_logical_ring_emit(ringbuf, flags); 1748 intel_logical_ring_emit(ringbuf, scratch_addr); 1749 intel_logical_ring_emit(ringbuf, 0); 1750 intel_logical_ring_emit(ringbuf, 0); 1751 intel_logical_ring_emit(ringbuf, 0); 1752 intel_logical_ring_advance(ringbuf); 1753 1754 return 0; 1755 } 1756 1757 static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1758 { 1759 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1760 } 1761 1762 static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1763 { 1764 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1765 } 1766 1767 static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1768 { 1769 1770 /* 1771 * On BXT A steppings there is a HW coherency issue whereby the 1772 * MI_STORE_DATA_IMM storing the completed request's seqno 1773 * occasionally doesn't invalidate the CPU cache. Work around this by 1774 * clflushing the corresponding cacheline whenever the caller wants 1775 * the coherency to be guaranteed. Note that this cacheline is known 1776 * to be clean at this point, since we only write it in 1777 * bxt_a_set_seqno(), where we also do a clflush after the write. So 1778 * this clflush in practice becomes an invalidate operation. 1779 */ 1780 1781 if (!lazy_coherency) 1782 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1783 1784 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1785 } 1786 1787 static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1788 { 1789 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1790 1791 /* See bxt_a_get_seqno() explaining the reason for the clflush. */ 1792 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1793 } 1794 1795 static int gen8_emit_request(struct drm_i915_gem_request *request) 1796 { 1797 struct intel_ringbuffer *ringbuf = request->ringbuf; 1798 struct intel_engine_cs *ring = ringbuf->ring; 1799 u32 cmd; 1800 int ret; 1801 1802 /* 1803 * Reserve space for 2 NOOPs at the end of each request to be 1804 * used as a workaround for not being allowed to do lite 1805 * restore with HEAD==TAIL (WaIdleLiteRestore). 1806 */ 1807 ret = intel_logical_ring_begin(request, 8); 1808 if (ret) 1809 return ret; 1810 1811 cmd = MI_STORE_DWORD_IMM_GEN4; 1812 cmd |= MI_GLOBAL_GTT; 1813 1814 intel_logical_ring_emit(ringbuf, cmd); 1815 intel_logical_ring_emit(ringbuf, 1816 (ring->status_page.gfx_addr + 1817 (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT))); 1818 intel_logical_ring_emit(ringbuf, 0); 1819 intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request)); 1820 intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT); 1821 intel_logical_ring_emit(ringbuf, MI_NOOP); 1822 intel_logical_ring_advance_and_submit(request); 1823 1824 /* 1825 * Here we add two extra NOOPs as padding to avoid 1826 * lite restore of a context with HEAD==TAIL. 1827 */ 1828 intel_logical_ring_emit(ringbuf, MI_NOOP); 1829 intel_logical_ring_emit(ringbuf, MI_NOOP); 1830 intel_logical_ring_advance(ringbuf); 1831 1832 return 0; 1833 } 1834 1835 static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req) 1836 { 1837 struct render_state so; 1838 int ret; 1839 1840 ret = i915_gem_render_state_prepare(req->ring, &so); 1841 if (ret) 1842 return ret; 1843 1844 if (so.rodata == NULL) 1845 return 0; 1846 1847 ret = req->ring->emit_bb_start(req, so.ggtt_offset, 1848 I915_DISPATCH_SECURE); 1849 if (ret) 1850 goto out; 1851 1852 ret = req->ring->emit_bb_start(req, 1853 (so.ggtt_offset + so.aux_batch_offset), 1854 I915_DISPATCH_SECURE); 1855 if (ret) 1856 goto out; 1857 1858 i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req); 1859 1860 out: 1861 i915_gem_render_state_fini(&so); 1862 return ret; 1863 } 1864 1865 static int gen8_init_rcs_context(struct drm_i915_gem_request *req) 1866 { 1867 int ret; 1868 1869 ret = intel_logical_ring_workarounds_emit(req); 1870 if (ret) 1871 return ret; 1872 1873 ret = intel_rcs_context_init_mocs(req); 1874 /* 1875 * Failing to program the MOCS is non-fatal.The system will not 1876 * run at peak performance. So generate an error and carry on. 1877 */ 1878 if (ret) 1879 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1880 1881 return intel_lr_context_render_state_init(req); 1882 } 1883 1884 /** 1885 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer 1886 * 1887 * @ring: Engine Command Streamer. 1888 * 1889 */ 1890 void intel_logical_ring_cleanup(struct intel_engine_cs *ring) 1891 { 1892 struct drm_i915_private *dev_priv; 1893 1894 if (!intel_ring_initialized(ring)) 1895 return; 1896 1897 dev_priv = ring->dev->dev_private; 1898 1899 if (ring->buffer) { 1900 intel_logical_ring_stop(ring); 1901 WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0); 1902 } 1903 1904 if (ring->cleanup) 1905 ring->cleanup(ring); 1906 1907 i915_cmd_parser_fini_ring(ring); 1908 i915_gem_batch_pool_fini(&ring->batch_pool); 1909 1910 if (ring->status_page.obj) { 1911 kunmap(sg_page(ring->status_page.obj->pages->sgl)); 1912 ring->status_page.obj = NULL; 1913 } 1914 1915 lrc_destroy_wa_ctx_obj(ring); 1916 ring->dev = NULL; 1917 } 1918 1919 static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring) 1920 { 1921 int ret; 1922 1923 /* Intentionally left blank. */ 1924 ring->buffer = NULL; 1925 1926 ring->dev = dev; 1927 INIT_LIST_HEAD(&ring->active_list); 1928 INIT_LIST_HEAD(&ring->request_list); 1929 i915_gem_batch_pool_init(dev, &ring->batch_pool); 1930 init_waitqueue_head(&ring->irq_queue); 1931 1932 INIT_LIST_HEAD(&ring->buffers); 1933 INIT_LIST_HEAD(&ring->execlist_queue); 1934 INIT_LIST_HEAD(&ring->execlist_retired_req_list); 1935 lockinit(&ring->execlist_lock, "i915el", 0, LK_CANRECURSE); 1936 1937 ret = i915_cmd_parser_init_ring(ring); 1938 if (ret) 1939 goto error; 1940 1941 ret = intel_lr_context_deferred_alloc(ring->default_context, ring); 1942 if (ret) 1943 goto error; 1944 1945 /* As this is the default context, always pin it */ 1946 ret = intel_lr_context_do_pin( 1947 ring, 1948 ring->default_context->engine[ring->id].state, 1949 ring->default_context->engine[ring->id].ringbuf); 1950 if (ret) { 1951 DRM_ERROR( 1952 "Failed to pin and map ringbuffer %s: %d\n", 1953 ring->name, ret); 1954 goto error; 1955 } 1956 1957 return 0; 1958 1959 error: 1960 intel_logical_ring_cleanup(ring); 1961 return ret; 1962 } 1963 1964 static int logical_render_ring_init(struct drm_device *dev) 1965 { 1966 struct drm_i915_private *dev_priv = dev->dev_private; 1967 struct intel_engine_cs *ring = &dev_priv->ring[RCS]; 1968 int ret; 1969 1970 ring->name = "render ring"; 1971 ring->id = RCS; 1972 ring->mmio_base = RENDER_RING_BASE; 1973 ring->irq_enable_mask = 1974 GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT; 1975 ring->irq_keep_mask = 1976 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT; 1977 if (HAS_L3_DPF(dev)) 1978 ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 1979 1980 if (INTEL_INFO(dev)->gen >= 9) 1981 ring->init_hw = gen9_init_render_ring; 1982 else 1983 ring->init_hw = gen8_init_render_ring; 1984 ring->init_context = gen8_init_rcs_context; 1985 ring->cleanup = intel_fini_pipe_control; 1986 if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 1987 ring->get_seqno = bxt_a_get_seqno; 1988 ring->set_seqno = bxt_a_set_seqno; 1989 } else { 1990 ring->get_seqno = gen8_get_seqno; 1991 ring->set_seqno = gen8_set_seqno; 1992 } 1993 ring->emit_request = gen8_emit_request; 1994 ring->emit_flush = gen8_emit_flush_render; 1995 ring->irq_get = gen8_logical_ring_get_irq; 1996 ring->irq_put = gen8_logical_ring_put_irq; 1997 ring->emit_bb_start = gen8_emit_bb_start; 1998 1999 ring->dev = dev; 2000 2001 ret = intel_init_pipe_control(ring); 2002 if (ret) 2003 return ret; 2004 2005 ret = intel_init_workaround_bb(ring); 2006 if (ret) { 2007 /* 2008 * We continue even if we fail to initialize WA batch 2009 * because we only expect rare glitches but nothing 2010 * critical to prevent us from using GPU 2011 */ 2012 DRM_ERROR("WA batch buffer initialization failed: %d\n", 2013 ret); 2014 } 2015 2016 ret = logical_ring_init(dev, ring); 2017 if (ret) { 2018 lrc_destroy_wa_ctx_obj(ring); 2019 } 2020 2021 return ret; 2022 } 2023 2024 static int logical_bsd_ring_init(struct drm_device *dev) 2025 { 2026 struct drm_i915_private *dev_priv = dev->dev_private; 2027 struct intel_engine_cs *ring = &dev_priv->ring[VCS]; 2028 2029 ring->name = "bsd ring"; 2030 ring->id = VCS; 2031 ring->mmio_base = GEN6_BSD_RING_BASE; 2032 ring->irq_enable_mask = 2033 GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT; 2034 ring->irq_keep_mask = 2035 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT; 2036 2037 ring->init_hw = gen8_init_common_ring; 2038 if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 2039 ring->get_seqno = bxt_a_get_seqno; 2040 ring->set_seqno = bxt_a_set_seqno; 2041 } else { 2042 ring->get_seqno = gen8_get_seqno; 2043 ring->set_seqno = gen8_set_seqno; 2044 } 2045 ring->emit_request = gen8_emit_request; 2046 ring->emit_flush = gen8_emit_flush; 2047 ring->irq_get = gen8_logical_ring_get_irq; 2048 ring->irq_put = gen8_logical_ring_put_irq; 2049 ring->emit_bb_start = gen8_emit_bb_start; 2050 2051 return logical_ring_init(dev, ring); 2052 } 2053 2054 static int logical_bsd2_ring_init(struct drm_device *dev) 2055 { 2056 struct drm_i915_private *dev_priv = dev->dev_private; 2057 struct intel_engine_cs *ring = &dev_priv->ring[VCS2]; 2058 2059 ring->name = "bds2 ring"; 2060 ring->id = VCS2; 2061 ring->mmio_base = GEN8_BSD2_RING_BASE; 2062 ring->irq_enable_mask = 2063 GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT; 2064 ring->irq_keep_mask = 2065 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT; 2066 2067 ring->init_hw = gen8_init_common_ring; 2068 ring->get_seqno = gen8_get_seqno; 2069 ring->set_seqno = gen8_set_seqno; 2070 ring->emit_request = gen8_emit_request; 2071 ring->emit_flush = gen8_emit_flush; 2072 ring->irq_get = gen8_logical_ring_get_irq; 2073 ring->irq_put = gen8_logical_ring_put_irq; 2074 ring->emit_bb_start = gen8_emit_bb_start; 2075 2076 return logical_ring_init(dev, ring); 2077 } 2078 2079 static int logical_blt_ring_init(struct drm_device *dev) 2080 { 2081 struct drm_i915_private *dev_priv = dev->dev_private; 2082 struct intel_engine_cs *ring = &dev_priv->ring[BCS]; 2083 2084 ring->name = "blitter ring"; 2085 ring->id = BCS; 2086 ring->mmio_base = BLT_RING_BASE; 2087 ring->irq_enable_mask = 2088 GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT; 2089 ring->irq_keep_mask = 2090 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT; 2091 2092 ring->init_hw = gen8_init_common_ring; 2093 if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 2094 ring->get_seqno = bxt_a_get_seqno; 2095 ring->set_seqno = bxt_a_set_seqno; 2096 } else { 2097 ring->get_seqno = gen8_get_seqno; 2098 ring->set_seqno = gen8_set_seqno; 2099 } 2100 ring->emit_request = gen8_emit_request; 2101 ring->emit_flush = gen8_emit_flush; 2102 ring->irq_get = gen8_logical_ring_get_irq; 2103 ring->irq_put = gen8_logical_ring_put_irq; 2104 ring->emit_bb_start = gen8_emit_bb_start; 2105 2106 return logical_ring_init(dev, ring); 2107 } 2108 2109 static int logical_vebox_ring_init(struct drm_device *dev) 2110 { 2111 struct drm_i915_private *dev_priv = dev->dev_private; 2112 struct intel_engine_cs *ring = &dev_priv->ring[VECS]; 2113 2114 ring->name = "video enhancement ring"; 2115 ring->id = VECS; 2116 ring->mmio_base = VEBOX_RING_BASE; 2117 ring->irq_enable_mask = 2118 GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT; 2119 ring->irq_keep_mask = 2120 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT; 2121 2122 ring->init_hw = gen8_init_common_ring; 2123 if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) { 2124 ring->get_seqno = bxt_a_get_seqno; 2125 ring->set_seqno = bxt_a_set_seqno; 2126 } else { 2127 ring->get_seqno = gen8_get_seqno; 2128 ring->set_seqno = gen8_set_seqno; 2129 } 2130 ring->emit_request = gen8_emit_request; 2131 ring->emit_flush = gen8_emit_flush; 2132 ring->irq_get = gen8_logical_ring_get_irq; 2133 ring->irq_put = gen8_logical_ring_put_irq; 2134 ring->emit_bb_start = gen8_emit_bb_start; 2135 2136 return logical_ring_init(dev, ring); 2137 } 2138 2139 /** 2140 * intel_logical_rings_init() - allocate, populate and init the Engine Command Streamers 2141 * @dev: DRM device. 2142 * 2143 * This function inits the engines for an Execlists submission style (the equivalent in the 2144 * legacy ringbuffer submission world would be i915_gem_init_rings). It does it only for 2145 * those engines that are present in the hardware. 2146 * 2147 * Return: non-zero if the initialization failed. 2148 */ 2149 int intel_logical_rings_init(struct drm_device *dev) 2150 { 2151 struct drm_i915_private *dev_priv = dev->dev_private; 2152 int ret; 2153 2154 ret = logical_render_ring_init(dev); 2155 if (ret) 2156 return ret; 2157 2158 if (HAS_BSD(dev)) { 2159 ret = logical_bsd_ring_init(dev); 2160 if (ret) 2161 goto cleanup_render_ring; 2162 } 2163 2164 if (HAS_BLT(dev)) { 2165 ret = logical_blt_ring_init(dev); 2166 if (ret) 2167 goto cleanup_bsd_ring; 2168 } 2169 2170 if (HAS_VEBOX(dev)) { 2171 ret = logical_vebox_ring_init(dev); 2172 if (ret) 2173 goto cleanup_blt_ring; 2174 } 2175 2176 if (HAS_BSD2(dev)) { 2177 ret = logical_bsd2_ring_init(dev); 2178 if (ret) 2179 goto cleanup_vebox_ring; 2180 } 2181 2182 return 0; 2183 2184 cleanup_vebox_ring: 2185 intel_logical_ring_cleanup(&dev_priv->ring[VECS]); 2186 cleanup_blt_ring: 2187 intel_logical_ring_cleanup(&dev_priv->ring[BCS]); 2188 cleanup_bsd_ring: 2189 intel_logical_ring_cleanup(&dev_priv->ring[VCS]); 2190 cleanup_render_ring: 2191 intel_logical_ring_cleanup(&dev_priv->ring[RCS]); 2192 2193 return ret; 2194 } 2195 2196 static u32 2197 make_rpcs(struct drm_device *dev) 2198 { 2199 u32 rpcs = 0; 2200 2201 /* 2202 * No explicit RPCS request is needed to ensure full 2203 * slice/subslice/EU enablement prior to Gen9. 2204 */ 2205 if (INTEL_INFO(dev)->gen < 9) 2206 return 0; 2207 2208 /* 2209 * Starting in Gen9, render power gating can leave 2210 * slice/subslice/EU in a partially enabled state. We 2211 * must make an explicit request through RPCS for full 2212 * enablement. 2213 */ 2214 if (INTEL_INFO(dev)->has_slice_pg) { 2215 rpcs |= GEN8_RPCS_S_CNT_ENABLE; 2216 rpcs |= INTEL_INFO(dev)->slice_total << 2217 GEN8_RPCS_S_CNT_SHIFT; 2218 rpcs |= GEN8_RPCS_ENABLE; 2219 } 2220 2221 if (INTEL_INFO(dev)->has_subslice_pg) { 2222 rpcs |= GEN8_RPCS_SS_CNT_ENABLE; 2223 rpcs |= INTEL_INFO(dev)->subslice_per_slice << 2224 GEN8_RPCS_SS_CNT_SHIFT; 2225 rpcs |= GEN8_RPCS_ENABLE; 2226 } 2227 2228 if (INTEL_INFO(dev)->has_eu_pg) { 2229 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2230 GEN8_RPCS_EU_MIN_SHIFT; 2231 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2232 GEN8_RPCS_EU_MAX_SHIFT; 2233 rpcs |= GEN8_RPCS_ENABLE; 2234 } 2235 2236 return rpcs; 2237 } 2238 2239 static int 2240 populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj, 2241 struct intel_engine_cs *ring, struct intel_ringbuffer *ringbuf) 2242 { 2243 struct drm_device *dev = ring->dev; 2244 struct drm_i915_private *dev_priv = dev->dev_private; 2245 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt; 2246 struct vm_page *page; 2247 uint32_t *reg_state; 2248 int ret; 2249 2250 if (!ppgtt) 2251 ppgtt = dev_priv->mm.aliasing_ppgtt; 2252 2253 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); 2254 if (ret) { 2255 DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); 2256 return ret; 2257 } 2258 2259 ret = i915_gem_object_get_pages(ctx_obj); 2260 if (ret) { 2261 DRM_DEBUG_DRIVER("Could not get object pages\n"); 2262 return ret; 2263 } 2264 2265 i915_gem_object_pin_pages(ctx_obj); 2266 2267 /* The second page of the context object contains some fields which must 2268 * be set up prior to the first execution. */ 2269 page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 2270 reg_state = kmap_atomic(page); 2271 2272 /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM 2273 * commands followed by (reg, value) pairs. The values we are setting here are 2274 * only for the first context restore: on a subsequent save, the GPU will 2275 * recreate this batchbuffer with new values (including all the missing 2276 * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ 2277 reg_state[CTX_LRI_HEADER_0] = 2278 MI_LOAD_REGISTER_IMM(ring->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED; 2279 ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(ring), 2280 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 2281 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | 2282 CTX_CTRL_RS_CTX_ENABLE)); 2283 ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0); 2284 ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0); 2285 /* Ring buffer start address is not known until the buffer is pinned. 2286 * It is written to the context image in execlists_update_context() 2287 */ 2288 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0); 2289 ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base), 2290 ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID); 2291 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, RING_BBADDR_UDW(ring->mmio_base), 0); 2292 ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, RING_BBADDR(ring->mmio_base), 0); 2293 ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, RING_BBSTATE(ring->mmio_base), 2294 RING_BB_PPGTT); 2295 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(ring->mmio_base), 0); 2296 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(ring->mmio_base), 0); 2297 ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, RING_SBBSTATE(ring->mmio_base), 0); 2298 if (ring->id == RCS) { 2299 ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(ring->mmio_base), 0); 2300 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(ring->mmio_base), 0); 2301 ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, RING_INDIRECT_CTX_OFFSET(ring->mmio_base), 0); 2302 if (ring->wa_ctx.obj) { 2303 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 2304 uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); 2305 2306 reg_state[CTX_RCS_INDIRECT_CTX+1] = 2307 (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | 2308 (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); 2309 2310 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 2311 CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; 2312 2313 reg_state[CTX_BB_PER_CTX_PTR+1] = 2314 (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | 2315 0x01; 2316 } 2317 } 2318 reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 2319 ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(ring->mmio_base), 0); 2320 /* PDP values well be assigned later if needed */ 2321 ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(ring, 3), 0); 2322 ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(ring, 3), 0); 2323 ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(ring, 2), 0); 2324 ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(ring, 2), 0); 2325 ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(ring, 1), 0); 2326 ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(ring, 1), 0); 2327 ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(ring, 0), 0); 2328 ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(ring, 0), 0); 2329 2330 if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 2331 /* 64b PPGTT (48bit canonical) 2332 * PDP0_DESCRIPTOR contains the base address to PML4 and 2333 * other PDP Descriptors are ignored. 2334 */ 2335 ASSIGN_CTX_PML4(ppgtt, reg_state); 2336 } else { 2337 /* 32b PPGTT 2338 * PDP*_DESCRIPTOR contains the base address of space supported. 2339 * With dynamic page allocation, PDPs may not be allocated at 2340 * this point. Point the unallocated PDPs to the scratch page 2341 */ 2342 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 2343 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 2344 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 2345 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 2346 } 2347 2348 if (ring->id == RCS) { 2349 reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 2350 ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 2351 make_rpcs(dev)); 2352 } 2353 2354 kunmap_atomic(reg_state); 2355 i915_gem_object_unpin_pages(ctx_obj); 2356 2357 return 0; 2358 } 2359 2360 /** 2361 * intel_lr_context_free() - free the LRC specific bits of a context 2362 * @ctx: the LR context to free. 2363 * 2364 * The real context freeing is done in i915_gem_context_free: this only 2365 * takes care of the bits that are LRC related: the per-engine backing 2366 * objects and the logical ringbuffer. 2367 */ 2368 void intel_lr_context_free(struct intel_context *ctx) 2369 { 2370 int i; 2371 2372 for (i = 0; i < I915_NUM_RINGS; i++) { 2373 struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state; 2374 2375 if (ctx_obj) { 2376 struct intel_ringbuffer *ringbuf = 2377 ctx->engine[i].ringbuf; 2378 struct intel_engine_cs *ring = ringbuf->ring; 2379 2380 if (ctx == ring->default_context) { 2381 intel_unpin_ringbuffer_obj(ringbuf); 2382 i915_gem_object_ggtt_unpin(ctx_obj); 2383 } 2384 WARN_ON(ctx->engine[ring->id].pin_count); 2385 intel_ringbuffer_free(ringbuf); 2386 drm_gem_object_unreference(&ctx_obj->base); 2387 } 2388 } 2389 } 2390 2391 static uint32_t get_lr_context_size(struct intel_engine_cs *ring) 2392 { 2393 int ret = 0; 2394 2395 WARN_ON(INTEL_INFO(ring->dev)->gen < 8); 2396 2397 switch (ring->id) { 2398 case RCS: 2399 if (INTEL_INFO(ring->dev)->gen >= 9) 2400 ret = GEN9_LR_CONTEXT_RENDER_SIZE; 2401 else 2402 ret = GEN8_LR_CONTEXT_RENDER_SIZE; 2403 break; 2404 case VCS: 2405 case BCS: 2406 case VECS: 2407 case VCS2: 2408 ret = GEN8_LR_CONTEXT_OTHER_SIZE; 2409 break; 2410 } 2411 2412 return ret; 2413 } 2414 2415 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 2416 struct drm_i915_gem_object *default_ctx_obj) 2417 { 2418 struct drm_i915_private *dev_priv = ring->dev->dev_private; 2419 struct vm_page *page; 2420 2421 /* The HWSP is part of the default context object in LRC mode. */ 2422 ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj) 2423 + LRC_PPHWSP_PN * PAGE_SIZE; 2424 page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN); 2425 ring->status_page.page_addr = kmap(page); 2426 ring->status_page.obj = default_ctx_obj; 2427 2428 I915_WRITE(RING_HWS_PGA(ring->mmio_base), 2429 (u32)ring->status_page.gfx_addr); 2430 POSTING_READ(RING_HWS_PGA(ring->mmio_base)); 2431 } 2432 2433 /** 2434 * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context 2435 * @ctx: LR context to create. 2436 * @ring: engine to be used with the context. 2437 * 2438 * This function can be called more than once, with different engines, if we plan 2439 * to use the context with them. The context backing objects and the ringbuffers 2440 * (specially the ringbuffer backing objects) suck a lot of memory up, and that's why 2441 * the creation is a deferred call: it's better to make sure first that we need to use 2442 * a given ring with the context. 2443 * 2444 * Return: non-zero on error. 2445 */ 2446 2447 int intel_lr_context_deferred_alloc(struct intel_context *ctx, 2448 struct intel_engine_cs *ring) 2449 { 2450 struct drm_device *dev = ring->dev; 2451 struct drm_i915_gem_object *ctx_obj; 2452 uint32_t context_size; 2453 struct intel_ringbuffer *ringbuf; 2454 int ret; 2455 2456 WARN_ON(ctx->legacy_hw_ctx.rcs_state != NULL); 2457 WARN_ON(ctx->engine[ring->id].state); 2458 2459 context_size = round_up(get_lr_context_size(ring), 4096); 2460 2461 /* One extra page as the sharing data between driver and GuC */ 2462 context_size += PAGE_SIZE * LRC_PPHWSP_PN; 2463 2464 ctx_obj = i915_gem_alloc_object(dev, context_size); 2465 if (!ctx_obj) { 2466 DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); 2467 return -ENOMEM; 2468 } 2469 2470 ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE); 2471 if (IS_ERR(ringbuf)) { 2472 ret = PTR_ERR(ringbuf); 2473 goto error_deref_obj; 2474 } 2475 2476 ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); 2477 if (ret) { 2478 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 2479 goto error_ringbuf; 2480 } 2481 2482 ctx->engine[ring->id].ringbuf = ringbuf; 2483 ctx->engine[ring->id].state = ctx_obj; 2484 2485 if (ctx != ring->default_context && ring->init_context) { 2486 struct drm_i915_gem_request *req; 2487 2488 ret = i915_gem_request_alloc(ring, 2489 ctx, &req); 2490 if (ret) { 2491 DRM_ERROR("ring create req: %d\n", 2492 ret); 2493 goto error_ringbuf; 2494 } 2495 2496 ret = ring->init_context(req); 2497 if (ret) { 2498 DRM_ERROR("ring init context: %d\n", 2499 ret); 2500 i915_gem_request_cancel(req); 2501 goto error_ringbuf; 2502 } 2503 i915_add_request_no_flush(req); 2504 } 2505 return 0; 2506 2507 error_ringbuf: 2508 intel_ringbuffer_free(ringbuf); 2509 error_deref_obj: 2510 drm_gem_object_unreference(&ctx_obj->base); 2511 ctx->engine[ring->id].ringbuf = NULL; 2512 ctx->engine[ring->id].state = NULL; 2513 return ret; 2514 } 2515 2516 void intel_lr_context_reset(struct drm_device *dev, 2517 struct intel_context *ctx) 2518 { 2519 struct drm_i915_private *dev_priv = dev->dev_private; 2520 struct intel_engine_cs *ring; 2521 int i; 2522 2523 for_each_ring(ring, dev_priv, i) { 2524 struct drm_i915_gem_object *ctx_obj = 2525 ctx->engine[ring->id].state; 2526 struct intel_ringbuffer *ringbuf = 2527 ctx->engine[ring->id].ringbuf; 2528 uint32_t *reg_state; 2529 struct vm_page *page; 2530 2531 if (!ctx_obj) 2532 continue; 2533 2534 if (i915_gem_object_get_pages(ctx_obj)) { 2535 WARN(1, "Failed get_pages for context obj\n"); 2536 continue; 2537 } 2538 page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); 2539 reg_state = kmap_atomic(page); 2540 2541 reg_state[CTX_RING_HEAD+1] = 0; 2542 reg_state[CTX_RING_TAIL+1] = 0; 2543 2544 kunmap_atomic(reg_state); 2545 2546 ringbuf->head = 0; 2547 ringbuf->tail = 0; 2548 } 2549 } 2550