1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 135 #include <drm/drmP.h> 136 #include <drm/i915_drm.h> 137 #include "i915_drv.h" 138 #include "intel_drv.h" 139 #include "intel_mocs.h" 140 141 #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) 142 #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) 143 #define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE) 144 145 #define RING_EXECLIST_QFULL (1 << 0x2) 146 #define RING_EXECLIST1_VALID (1 << 0x3) 147 #define RING_EXECLIST0_VALID (1 << 0x4) 148 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 149 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 150 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 151 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 153 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 156 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 157 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 158 159 #define CTX_LRI_HEADER_0 0x01 160 #define CTX_CONTEXT_CONTROL 0x02 161 #define CTX_RING_HEAD 0x04 162 #define CTX_RING_TAIL 0x06 163 #define CTX_RING_BUFFER_START 0x08 164 #define CTX_RING_BUFFER_CONTROL 0x0a 165 #define CTX_BB_HEAD_U 0x0c 166 #define CTX_BB_HEAD_L 0x0e 167 #define CTX_BB_STATE 0x10 168 #define CTX_SECOND_BB_HEAD_U 0x12 169 #define CTX_SECOND_BB_HEAD_L 0x14 170 #define CTX_SECOND_BB_STATE 0x16 171 #define CTX_BB_PER_CTX_PTR 0x18 172 #define CTX_RCS_INDIRECT_CTX 0x1a 173 #define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c 174 #define CTX_LRI_HEADER_1 0x21 175 #define CTX_CTX_TIMESTAMP 0x22 176 #define CTX_PDP3_UDW 0x24 177 #define CTX_PDP3_LDW 0x26 178 #define CTX_PDP2_UDW 0x28 179 #define CTX_PDP2_LDW 0x2a 180 #define CTX_PDP1_UDW 0x2c 181 #define CTX_PDP1_LDW 0x2e 182 #define CTX_PDP0_UDW 0x30 183 #define CTX_PDP0_LDW 0x32 184 #define CTX_LRI_HEADER_2 0x41 185 #define CTX_R_PWR_CLK_STATE 0x42 186 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 187 188 #define GEN8_CTX_VALID (1<<0) 189 #define GEN8_CTX_FORCE_PD_RESTORE (1<<1) 190 #define GEN8_CTX_FORCE_RESTORE (1<<2) 191 #define GEN8_CTX_L3LLC_COHERENT (1<<5) 192 #define GEN8_CTX_PRIVILEGE (1<<8) 193 194 #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \ 195 const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ 196 reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ 197 reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ 198 } 199 200 #define ASSIGN_CTX_PML4(ppgtt, reg_state) { \ 201 reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ 202 reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ 203 } 204 205 enum { 206 ADVANCED_CONTEXT = 0, 207 LEGACY_32B_CONTEXT, 208 ADVANCED_AD_CONTEXT, 209 LEGACY_64B_CONTEXT 210 }; 211 #define GEN8_CTX_ADDRESSING_MODE_SHIFT 3 212 #define GEN8_CTX_ADDRESSING_MODE(dev) (USES_FULL_48BIT_PPGTT(dev) ?\ 213 LEGACY_64B_CONTEXT :\ 214 LEGACY_32B_CONTEXT) 215 enum { 216 FAULT_AND_HANG = 0, 217 FAULT_AND_HALT, /* Debug only */ 218 FAULT_AND_STREAM, 219 FAULT_AND_CONTINUE /* Unsupported */ 220 }; 221 #define GEN8_CTX_ID_SHIFT 32 222 #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 223 224 static int intel_lr_context_pin(struct drm_i915_gem_request *rq); 225 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 226 struct drm_i915_gem_object *default_ctx_obj); 227 228 229 /** 230 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists 231 * @dev: DRM device. 232 * @enable_execlists: value of i915.enable_execlists module parameter. 233 * 234 * Only certain platforms support Execlists (the prerequisites being 235 * support for Logical Ring Contexts and Aliasing PPGTT or better). 236 * 237 * Return: 1 if Execlists is supported and has to be enabled. 238 */ 239 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists) 240 { 241 WARN_ON(i915.enable_ppgtt == -1); 242 243 /* On platforms with execlist available, vGPU will only 244 * support execlist mode, no ring buffer mode. 245 */ 246 if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev)) 247 return 1; 248 249 if (INTEL_INFO(dev)->gen >= 9) 250 return 1; 251 252 if (enable_execlists == 0) 253 return 0; 254 255 if (HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev) && 256 i915.use_mmio_flip >= 0) 257 return 1; 258 259 return 0; 260 } 261 262 /** 263 * intel_execlists_ctx_id() - get the Execlists Context ID 264 * @ctx_obj: Logical Ring Context backing object. 265 * 266 * Do not confuse with ctx->id! Unfortunately we have a name overload 267 * here: the old context ID we pass to userspace as a handler so that 268 * they can refer to a context, and the new context ID we pass to the 269 * ELSP so that the GPU can inform us of the context status via 270 * interrupts. 271 * 272 * Return: 20-bits globally unique context ID. 273 */ 274 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) 275 { 276 u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) + 277 LRC_PPHWSP_PN * PAGE_SIZE; 278 279 /* LRCA is required to be 4K aligned so the more significant 20 bits 280 * are globally unique */ 281 return lrca >> 12; 282 } 283 284 static bool disable_lite_restore_wa(struct intel_engine_cs *ring) 285 { 286 struct drm_device *dev = ring->dev; 287 288 return ((IS_SKYLAKE(dev) && INTEL_REVID(dev) <= SKL_REVID_B0) || 289 (IS_BROXTON(dev) && INTEL_REVID(dev) == BXT_REVID_A0)) && 290 (ring->id == VCS || ring->id == VCS2); 291 } 292 293 uint64_t intel_lr_context_descriptor(struct intel_context *ctx, 294 struct intel_engine_cs *ring) 295 { 296 struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; 297 uint64_t desc; 298 uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) + 299 LRC_PPHWSP_PN * PAGE_SIZE; 300 301 WARN_ON(lrca & 0xFFFFFFFF00000FFFULL); 302 303 desc = GEN8_CTX_VALID; 304 desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT; 305 if (IS_GEN8(ctx_obj->base.dev)) 306 desc |= GEN8_CTX_L3LLC_COHERENT; 307 desc |= GEN8_CTX_PRIVILEGE; 308 desc |= lrca; 309 desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; 310 311 /* TODO: WaDisableLiteRestore when we start using semaphore 312 * signalling between Command Streamers */ 313 /* desc |= GEN8_CTX_FORCE_RESTORE; */ 314 315 /* WaEnableForceRestoreInCtxtDescForVCS:skl */ 316 /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ 317 if (disable_lite_restore_wa(ring)) 318 desc |= GEN8_CTX_FORCE_RESTORE; 319 320 return desc; 321 } 322 323 static void execlists_elsp_write(struct drm_i915_gem_request *rq0, 324 struct drm_i915_gem_request *rq1) 325 { 326 327 struct intel_engine_cs *ring = rq0->ring; 328 struct drm_device *dev = ring->dev; 329 struct drm_i915_private *dev_priv = dev->dev_private; 330 uint64_t desc[2]; 331 332 if (rq1) { 333 desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring); 334 rq1->elsp_submitted++; 335 } else { 336 desc[1] = 0; 337 } 338 339 desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring); 340 rq0->elsp_submitted++; 341 342 /* You must always write both descriptors in the order below. */ 343 lockmgr(&dev_priv->uncore.lock, LK_EXCLUSIVE); 344 intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); 345 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); 346 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); 347 348 I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0])); 349 /* The context is automatically loaded after the following */ 350 I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0])); 351 352 /* ELSP is a wo register, use another nearby reg for posting */ 353 POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); 354 intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); 355 lockmgr(&dev_priv->uncore.lock, LK_RELEASE); 356 } 357 358 static int execlists_update_context(struct drm_i915_gem_request *rq) 359 { 360 struct intel_engine_cs *ring = rq->ring; 361 struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; 362 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 363 struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj; 364 struct vm_page *page; 365 uint32_t *reg_state; 366 367 BUG_ON(!ctx_obj); 368 WARN_ON(!i915_gem_obj_is_pinned(ctx_obj)); 369 WARN_ON(!i915_gem_obj_is_pinned(rb_obj)); 370 371 page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); 372 reg_state = kmap_atomic(page); 373 374 reg_state[CTX_RING_TAIL+1] = rq->tail; 375 reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj); 376 377 if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 378 /* True 32b PPGTT with dynamic page allocation: update PDP 379 * registers and point the unallocated PDPs to scratch page. 380 * PML4 is allocated during ppgtt init, so this is not needed 381 * in 48-bit mode. 382 */ 383 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 384 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 385 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 386 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 387 } 388 389 kunmap_atomic(reg_state); 390 391 return 0; 392 } 393 394 static void execlists_submit_requests(struct drm_i915_gem_request *rq0, 395 struct drm_i915_gem_request *rq1) 396 { 397 execlists_update_context(rq0); 398 399 if (rq1) 400 execlists_update_context(rq1); 401 402 execlists_elsp_write(rq0, rq1); 403 } 404 405 static void execlists_context_unqueue(struct intel_engine_cs *ring) 406 { 407 struct drm_i915_gem_request *req0 = NULL, *req1 = NULL; 408 struct drm_i915_gem_request *cursor = NULL, *tmp = NULL; 409 410 assert_spin_locked(&ring->execlist_lock); 411 412 /* 413 * If irqs are not active generate a warning as batches that finish 414 * without the irqs may get lost and a GPU Hang may occur. 415 */ 416 WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); 417 418 if (list_empty(&ring->execlist_queue)) 419 return; 420 421 /* Try to read in pairs */ 422 list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue, 423 execlist_link) { 424 if (!req0) { 425 req0 = cursor; 426 } else if (req0->ctx == cursor->ctx) { 427 /* Same ctx: ignore first request, as second request 428 * will update tail past first request's workload */ 429 cursor->elsp_submitted = req0->elsp_submitted; 430 list_del(&req0->execlist_link); 431 list_add_tail(&req0->execlist_link, 432 &ring->execlist_retired_req_list); 433 req0 = cursor; 434 } else { 435 req1 = cursor; 436 break; 437 } 438 } 439 440 if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) { 441 /* 442 * WaIdleLiteRestore: make sure we never cause a lite 443 * restore with HEAD==TAIL 444 */ 445 if (req0->elsp_submitted) { 446 /* 447 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL 448 * as we resubmit the request. See gen8_emit_request() 449 * for where we prepare the padding after the end of the 450 * request. 451 */ 452 struct intel_ringbuffer *ringbuf; 453 454 ringbuf = req0->ctx->engine[ring->id].ringbuf; 455 req0->tail += 8; 456 req0->tail &= ringbuf->size - 1; 457 } 458 } 459 460 WARN_ON(req1 && req1->elsp_submitted); 461 462 execlists_submit_requests(req0, req1); 463 } 464 465 static bool execlists_check_remove_request(struct intel_engine_cs *ring, 466 u32 request_id) 467 { 468 struct drm_i915_gem_request *head_req; 469 470 assert_spin_locked(&ring->execlist_lock); 471 472 head_req = list_first_entry_or_null(&ring->execlist_queue, 473 struct drm_i915_gem_request, 474 execlist_link); 475 476 if (head_req != NULL) { 477 struct drm_i915_gem_object *ctx_obj = 478 head_req->ctx->engine[ring->id].state; 479 if (intel_execlists_ctx_id(ctx_obj) == request_id) { 480 WARN(head_req->elsp_submitted == 0, 481 "Never submitted head request\n"); 482 483 if (--head_req->elsp_submitted <= 0) { 484 list_del(&head_req->execlist_link); 485 list_add_tail(&head_req->execlist_link, 486 &ring->execlist_retired_req_list); 487 return true; 488 } 489 } 490 } 491 492 return false; 493 } 494 495 /** 496 * intel_lrc_irq_handler() - handle Context Switch interrupts 497 * @ring: Engine Command Streamer to handle. 498 * 499 * Check the unread Context Status Buffers and manage the submission of new 500 * contexts to the ELSP accordingly. 501 */ 502 void intel_lrc_irq_handler(struct intel_engine_cs *ring) 503 { 504 struct drm_i915_private *dev_priv = ring->dev->dev_private; 505 u32 status_pointer; 506 u8 read_pointer; 507 u8 write_pointer; 508 u32 status = 0; 509 u32 status_id; 510 u32 submit_contexts = 0; 511 512 status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); 513 514 read_pointer = ring->next_context_status_buffer; 515 write_pointer = status_pointer & GEN8_CSB_PTR_MASK; 516 if (read_pointer > write_pointer) 517 write_pointer += GEN8_CSB_ENTRIES; 518 519 lockmgr(&ring->execlist_lock, LK_EXCLUSIVE); 520 521 while (read_pointer < write_pointer) { 522 read_pointer++; 523 status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer % GEN8_CSB_ENTRIES)); 524 status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer % GEN8_CSB_ENTRIES)); 525 526 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 527 continue; 528 529 if (status & GEN8_CTX_STATUS_PREEMPTED) { 530 if (status & GEN8_CTX_STATUS_LITE_RESTORE) { 531 if (execlists_check_remove_request(ring, status_id)) 532 WARN(1, "Lite Restored request removed from queue\n"); 533 } else 534 WARN(1, "Preemption without Lite Restore\n"); 535 } 536 537 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) || 538 (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) { 539 if (execlists_check_remove_request(ring, status_id)) 540 submit_contexts++; 541 } 542 } 543 544 if (disable_lite_restore_wa(ring)) { 545 /* Prevent a ctx to preempt itself */ 546 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) && 547 (submit_contexts != 0)) 548 execlists_context_unqueue(ring); 549 } else if (submit_contexts != 0) { 550 execlists_context_unqueue(ring); 551 } 552 553 lockmgr(&ring->execlist_lock, LK_RELEASE); 554 555 WARN(submit_contexts > 2, "More than two context complete events?\n"); 556 ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; 557 558 I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), 559 _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8, 560 ((u32)ring->next_context_status_buffer & 561 GEN8_CSB_PTR_MASK) << 8)); 562 } 563 564 static int execlists_context_queue(struct drm_i915_gem_request *request) 565 { 566 struct intel_engine_cs *ring = request->ring; 567 struct drm_i915_gem_request *cursor; 568 int num_elements = 0; 569 570 if (request->ctx != ring->default_context) 571 intel_lr_context_pin(request); 572 573 i915_gem_request_reference(request); 574 575 spin_lock_irq(&ring->execlist_lock); 576 577 list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) 578 if (++num_elements > 2) 579 break; 580 581 if (num_elements > 2) { 582 struct drm_i915_gem_request *tail_req; 583 584 tail_req = list_last_entry(&ring->execlist_queue, 585 struct drm_i915_gem_request, 586 execlist_link); 587 588 if (request->ctx == tail_req->ctx) { 589 WARN(tail_req->elsp_submitted != 0, 590 "More than 2 already-submitted reqs queued\n"); 591 list_del(&tail_req->execlist_link); 592 list_add_tail(&tail_req->execlist_link, 593 &ring->execlist_retired_req_list); 594 } 595 } 596 597 list_add_tail(&request->execlist_link, &ring->execlist_queue); 598 if (num_elements == 0) 599 execlists_context_unqueue(ring); 600 601 spin_unlock_irq(&ring->execlist_lock); 602 603 return 0; 604 } 605 606 static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req) 607 { 608 struct intel_engine_cs *ring = req->ring; 609 uint32_t flush_domains; 610 int ret; 611 612 flush_domains = 0; 613 if (ring->gpu_caches_dirty) 614 flush_domains = I915_GEM_GPU_DOMAINS; 615 616 ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains); 617 if (ret) 618 return ret; 619 620 ring->gpu_caches_dirty = false; 621 return 0; 622 } 623 624 static int execlists_move_to_gpu(struct drm_i915_gem_request *req, 625 struct list_head *vmas) 626 { 627 const unsigned other_rings = ~intel_ring_flag(req->ring); 628 struct i915_vma *vma; 629 uint32_t flush_domains = 0; 630 bool flush_chipset = false; 631 int ret; 632 633 list_for_each_entry(vma, vmas, exec_list) { 634 struct drm_i915_gem_object *obj = vma->obj; 635 636 if (obj->active & other_rings) { 637 ret = i915_gem_object_sync(obj, req->ring, &req); 638 if (ret) 639 return ret; 640 } 641 642 if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) 643 flush_chipset |= i915_gem_clflush_object(obj, false); 644 645 flush_domains |= obj->base.write_domain; 646 } 647 648 if (flush_domains & I915_GEM_DOMAIN_GTT) 649 wmb(); 650 651 /* Unconditionally invalidate gpu caches and ensure that we do flush 652 * any residual writes from the previous batch. 653 */ 654 return logical_ring_invalidate_all_caches(req); 655 } 656 657 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) 658 { 659 int ret; 660 661 request->ringbuf = request->ctx->engine[request->ring->id].ringbuf; 662 663 if (request->ctx != request->ring->default_context) { 664 ret = intel_lr_context_pin(request); 665 if (ret) 666 return ret; 667 } 668 669 return 0; 670 } 671 672 static int logical_ring_wait_for_space(struct drm_i915_gem_request *req, 673 int bytes) 674 { 675 struct intel_ringbuffer *ringbuf = req->ringbuf; 676 struct intel_engine_cs *ring = req->ring; 677 struct drm_i915_gem_request *target; 678 unsigned space; 679 int ret; 680 681 if (intel_ring_space(ringbuf) >= bytes) 682 return 0; 683 684 /* The whole point of reserving space is to not wait! */ 685 WARN_ON(ringbuf->reserved_in_use); 686 687 list_for_each_entry(target, &ring->request_list, list) { 688 /* 689 * The request queue is per-engine, so can contain requests 690 * from multiple ringbuffers. Here, we must ignore any that 691 * aren't from the ringbuffer we're considering. 692 */ 693 if (target->ringbuf != ringbuf) 694 continue; 695 696 /* Would completion of this request free enough space? */ 697 space = __intel_ring_space(target->postfix, ringbuf->tail, 698 ringbuf->size); 699 if (space >= bytes) 700 break; 701 } 702 703 if (WARN_ON(&target->list == &ring->request_list)) 704 return -ENOSPC; 705 706 ret = i915_wait_request(target); 707 if (ret) 708 return ret; 709 710 ringbuf->space = space; 711 return 0; 712 } 713 714 /* 715 * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload 716 * @request: Request to advance the logical ringbuffer of. 717 * 718 * The tail is updated in our logical ringbuffer struct, not in the actual context. What 719 * really happens during submission is that the context and current tail will be placed 720 * on a queue waiting for the ELSP to be ready to accept a new context submission. At that 721 * point, the tail *inside* the context is updated and the ELSP written to. 722 */ 723 static void 724 intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request) 725 { 726 struct intel_engine_cs *ring = request->ring; 727 728 intel_logical_ring_advance(request->ringbuf); 729 730 request->tail = request->ringbuf->tail; 731 732 if (intel_ring_stopped(ring)) 733 return; 734 735 #if 0 736 if (dev_priv->guc.execbuf_client) 737 i915_guc_submit(dev_priv->guc.execbuf_client, request); 738 else 739 #endif 740 execlists_context_queue(request); 741 } 742 743 static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf) 744 { 745 uint32_t __iomem *virt; 746 int rem = ringbuf->size - ringbuf->tail; 747 748 virt = (uint32_t *)(ringbuf->virtual_start + ringbuf->tail); 749 rem /= 4; 750 while (rem--) 751 iowrite32(MI_NOOP, virt++); 752 753 ringbuf->tail = 0; 754 intel_ring_update_space(ringbuf); 755 } 756 757 static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes) 758 { 759 struct intel_ringbuffer *ringbuf = req->ringbuf; 760 int remain_usable = ringbuf->effective_size - ringbuf->tail; 761 int remain_actual = ringbuf->size - ringbuf->tail; 762 int ret, total_bytes, wait_bytes = 0; 763 bool need_wrap = false; 764 765 if (ringbuf->reserved_in_use) 766 total_bytes = bytes; 767 else 768 total_bytes = bytes + ringbuf->reserved_size; 769 770 if (unlikely(bytes > remain_usable)) { 771 /* 772 * Not enough space for the basic request. So need to flush 773 * out the remainder and then wait for base + reserved. 774 */ 775 wait_bytes = remain_actual + total_bytes; 776 need_wrap = true; 777 } else { 778 if (unlikely(total_bytes > remain_usable)) { 779 /* 780 * The base request will fit but the reserved space 781 * falls off the end. So only need to to wait for the 782 * reserved size after flushing out the remainder. 783 */ 784 wait_bytes = remain_actual + ringbuf->reserved_size; 785 need_wrap = true; 786 } else if (total_bytes > ringbuf->space) { 787 /* No wrapping required, just waiting. */ 788 wait_bytes = total_bytes; 789 } 790 } 791 792 if (wait_bytes) { 793 ret = logical_ring_wait_for_space(req, wait_bytes); 794 if (unlikely(ret)) 795 return ret; 796 797 if (need_wrap) 798 __wrap_ring_buffer(ringbuf); 799 } 800 801 return 0; 802 } 803 804 /** 805 * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands 806 * 807 * @req: The request to start some new work for 808 * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. 809 * 810 * The ringbuffer might not be ready to accept the commands right away (maybe it needs to 811 * be wrapped, or wait a bit for the tail to be updated). This function takes care of that 812 * and also preallocates a request (every workload submission is still mediated through 813 * requests, same as it did with legacy ringbuffer submission). 814 * 815 * Return: non-zero if the ringbuffer is not ready to be written to. 816 */ 817 int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords) 818 { 819 struct drm_i915_private *dev_priv; 820 int ret; 821 822 WARN_ON(req == NULL); 823 dev_priv = req->ring->dev->dev_private; 824 825 ret = i915_gem_check_wedge(&dev_priv->gpu_error, 826 dev_priv->mm.interruptible); 827 if (ret) 828 return ret; 829 830 ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t)); 831 if (ret) 832 return ret; 833 834 req->ringbuf->space -= num_dwords * sizeof(uint32_t); 835 return 0; 836 } 837 838 int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request) 839 { 840 /* 841 * The first call merely notes the reserve request and is common for 842 * all back ends. The subsequent localised _begin() call actually 843 * ensures that the reservation is available. Without the begin, if 844 * the request creator immediately submitted the request without 845 * adding any commands to it then there might not actually be 846 * sufficient room for the submission commands. 847 */ 848 intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST); 849 850 return intel_logical_ring_begin(request, 0); 851 } 852 853 /** 854 * execlists_submission() - submit a batchbuffer for execution, Execlists style 855 * @dev: DRM device. 856 * @file: DRM file. 857 * @ring: Engine Command Streamer to submit to. 858 * @ctx: Context to employ for this submission. 859 * @args: execbuffer call arguments. 860 * @vmas: list of vmas. 861 * @batch_obj: the batchbuffer to submit. 862 * @exec_start: batchbuffer start virtual address pointer. 863 * @dispatch_flags: translated execbuffer call flags. 864 * 865 * This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts 866 * away the submission details of the execbuffer ioctl call. 867 * 868 * Return: non-zero if the submission fails. 869 */ 870 int intel_execlists_submission(struct i915_execbuffer_params *params, 871 struct drm_i915_gem_execbuffer2 *args, 872 struct list_head *vmas) 873 { 874 struct drm_device *dev = params->dev; 875 struct intel_engine_cs *ring = params->ring; 876 struct drm_i915_private *dev_priv = dev->dev_private; 877 struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf; 878 u64 exec_start; 879 int instp_mode; 880 u32 instp_mask; 881 int ret; 882 883 instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK; 884 instp_mask = I915_EXEC_CONSTANTS_MASK; 885 switch (instp_mode) { 886 case I915_EXEC_CONSTANTS_REL_GENERAL: 887 case I915_EXEC_CONSTANTS_ABSOLUTE: 888 case I915_EXEC_CONSTANTS_REL_SURFACE: 889 if (instp_mode != 0 && ring != &dev_priv->ring[RCS]) { 890 DRM_DEBUG("non-0 rel constants mode on non-RCS\n"); 891 return -EINVAL; 892 } 893 894 if (instp_mode != dev_priv->relative_constants_mode) { 895 if (instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) { 896 DRM_DEBUG("rel surface constants mode invalid on gen5+\n"); 897 return -EINVAL; 898 } 899 900 /* The HW changed the meaning on this bit on gen6 */ 901 instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE; 902 } 903 break; 904 default: 905 DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode); 906 return -EINVAL; 907 } 908 909 if (args->flags & I915_EXEC_GEN7_SOL_RESET) { 910 DRM_DEBUG("sol reset is gen7 only\n"); 911 return -EINVAL; 912 } 913 914 ret = execlists_move_to_gpu(params->request, vmas); 915 if (ret) 916 return ret; 917 918 if (ring == &dev_priv->ring[RCS] && 919 instp_mode != dev_priv->relative_constants_mode) { 920 ret = intel_logical_ring_begin(params->request, 4); 921 if (ret) 922 return ret; 923 924 intel_logical_ring_emit(ringbuf, MI_NOOP); 925 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); 926 intel_logical_ring_emit(ringbuf, INSTPM); 927 intel_logical_ring_emit(ringbuf, instp_mask << 16 | instp_mode); 928 intel_logical_ring_advance(ringbuf); 929 930 dev_priv->relative_constants_mode = instp_mode; 931 } 932 933 exec_start = params->batch_obj_vm_offset + 934 args->batch_start_offset; 935 936 ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags); 937 if (ret) 938 return ret; 939 940 trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); 941 942 i915_gem_execbuffer_move_to_active(vmas, params->request); 943 i915_gem_execbuffer_retire_commands(params); 944 945 return 0; 946 } 947 948 void intel_execlists_retire_requests(struct intel_engine_cs *ring) 949 { 950 struct drm_i915_gem_request *req, *tmp; 951 struct list_head retired_list; 952 953 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 954 if (list_empty(&ring->execlist_retired_req_list)) 955 return; 956 957 INIT_LIST_HEAD(&retired_list); 958 spin_lock_irq(&ring->execlist_lock); 959 list_replace_init(&ring->execlist_retired_req_list, &retired_list); 960 spin_unlock_irq(&ring->execlist_lock); 961 962 list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { 963 struct intel_context *ctx = req->ctx; 964 struct drm_i915_gem_object *ctx_obj = 965 ctx->engine[ring->id].state; 966 967 if (ctx_obj && (ctx != ring->default_context)) 968 intel_lr_context_unpin(req); 969 list_del(&req->execlist_link); 970 i915_gem_request_unreference(req); 971 } 972 } 973 974 void intel_logical_ring_stop(struct intel_engine_cs *ring) 975 { 976 struct drm_i915_private *dev_priv = ring->dev->dev_private; 977 int ret; 978 979 if (!intel_ring_initialized(ring)) 980 return; 981 982 ret = intel_ring_idle(ring); 983 if (ret && !i915_reset_in_progress(&to_i915(ring->dev)->gpu_error)) 984 DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n", 985 ring->name, ret); 986 987 /* TODO: Is this correct with Execlists enabled? */ 988 I915_WRITE_MODE(ring, _MASKED_BIT_ENABLE(STOP_RING)); 989 if (wait_for_atomic((I915_READ_MODE(ring) & MODE_IDLE) != 0, 1000)) { 990 DRM_ERROR("%s :timed out trying to stop ring\n", ring->name); 991 return; 992 } 993 I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING)); 994 } 995 996 int logical_ring_flush_all_caches(struct drm_i915_gem_request *req) 997 { 998 struct intel_engine_cs *ring = req->ring; 999 int ret; 1000 1001 if (!ring->gpu_caches_dirty) 1002 return 0; 1003 1004 ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS); 1005 if (ret) 1006 return ret; 1007 1008 ring->gpu_caches_dirty = false; 1009 return 0; 1010 } 1011 1012 static int intel_lr_context_do_pin(struct intel_engine_cs *ring, 1013 struct drm_i915_gem_object *ctx_obj, 1014 struct intel_ringbuffer *ringbuf) 1015 { 1016 struct drm_device *dev = ring->dev; 1017 struct drm_i915_private *dev_priv = dev->dev_private; 1018 int ret = 0; 1019 1020 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1021 ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 1022 PIN_OFFSET_BIAS | GUC_WOPCM_TOP); 1023 if (ret) 1024 return ret; 1025 1026 ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf); 1027 if (ret) 1028 goto unpin_ctx_obj; 1029 1030 ctx_obj->dirty = true; 1031 1032 /* Invalidate GuC TLB. */ 1033 if (i915.enable_guc_submission) 1034 I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); 1035 1036 return ret; 1037 1038 unpin_ctx_obj: 1039 i915_gem_object_ggtt_unpin(ctx_obj); 1040 1041 return ret; 1042 } 1043 1044 static int intel_lr_context_pin(struct drm_i915_gem_request *rq) 1045 { 1046 int ret = 0; 1047 struct intel_engine_cs *ring = rq->ring; 1048 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 1049 struct intel_ringbuffer *ringbuf = rq->ringbuf; 1050 1051 if (rq->ctx->engine[ring->id].pin_count++ == 0) { 1052 ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf); 1053 if (ret) 1054 goto reset_pin_count; 1055 } 1056 return ret; 1057 1058 reset_pin_count: 1059 rq->ctx->engine[ring->id].pin_count = 0; 1060 return ret; 1061 } 1062 1063 void intel_lr_context_unpin(struct drm_i915_gem_request *rq) 1064 { 1065 struct intel_engine_cs *ring = rq->ring; 1066 struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; 1067 struct intel_ringbuffer *ringbuf = rq->ringbuf; 1068 1069 if (ctx_obj) { 1070 WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); 1071 if (--rq->ctx->engine[ring->id].pin_count == 0) { 1072 intel_unpin_ringbuffer_obj(ringbuf); 1073 i915_gem_object_ggtt_unpin(ctx_obj); 1074 } 1075 } 1076 } 1077 1078 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) 1079 { 1080 int ret, i; 1081 struct intel_engine_cs *ring = req->ring; 1082 struct intel_ringbuffer *ringbuf = req->ringbuf; 1083 struct drm_device *dev = ring->dev; 1084 struct drm_i915_private *dev_priv = dev->dev_private; 1085 struct i915_workarounds *w = &dev_priv->workarounds; 1086 1087 if (WARN_ON_ONCE(w->count == 0)) 1088 return 0; 1089 1090 ring->gpu_caches_dirty = true; 1091 ret = logical_ring_flush_all_caches(req); 1092 if (ret) 1093 return ret; 1094 1095 ret = intel_logical_ring_begin(req, w->count * 2 + 2); 1096 if (ret) 1097 return ret; 1098 1099 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); 1100 for (i = 0; i < w->count; i++) { 1101 intel_logical_ring_emit(ringbuf, w->reg[i].addr); 1102 intel_logical_ring_emit(ringbuf, w->reg[i].value); 1103 } 1104 intel_logical_ring_emit(ringbuf, MI_NOOP); 1105 1106 intel_logical_ring_advance(ringbuf); 1107 1108 ring->gpu_caches_dirty = true; 1109 ret = logical_ring_flush_all_caches(req); 1110 if (ret) 1111 return ret; 1112 1113 return 0; 1114 } 1115 1116 #define wa_ctx_emit(batch, index, cmd) \ 1117 do { \ 1118 int __index = (index)++; \ 1119 if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ 1120 return -ENOSPC; \ 1121 } \ 1122 batch[__index] = (cmd); \ 1123 } while (0) 1124 1125 1126 /* 1127 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1128 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1129 * but there is a slight complication as this is applied in WA batch where the 1130 * values are only initialized once so we cannot take register value at the 1131 * beginning and reuse it further; hence we save its value to memory, upload a 1132 * constant value with bit21 set and then we restore it back with the saved value. 1133 * To simplify the WA, a constant value is formed by using the default value 1134 * of this register. This shouldn't be a problem because we are only modifying 1135 * it for a short period and this batch in non-premptible. We can ofcourse 1136 * use additional instructions that read the actual value of the register 1137 * at that time and set our bit of interest but it makes the WA complicated. 1138 * 1139 * This WA is also required for Gen9 so extracting as a function avoids 1140 * code duplication. 1141 */ 1142 static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring, 1143 uint32_t *const batch, 1144 uint32_t index) 1145 { 1146 uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); 1147 1148 /* 1149 * WaDisableLSQCROPERFforOCL:skl 1150 * This WA is implemented in skl_init_clock_gating() but since 1151 * this batch updates GEN8_L3SQCREG4 with default value we need to 1152 * set this bit here to retain the WA during flush. 1153 */ 1154 if (IS_SKYLAKE(ring->dev) && INTEL_REVID(ring->dev) <= SKL_REVID_E0) 1155 l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS; 1156 1157 wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | 1158 MI_SRM_LRM_GLOBAL_GTT)); 1159 wa_ctx_emit(batch, index, GEN8_L3SQCREG4); 1160 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1161 wa_ctx_emit(batch, index, 0); 1162 1163 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1164 wa_ctx_emit(batch, index, GEN8_L3SQCREG4); 1165 wa_ctx_emit(batch, index, l3sqc4_flush); 1166 1167 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1168 wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | 1169 PIPE_CONTROL_DC_FLUSH_ENABLE)); 1170 wa_ctx_emit(batch, index, 0); 1171 wa_ctx_emit(batch, index, 0); 1172 wa_ctx_emit(batch, index, 0); 1173 wa_ctx_emit(batch, index, 0); 1174 1175 wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | 1176 MI_SRM_LRM_GLOBAL_GTT)); 1177 wa_ctx_emit(batch, index, GEN8_L3SQCREG4); 1178 wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); 1179 wa_ctx_emit(batch, index, 0); 1180 1181 return index; 1182 } 1183 1184 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, 1185 uint32_t offset, 1186 uint32_t start_alignment) 1187 { 1188 return wa_ctx->offset = ALIGN(offset, start_alignment); 1189 } 1190 1191 static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, 1192 uint32_t offset, 1193 uint32_t size_alignment) 1194 { 1195 wa_ctx->size = offset - wa_ctx->offset; 1196 1197 WARN(wa_ctx->size % size_alignment, 1198 "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", 1199 wa_ctx->size, size_alignment); 1200 return 0; 1201 } 1202 1203 /** 1204 * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA 1205 * 1206 * @ring: only applicable for RCS 1207 * @wa_ctx: structure representing wa_ctx 1208 * offset: specifies start of the batch, should be cache-aligned. This is updated 1209 * with the offset value received as input. 1210 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1211 * @batch: page in which WA are loaded 1212 * @offset: This field specifies the start of the batch, it should be 1213 * cache-aligned otherwise it is adjusted accordingly. 1214 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1215 * initialized at the beginning and shared across all contexts but this field 1216 * helps us to have multiple batches at different offsets and select them based 1217 * on a criteria. At the moment this batch always start at the beginning of the page 1218 * and at this point we don't have multiple wa_ctx batch buffers. 1219 * 1220 * The number of WA applied are not known at the beginning; we use this field 1221 * to return the no of DWORDS written. 1222 * 1223 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1224 * so it adds NOOPs as padding to make it cacheline aligned. 1225 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1226 * makes a complete batch buffer. 1227 * 1228 * Return: non-zero if we exceed the PAGE_SIZE limit. 1229 */ 1230 1231 static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, 1232 struct i915_wa_ctx_bb *wa_ctx, 1233 uint32_t *const batch, 1234 uint32_t *offset) 1235 { 1236 uint32_t scratch_addr; 1237 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1238 1239 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1240 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1241 1242 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1243 if (IS_BROADWELL(ring->dev)) { 1244 int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1245 if (rc < 0) 1246 return rc; 1247 index = rc; 1248 } 1249 1250 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1251 /* Actual scratch location is at 128 bytes offset */ 1252 scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; 1253 1254 wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); 1255 wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | 1256 PIPE_CONTROL_GLOBAL_GTT_IVB | 1257 PIPE_CONTROL_CS_STALL | 1258 PIPE_CONTROL_QW_WRITE)); 1259 wa_ctx_emit(batch, index, scratch_addr); 1260 wa_ctx_emit(batch, index, 0); 1261 wa_ctx_emit(batch, index, 0); 1262 wa_ctx_emit(batch, index, 0); 1263 1264 /* Pad to end of cacheline */ 1265 while (index % CACHELINE_DWORDS) 1266 wa_ctx_emit(batch, index, MI_NOOP); 1267 1268 /* 1269 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1270 * execution depends on the length specified in terms of cache lines 1271 * in the register CTX_RCS_INDIRECT_CTX 1272 */ 1273 1274 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1275 } 1276 1277 /** 1278 * gen8_init_perctx_bb() - initialize per ctx batch with WA 1279 * 1280 * @ring: only applicable for RCS 1281 * @wa_ctx: structure representing wa_ctx 1282 * offset: specifies start of the batch, should be cache-aligned. 1283 * size: size of the batch in DWORDS but HW expects in terms of cachelines 1284 * @batch: page in which WA are loaded 1285 * @offset: This field specifies the start of this batch. 1286 * This batch is started immediately after indirect_ctx batch. Since we ensure 1287 * that indirect_ctx ends on a cacheline this batch is aligned automatically. 1288 * 1289 * The number of DWORDS written are returned using this field. 1290 * 1291 * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding 1292 * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. 1293 */ 1294 static int gen8_init_perctx_bb(struct intel_engine_cs *ring, 1295 struct i915_wa_ctx_bb *wa_ctx, 1296 uint32_t *const batch, 1297 uint32_t *offset) 1298 { 1299 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1300 1301 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1302 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1303 1304 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1305 1306 return wa_ctx_end(wa_ctx, *offset = index, 1); 1307 } 1308 1309 static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring, 1310 struct i915_wa_ctx_bb *wa_ctx, 1311 uint32_t *const batch, 1312 uint32_t *offset) 1313 { 1314 int ret; 1315 struct drm_device *dev = ring->dev; 1316 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1317 1318 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1319 if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) || 1320 (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) 1321 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); 1322 1323 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ 1324 ret = gen8_emit_flush_coherentl3_wa(ring, batch, index); 1325 if (ret < 0) 1326 return ret; 1327 index = ret; 1328 1329 /* Pad to end of cacheline */ 1330 while (index % CACHELINE_DWORDS) 1331 wa_ctx_emit(batch, index, MI_NOOP); 1332 1333 return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); 1334 } 1335 1336 static int gen9_init_perctx_bb(struct intel_engine_cs *ring, 1337 struct i915_wa_ctx_bb *wa_ctx, 1338 uint32_t *const batch, 1339 uint32_t *offset) 1340 { 1341 struct drm_device *dev = ring->dev; 1342 uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); 1343 1344 /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */ 1345 if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_B0)) || 1346 (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) { 1347 wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); 1348 wa_ctx_emit(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); 1349 wa_ctx_emit(batch, index, 1350 _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); 1351 wa_ctx_emit(batch, index, MI_NOOP); 1352 } 1353 1354 /* WaDisableCtxRestoreArbitration:skl,bxt */ 1355 if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) || 1356 (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) 1357 wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); 1358 1359 wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); 1360 1361 return wa_ctx_end(wa_ctx, *offset = index, 1); 1362 } 1363 1364 static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size) 1365 { 1366 int ret; 1367 1368 ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size)); 1369 if (!ring->wa_ctx.obj) { 1370 DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); 1371 return -ENOMEM; 1372 } 1373 1374 ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0); 1375 if (ret) { 1376 DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", 1377 ret); 1378 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1379 return ret; 1380 } 1381 1382 return 0; 1383 } 1384 1385 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring) 1386 { 1387 if (ring->wa_ctx.obj) { 1388 i915_gem_object_ggtt_unpin(ring->wa_ctx.obj); 1389 drm_gem_object_unreference(&ring->wa_ctx.obj->base); 1390 ring->wa_ctx.obj = NULL; 1391 } 1392 } 1393 1394 static int intel_init_workaround_bb(struct intel_engine_cs *ring) 1395 { 1396 int ret; 1397 uint32_t *batch; 1398 uint32_t offset; 1399 struct vm_page *page; 1400 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 1401 1402 WARN_ON(ring->id != RCS); 1403 1404 /* update this when WA for higher Gen are added */ 1405 if (INTEL_INFO(ring->dev)->gen > 9) { 1406 DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", 1407 INTEL_INFO(ring->dev)->gen); 1408 return 0; 1409 } 1410 1411 /* some WA perform writes to scratch page, ensure it is valid */ 1412 if (ring->scratch.obj == NULL) { 1413 DRM_ERROR("scratch page not allocated for %s\n", ring->name); 1414 return -EINVAL; 1415 } 1416 1417 ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE); 1418 if (ret) { 1419 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 1420 return ret; 1421 } 1422 1423 page = i915_gem_object_get_page(wa_ctx->obj, 0); 1424 batch = kmap_atomic(page); 1425 offset = 0; 1426 1427 if (INTEL_INFO(ring->dev)->gen == 8) { 1428 ret = gen8_init_indirectctx_bb(ring, 1429 &wa_ctx->indirect_ctx, 1430 batch, 1431 &offset); 1432 if (ret) 1433 goto out; 1434 1435 ret = gen8_init_perctx_bb(ring, 1436 &wa_ctx->per_ctx, 1437 batch, 1438 &offset); 1439 if (ret) 1440 goto out; 1441 } else if (INTEL_INFO(ring->dev)->gen == 9) { 1442 ret = gen9_init_indirectctx_bb(ring, 1443 &wa_ctx->indirect_ctx, 1444 batch, 1445 &offset); 1446 if (ret) 1447 goto out; 1448 1449 ret = gen9_init_perctx_bb(ring, 1450 &wa_ctx->per_ctx, 1451 batch, 1452 &offset); 1453 if (ret) 1454 goto out; 1455 } 1456 1457 out: 1458 kunmap_atomic(batch); 1459 if (ret) 1460 lrc_destroy_wa_ctx_obj(ring); 1461 1462 return ret; 1463 } 1464 1465 static int gen8_init_common_ring(struct intel_engine_cs *ring) 1466 { 1467 struct drm_device *dev = ring->dev; 1468 struct drm_i915_private *dev_priv = dev->dev_private; 1469 u8 next_context_status_buffer_hw; 1470 1471 lrc_setup_hardware_status_page(ring, 1472 ring->default_context->engine[ring->id].state); 1473 1474 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1475 I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff); 1476 1477 if (ring->status_page.obj) { 1478 I915_WRITE(RING_HWS_PGA(ring->mmio_base), 1479 (u32)ring->status_page.gfx_addr); 1480 POSTING_READ(RING_HWS_PGA(ring->mmio_base)); 1481 } 1482 1483 I915_WRITE(RING_MODE_GEN7(ring), 1484 _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | 1485 _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); 1486 POSTING_READ(RING_MODE_GEN7(ring)); 1487 1488 /* 1489 * Instead of resetting the Context Status Buffer (CSB) read pointer to 1490 * zero, we need to read the write pointer from hardware and use its 1491 * value because "this register is power context save restored". 1492 * Effectively, these states have been observed: 1493 * 1494 * | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) | 1495 * BDW | CSB regs not reset | CSB regs reset | 1496 * CHT | CSB regs not reset | CSB regs not reset | 1497 */ 1498 next_context_status_buffer_hw = (I915_READ(RING_CONTEXT_STATUS_PTR(ring)) 1499 & GEN8_CSB_PTR_MASK); 1500 1501 /* 1502 * When the CSB registers are reset (also after power-up / gpu reset), 1503 * CSB write pointer is set to all 1's, which is not valid, use '5' in 1504 * this special case, so the first element read is CSB[0]. 1505 */ 1506 if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) 1507 next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); 1508 1509 ring->next_context_status_buffer = next_context_status_buffer_hw; 1510 DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name); 1511 1512 memset(&ring->hangcheck, 0, sizeof(ring->hangcheck)); 1513 1514 return 0; 1515 } 1516 1517 static int gen8_init_render_ring(struct intel_engine_cs *ring) 1518 { 1519 struct drm_device *dev = ring->dev; 1520 struct drm_i915_private *dev_priv = dev->dev_private; 1521 int ret; 1522 1523 ret = gen8_init_common_ring(ring); 1524 if (ret) 1525 return ret; 1526 1527 /* We need to disable the AsyncFlip performance optimisations in order 1528 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1529 * programmed to '1' on all products. 1530 * 1531 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv 1532 */ 1533 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1534 1535 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1536 1537 return init_workarounds_ring(ring); 1538 } 1539 1540 static int gen9_init_render_ring(struct intel_engine_cs *ring) 1541 { 1542 int ret; 1543 1544 ret = gen8_init_common_ring(ring); 1545 if (ret) 1546 return ret; 1547 1548 return init_workarounds_ring(ring); 1549 } 1550 1551 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) 1552 { 1553 struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; 1554 struct intel_engine_cs *ring = req->ring; 1555 struct intel_ringbuffer *ringbuf = req->ringbuf; 1556 const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; 1557 int i, ret; 1558 1559 ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2); 1560 if (ret) 1561 return ret; 1562 1563 intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds)); 1564 for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { 1565 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1566 1567 intel_logical_ring_emit(ringbuf, GEN8_RING_PDP_UDW(ring, i)); 1568 intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr)); 1569 intel_logical_ring_emit(ringbuf, GEN8_RING_PDP_LDW(ring, i)); 1570 intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr)); 1571 } 1572 1573 intel_logical_ring_emit(ringbuf, MI_NOOP); 1574 intel_logical_ring_advance(ringbuf); 1575 1576 return 0; 1577 } 1578 1579 static int gen8_emit_bb_start(struct drm_i915_gem_request *req, 1580 u64 offset, unsigned dispatch_flags) 1581 { 1582 struct intel_ringbuffer *ringbuf = req->ringbuf; 1583 bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); 1584 int ret; 1585 1586 /* Don't rely in hw updating PDPs, specially in lite-restore. 1587 * Ideally, we should set Force PD Restore in ctx descriptor, 1588 * but we can't. Force Restore would be a second option, but 1589 * it is unsafe in case of lite-restore (because the ctx is 1590 * not idle). PML4 is allocated during ppgtt init so this is 1591 * not needed in 48-bit.*/ 1592 if (req->ctx->ppgtt && 1593 (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) { 1594 if (!USES_FULL_48BIT_PPGTT(req->i915) && 1595 !intel_vgpu_active(req->i915->dev)) { 1596 ret = intel_logical_ring_emit_pdps(req); 1597 if (ret) 1598 return ret; 1599 } 1600 1601 req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring); 1602 } 1603 1604 ret = intel_logical_ring_begin(req, 4); 1605 if (ret) 1606 return ret; 1607 1608 /* FIXME(BDW): Address space and security selectors. */ 1609 intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | 1610 (ppgtt<<8) | 1611 (dispatch_flags & I915_DISPATCH_RS ? 1612 MI_BATCH_RESOURCE_STREAMER : 0)); 1613 intel_logical_ring_emit(ringbuf, lower_32_bits(offset)); 1614 intel_logical_ring_emit(ringbuf, upper_32_bits(offset)); 1615 intel_logical_ring_emit(ringbuf, MI_NOOP); 1616 intel_logical_ring_advance(ringbuf); 1617 1618 return 0; 1619 } 1620 1621 static bool gen8_logical_ring_get_irq(struct intel_engine_cs *ring) 1622 { 1623 struct drm_device *dev = ring->dev; 1624 struct drm_i915_private *dev_priv = dev->dev_private; 1625 unsigned long flags; 1626 1627 if (WARN_ON(!intel_irqs_enabled(dev_priv))) 1628 return false; 1629 1630 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1631 if (ring->irq_refcount++ == 0) { 1632 I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); 1633 POSTING_READ(RING_IMR(ring->mmio_base)); 1634 } 1635 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1636 1637 return true; 1638 } 1639 1640 static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring) 1641 { 1642 struct drm_device *dev = ring->dev; 1643 struct drm_i915_private *dev_priv = dev->dev_private; 1644 unsigned long flags; 1645 1646 spin_lock_irqsave(&dev_priv->irq_lock, flags); 1647 if (--ring->irq_refcount == 0) { 1648 I915_WRITE_IMR(ring, ~ring->irq_keep_mask); 1649 POSTING_READ(RING_IMR(ring->mmio_base)); 1650 } 1651 spin_unlock_irqrestore(&dev_priv->irq_lock, flags); 1652 } 1653 1654 static int gen8_emit_flush(struct drm_i915_gem_request *request, 1655 u32 invalidate_domains, 1656 u32 unused) 1657 { 1658 struct intel_ringbuffer *ringbuf = request->ringbuf; 1659 struct intel_engine_cs *ring = ringbuf->ring; 1660 struct drm_device *dev = ring->dev; 1661 struct drm_i915_private *dev_priv = dev->dev_private; 1662 uint32_t cmd; 1663 int ret; 1664 1665 ret = intel_logical_ring_begin(request, 4); 1666 if (ret) 1667 return ret; 1668 1669 cmd = MI_FLUSH_DW + 1; 1670 1671 /* We always require a command barrier so that subsequent 1672 * commands, such as breadcrumb interrupts, are strictly ordered 1673 * wrt the contents of the write cache being flushed to memory 1674 * (and thus being coherent from the CPU). 1675 */ 1676 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 1677 1678 if (invalidate_domains & I915_GEM_GPU_DOMAINS) { 1679 cmd |= MI_INVALIDATE_TLB; 1680 if (ring == &dev_priv->ring[VCS]) 1681 cmd |= MI_INVALIDATE_BSD; 1682 } 1683 1684 intel_logical_ring_emit(ringbuf, cmd); 1685 intel_logical_ring_emit(ringbuf, 1686 I915_GEM_HWS_SCRATCH_ADDR | 1687 MI_FLUSH_DW_USE_GTT); 1688 intel_logical_ring_emit(ringbuf, 0); /* upper addr */ 1689 intel_logical_ring_emit(ringbuf, 0); /* value */ 1690 intel_logical_ring_advance(ringbuf); 1691 1692 return 0; 1693 } 1694 1695 static int gen8_emit_flush_render(struct drm_i915_gem_request *request, 1696 u32 invalidate_domains, 1697 u32 flush_domains) 1698 { 1699 struct intel_ringbuffer *ringbuf = request->ringbuf; 1700 struct intel_engine_cs *ring = ringbuf->ring; 1701 u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; 1702 bool vf_flush_wa; 1703 u32 flags = 0; 1704 int ret; 1705 1706 flags |= PIPE_CONTROL_CS_STALL; 1707 1708 if (flush_domains) { 1709 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 1710 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 1711 flags |= PIPE_CONTROL_FLUSH_ENABLE; 1712 } 1713 1714 if (invalidate_domains) { 1715 flags |= PIPE_CONTROL_TLB_INVALIDATE; 1716 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 1717 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 1718 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 1719 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 1720 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 1721 flags |= PIPE_CONTROL_QW_WRITE; 1722 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 1723 } 1724 1725 /* 1726 * On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe 1727 * control. 1728 */ 1729 vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 && 1730 flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; 1731 1732 ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6); 1733 if (ret) 1734 return ret; 1735 1736 if (vf_flush_wa) { 1737 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1738 intel_logical_ring_emit(ringbuf, 0); 1739 intel_logical_ring_emit(ringbuf, 0); 1740 intel_logical_ring_emit(ringbuf, 0); 1741 intel_logical_ring_emit(ringbuf, 0); 1742 intel_logical_ring_emit(ringbuf, 0); 1743 } 1744 1745 intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); 1746 intel_logical_ring_emit(ringbuf, flags); 1747 intel_logical_ring_emit(ringbuf, scratch_addr); 1748 intel_logical_ring_emit(ringbuf, 0); 1749 intel_logical_ring_emit(ringbuf, 0); 1750 intel_logical_ring_emit(ringbuf, 0); 1751 intel_logical_ring_advance(ringbuf); 1752 1753 return 0; 1754 } 1755 1756 static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1757 { 1758 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1759 } 1760 1761 static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1762 { 1763 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1764 } 1765 1766 static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) 1767 { 1768 1769 /* 1770 * On BXT A steppings there is a HW coherency issue whereby the 1771 * MI_STORE_DATA_IMM storing the completed request's seqno 1772 * occasionally doesn't invalidate the CPU cache. Work around this by 1773 * clflushing the corresponding cacheline whenever the caller wants 1774 * the coherency to be guaranteed. Note that this cacheline is known 1775 * to be clean at this point, since we only write it in 1776 * bxt_a_set_seqno(), where we also do a clflush after the write. So 1777 * this clflush in practice becomes an invalidate operation. 1778 */ 1779 1780 if (!lazy_coherency) 1781 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1782 1783 return intel_read_status_page(ring, I915_GEM_HWS_INDEX); 1784 } 1785 1786 static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno) 1787 { 1788 intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); 1789 1790 /* See bxt_a_get_seqno() explaining the reason for the clflush. */ 1791 intel_flush_status_page(ring, I915_GEM_HWS_INDEX); 1792 } 1793 1794 static int gen8_emit_request(struct drm_i915_gem_request *request) 1795 { 1796 struct intel_ringbuffer *ringbuf = request->ringbuf; 1797 struct intel_engine_cs *ring = ringbuf->ring; 1798 u32 cmd; 1799 int ret; 1800 1801 /* 1802 * Reserve space for 2 NOOPs at the end of each request to be 1803 * used as a workaround for not being allowed to do lite 1804 * restore with HEAD==TAIL (WaIdleLiteRestore). 1805 */ 1806 ret = intel_logical_ring_begin(request, 8); 1807 if (ret) 1808 return ret; 1809 1810 cmd = MI_STORE_DWORD_IMM_GEN4; 1811 cmd |= MI_GLOBAL_GTT; 1812 1813 intel_logical_ring_emit(ringbuf, cmd); 1814 intel_logical_ring_emit(ringbuf, 1815 (ring->status_page.gfx_addr + 1816 (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT))); 1817 intel_logical_ring_emit(ringbuf, 0); 1818 intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request)); 1819 intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT); 1820 intel_logical_ring_emit(ringbuf, MI_NOOP); 1821 intel_logical_ring_advance_and_submit(request); 1822 1823 /* 1824 * Here we add two extra NOOPs as padding to avoid 1825 * lite restore of a context with HEAD==TAIL. 1826 */ 1827 intel_logical_ring_emit(ringbuf, MI_NOOP); 1828 intel_logical_ring_emit(ringbuf, MI_NOOP); 1829 intel_logical_ring_advance(ringbuf); 1830 1831 return 0; 1832 } 1833 1834 static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req) 1835 { 1836 struct render_state so; 1837 int ret; 1838 1839 ret = i915_gem_render_state_prepare(req->ring, &so); 1840 if (ret) 1841 return ret; 1842 1843 if (so.rodata == NULL) 1844 return 0; 1845 1846 ret = req->ring->emit_bb_start(req, so.ggtt_offset, 1847 I915_DISPATCH_SECURE); 1848 if (ret) 1849 goto out; 1850 1851 ret = req->ring->emit_bb_start(req, 1852 (so.ggtt_offset + so.aux_batch_offset), 1853 I915_DISPATCH_SECURE); 1854 if (ret) 1855 goto out; 1856 1857 i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req); 1858 1859 out: 1860 i915_gem_render_state_fini(&so); 1861 return ret; 1862 } 1863 1864 static int gen8_init_rcs_context(struct drm_i915_gem_request *req) 1865 { 1866 int ret; 1867 1868 ret = intel_logical_ring_workarounds_emit(req); 1869 if (ret) 1870 return ret; 1871 1872 ret = intel_rcs_context_init_mocs(req); 1873 /* 1874 * Failing to program the MOCS is non-fatal.The system will not 1875 * run at peak performance. So generate an error and carry on. 1876 */ 1877 if (ret) 1878 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1879 1880 return intel_lr_context_render_state_init(req); 1881 } 1882 1883 /** 1884 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer 1885 * 1886 * @ring: Engine Command Streamer. 1887 * 1888 */ 1889 void intel_logical_ring_cleanup(struct intel_engine_cs *ring) 1890 { 1891 struct drm_i915_private *dev_priv; 1892 1893 if (!intel_ring_initialized(ring)) 1894 return; 1895 1896 dev_priv = ring->dev->dev_private; 1897 1898 intel_logical_ring_stop(ring); 1899 WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0); 1900 1901 if (ring->cleanup) 1902 ring->cleanup(ring); 1903 1904 i915_cmd_parser_fini_ring(ring); 1905 i915_gem_batch_pool_fini(&ring->batch_pool); 1906 1907 if (ring->status_page.obj) { 1908 kunmap(sg_page(ring->status_page.obj->pages->sgl)); 1909 ring->status_page.obj = NULL; 1910 } 1911 1912 lrc_destroy_wa_ctx_obj(ring); 1913 } 1914 1915 static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring) 1916 { 1917 int ret; 1918 1919 /* Intentionally left blank. */ 1920 ring->buffer = NULL; 1921 1922 ring->dev = dev; 1923 INIT_LIST_HEAD(&ring->active_list); 1924 INIT_LIST_HEAD(&ring->request_list); 1925 i915_gem_batch_pool_init(dev, &ring->batch_pool); 1926 init_waitqueue_head(&ring->irq_queue); 1927 1928 INIT_LIST_HEAD(&ring->execlist_queue); 1929 INIT_LIST_HEAD(&ring->execlist_retired_req_list); 1930 lockinit(&ring->execlist_lock, "i915el", 0, LK_CANRECURSE); 1931 1932 ret = i915_cmd_parser_init_ring(ring); 1933 if (ret) 1934 return ret; 1935 1936 ret = intel_lr_context_deferred_alloc(ring->default_context, ring); 1937 if (ret) 1938 return ret; 1939 1940 /* As this is the default context, always pin it */ 1941 ret = intel_lr_context_do_pin( 1942 ring, 1943 ring->default_context->engine[ring->id].state, 1944 ring->default_context->engine[ring->id].ringbuf); 1945 if (ret) { 1946 DRM_ERROR( 1947 "Failed to pin and map ringbuffer %s: %d\n", 1948 ring->name, ret); 1949 return ret; 1950 } 1951 1952 return ret; 1953 } 1954 1955 static int logical_render_ring_init(struct drm_device *dev) 1956 { 1957 struct drm_i915_private *dev_priv = dev->dev_private; 1958 struct intel_engine_cs *ring = &dev_priv->ring[RCS]; 1959 int ret; 1960 1961 ring->name = "render ring"; 1962 ring->id = RCS; 1963 ring->mmio_base = RENDER_RING_BASE; 1964 ring->irq_enable_mask = 1965 GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT; 1966 ring->irq_keep_mask = 1967 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT; 1968 if (HAS_L3_DPF(dev)) 1969 ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 1970 1971 if (INTEL_INFO(dev)->gen >= 9) 1972 ring->init_hw = gen9_init_render_ring; 1973 else 1974 ring->init_hw = gen8_init_render_ring; 1975 ring->init_context = gen8_init_rcs_context; 1976 ring->cleanup = intel_fini_pipe_control; 1977 if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { 1978 ring->get_seqno = bxt_a_get_seqno; 1979 ring->set_seqno = bxt_a_set_seqno; 1980 } else { 1981 ring->get_seqno = gen8_get_seqno; 1982 ring->set_seqno = gen8_set_seqno; 1983 } 1984 ring->emit_request = gen8_emit_request; 1985 ring->emit_flush = gen8_emit_flush_render; 1986 ring->irq_get = gen8_logical_ring_get_irq; 1987 ring->irq_put = gen8_logical_ring_put_irq; 1988 ring->emit_bb_start = gen8_emit_bb_start; 1989 1990 ring->dev = dev; 1991 1992 ret = intel_init_pipe_control(ring); 1993 if (ret) 1994 return ret; 1995 1996 ret = intel_init_workaround_bb(ring); 1997 if (ret) { 1998 /* 1999 * We continue even if we fail to initialize WA batch 2000 * because we only expect rare glitches but nothing 2001 * critical to prevent us from using GPU 2002 */ 2003 DRM_ERROR("WA batch buffer initialization failed: %d\n", 2004 ret); 2005 } 2006 2007 ret = logical_ring_init(dev, ring); 2008 if (ret) { 2009 lrc_destroy_wa_ctx_obj(ring); 2010 } 2011 2012 return ret; 2013 } 2014 2015 static int logical_bsd_ring_init(struct drm_device *dev) 2016 { 2017 struct drm_i915_private *dev_priv = dev->dev_private; 2018 struct intel_engine_cs *ring = &dev_priv->ring[VCS]; 2019 2020 ring->name = "bsd ring"; 2021 ring->id = VCS; 2022 ring->mmio_base = GEN6_BSD_RING_BASE; 2023 ring->irq_enable_mask = 2024 GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT; 2025 ring->irq_keep_mask = 2026 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT; 2027 2028 ring->init_hw = gen8_init_common_ring; 2029 if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { 2030 ring->get_seqno = bxt_a_get_seqno; 2031 ring->set_seqno = bxt_a_set_seqno; 2032 } else { 2033 ring->get_seqno = gen8_get_seqno; 2034 ring->set_seqno = gen8_set_seqno; 2035 } 2036 ring->emit_request = gen8_emit_request; 2037 ring->emit_flush = gen8_emit_flush; 2038 ring->irq_get = gen8_logical_ring_get_irq; 2039 ring->irq_put = gen8_logical_ring_put_irq; 2040 ring->emit_bb_start = gen8_emit_bb_start; 2041 2042 return logical_ring_init(dev, ring); 2043 } 2044 2045 static int logical_bsd2_ring_init(struct drm_device *dev) 2046 { 2047 struct drm_i915_private *dev_priv = dev->dev_private; 2048 struct intel_engine_cs *ring = &dev_priv->ring[VCS2]; 2049 2050 ring->name = "bds2 ring"; 2051 ring->id = VCS2; 2052 ring->mmio_base = GEN8_BSD2_RING_BASE; 2053 ring->irq_enable_mask = 2054 GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT; 2055 ring->irq_keep_mask = 2056 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT; 2057 2058 ring->init_hw = gen8_init_common_ring; 2059 ring->get_seqno = gen8_get_seqno; 2060 ring->set_seqno = gen8_set_seqno; 2061 ring->emit_request = gen8_emit_request; 2062 ring->emit_flush = gen8_emit_flush; 2063 ring->irq_get = gen8_logical_ring_get_irq; 2064 ring->irq_put = gen8_logical_ring_put_irq; 2065 ring->emit_bb_start = gen8_emit_bb_start; 2066 2067 return logical_ring_init(dev, ring); 2068 } 2069 2070 static int logical_blt_ring_init(struct drm_device *dev) 2071 { 2072 struct drm_i915_private *dev_priv = dev->dev_private; 2073 struct intel_engine_cs *ring = &dev_priv->ring[BCS]; 2074 2075 ring->name = "blitter ring"; 2076 ring->id = BCS; 2077 ring->mmio_base = BLT_RING_BASE; 2078 ring->irq_enable_mask = 2079 GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT; 2080 ring->irq_keep_mask = 2081 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT; 2082 2083 ring->init_hw = gen8_init_common_ring; 2084 if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { 2085 ring->get_seqno = bxt_a_get_seqno; 2086 ring->set_seqno = bxt_a_set_seqno; 2087 } else { 2088 ring->get_seqno = gen8_get_seqno; 2089 ring->set_seqno = gen8_set_seqno; 2090 } 2091 ring->emit_request = gen8_emit_request; 2092 ring->emit_flush = gen8_emit_flush; 2093 ring->irq_get = gen8_logical_ring_get_irq; 2094 ring->irq_put = gen8_logical_ring_put_irq; 2095 ring->emit_bb_start = gen8_emit_bb_start; 2096 2097 return logical_ring_init(dev, ring); 2098 } 2099 2100 static int logical_vebox_ring_init(struct drm_device *dev) 2101 { 2102 struct drm_i915_private *dev_priv = dev->dev_private; 2103 struct intel_engine_cs *ring = &dev_priv->ring[VECS]; 2104 2105 ring->name = "video enhancement ring"; 2106 ring->id = VECS; 2107 ring->mmio_base = VEBOX_RING_BASE; 2108 ring->irq_enable_mask = 2109 GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT; 2110 ring->irq_keep_mask = 2111 GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT; 2112 2113 ring->init_hw = gen8_init_common_ring; 2114 if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { 2115 ring->get_seqno = bxt_a_get_seqno; 2116 ring->set_seqno = bxt_a_set_seqno; 2117 } else { 2118 ring->get_seqno = gen8_get_seqno; 2119 ring->set_seqno = gen8_set_seqno; 2120 } 2121 ring->emit_request = gen8_emit_request; 2122 ring->emit_flush = gen8_emit_flush; 2123 ring->irq_get = gen8_logical_ring_get_irq; 2124 ring->irq_put = gen8_logical_ring_put_irq; 2125 ring->emit_bb_start = gen8_emit_bb_start; 2126 2127 return logical_ring_init(dev, ring); 2128 } 2129 2130 /** 2131 * intel_logical_rings_init() - allocate, populate and init the Engine Command Streamers 2132 * @dev: DRM device. 2133 * 2134 * This function inits the engines for an Execlists submission style (the equivalent in the 2135 * legacy ringbuffer submission world would be i915_gem_init_rings). It does it only for 2136 * those engines that are present in the hardware. 2137 * 2138 * Return: non-zero if the initialization failed. 2139 */ 2140 int intel_logical_rings_init(struct drm_device *dev) 2141 { 2142 struct drm_i915_private *dev_priv = dev->dev_private; 2143 int ret; 2144 2145 ret = logical_render_ring_init(dev); 2146 if (ret) 2147 return ret; 2148 2149 if (HAS_BSD(dev)) { 2150 ret = logical_bsd_ring_init(dev); 2151 if (ret) 2152 goto cleanup_render_ring; 2153 } 2154 2155 if (HAS_BLT(dev)) { 2156 ret = logical_blt_ring_init(dev); 2157 if (ret) 2158 goto cleanup_bsd_ring; 2159 } 2160 2161 if (HAS_VEBOX(dev)) { 2162 ret = logical_vebox_ring_init(dev); 2163 if (ret) 2164 goto cleanup_blt_ring; 2165 } 2166 2167 if (HAS_BSD2(dev)) { 2168 ret = logical_bsd2_ring_init(dev); 2169 if (ret) 2170 goto cleanup_vebox_ring; 2171 } 2172 2173 return 0; 2174 2175 cleanup_vebox_ring: 2176 intel_logical_ring_cleanup(&dev_priv->ring[VECS]); 2177 cleanup_blt_ring: 2178 intel_logical_ring_cleanup(&dev_priv->ring[BCS]); 2179 cleanup_bsd_ring: 2180 intel_logical_ring_cleanup(&dev_priv->ring[VCS]); 2181 cleanup_render_ring: 2182 intel_logical_ring_cleanup(&dev_priv->ring[RCS]); 2183 2184 return ret; 2185 } 2186 2187 static u32 2188 make_rpcs(struct drm_device *dev) 2189 { 2190 u32 rpcs = 0; 2191 2192 /* 2193 * No explicit RPCS request is needed to ensure full 2194 * slice/subslice/EU enablement prior to Gen9. 2195 */ 2196 if (INTEL_INFO(dev)->gen < 9) 2197 return 0; 2198 2199 /* 2200 * Starting in Gen9, render power gating can leave 2201 * slice/subslice/EU in a partially enabled state. We 2202 * must make an explicit request through RPCS for full 2203 * enablement. 2204 */ 2205 if (INTEL_INFO(dev)->has_slice_pg) { 2206 rpcs |= GEN8_RPCS_S_CNT_ENABLE; 2207 rpcs |= INTEL_INFO(dev)->slice_total << 2208 GEN8_RPCS_S_CNT_SHIFT; 2209 rpcs |= GEN8_RPCS_ENABLE; 2210 } 2211 2212 if (INTEL_INFO(dev)->has_subslice_pg) { 2213 rpcs |= GEN8_RPCS_SS_CNT_ENABLE; 2214 rpcs |= INTEL_INFO(dev)->subslice_per_slice << 2215 GEN8_RPCS_SS_CNT_SHIFT; 2216 rpcs |= GEN8_RPCS_ENABLE; 2217 } 2218 2219 if (INTEL_INFO(dev)->has_eu_pg) { 2220 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2221 GEN8_RPCS_EU_MIN_SHIFT; 2222 rpcs |= INTEL_INFO(dev)->eu_per_subslice << 2223 GEN8_RPCS_EU_MAX_SHIFT; 2224 rpcs |= GEN8_RPCS_ENABLE; 2225 } 2226 2227 return rpcs; 2228 } 2229 2230 static int 2231 populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj, 2232 struct intel_engine_cs *ring, struct intel_ringbuffer *ringbuf) 2233 { 2234 struct drm_device *dev = ring->dev; 2235 struct drm_i915_private *dev_priv = dev->dev_private; 2236 struct i915_hw_ppgtt *ppgtt = ctx->ppgtt; 2237 struct vm_page *page; 2238 uint32_t *reg_state; 2239 int ret; 2240 2241 if (!ppgtt) 2242 ppgtt = dev_priv->mm.aliasing_ppgtt; 2243 2244 ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true); 2245 if (ret) { 2246 DRM_DEBUG_DRIVER("Could not set to CPU domain\n"); 2247 return ret; 2248 } 2249 2250 ret = i915_gem_object_get_pages(ctx_obj); 2251 if (ret) { 2252 DRM_DEBUG_DRIVER("Could not get object pages\n"); 2253 return ret; 2254 } 2255 2256 i915_gem_object_pin_pages(ctx_obj); 2257 2258 /* The second page of the context object contains some fields which must 2259 * be set up prior to the first execution. */ 2260 page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); 2261 reg_state = kmap_atomic(page); 2262 2263 /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM 2264 * commands followed by (reg, value) pairs. The values we are setting here are 2265 * only for the first context restore: on a subsequent save, the GPU will 2266 * recreate this batchbuffer with new values (including all the missing 2267 * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ 2268 if (ring->id == RCS) 2269 reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(14); 2270 else 2271 reg_state[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(11); 2272 reg_state[CTX_LRI_HEADER_0] |= MI_LRI_FORCE_POSTED; 2273 reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring); 2274 reg_state[CTX_CONTEXT_CONTROL+1] = 2275 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 2276 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | 2277 CTX_CTRL_RS_CTX_ENABLE); 2278 reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); 2279 reg_state[CTX_RING_HEAD+1] = 0; 2280 reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); 2281 reg_state[CTX_RING_TAIL+1] = 0; 2282 reg_state[CTX_RING_BUFFER_START] = RING_START(ring->mmio_base); 2283 /* Ring buffer start address is not known until the buffer is pinned. 2284 * It is written to the context image in execlists_update_context() 2285 */ 2286 reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring->mmio_base); 2287 reg_state[CTX_RING_BUFFER_CONTROL+1] = 2288 ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID; 2289 reg_state[CTX_BB_HEAD_U] = ring->mmio_base + 0x168; 2290 reg_state[CTX_BB_HEAD_U+1] = 0; 2291 reg_state[CTX_BB_HEAD_L] = ring->mmio_base + 0x140; 2292 reg_state[CTX_BB_HEAD_L+1] = 0; 2293 reg_state[CTX_BB_STATE] = ring->mmio_base + 0x110; 2294 reg_state[CTX_BB_STATE+1] = (1<<5); 2295 reg_state[CTX_SECOND_BB_HEAD_U] = ring->mmio_base + 0x11c; 2296 reg_state[CTX_SECOND_BB_HEAD_U+1] = 0; 2297 reg_state[CTX_SECOND_BB_HEAD_L] = ring->mmio_base + 0x114; 2298 reg_state[CTX_SECOND_BB_HEAD_L+1] = 0; 2299 reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; 2300 reg_state[CTX_SECOND_BB_STATE+1] = 0; 2301 if (ring->id == RCS) { 2302 reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; 2303 reg_state[CTX_BB_PER_CTX_PTR+1] = 0; 2304 reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; 2305 reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; 2306 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; 2307 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; 2308 if (ring->wa_ctx.obj) { 2309 struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; 2310 uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); 2311 2312 reg_state[CTX_RCS_INDIRECT_CTX+1] = 2313 (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | 2314 (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); 2315 2316 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 2317 CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; 2318 2319 reg_state[CTX_BB_PER_CTX_PTR+1] = 2320 (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | 2321 0x01; 2322 } 2323 } 2324 reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); 2325 reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; 2326 reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8; 2327 reg_state[CTX_CTX_TIMESTAMP+1] = 0; 2328 reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3); 2329 reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3); 2330 reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2); 2331 reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2); 2332 reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1); 2333 reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); 2334 reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); 2335 reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); 2336 2337 if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { 2338 /* 64b PPGTT (48bit canonical) 2339 * PDP0_DESCRIPTOR contains the base address to PML4 and 2340 * other PDP Descriptors are ignored. 2341 */ 2342 ASSIGN_CTX_PML4(ppgtt, reg_state); 2343 } else { 2344 /* 32b PPGTT 2345 * PDP*_DESCRIPTOR contains the base address of space supported. 2346 * With dynamic page allocation, PDPs may not be allocated at 2347 * this point. Point the unallocated PDPs to the scratch page 2348 */ 2349 ASSIGN_CTX_PDP(ppgtt, reg_state, 3); 2350 ASSIGN_CTX_PDP(ppgtt, reg_state, 2); 2351 ASSIGN_CTX_PDP(ppgtt, reg_state, 1); 2352 ASSIGN_CTX_PDP(ppgtt, reg_state, 0); 2353 } 2354 2355 if (ring->id == RCS) { 2356 reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 2357 reg_state[CTX_R_PWR_CLK_STATE] = GEN8_R_PWR_CLK_STATE; 2358 reg_state[CTX_R_PWR_CLK_STATE+1] = make_rpcs(dev); 2359 } 2360 2361 kunmap_atomic(reg_state); 2362 2363 ctx_obj->dirty = 1; 2364 set_page_dirty(page); 2365 i915_gem_object_unpin_pages(ctx_obj); 2366 2367 return 0; 2368 } 2369 2370 /** 2371 * intel_lr_context_free() - free the LRC specific bits of a context 2372 * @ctx: the LR context to free. 2373 * 2374 * The real context freeing is done in i915_gem_context_free: this only 2375 * takes care of the bits that are LRC related: the per-engine backing 2376 * objects and the logical ringbuffer. 2377 */ 2378 void intel_lr_context_free(struct intel_context *ctx) 2379 { 2380 int i; 2381 2382 for (i = 0; i < I915_NUM_RINGS; i++) { 2383 struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state; 2384 2385 if (ctx_obj) { 2386 struct intel_ringbuffer *ringbuf = 2387 ctx->engine[i].ringbuf; 2388 struct intel_engine_cs *ring = ringbuf->ring; 2389 2390 if (ctx == ring->default_context) { 2391 intel_unpin_ringbuffer_obj(ringbuf); 2392 i915_gem_object_ggtt_unpin(ctx_obj); 2393 } 2394 WARN_ON(ctx->engine[ring->id].pin_count); 2395 intel_ringbuffer_free(ringbuf); 2396 drm_gem_object_unreference(&ctx_obj->base); 2397 } 2398 } 2399 } 2400 2401 static uint32_t get_lr_context_size(struct intel_engine_cs *ring) 2402 { 2403 int ret = 0; 2404 2405 WARN_ON(INTEL_INFO(ring->dev)->gen < 8); 2406 2407 switch (ring->id) { 2408 case RCS: 2409 if (INTEL_INFO(ring->dev)->gen >= 9) 2410 ret = GEN9_LR_CONTEXT_RENDER_SIZE; 2411 else 2412 ret = GEN8_LR_CONTEXT_RENDER_SIZE; 2413 break; 2414 case VCS: 2415 case BCS: 2416 case VECS: 2417 case VCS2: 2418 ret = GEN8_LR_CONTEXT_OTHER_SIZE; 2419 break; 2420 } 2421 2422 return ret; 2423 } 2424 2425 static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, 2426 struct drm_i915_gem_object *default_ctx_obj) 2427 { 2428 struct drm_i915_private *dev_priv = ring->dev->dev_private; 2429 struct vm_page *page; 2430 2431 /* The HWSP is part of the default context object in LRC mode. */ 2432 ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj) 2433 + LRC_PPHWSP_PN * PAGE_SIZE; 2434 page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN); 2435 ring->status_page.page_addr = kmap(page); 2436 ring->status_page.obj = default_ctx_obj; 2437 2438 I915_WRITE(RING_HWS_PGA(ring->mmio_base), 2439 (u32)ring->status_page.gfx_addr); 2440 POSTING_READ(RING_HWS_PGA(ring->mmio_base)); 2441 } 2442 2443 /** 2444 * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context 2445 * @ctx: LR context to create. 2446 * @ring: engine to be used with the context. 2447 * 2448 * This function can be called more than once, with different engines, if we plan 2449 * to use the context with them. The context backing objects and the ringbuffers 2450 * (specially the ringbuffer backing objects) suck a lot of memory up, and that's why 2451 * the creation is a deferred call: it's better to make sure first that we need to use 2452 * a given ring with the context. 2453 * 2454 * Return: non-zero on error. 2455 */ 2456 2457 int intel_lr_context_deferred_alloc(struct intel_context *ctx, 2458 struct intel_engine_cs *ring) 2459 { 2460 struct drm_device *dev = ring->dev; 2461 struct drm_i915_gem_object *ctx_obj; 2462 uint32_t context_size; 2463 struct intel_ringbuffer *ringbuf; 2464 int ret; 2465 2466 WARN_ON(ctx->legacy_hw_ctx.rcs_state != NULL); 2467 WARN_ON(ctx->engine[ring->id].state); 2468 2469 context_size = round_up(get_lr_context_size(ring), 4096); 2470 2471 /* One extra page as the sharing data between driver and GuC */ 2472 context_size += PAGE_SIZE * LRC_PPHWSP_PN; 2473 2474 ctx_obj = i915_gem_alloc_object(dev, context_size); 2475 if (!ctx_obj) { 2476 DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); 2477 return -ENOMEM; 2478 } 2479 2480 ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE); 2481 if (IS_ERR(ringbuf)) { 2482 ret = PTR_ERR(ringbuf); 2483 goto error_deref_obj; 2484 } 2485 2486 ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); 2487 if (ret) { 2488 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 2489 goto error_ringbuf; 2490 } 2491 2492 ctx->engine[ring->id].ringbuf = ringbuf; 2493 ctx->engine[ring->id].state = ctx_obj; 2494 2495 if (ctx != ring->default_context && ring->init_context) { 2496 struct drm_i915_gem_request *req; 2497 2498 ret = i915_gem_request_alloc(ring, 2499 ctx, &req); 2500 if (ret) { 2501 DRM_ERROR("ring create req: %d\n", 2502 ret); 2503 goto error_ringbuf; 2504 } 2505 2506 ret = ring->init_context(req); 2507 if (ret) { 2508 DRM_ERROR("ring init context: %d\n", 2509 ret); 2510 i915_gem_request_cancel(req); 2511 goto error_ringbuf; 2512 } 2513 i915_add_request_no_flush(req); 2514 } 2515 return 0; 2516 2517 error_ringbuf: 2518 intel_ringbuffer_free(ringbuf); 2519 error_deref_obj: 2520 drm_gem_object_unreference(&ctx_obj->base); 2521 ctx->engine[ring->id].ringbuf = NULL; 2522 ctx->engine[ring->id].state = NULL; 2523 return ret; 2524 } 2525 2526 void intel_lr_context_reset(struct drm_device *dev, 2527 struct intel_context *ctx) 2528 { 2529 struct drm_i915_private *dev_priv = dev->dev_private; 2530 struct intel_engine_cs *ring; 2531 int i; 2532 2533 for_each_ring(ring, dev_priv, i) { 2534 struct drm_i915_gem_object *ctx_obj = 2535 ctx->engine[ring->id].state; 2536 struct intel_ringbuffer *ringbuf = 2537 ctx->engine[ring->id].ringbuf; 2538 uint32_t *reg_state; 2539 struct vm_page *page; 2540 2541 if (!ctx_obj) 2542 continue; 2543 2544 if (i915_gem_object_get_pages(ctx_obj)) { 2545 WARN(1, "Failed get_pages for context obj\n"); 2546 continue; 2547 } 2548 page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); 2549 reg_state = kmap_atomic(page); 2550 2551 reg_state[CTX_RING_HEAD+1] = 0; 2552 reg_state[CTX_RING_TAIL+1] = 0; 2553 2554 kunmap_atomic(reg_state); 2555 2556 ringbuf->head = 0; 2557 ringbuf->tail = 0; 2558 } 2559 } 2560