1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014 Intel Corporation
4 */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22
23 /*
24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
25 * addresses' offset and commands in @regs. The following encoding is used
26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
27 *
28 * Commands:
29 * [7]: create NOPs - number of NOPs are set in lower bits
30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31 * MI_LRI_FORCE_POSTED
32 * [5:0]: Number of NOPs or registers to set values to in case of
33 * MI_LOAD_REGISTER_IMM
34 *
35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36 * number of registers. They are set by using the REG/REG16 macros: the former
37 * is used for offsets smaller than 0x200 while the latter is for values bigger
38 * than that. Those macros already set all the bits documented below correctly:
39 *
40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41 * follow, for the lower bits
42 * [6:0]: Register offset, without considering the engine base.
43 *
44 * This function only tweaks the commands and register offsets. Values are not
45 * filled out.
46 */
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)47 static void set_offsets(u32 *regs,
48 const u8 *data,
49 const struct intel_engine_cs *engine,
50 bool close)
51 #define NOP(x) (BIT(7) | (x))
52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53 #define POSTED BIT(0)
54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55 #define REG16(x) \
56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57 (((x) >> 2) & 0x7f)
58 #define END 0
59 {
60 const u32 base = engine->mmio_base;
61
62 while (*data) {
63 u8 count, flags;
64
65 if (*data & BIT(7)) { /* skip */
66 count = *data++ & ~BIT(7);
67 regs += count;
68 continue;
69 }
70
71 count = *data & 0x3f;
72 flags = *data >> 6;
73 data++;
74
75 *regs = MI_LOAD_REGISTER_IMM(count);
76 if (flags & POSTED)
77 *regs |= MI_LRI_FORCE_POSTED;
78 if (GRAPHICS_VER(engine->i915) >= 11)
79 *regs |= MI_LRI_LRM_CS_MMIO;
80 regs++;
81
82 GEM_BUG_ON(!count);
83 do {
84 u32 offset = 0;
85 u8 v;
86
87 do {
88 v = *data++;
89 offset <<= 7;
90 offset |= v & ~BIT(7);
91 } while (v & BIT(7));
92
93 regs[0] = base + (offset << 2);
94 regs += 2;
95 } while (--count);
96 }
97
98 if (close) {
99 /* Close the batch; used mainly by live_lrc_layout() */
100 *regs = MI_BATCH_BUFFER_END;
101 if (GRAPHICS_VER(engine->i915) >= 11)
102 *regs |= BIT(0);
103 }
104 }
105
106 static const u8 gen8_xcs_offsets[] = {
107 NOP(1),
108 LRI(11, 0),
109 REG16(0x244),
110 REG(0x034),
111 REG(0x030),
112 REG(0x038),
113 REG(0x03c),
114 REG(0x168),
115 REG(0x140),
116 REG(0x110),
117 REG(0x11c),
118 REG(0x114),
119 REG(0x118),
120
121 NOP(9),
122 LRI(9, 0),
123 REG16(0x3a8),
124 REG16(0x28c),
125 REG16(0x288),
126 REG16(0x284),
127 REG16(0x280),
128 REG16(0x27c),
129 REG16(0x278),
130 REG16(0x274),
131 REG16(0x270),
132
133 NOP(13),
134 LRI(2, 0),
135 REG16(0x200),
136 REG(0x028),
137
138 END
139 };
140
141 static const u8 gen9_xcs_offsets[] = {
142 NOP(1),
143 LRI(14, POSTED),
144 REG16(0x244),
145 REG(0x034),
146 REG(0x030),
147 REG(0x038),
148 REG(0x03c),
149 REG(0x168),
150 REG(0x140),
151 REG(0x110),
152 REG(0x11c),
153 REG(0x114),
154 REG(0x118),
155 REG(0x1c0),
156 REG(0x1c4),
157 REG(0x1c8),
158
159 NOP(3),
160 LRI(9, POSTED),
161 REG16(0x3a8),
162 REG16(0x28c),
163 REG16(0x288),
164 REG16(0x284),
165 REG16(0x280),
166 REG16(0x27c),
167 REG16(0x278),
168 REG16(0x274),
169 REG16(0x270),
170
171 NOP(13),
172 LRI(1, POSTED),
173 REG16(0x200),
174
175 NOP(13),
176 LRI(44, POSTED),
177 REG(0x028),
178 REG(0x09c),
179 REG(0x0c0),
180 REG(0x178),
181 REG(0x17c),
182 REG16(0x358),
183 REG(0x170),
184 REG(0x150),
185 REG(0x154),
186 REG(0x158),
187 REG16(0x41c),
188 REG16(0x600),
189 REG16(0x604),
190 REG16(0x608),
191 REG16(0x60c),
192 REG16(0x610),
193 REG16(0x614),
194 REG16(0x618),
195 REG16(0x61c),
196 REG16(0x620),
197 REG16(0x624),
198 REG16(0x628),
199 REG16(0x62c),
200 REG16(0x630),
201 REG16(0x634),
202 REG16(0x638),
203 REG16(0x63c),
204 REG16(0x640),
205 REG16(0x644),
206 REG16(0x648),
207 REG16(0x64c),
208 REG16(0x650),
209 REG16(0x654),
210 REG16(0x658),
211 REG16(0x65c),
212 REG16(0x660),
213 REG16(0x664),
214 REG16(0x668),
215 REG16(0x66c),
216 REG16(0x670),
217 REG16(0x674),
218 REG16(0x678),
219 REG16(0x67c),
220 REG(0x068),
221
222 END
223 };
224
225 static const u8 gen12_xcs_offsets[] = {
226 NOP(1),
227 LRI(13, POSTED),
228 REG16(0x244),
229 REG(0x034),
230 REG(0x030),
231 REG(0x038),
232 REG(0x03c),
233 REG(0x168),
234 REG(0x140),
235 REG(0x110),
236 REG(0x1c0),
237 REG(0x1c4),
238 REG(0x1c8),
239 REG(0x180),
240 REG16(0x2b4),
241
242 NOP(5),
243 LRI(9, POSTED),
244 REG16(0x3a8),
245 REG16(0x28c),
246 REG16(0x288),
247 REG16(0x284),
248 REG16(0x280),
249 REG16(0x27c),
250 REG16(0x278),
251 REG16(0x274),
252 REG16(0x270),
253
254 END
255 };
256
257 static const u8 dg2_xcs_offsets[] = {
258 NOP(1),
259 LRI(15, POSTED),
260 REG16(0x244),
261 REG(0x034),
262 REG(0x030),
263 REG(0x038),
264 REG(0x03c),
265 REG(0x168),
266 REG(0x140),
267 REG(0x110),
268 REG(0x1c0),
269 REG(0x1c4),
270 REG(0x1c8),
271 REG(0x180),
272 REG16(0x2b4),
273 REG(0x120),
274 REG(0x124),
275
276 NOP(1),
277 LRI(9, POSTED),
278 REG16(0x3a8),
279 REG16(0x28c),
280 REG16(0x288),
281 REG16(0x284),
282 REG16(0x280),
283 REG16(0x27c),
284 REG16(0x278),
285 REG16(0x274),
286 REG16(0x270),
287
288 END
289 };
290
291 static const u8 gen8_rcs_offsets[] = {
292 NOP(1),
293 LRI(14, POSTED),
294 REG16(0x244),
295 REG(0x034),
296 REG(0x030),
297 REG(0x038),
298 REG(0x03c),
299 REG(0x168),
300 REG(0x140),
301 REG(0x110),
302 REG(0x11c),
303 REG(0x114),
304 REG(0x118),
305 REG(0x1c0),
306 REG(0x1c4),
307 REG(0x1c8),
308
309 NOP(3),
310 LRI(9, POSTED),
311 REG16(0x3a8),
312 REG16(0x28c),
313 REG16(0x288),
314 REG16(0x284),
315 REG16(0x280),
316 REG16(0x27c),
317 REG16(0x278),
318 REG16(0x274),
319 REG16(0x270),
320
321 NOP(13),
322 LRI(1, 0),
323 REG(0x0c8),
324
325 END
326 };
327
328 static const u8 gen9_rcs_offsets[] = {
329 NOP(1),
330 LRI(14, POSTED),
331 REG16(0x244),
332 REG(0x34),
333 REG(0x30),
334 REG(0x38),
335 REG(0x3c),
336 REG(0x168),
337 REG(0x140),
338 REG(0x110),
339 REG(0x11c),
340 REG(0x114),
341 REG(0x118),
342 REG(0x1c0),
343 REG(0x1c4),
344 REG(0x1c8),
345
346 NOP(3),
347 LRI(9, POSTED),
348 REG16(0x3a8),
349 REG16(0x28c),
350 REG16(0x288),
351 REG16(0x284),
352 REG16(0x280),
353 REG16(0x27c),
354 REG16(0x278),
355 REG16(0x274),
356 REG16(0x270),
357
358 NOP(13),
359 LRI(1, 0),
360 REG(0xc8),
361
362 NOP(13),
363 LRI(44, POSTED),
364 REG(0x28),
365 REG(0x9c),
366 REG(0xc0),
367 REG(0x178),
368 REG(0x17c),
369 REG16(0x358),
370 REG(0x170),
371 REG(0x150),
372 REG(0x154),
373 REG(0x158),
374 REG16(0x41c),
375 REG16(0x600),
376 REG16(0x604),
377 REG16(0x608),
378 REG16(0x60c),
379 REG16(0x610),
380 REG16(0x614),
381 REG16(0x618),
382 REG16(0x61c),
383 REG16(0x620),
384 REG16(0x624),
385 REG16(0x628),
386 REG16(0x62c),
387 REG16(0x630),
388 REG16(0x634),
389 REG16(0x638),
390 REG16(0x63c),
391 REG16(0x640),
392 REG16(0x644),
393 REG16(0x648),
394 REG16(0x64c),
395 REG16(0x650),
396 REG16(0x654),
397 REG16(0x658),
398 REG16(0x65c),
399 REG16(0x660),
400 REG16(0x664),
401 REG16(0x668),
402 REG16(0x66c),
403 REG16(0x670),
404 REG16(0x674),
405 REG16(0x678),
406 REG16(0x67c),
407 REG(0x68),
408
409 END
410 };
411
412 static const u8 gen11_rcs_offsets[] = {
413 NOP(1),
414 LRI(15, POSTED),
415 REG16(0x244),
416 REG(0x034),
417 REG(0x030),
418 REG(0x038),
419 REG(0x03c),
420 REG(0x168),
421 REG(0x140),
422 REG(0x110),
423 REG(0x11c),
424 REG(0x114),
425 REG(0x118),
426 REG(0x1c0),
427 REG(0x1c4),
428 REG(0x1c8),
429 REG(0x180),
430
431 NOP(1),
432 LRI(9, POSTED),
433 REG16(0x3a8),
434 REG16(0x28c),
435 REG16(0x288),
436 REG16(0x284),
437 REG16(0x280),
438 REG16(0x27c),
439 REG16(0x278),
440 REG16(0x274),
441 REG16(0x270),
442
443 LRI(1, POSTED),
444 REG(0x1b0),
445
446 NOP(10),
447 LRI(1, 0),
448 REG(0x0c8),
449
450 END
451 };
452
453 static const u8 gen12_rcs_offsets[] = {
454 NOP(1),
455 LRI(13, POSTED),
456 REG16(0x244),
457 REG(0x034),
458 REG(0x030),
459 REG(0x038),
460 REG(0x03c),
461 REG(0x168),
462 REG(0x140),
463 REG(0x110),
464 REG(0x1c0),
465 REG(0x1c4),
466 REG(0x1c8),
467 REG(0x180),
468 REG16(0x2b4),
469
470 NOP(5),
471 LRI(9, POSTED),
472 REG16(0x3a8),
473 REG16(0x28c),
474 REG16(0x288),
475 REG16(0x284),
476 REG16(0x280),
477 REG16(0x27c),
478 REG16(0x278),
479 REG16(0x274),
480 REG16(0x270),
481
482 LRI(3, POSTED),
483 REG(0x1b0),
484 REG16(0x5a8),
485 REG16(0x5ac),
486
487 NOP(6),
488 LRI(1, 0),
489 REG(0x0c8),
490 NOP(3 + 9 + 1),
491
492 LRI(51, POSTED),
493 REG16(0x588),
494 REG16(0x588),
495 REG16(0x588),
496 REG16(0x588),
497 REG16(0x588),
498 REG16(0x588),
499 REG(0x028),
500 REG(0x09c),
501 REG(0x0c0),
502 REG(0x178),
503 REG(0x17c),
504 REG16(0x358),
505 REG(0x170),
506 REG(0x150),
507 REG(0x154),
508 REG(0x158),
509 REG16(0x41c),
510 REG16(0x600),
511 REG16(0x604),
512 REG16(0x608),
513 REG16(0x60c),
514 REG16(0x610),
515 REG16(0x614),
516 REG16(0x618),
517 REG16(0x61c),
518 REG16(0x620),
519 REG16(0x624),
520 REG16(0x628),
521 REG16(0x62c),
522 REG16(0x630),
523 REG16(0x634),
524 REG16(0x638),
525 REG16(0x63c),
526 REG16(0x640),
527 REG16(0x644),
528 REG16(0x648),
529 REG16(0x64c),
530 REG16(0x650),
531 REG16(0x654),
532 REG16(0x658),
533 REG16(0x65c),
534 REG16(0x660),
535 REG16(0x664),
536 REG16(0x668),
537 REG16(0x66c),
538 REG16(0x670),
539 REG16(0x674),
540 REG16(0x678),
541 REG16(0x67c),
542 REG(0x068),
543 REG(0x084),
544 NOP(1),
545
546 END
547 };
548
549 static const u8 xehp_rcs_offsets[] = {
550 NOP(1),
551 LRI(13, POSTED),
552 REG16(0x244),
553 REG(0x034),
554 REG(0x030),
555 REG(0x038),
556 REG(0x03c),
557 REG(0x168),
558 REG(0x140),
559 REG(0x110),
560 REG(0x1c0),
561 REG(0x1c4),
562 REG(0x1c8),
563 REG(0x180),
564 REG16(0x2b4),
565
566 NOP(5),
567 LRI(9, POSTED),
568 REG16(0x3a8),
569 REG16(0x28c),
570 REG16(0x288),
571 REG16(0x284),
572 REG16(0x280),
573 REG16(0x27c),
574 REG16(0x278),
575 REG16(0x274),
576 REG16(0x270),
577
578 LRI(3, POSTED),
579 REG(0x1b0),
580 REG16(0x5a8),
581 REG16(0x5ac),
582
583 NOP(6),
584 LRI(1, 0),
585 REG(0x0c8),
586
587 END
588 };
589
590 static const u8 dg2_rcs_offsets[] = {
591 NOP(1),
592 LRI(15, POSTED),
593 REG16(0x244),
594 REG(0x034),
595 REG(0x030),
596 REG(0x038),
597 REG(0x03c),
598 REG(0x168),
599 REG(0x140),
600 REG(0x110),
601 REG(0x1c0),
602 REG(0x1c4),
603 REG(0x1c8),
604 REG(0x180),
605 REG16(0x2b4),
606 REG(0x120),
607 REG(0x124),
608
609 NOP(1),
610 LRI(9, POSTED),
611 REG16(0x3a8),
612 REG16(0x28c),
613 REG16(0x288),
614 REG16(0x284),
615 REG16(0x280),
616 REG16(0x27c),
617 REG16(0x278),
618 REG16(0x274),
619 REG16(0x270),
620
621 LRI(3, POSTED),
622 REG(0x1b0),
623 REG16(0x5a8),
624 REG16(0x5ac),
625
626 NOP(6),
627 LRI(1, 0),
628 REG(0x0c8),
629
630 END
631 };
632
633 static const u8 mtl_rcs_offsets[] = {
634 NOP(1),
635 LRI(15, POSTED),
636 REG16(0x244),
637 REG(0x034),
638 REG(0x030),
639 REG(0x038),
640 REG(0x03c),
641 REG(0x168),
642 REG(0x140),
643 REG(0x110),
644 REG(0x1c0),
645 REG(0x1c4),
646 REG(0x1c8),
647 REG(0x180),
648 REG16(0x2b4),
649 REG(0x120),
650 REG(0x124),
651
652 NOP(1),
653 LRI(9, POSTED),
654 REG16(0x3a8),
655 REG16(0x28c),
656 REG16(0x288),
657 REG16(0x284),
658 REG16(0x280),
659 REG16(0x27c),
660 REG16(0x278),
661 REG16(0x274),
662 REG16(0x270),
663
664 NOP(2),
665 LRI(2, POSTED),
666 REG16(0x5a8),
667 REG16(0x5ac),
668
669 NOP(6),
670 LRI(1, 0),
671 REG(0x0c8),
672
673 END
674 };
675
676 #undef END
677 #undef REG16
678 #undef REG
679 #undef LRI
680 #undef NOP
681
reg_offsets(const struct intel_engine_cs * engine)682 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
683 {
684 /*
685 * The gen12+ lists only have the registers we program in the basic
686 * default state. We rely on the context image using relative
687 * addressing to automatic fixup the register state between the
688 * physical engines for virtual engine.
689 */
690 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
691 !intel_engine_has_relative_mmio(engine));
692
693 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
694 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
695 return mtl_rcs_offsets;
696 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
697 return dg2_rcs_offsets;
698 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
699 return xehp_rcs_offsets;
700 else if (GRAPHICS_VER(engine->i915) >= 12)
701 return gen12_rcs_offsets;
702 else if (GRAPHICS_VER(engine->i915) >= 11)
703 return gen11_rcs_offsets;
704 else if (GRAPHICS_VER(engine->i915) >= 9)
705 return gen9_rcs_offsets;
706 else
707 return gen8_rcs_offsets;
708 } else {
709 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
710 return dg2_xcs_offsets;
711 else if (GRAPHICS_VER(engine->i915) >= 12)
712 return gen12_xcs_offsets;
713 else if (GRAPHICS_VER(engine->i915) >= 9)
714 return gen9_xcs_offsets;
715 else
716 return gen8_xcs_offsets;
717 }
718 }
719
lrc_ring_mi_mode(const struct intel_engine_cs * engine)720 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
721 {
722 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
723 return 0x70;
724 else if (GRAPHICS_VER(engine->i915) >= 12)
725 return 0x60;
726 else if (GRAPHICS_VER(engine->i915) >= 9)
727 return 0x54;
728 else if (engine->class == RENDER_CLASS)
729 return 0x58;
730 else
731 return -1;
732 }
733
lrc_ring_bb_offset(const struct intel_engine_cs * engine)734 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
735 {
736 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
737 return 0x80;
738 else if (GRAPHICS_VER(engine->i915) >= 12)
739 return 0x70;
740 else if (GRAPHICS_VER(engine->i915) >= 9)
741 return 0x64;
742 else if (GRAPHICS_VER(engine->i915) >= 8 &&
743 engine->class == RENDER_CLASS)
744 return 0xc4;
745 else
746 return -1;
747 }
748
lrc_ring_gpr0(const struct intel_engine_cs * engine)749 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
750 {
751 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
752 return 0x84;
753 else if (GRAPHICS_VER(engine->i915) >= 12)
754 return 0x74;
755 else if (GRAPHICS_VER(engine->i915) >= 9)
756 return 0x68;
757 else if (engine->class == RENDER_CLASS)
758 return 0xd8;
759 else
760 return -1;
761 }
762
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)763 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
764 {
765 if (GRAPHICS_VER(engine->i915) >= 12)
766 return 0x12;
767 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
768 return 0x18;
769 else
770 return -1;
771 }
772
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)773 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
774 {
775 int x;
776
777 x = lrc_ring_wa_bb_per_ctx(engine);
778 if (x < 0)
779 return x;
780
781 return x + 2;
782 }
783
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)784 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
785 {
786 int x;
787
788 x = lrc_ring_indirect_ptr(engine);
789 if (x < 0)
790 return x;
791
792 return x + 2;
793 }
794
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)795 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
796 {
797
798 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
799 /*
800 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
801 * simply to match the RCS context image layout.
802 */
803 return 0xc6;
804 else if (engine->class != RENDER_CLASS)
805 return -1;
806 else if (GRAPHICS_VER(engine->i915) >= 12)
807 return 0xb6;
808 else if (GRAPHICS_VER(engine->i915) >= 11)
809 return 0xaa;
810 else
811 return -1;
812 }
813
814 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)815 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
816 {
817 if (GRAPHICS_VER(engine->i915) >= 12)
818 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
819 else if (GRAPHICS_VER(engine->i915) >= 11)
820 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
821 else if (GRAPHICS_VER(engine->i915) >= 9)
822 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
823 else if (GRAPHICS_VER(engine->i915) >= 8)
824 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
825
826 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
827
828 return 0;
829 }
830
831 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)832 lrc_setup_indirect_ctx(u32 *regs,
833 const struct intel_engine_cs *engine,
834 u32 ctx_bb_ggtt_addr,
835 u32 size)
836 {
837 GEM_BUG_ON(!size);
838 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
839 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
840 regs[lrc_ring_indirect_ptr(engine) + 1] =
841 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
842
843 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
844 regs[lrc_ring_indirect_offset(engine) + 1] =
845 lrc_ring_indirect_offset_default(engine) << 6;
846 }
847
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)848 static void init_common_regs(u32 * const regs,
849 const struct intel_context *ce,
850 const struct intel_engine_cs *engine,
851 bool inhibit)
852 {
853 u32 ctl;
854 int loc;
855
856 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
857 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
858 if (inhibit)
859 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
860 if (GRAPHICS_VER(engine->i915) < 11)
861 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
862 CTX_CTRL_RS_CTX_ENABLE);
863 regs[CTX_CONTEXT_CONTROL] = ctl;
864
865 regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
866
867 loc = lrc_ring_bb_offset(engine);
868 if (loc != -1)
869 regs[loc + 1] = 0;
870 }
871
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)872 static void init_wa_bb_regs(u32 * const regs,
873 const struct intel_engine_cs *engine)
874 {
875 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
876
877 if (wa_ctx->per_ctx.size) {
878 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
879
880 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
881 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
882 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
883 }
884
885 if (wa_ctx->indirect_ctx.size) {
886 lrc_setup_indirect_ctx(regs, engine,
887 i915_ggtt_offset(wa_ctx->vma) +
888 wa_ctx->indirect_ctx.offset,
889 wa_ctx->indirect_ctx.size);
890 }
891 }
892
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)893 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
894 {
895 if (i915_vm_is_4lvl(&ppgtt->vm)) {
896 /* 64b PPGTT (48bit canonical)
897 * PDP0_DESCRIPTOR contains the base address to PML4 and
898 * other PDP Descriptors are ignored.
899 */
900 ASSIGN_CTX_PML4(ppgtt, regs);
901 } else {
902 ASSIGN_CTX_PDP(ppgtt, regs, 3);
903 ASSIGN_CTX_PDP(ppgtt, regs, 2);
904 ASSIGN_CTX_PDP(ppgtt, regs, 1);
905 ASSIGN_CTX_PDP(ppgtt, regs, 0);
906 }
907 }
908
vm_alias(struct i915_address_space * vm)909 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
910 {
911 if (i915_is_ggtt(vm))
912 return i915_vm_to_ggtt(vm)->alias;
913 else
914 return i915_vm_to_ppgtt(vm);
915 }
916
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)917 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
918 {
919 int x;
920
921 x = lrc_ring_mi_mode(engine);
922 if (x != -1) {
923 regs[x + 1] &= ~STOP_RING;
924 regs[x + 1] |= STOP_RING << 16;
925 }
926 }
927
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)928 static void __lrc_init_regs(u32 *regs,
929 const struct intel_context *ce,
930 const struct intel_engine_cs *engine,
931 bool inhibit)
932 {
933 /*
934 * A context is actually a big batch buffer with several
935 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
936 * values we are setting here are only for the first context restore:
937 * on a subsequent save, the GPU will recreate this batchbuffer with new
938 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
939 * we are not initializing here).
940 *
941 * Must keep consistent with virtual_update_register_offsets().
942 */
943
944 if (inhibit)
945 memset(regs, 0, PAGE_SIZE);
946
947 set_offsets(regs, reg_offsets(engine), engine, inhibit);
948
949 init_common_regs(regs, ce, engine, inhibit);
950 init_ppgtt_regs(regs, vm_alias(ce->vm));
951
952 init_wa_bb_regs(regs, engine);
953
954 __reset_stop_ring(regs, engine);
955 }
956
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)957 void lrc_init_regs(const struct intel_context *ce,
958 const struct intel_engine_cs *engine,
959 bool inhibit)
960 {
961 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
962 }
963
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)964 void lrc_reset_regs(const struct intel_context *ce,
965 const struct intel_engine_cs *engine)
966 {
967 __reset_stop_ring(ce->lrc_reg_state, engine);
968 }
969
970 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)971 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
972 {
973 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
974 return;
975
976 vaddr += engine->context_size;
977
978 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
979 }
980
981 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)982 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
983 {
984 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
985 return;
986
987 vaddr += engine->context_size;
988
989 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
990 drm_err_once(&engine->i915->drm,
991 "%s context redzone overwritten!\n",
992 engine->name);
993 }
994
context_wa_bb_offset(const struct intel_context * ce)995 static u32 context_wa_bb_offset(const struct intel_context *ce)
996 {
997 return PAGE_SIZE * ce->wa_bb_page;
998 }
999
context_indirect_bb(const struct intel_context * ce)1000 static u32 *context_indirect_bb(const struct intel_context *ce)
1001 {
1002 void *ptr;
1003
1004 GEM_BUG_ON(!ce->wa_bb_page);
1005
1006 ptr = ce->lrc_reg_state;
1007 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008 ptr += context_wa_bb_offset(ce);
1009
1010 return ptr;
1011 }
1012
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)1013 void lrc_init_state(struct intel_context *ce,
1014 struct intel_engine_cs *engine,
1015 void *state)
1016 {
1017 bool inhibit = true;
1018
1019 set_redzone(state, engine);
1020
1021 if (engine->default_state) {
1022 #ifdef __linux__
1023 shmem_read(engine->default_state, 0,
1024 state, engine->context_size);
1025 #else
1026 uao_read(engine->default_state, 0,
1027 state, engine->context_size);
1028 #endif
1029 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1030 inhibit = false;
1031 }
1032
1033 /* Clear the ppHWSP (inc. per-context counters) */
1034 memset(state, 0, PAGE_SIZE);
1035
1036 /* Clear the indirect wa and storage */
1037 if (ce->wa_bb_page)
1038 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1039
1040 /*
1041 * The second page of the context object contains some registers which
1042 * must be set up prior to the first execution.
1043 */
1044 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1045 }
1046
lrc_indirect_bb(const struct intel_context * ce)1047 u32 lrc_indirect_bb(const struct intel_context *ce)
1048 {
1049 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1050 }
1051
setup_predicate_disable_wa(const struct intel_context * ce,u32 * cs)1052 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1053 {
1054 /* If predication is active, this will be noop'ed */
1055 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1056 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1057 *cs++ = 0;
1058 *cs++ = 0; /* No predication */
1059
1060 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1061 *cs++ = MI_BATCH_BUFFER_END | BIT(15);
1062 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1063
1064 /* Instructions are no longer predicated (disabled), we can proceed */
1065 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1066 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1067 *cs++ = 0;
1068 *cs++ = 1; /* enable predication before the next BB */
1069
1070 *cs++ = MI_BATCH_BUFFER_END;
1071 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1072
1073 return cs;
1074 }
1075
1076 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)1077 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1078 {
1079 struct drm_i915_gem_object *obj;
1080 struct i915_vma *vma;
1081 u32 context_size;
1082
1083 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1084
1085 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1086 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1087
1088 if (GRAPHICS_VER(engine->i915) >= 12) {
1089 ce->wa_bb_page = context_size / PAGE_SIZE;
1090 context_size += PAGE_SIZE;
1091 }
1092
1093 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1094 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1095 context_size += PARENT_SCRATCH_SIZE;
1096 }
1097
1098 obj = i915_gem_object_create_lmem(engine->i915, context_size,
1099 I915_BO_ALLOC_PM_VOLATILE);
1100 if (IS_ERR(obj)) {
1101 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1102 if (IS_ERR(obj))
1103 return ERR_CAST(obj);
1104
1105 /*
1106 * Wa_22016122933: For Media version 13.0, all Media GT shared
1107 * memory needs to be mapped as WC on CPU side and UC (PAT
1108 * index 2) on GPU side.
1109 */
1110 if (intel_gt_needs_wa_22016122933(engine->gt))
1111 i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1112 }
1113
1114 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1115 if (IS_ERR(vma)) {
1116 i915_gem_object_put(obj);
1117 return vma;
1118 }
1119
1120 return vma;
1121 }
1122
1123 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)1124 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1125 {
1126 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1127
1128 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1129 }
1130
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)1131 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1132 {
1133 struct intel_ring *ring;
1134 struct i915_vma *vma;
1135 int err;
1136
1137 GEM_BUG_ON(ce->state);
1138
1139 vma = __lrc_alloc_state(ce, engine);
1140 if (IS_ERR(vma))
1141 return PTR_ERR(vma);
1142
1143 ring = intel_engine_create_ring(engine, ce->ring_size);
1144 if (IS_ERR(ring)) {
1145 err = PTR_ERR(ring);
1146 goto err_vma;
1147 }
1148
1149 if (!page_mask_bits(ce->timeline)) {
1150 struct intel_timeline *tl;
1151
1152 /*
1153 * Use the static global HWSP for the kernel context, and
1154 * a dynamically allocated cacheline for everyone else.
1155 */
1156 if (unlikely(ce->timeline))
1157 tl = pinned_timeline(ce, engine);
1158 else
1159 tl = intel_timeline_create(engine->gt);
1160 if (IS_ERR(tl)) {
1161 err = PTR_ERR(tl);
1162 goto err_ring;
1163 }
1164
1165 ce->timeline = tl;
1166 }
1167
1168 ce->ring = ring;
1169 ce->state = vma;
1170
1171 return 0;
1172
1173 err_ring:
1174 intel_ring_put(ring);
1175 err_vma:
1176 i915_vma_put(vma);
1177 return err;
1178 }
1179
lrc_reset(struct intel_context * ce)1180 void lrc_reset(struct intel_context *ce)
1181 {
1182 GEM_BUG_ON(!intel_context_is_pinned(ce));
1183
1184 intel_ring_reset(ce->ring, ce->ring->emit);
1185
1186 /* Scrub away the garbage */
1187 lrc_init_regs(ce, ce->engine, true);
1188 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1189 }
1190
1191 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)1192 lrc_pre_pin(struct intel_context *ce,
1193 struct intel_engine_cs *engine,
1194 struct i915_gem_ww_ctx *ww,
1195 void **vaddr)
1196 {
1197 GEM_BUG_ON(!ce->state);
1198 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1199
1200 *vaddr = i915_gem_object_pin_map(ce->state->obj,
1201 intel_gt_coherent_map_type(ce->engine->gt,
1202 ce->state->obj,
1203 false) |
1204 I915_MAP_OVERRIDE);
1205
1206 return PTR_ERR_OR_ZERO(*vaddr);
1207 }
1208
1209 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)1210 lrc_pin(struct intel_context *ce,
1211 struct intel_engine_cs *engine,
1212 void *vaddr)
1213 {
1214 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1215
1216 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1217 lrc_init_state(ce, engine, vaddr);
1218
1219 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1220 return 0;
1221 }
1222
lrc_unpin(struct intel_context * ce)1223 void lrc_unpin(struct intel_context *ce)
1224 {
1225 if (unlikely(ce->parallel.last_rq)) {
1226 i915_request_put(ce->parallel.last_rq);
1227 ce->parallel.last_rq = NULL;
1228 }
1229 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1230 ce->engine);
1231 }
1232
lrc_post_unpin(struct intel_context * ce)1233 void lrc_post_unpin(struct intel_context *ce)
1234 {
1235 i915_gem_object_unpin_map(ce->state->obj);
1236 }
1237
lrc_fini(struct intel_context * ce)1238 void lrc_fini(struct intel_context *ce)
1239 {
1240 if (!ce->state)
1241 return;
1242
1243 intel_ring_put(fetch_and_zero(&ce->ring));
1244 i915_vma_put(fetch_and_zero(&ce->state));
1245 }
1246
lrc_destroy(struct kref * kref)1247 void lrc_destroy(struct kref *kref)
1248 {
1249 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1250
1251 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1252 GEM_BUG_ON(intel_context_is_pinned(ce));
1253
1254 lrc_fini(ce);
1255
1256 intel_context_fini(ce);
1257 intel_context_free(ce);
1258 }
1259
1260 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1261 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1262 {
1263 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1264 MI_SRM_LRM_GLOBAL_GTT |
1265 MI_LRI_LRM_CS_MMIO;
1266 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1267 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1268 CTX_TIMESTAMP * sizeof(u32);
1269 *cs++ = 0;
1270
1271 *cs++ = MI_LOAD_REGISTER_REG |
1272 MI_LRR_SOURCE_CS_MMIO |
1273 MI_LRI_LRM_CS_MMIO;
1274 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1275 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1276
1277 *cs++ = MI_LOAD_REGISTER_REG |
1278 MI_LRR_SOURCE_CS_MMIO |
1279 MI_LRI_LRM_CS_MMIO;
1280 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1281 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1282
1283 return cs;
1284 }
1285
1286 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1287 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1288 {
1289 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1290
1291 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1292 MI_SRM_LRM_GLOBAL_GTT |
1293 MI_LRI_LRM_CS_MMIO;
1294 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1295 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1296 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1297 *cs++ = 0;
1298
1299 return cs;
1300 }
1301
1302 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1303 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1304 {
1305 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1306
1307 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1308 MI_SRM_LRM_GLOBAL_GTT |
1309 MI_LRI_LRM_CS_MMIO;
1310 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1311 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1312 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1313 *cs++ = 0;
1314
1315 *cs++ = MI_LOAD_REGISTER_REG |
1316 MI_LRR_SOURCE_CS_MMIO |
1317 MI_LRI_LRM_CS_MMIO;
1318 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1319 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1320
1321 return cs;
1322 }
1323
1324 /*
1325 * The bspec's tuning guide asks us to program a vertical watermark value of
1326 * 0x3FF. However this register is not saved/restored properly by the
1327 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1328 * batch buffer to ensure the value takes effect properly. All other bits
1329 * in this register should remain at 0 (the hardware default).
1330 */
1331 static u32 *
dg2_emit_draw_watermark_setting(u32 * cs)1332 dg2_emit_draw_watermark_setting(u32 *cs)
1333 {
1334 *cs++ = MI_LOAD_REGISTER_IMM(1);
1335 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1336 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1337
1338 return cs;
1339 }
1340
1341 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1342 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1343 {
1344 cs = gen12_emit_timestamp_wa(ce, cs);
1345 cs = gen12_emit_cmd_buf_wa(ce, cs);
1346 cs = gen12_emit_restore_scratch(ce, cs);
1347
1348 /* Wa_16013000631:dg2 */
1349 if (IS_DG2_G11(ce->engine->i915))
1350 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1351
1352 cs = gen12_emit_aux_table_inv(ce->engine, cs);
1353
1354 /* Wa_16014892111 */
1355 if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1356 IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1357 IS_DG2(ce->engine->i915))
1358 cs = dg2_emit_draw_watermark_setting(cs);
1359
1360 return cs;
1361 }
1362
1363 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1364 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1365 {
1366 cs = gen12_emit_timestamp_wa(ce, cs);
1367 cs = gen12_emit_restore_scratch(ce, cs);
1368
1369 /* Wa_16013000631:dg2 */
1370 if (IS_DG2_G11(ce->engine->i915))
1371 if (ce->engine->class == COMPUTE_CLASS)
1372 cs = gen8_emit_pipe_control(cs,
1373 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1374 0);
1375
1376 return gen12_emit_aux_table_inv(ce->engine, cs);
1377 }
1378
1379 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1380 setup_indirect_ctx_bb(const struct intel_context *ce,
1381 const struct intel_engine_cs *engine,
1382 u32 *(*emit)(const struct intel_context *, u32 *))
1383 {
1384 u32 * const start = context_indirect_bb(ce);
1385 u32 *cs;
1386
1387 cs = emit(ce, start);
1388 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1389 while ((unsigned long)cs % CACHELINE_BYTES)
1390 *cs++ = MI_NOOP;
1391
1392 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1393 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1394
1395 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1396 lrc_indirect_bb(ce),
1397 (cs - start) * sizeof(*cs));
1398 }
1399
1400 /*
1401 * The context descriptor encodes various attributes of a context,
1402 * including its GTT address and some flags. Because it's fairly
1403 * expensive to calculate, we'll just do it once and cache the result,
1404 * which remains valid until the context is unpinned.
1405 *
1406 * This is what a descriptor looks like, from LSB to MSB::
1407 *
1408 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1409 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1410 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1411 * bits 53-54: mbz, reserved for use by hardware
1412 * bits 55-63: group ID, currently unused and set to 0
1413 *
1414 * Starting from Gen11, the upper dword of the descriptor has a new format:
1415 *
1416 * bits 32-36: reserved
1417 * bits 37-47: SW context ID
1418 * bits 48:53: engine instance
1419 * bit 54: mbz, reserved for use by hardware
1420 * bits 55-60: SW counter
1421 * bits 61-63: engine class
1422 *
1423 * On Xe_HP, the upper dword of the descriptor has a new format:
1424 *
1425 * bits 32-37: virtual function number
1426 * bit 38: mbz, reserved for use by hardware
1427 * bits 39-54: SW context ID
1428 * bits 55-57: reserved
1429 * bits 58-63: SW counter
1430 *
1431 * engine info, SW context ID and SW counter need to form a unique number
1432 * (Context ID) per lrc.
1433 */
lrc_descriptor(const struct intel_context * ce)1434 static u32 lrc_descriptor(const struct intel_context *ce)
1435 {
1436 u32 desc;
1437
1438 desc = INTEL_LEGACY_32B_CONTEXT;
1439 if (i915_vm_is_4lvl(ce->vm))
1440 desc = INTEL_LEGACY_64B_CONTEXT;
1441 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1442
1443 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1444 if (GRAPHICS_VER(ce->vm->i915) == 8)
1445 desc |= GEN8_CTX_L3LLC_COHERENT;
1446
1447 return i915_ggtt_offset(ce->state) | desc;
1448 }
1449
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1450 u32 lrc_update_regs(const struct intel_context *ce,
1451 const struct intel_engine_cs *engine,
1452 u32 head)
1453 {
1454 struct intel_ring *ring = ce->ring;
1455 u32 *regs = ce->lrc_reg_state;
1456
1457 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1458 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1459
1460 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1461 regs[CTX_RING_HEAD] = head;
1462 regs[CTX_RING_TAIL] = ring->tail;
1463 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1464
1465 /* RPCS */
1466 if (engine->class == RENDER_CLASS) {
1467 regs[CTX_R_PWR_CLK_STATE] =
1468 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1469
1470 i915_oa_init_reg_state(ce, engine);
1471 }
1472
1473 if (ce->wa_bb_page) {
1474 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1475
1476 fn = gen12_emit_indirect_ctx_xcs;
1477 if (ce->engine->class == RENDER_CLASS)
1478 fn = gen12_emit_indirect_ctx_rcs;
1479
1480 /* Mutually exclusive wrt to global indirect bb */
1481 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1482 setup_indirect_ctx_bb(ce, engine, fn);
1483 }
1484
1485 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1486 }
1487
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1488 void lrc_update_offsets(struct intel_context *ce,
1489 struct intel_engine_cs *engine)
1490 {
1491 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1492 }
1493
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1494 void lrc_check_regs(const struct intel_context *ce,
1495 const struct intel_engine_cs *engine,
1496 const char *when)
1497 {
1498 const struct intel_ring *ring = ce->ring;
1499 u32 *regs = ce->lrc_reg_state;
1500 bool valid = true;
1501 int x;
1502
1503 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1504 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1505 engine->name,
1506 regs[CTX_RING_START],
1507 i915_ggtt_offset(ring->vma));
1508 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1509 valid = false;
1510 }
1511
1512 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1513 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1514 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1515 engine->name,
1516 regs[CTX_RING_CTL],
1517 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1518 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1519 valid = false;
1520 }
1521
1522 x = lrc_ring_mi_mode(engine);
1523 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1524 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1525 engine->name, regs[x + 1]);
1526 regs[x + 1] &= ~STOP_RING;
1527 regs[x + 1] |= STOP_RING << 16;
1528 valid = false;
1529 }
1530
1531 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1532 }
1533
1534 /*
1535 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1536 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1537 * but there is a slight complication as this is applied in WA batch where the
1538 * values are only initialized once so we cannot take register value at the
1539 * beginning and reuse it further; hence we save its value to memory, upload a
1540 * constant value with bit21 set and then we restore it back with the saved value.
1541 * To simplify the WA, a constant value is formed by using the default value
1542 * of this register. This shouldn't be a problem because we are only modifying
1543 * it for a short period and this batch in non-premptible. We can ofcourse
1544 * use additional instructions that read the actual value of the register
1545 * at that time and set our bit of interest but it makes the WA complicated.
1546 *
1547 * This WA is also required for Gen9 so extracting as a function avoids
1548 * code duplication.
1549 */
1550 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1551 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1552 {
1553 /* NB no one else is allowed to scribble over scratch + 256! */
1554 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1555 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1556 *batch++ = intel_gt_scratch_offset(engine->gt,
1557 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1558 *batch++ = 0;
1559
1560 *batch++ = MI_LOAD_REGISTER_IMM(1);
1561 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1562 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1563
1564 batch = gen8_emit_pipe_control(batch,
1565 PIPE_CONTROL_CS_STALL |
1566 PIPE_CONTROL_DC_FLUSH_ENABLE,
1567 0);
1568
1569 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1570 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1571 *batch++ = intel_gt_scratch_offset(engine->gt,
1572 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1573 *batch++ = 0;
1574
1575 return batch;
1576 }
1577
1578 /*
1579 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1580 * initialized at the beginning and shared across all contexts but this field
1581 * helps us to have multiple batches at different offsets and select them based
1582 * on a criteria. At the moment this batch always start at the beginning of the page
1583 * and at this point we don't have multiple wa_ctx batch buffers.
1584 *
1585 * The number of WA applied are not known at the beginning; we use this field
1586 * to return the no of DWORDS written.
1587 *
1588 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1589 * so it adds NOOPs as padding to make it cacheline aligned.
1590 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1591 * makes a complete batch buffer.
1592 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1593 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1594 {
1595 /* WaDisableCtxRestoreArbitration:bdw,chv */
1596 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1597
1598 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1599 if (IS_BROADWELL(engine->i915))
1600 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1601
1602 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1603 /* Actual scratch location is at 128 bytes offset */
1604 batch = gen8_emit_pipe_control(batch,
1605 PIPE_CONTROL_FLUSH_L3 |
1606 PIPE_CONTROL_STORE_DATA_INDEX |
1607 PIPE_CONTROL_CS_STALL |
1608 PIPE_CONTROL_QW_WRITE,
1609 LRC_PPHWSP_SCRATCH_ADDR);
1610
1611 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1612
1613 /* Pad to end of cacheline */
1614 while ((unsigned long)batch % CACHELINE_BYTES)
1615 *batch++ = MI_NOOP;
1616
1617 /*
1618 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1619 * execution depends on the length specified in terms of cache lines
1620 * in the register CTX_RCS_INDIRECT_CTX
1621 */
1622
1623 return batch;
1624 }
1625
1626 struct lri {
1627 i915_reg_t reg;
1628 u32 value;
1629 };
1630
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1631 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1632 {
1633 GEM_BUG_ON(!count || count > 63);
1634
1635 *batch++ = MI_LOAD_REGISTER_IMM(count);
1636 do {
1637 *batch++ = i915_mmio_reg_offset(lri->reg);
1638 *batch++ = lri->value;
1639 } while (lri++, --count);
1640 *batch++ = MI_NOOP;
1641
1642 return batch;
1643 }
1644
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1645 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1646 {
1647 static const struct lri lri[] = {
1648 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1649 {
1650 COMMON_SLICE_CHICKEN2,
1651 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1652 0),
1653 },
1654
1655 /* BSpec: 11391 */
1656 {
1657 FF_SLICE_CHICKEN,
1658 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1659 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1660 },
1661
1662 /* BSpec: 11299 */
1663 {
1664 _3D_CHICKEN3,
1665 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1666 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1667 }
1668 };
1669
1670 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1671
1672 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1673 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1674
1675 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1676 batch = gen8_emit_pipe_control(batch,
1677 PIPE_CONTROL_FLUSH_L3 |
1678 PIPE_CONTROL_STORE_DATA_INDEX |
1679 PIPE_CONTROL_CS_STALL |
1680 PIPE_CONTROL_QW_WRITE,
1681 LRC_PPHWSP_SCRATCH_ADDR);
1682
1683 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1684
1685 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1686 if (HAS_POOLED_EU(engine->i915)) {
1687 /*
1688 * EU pool configuration is setup along with golden context
1689 * during context initialization. This value depends on
1690 * device type (2x6 or 3x6) and needs to be updated based
1691 * on which subslice is disabled especially for 2x6
1692 * devices, however it is safe to load default
1693 * configuration of 3x6 device instead of masking off
1694 * corresponding bits because HW ignores bits of a disabled
1695 * subslice and drops down to appropriate config. Please
1696 * see render_state_setup() in i915_gem_render_state.c for
1697 * possible configurations, to avoid duplication they are
1698 * not shown here again.
1699 */
1700 *batch++ = GEN9_MEDIA_POOL_STATE;
1701 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1702 *batch++ = 0x00777000;
1703 *batch++ = 0;
1704 *batch++ = 0;
1705 *batch++ = 0;
1706 }
1707
1708 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1709
1710 /* Pad to end of cacheline */
1711 while ((unsigned long)batch % CACHELINE_BYTES)
1712 *batch++ = MI_NOOP;
1713
1714 return batch;
1715 }
1716
1717 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1718
lrc_create_wa_ctx(struct intel_engine_cs * engine)1719 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1720 {
1721 struct drm_i915_gem_object *obj;
1722 struct i915_vma *vma;
1723 int err;
1724
1725 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1726 if (IS_ERR(obj))
1727 return PTR_ERR(obj);
1728
1729 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1730 if (IS_ERR(vma)) {
1731 err = PTR_ERR(vma);
1732 goto err;
1733 }
1734
1735 engine->wa_ctx.vma = vma;
1736 return 0;
1737
1738 err:
1739 i915_gem_object_put(obj);
1740 return err;
1741 }
1742
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1743 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1744 {
1745 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1746 }
1747
1748 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1749
lrc_init_wa_ctx(struct intel_engine_cs * engine)1750 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1751 {
1752 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1753 struct i915_wa_ctx_bb *wa_bb[] = {
1754 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1755 };
1756 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1757 struct i915_gem_ww_ctx ww;
1758 void *batch, *batch_ptr;
1759 unsigned int i;
1760 int err;
1761
1762 if (GRAPHICS_VER(engine->i915) >= 11 ||
1763 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1764 return;
1765
1766 if (GRAPHICS_VER(engine->i915) == 9) {
1767 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1768 wa_bb_fn[1] = NULL;
1769 } else if (GRAPHICS_VER(engine->i915) == 8) {
1770 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1771 wa_bb_fn[1] = NULL;
1772 }
1773
1774 err = lrc_create_wa_ctx(engine);
1775 if (err) {
1776 /*
1777 * We continue even if we fail to initialize WA batch
1778 * because we only expect rare glitches but nothing
1779 * critical to prevent us from using GPU
1780 */
1781 drm_err(&engine->i915->drm,
1782 "Ignoring context switch w/a allocation error:%d\n",
1783 err);
1784 return;
1785 }
1786
1787 if (!engine->wa_ctx.vma)
1788 return;
1789
1790 i915_gem_ww_ctx_init(&ww, true);
1791 retry:
1792 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1793 if (!err)
1794 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1795 if (err)
1796 goto err;
1797
1798 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1799 if (IS_ERR(batch)) {
1800 err = PTR_ERR(batch);
1801 goto err_unpin;
1802 }
1803
1804 /*
1805 * Emit the two workaround batch buffers, recording the offset from the
1806 * start of the workaround batch buffer object for each and their
1807 * respective sizes.
1808 */
1809 batch_ptr = batch;
1810 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1811 wa_bb[i]->offset = batch_ptr - batch;
1812 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1813 CACHELINE_BYTES))) {
1814 err = -EINVAL;
1815 break;
1816 }
1817 if (wa_bb_fn[i])
1818 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1819 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1820 }
1821 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1822
1823 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1824 __i915_gem_object_release_map(wa_ctx->vma->obj);
1825
1826 /* Verify that we can handle failure to setup the wa_ctx */
1827 if (!err)
1828 err = i915_inject_probe_error(engine->i915, -ENODEV);
1829
1830 err_unpin:
1831 if (err)
1832 i915_vma_unpin(wa_ctx->vma);
1833 err:
1834 if (err == -EDEADLK) {
1835 err = i915_gem_ww_ctx_backoff(&ww);
1836 if (!err)
1837 goto retry;
1838 }
1839 i915_gem_ww_ctx_fini(&ww);
1840
1841 if (err) {
1842 i915_vma_put(engine->wa_ctx.vma);
1843
1844 /* Clear all flags to prevent further use */
1845 memset(wa_ctx, 0, sizeof(*wa_ctx));
1846 }
1847 }
1848
st_runtime_underflow(struct intel_context_stats * stats,s32 dt)1849 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1850 {
1851 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1852 stats->runtime.num_underflow++;
1853 stats->runtime.max_underflow =
1854 max_t(u32, stats->runtime.max_underflow, -dt);
1855 #endif
1856 }
1857
lrc_get_runtime(const struct intel_context * ce)1858 static u32 lrc_get_runtime(const struct intel_context *ce)
1859 {
1860 /*
1861 * We can use either ppHWSP[16] which is recorded before the context
1862 * switch (and so excludes the cost of context switches) or use the
1863 * value from the context image itself, which is saved/restored earlier
1864 * and so includes the cost of the save.
1865 */
1866 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1867 }
1868
lrc_update_runtime(struct intel_context * ce)1869 void lrc_update_runtime(struct intel_context *ce)
1870 {
1871 struct intel_context_stats *stats = &ce->stats;
1872 u32 old;
1873 s32 dt;
1874
1875 old = stats->runtime.last;
1876 stats->runtime.last = lrc_get_runtime(ce);
1877 dt = stats->runtime.last - old;
1878 if (!dt)
1879 return;
1880
1881 if (unlikely(dt < 0)) {
1882 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1883 old, stats->runtime.last, dt);
1884 st_runtime_underflow(stats, dt);
1885 return;
1886 }
1887
1888 ewma_runtime_add(&stats->runtime.avg, dt);
1889 stats->runtime.total += dt;
1890 }
1891
1892 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1893 #include "selftest_lrc.c"
1894 #endif
1895