1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2016 Intel Corporation
4 */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9
10 #include "intel_gt.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
14
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
20
21 #include "selftests/mock_drm.h"
22
23 #include "gem/selftests/mock_context.h"
24 #include "gem/selftests/igt_gem_utils.h"
25
26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
27
28 struct hang {
29 struct intel_gt *gt;
30 struct drm_i915_gem_object *hws;
31 struct drm_i915_gem_object *obj;
32 struct i915_gem_context *ctx;
33 u32 *seqno;
34 u32 *batch;
35 };
36
hang_init(struct hang * h,struct intel_gt * gt)37 static int hang_init(struct hang *h, struct intel_gt *gt)
38 {
39 void *vaddr;
40 int err;
41
42 memset(h, 0, sizeof(*h));
43 h->gt = gt;
44
45 h->ctx = kernel_context(gt->i915);
46 if (IS_ERR(h->ctx))
47 return PTR_ERR(h->ctx);
48
49 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
50
51 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
52 if (IS_ERR(h->hws)) {
53 err = PTR_ERR(h->hws);
54 goto err_ctx;
55 }
56
57 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
58 if (IS_ERR(h->obj)) {
59 err = PTR_ERR(h->obj);
60 goto err_hws;
61 }
62
63 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
64 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
65 if (IS_ERR(vaddr)) {
66 err = PTR_ERR(vaddr);
67 goto err_obj;
68 }
69 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
70
71 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
72 i915_coherent_map_type(gt->i915));
73 if (IS_ERR(vaddr)) {
74 err = PTR_ERR(vaddr);
75 goto err_unpin_hws;
76 }
77 h->batch = vaddr;
78
79 return 0;
80
81 err_unpin_hws:
82 i915_gem_object_unpin_map(h->hws);
83 err_obj:
84 i915_gem_object_put(h->obj);
85 err_hws:
86 i915_gem_object_put(h->hws);
87 err_ctx:
88 kernel_context_close(h->ctx);
89 return err;
90 }
91
hws_address(const struct i915_vma * hws,const struct i915_request * rq)92 static u64 hws_address(const struct i915_vma *hws,
93 const struct i915_request *rq)
94 {
95 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
96 }
97
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)98 static int move_to_active(struct i915_vma *vma,
99 struct i915_request *rq,
100 unsigned int flags)
101 {
102 int err;
103
104 i915_vma_lock(vma);
105 err = i915_request_await_object(rq, vma->obj,
106 flags & EXEC_OBJECT_WRITE);
107 if (err == 0)
108 err = i915_vma_move_to_active(vma, rq, flags);
109 i915_vma_unlock(vma);
110
111 return err;
112 }
113
114 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)115 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
116 {
117 struct intel_gt *gt = h->gt;
118 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
119 struct drm_i915_gem_object *obj;
120 struct i915_request *rq = NULL;
121 struct i915_vma *hws, *vma;
122 unsigned int flags;
123 void *vaddr;
124 u32 *batch;
125 int err;
126
127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
128 if (IS_ERR(obj)) {
129 i915_vm_put(vm);
130 return ERR_CAST(obj);
131 }
132
133 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915));
134 if (IS_ERR(vaddr)) {
135 i915_gem_object_put(obj);
136 i915_vm_put(vm);
137 return ERR_CAST(vaddr);
138 }
139
140 i915_gem_object_unpin_map(h->obj);
141 i915_gem_object_put(h->obj);
142
143 h->obj = obj;
144 h->batch = vaddr;
145
146 vma = i915_vma_instance(h->obj, vm, NULL);
147 if (IS_ERR(vma)) {
148 i915_vm_put(vm);
149 return ERR_CAST(vma);
150 }
151
152 hws = i915_vma_instance(h->hws, vm, NULL);
153 if (IS_ERR(hws)) {
154 i915_vm_put(vm);
155 return ERR_CAST(hws);
156 }
157
158 err = i915_vma_pin(vma, 0, 0, PIN_USER);
159 if (err) {
160 i915_vm_put(vm);
161 return ERR_PTR(err);
162 }
163
164 err = i915_vma_pin(hws, 0, 0, PIN_USER);
165 if (err)
166 goto unpin_vma;
167
168 rq = igt_request_alloc(h->ctx, engine);
169 if (IS_ERR(rq)) {
170 err = PTR_ERR(rq);
171 goto unpin_hws;
172 }
173
174 err = move_to_active(vma, rq, 0);
175 if (err)
176 goto cancel_rq;
177
178 err = move_to_active(hws, rq, 0);
179 if (err)
180 goto cancel_rq;
181
182 batch = h->batch;
183 if (INTEL_GEN(gt->i915) >= 8) {
184 *batch++ = MI_STORE_DWORD_IMM_GEN4;
185 *batch++ = lower_32_bits(hws_address(hws, rq));
186 *batch++ = upper_32_bits(hws_address(hws, rq));
187 *batch++ = rq->fence.seqno;
188 *batch++ = MI_NOOP;
189
190 memset(batch, 0, 1024);
191 batch += 1024 / sizeof(*batch);
192
193 *batch++ = MI_NOOP;
194 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
195 *batch++ = lower_32_bits(vma->node.start);
196 *batch++ = upper_32_bits(vma->node.start);
197 } else if (INTEL_GEN(gt->i915) >= 6) {
198 *batch++ = MI_STORE_DWORD_IMM_GEN4;
199 *batch++ = 0;
200 *batch++ = lower_32_bits(hws_address(hws, rq));
201 *batch++ = rq->fence.seqno;
202 *batch++ = MI_NOOP;
203
204 memset(batch, 0, 1024);
205 batch += 1024 / sizeof(*batch);
206
207 *batch++ = MI_NOOP;
208 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
209 *batch++ = lower_32_bits(vma->node.start);
210 } else if (INTEL_GEN(gt->i915) >= 4) {
211 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
212 *batch++ = 0;
213 *batch++ = lower_32_bits(hws_address(hws, rq));
214 *batch++ = rq->fence.seqno;
215 *batch++ = MI_NOOP;
216
217 memset(batch, 0, 1024);
218 batch += 1024 / sizeof(*batch);
219
220 *batch++ = MI_NOOP;
221 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222 *batch++ = lower_32_bits(vma->node.start);
223 } else {
224 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
225 *batch++ = lower_32_bits(hws_address(hws, rq));
226 *batch++ = rq->fence.seqno;
227 *batch++ = MI_NOOP;
228
229 memset(batch, 0, 1024);
230 batch += 1024 / sizeof(*batch);
231
232 *batch++ = MI_NOOP;
233 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234 *batch++ = lower_32_bits(vma->node.start);
235 }
236 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
237 intel_gt_chipset_flush(engine->gt);
238
239 if (rq->engine->emit_init_breadcrumb) {
240 err = rq->engine->emit_init_breadcrumb(rq);
241 if (err)
242 goto cancel_rq;
243 }
244
245 flags = 0;
246 if (INTEL_GEN(gt->i915) <= 5)
247 flags |= I915_DISPATCH_SECURE;
248
249 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
250
251 cancel_rq:
252 if (err) {
253 i915_request_set_error_once(rq, err);
254 i915_request_add(rq);
255 }
256 unpin_hws:
257 i915_vma_unpin(hws);
258 unpin_vma:
259 i915_vma_unpin(vma);
260 i915_vm_put(vm);
261 return err ? ERR_PTR(err) : rq;
262 }
263
hws_seqno(const struct hang * h,const struct i915_request * rq)264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
265 {
266 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
267 }
268
hang_fini(struct hang * h)269 static void hang_fini(struct hang *h)
270 {
271 *h->batch = MI_BATCH_BUFFER_END;
272 intel_gt_chipset_flush(h->gt);
273
274 i915_gem_object_unpin_map(h->obj);
275 i915_gem_object_put(h->obj);
276
277 i915_gem_object_unpin_map(h->hws);
278 i915_gem_object_put(h->hws);
279
280 kernel_context_close(h->ctx);
281
282 igt_flush_test(h->gt->i915);
283 }
284
wait_until_running(struct hang * h,struct i915_request * rq)285 static bool wait_until_running(struct hang *h, struct i915_request *rq)
286 {
287 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
288 rq->fence.seqno),
289 10) &&
290 wait_for(i915_seqno_passed(hws_seqno(h, rq),
291 rq->fence.seqno),
292 1000));
293 }
294
igt_hang_sanitycheck(void * arg)295 static int igt_hang_sanitycheck(void *arg)
296 {
297 struct intel_gt *gt = arg;
298 struct i915_request *rq;
299 struct intel_engine_cs *engine;
300 enum intel_engine_id id;
301 struct hang h;
302 int err;
303
304 /* Basic check that we can execute our hanging batch */
305
306 err = hang_init(&h, gt);
307 if (err)
308 return err;
309
310 for_each_engine(engine, gt, id) {
311 struct intel_wedge_me w;
312 long timeout;
313
314 if (!intel_engine_can_store_dword(engine))
315 continue;
316
317 rq = hang_create_request(&h, engine);
318 if (IS_ERR(rq)) {
319 err = PTR_ERR(rq);
320 pr_err("Failed to create request for %s, err=%d\n",
321 engine->name, err);
322 goto fini;
323 }
324
325 i915_request_get(rq);
326
327 *h.batch = MI_BATCH_BUFFER_END;
328 intel_gt_chipset_flush(engine->gt);
329
330 i915_request_add(rq);
331
332 timeout = 0;
333 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
334 timeout = i915_request_wait(rq, 0,
335 MAX_SCHEDULE_TIMEOUT);
336 if (intel_gt_is_wedged(gt))
337 timeout = -EIO;
338
339 i915_request_put(rq);
340
341 if (timeout < 0) {
342 err = timeout;
343 pr_err("Wait for request failed on %s, err=%d\n",
344 engine->name, err);
345 goto fini;
346 }
347 }
348
349 fini:
350 hang_fini(&h);
351 return err;
352 }
353
wait_for_idle(struct intel_engine_cs * engine)354 static bool wait_for_idle(struct intel_engine_cs *engine)
355 {
356 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
357 }
358
igt_reset_nop(void * arg)359 static int igt_reset_nop(void *arg)
360 {
361 struct intel_gt *gt = arg;
362 struct i915_gpu_error *global = >->i915->gpu_error;
363 struct intel_engine_cs *engine;
364 unsigned int reset_count, count;
365 enum intel_engine_id id;
366 IGT_TIMEOUT(end_time);
367 int err = 0;
368
369 /* Check that we can reset during non-user portions of requests */
370
371 reset_count = i915_reset_count(global);
372 count = 0;
373 do {
374 for_each_engine(engine, gt, id) {
375 struct intel_context *ce;
376 int i;
377
378 ce = intel_context_create(engine);
379 if (IS_ERR(ce)) {
380 err = PTR_ERR(ce);
381 break;
382 }
383
384 for (i = 0; i < 16; i++) {
385 struct i915_request *rq;
386
387 rq = intel_context_create_request(ce);
388 if (IS_ERR(rq)) {
389 err = PTR_ERR(rq);
390 break;
391 }
392
393 i915_request_add(rq);
394 }
395
396 intel_context_put(ce);
397 }
398
399 igt_global_reset_lock(gt);
400 intel_gt_reset(gt, ALL_ENGINES, NULL);
401 igt_global_reset_unlock(gt);
402
403 if (intel_gt_is_wedged(gt)) {
404 err = -EIO;
405 break;
406 }
407
408 if (i915_reset_count(global) != reset_count + ++count) {
409 pr_err("Full GPU reset not recorded!\n");
410 err = -EINVAL;
411 break;
412 }
413
414 err = igt_flush_test(gt->i915);
415 if (err)
416 break;
417 } while (time_before(jiffies, end_time));
418 pr_info("%s: %d resets\n", __func__, count);
419
420 if (igt_flush_test(gt->i915))
421 err = -EIO;
422 return err;
423 }
424
igt_reset_nop_engine(void * arg)425 static int igt_reset_nop_engine(void *arg)
426 {
427 struct intel_gt *gt = arg;
428 struct i915_gpu_error *global = >->i915->gpu_error;
429 struct intel_engine_cs *engine;
430 enum intel_engine_id id;
431
432 /* Check that we can engine-reset during non-user portions */
433
434 if (!intel_has_reset_engine(gt))
435 return 0;
436
437 for_each_engine(engine, gt, id) {
438 unsigned int reset_count, reset_engine_count, count;
439 struct intel_context *ce;
440 IGT_TIMEOUT(end_time);
441 int err;
442
443 ce = intel_context_create(engine);
444 if (IS_ERR(ce))
445 return PTR_ERR(ce);
446
447 reset_count = i915_reset_count(global);
448 reset_engine_count = i915_reset_engine_count(global, engine);
449 count = 0;
450
451 st_engine_heartbeat_disable(engine);
452 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
453 do {
454 int i;
455
456 if (!wait_for_idle(engine)) {
457 pr_err("%s failed to idle before reset\n",
458 engine->name);
459 err = -EIO;
460 break;
461 }
462
463 for (i = 0; i < 16; i++) {
464 struct i915_request *rq;
465
466 rq = intel_context_create_request(ce);
467 if (IS_ERR(rq)) {
468 struct drm_printer p =
469 drm_info_printer(gt->i915->drm.dev);
470 intel_engine_dump(engine, &p,
471 "%s(%s): failed to submit request\n",
472 __func__,
473 engine->name);
474
475 GEM_TRACE("%s(%s): failed to submit request\n",
476 __func__,
477 engine->name);
478 GEM_TRACE_DUMP();
479
480 intel_gt_set_wedged(gt);
481
482 err = PTR_ERR(rq);
483 break;
484 }
485
486 i915_request_add(rq);
487 }
488 err = intel_engine_reset(engine, NULL);
489 if (err) {
490 pr_err("intel_engine_reset(%s) failed, err:%d\n",
491 engine->name, err);
492 break;
493 }
494
495 if (i915_reset_count(global) != reset_count) {
496 pr_err("Full GPU reset recorded! (engine reset expected)\n");
497 err = -EINVAL;
498 break;
499 }
500
501 if (i915_reset_engine_count(global, engine) !=
502 reset_engine_count + ++count) {
503 pr_err("%s engine reset not recorded!\n",
504 engine->name);
505 err = -EINVAL;
506 break;
507 }
508 } while (time_before(jiffies, end_time));
509 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
510 st_engine_heartbeat_enable(engine);
511
512 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
513
514 intel_context_put(ce);
515 if (igt_flush_test(gt->i915))
516 err = -EIO;
517 if (err)
518 return err;
519 }
520
521 return 0;
522 }
523
force_reset_timeout(struct intel_engine_cs * engine)524 static void force_reset_timeout(struct intel_engine_cs *engine)
525 {
526 engine->reset_timeout.probability = 999;
527 atomic_set(&engine->reset_timeout.times, -1);
528 }
529
cancel_reset_timeout(struct intel_engine_cs * engine)530 static void cancel_reset_timeout(struct intel_engine_cs *engine)
531 {
532 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
533 }
534
igt_reset_fail_engine(void * arg)535 static int igt_reset_fail_engine(void *arg)
536 {
537 struct intel_gt *gt = arg;
538 struct intel_engine_cs *engine;
539 enum intel_engine_id id;
540
541 /* Check that we can recover from engine-reset failues */
542
543 if (!intel_has_reset_engine(gt))
544 return 0;
545
546 for_each_engine(engine, gt, id) {
547 unsigned int count;
548 struct intel_context *ce;
549 IGT_TIMEOUT(end_time);
550 int err;
551
552 ce = intel_context_create(engine);
553 if (IS_ERR(ce))
554 return PTR_ERR(ce);
555
556 st_engine_heartbeat_disable(engine);
557 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
558
559 force_reset_timeout(engine);
560 err = intel_engine_reset(engine, NULL);
561 cancel_reset_timeout(engine);
562 if (err == 0) /* timeouts only generated on gen8+ */
563 goto skip;
564
565 count = 0;
566 do {
567 struct i915_request *last = NULL;
568 int i;
569
570 if (!wait_for_idle(engine)) {
571 pr_err("%s failed to idle before reset\n",
572 engine->name);
573 err = -EIO;
574 break;
575 }
576
577 for (i = 0; i < count % 15; i++) {
578 struct i915_request *rq;
579
580 rq = intel_context_create_request(ce);
581 if (IS_ERR(rq)) {
582 struct drm_printer p =
583 drm_info_printer(gt->i915->drm.dev);
584 intel_engine_dump(engine, &p,
585 "%s(%s): failed to submit request\n",
586 __func__,
587 engine->name);
588
589 GEM_TRACE("%s(%s): failed to submit request\n",
590 __func__,
591 engine->name);
592 GEM_TRACE_DUMP();
593
594 intel_gt_set_wedged(gt);
595 if (last)
596 i915_request_put(last);
597
598 err = PTR_ERR(rq);
599 goto out;
600 }
601
602 if (last)
603 i915_request_put(last);
604 last = i915_request_get(rq);
605 i915_request_add(rq);
606 }
607
608 if (count & 1) {
609 err = intel_engine_reset(engine, NULL);
610 if (err) {
611 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
612 engine->name, err);
613 GEM_TRACE_DUMP();
614 i915_request_put(last);
615 break;
616 }
617 } else {
618 force_reset_timeout(engine);
619 err = intel_engine_reset(engine, NULL);
620 cancel_reset_timeout(engine);
621 if (err != -ETIMEDOUT) {
622 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
623 engine->name, err);
624 i915_request_put(last);
625 break;
626 }
627 }
628
629 err = 0;
630 if (last) {
631 if (i915_request_wait(last, 0, HZ / 2) < 0) {
632 struct drm_printer p =
633 drm_info_printer(gt->i915->drm.dev);
634
635 intel_engine_dump(engine, &p,
636 "%s(%s): failed to complete request\n",
637 __func__,
638 engine->name);
639
640 GEM_TRACE("%s(%s): failed to complete request\n",
641 __func__,
642 engine->name);
643 GEM_TRACE_DUMP();
644
645 err = -EIO;
646 }
647 i915_request_put(last);
648 }
649 count++;
650 } while (err == 0 && time_before(jiffies, end_time));
651 out:
652 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
653 skip:
654 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
655 st_engine_heartbeat_enable(engine);
656 intel_context_put(ce);
657
658 if (igt_flush_test(gt->i915))
659 err = -EIO;
660 if (err)
661 return err;
662 }
663
664 return 0;
665 }
666
__igt_reset_engine(struct intel_gt * gt,bool active)667 static int __igt_reset_engine(struct intel_gt *gt, bool active)
668 {
669 struct i915_gpu_error *global = >->i915->gpu_error;
670 struct intel_engine_cs *engine;
671 enum intel_engine_id id;
672 struct hang h;
673 int err = 0;
674
675 /* Check that we can issue an engine reset on an idle engine (no-op) */
676
677 if (!intel_has_reset_engine(gt))
678 return 0;
679
680 if (active) {
681 err = hang_init(&h, gt);
682 if (err)
683 return err;
684 }
685
686 for_each_engine(engine, gt, id) {
687 unsigned int reset_count, reset_engine_count;
688 unsigned long count;
689 IGT_TIMEOUT(end_time);
690
691 if (active && !intel_engine_can_store_dword(engine))
692 continue;
693
694 if (!wait_for_idle(engine)) {
695 pr_err("%s failed to idle before reset\n",
696 engine->name);
697 err = -EIO;
698 break;
699 }
700
701 reset_count = i915_reset_count(global);
702 reset_engine_count = i915_reset_engine_count(global, engine);
703
704 st_engine_heartbeat_disable(engine);
705 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
706 count = 0;
707 do {
708 if (active) {
709 struct i915_request *rq;
710
711 rq = hang_create_request(&h, engine);
712 if (IS_ERR(rq)) {
713 err = PTR_ERR(rq);
714 break;
715 }
716
717 i915_request_get(rq);
718 i915_request_add(rq);
719
720 if (!wait_until_running(&h, rq)) {
721 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
722
723 pr_err("%s: Failed to start request %llx, at %x\n",
724 __func__, rq->fence.seqno, hws_seqno(&h, rq));
725 intel_engine_dump(engine, &p,
726 "%s\n", engine->name);
727
728 i915_request_put(rq);
729 err = -EIO;
730 break;
731 }
732
733 i915_request_put(rq);
734 }
735
736 err = intel_engine_reset(engine, NULL);
737 if (err) {
738 pr_err("intel_engine_reset(%s) failed, err:%d\n",
739 engine->name, err);
740 break;
741 }
742
743 if (i915_reset_count(global) != reset_count) {
744 pr_err("Full GPU reset recorded! (engine reset expected)\n");
745 err = -EINVAL;
746 break;
747 }
748
749 if (i915_reset_engine_count(global, engine) !=
750 ++reset_engine_count) {
751 pr_err("%s engine reset not recorded!\n",
752 engine->name);
753 err = -EINVAL;
754 break;
755 }
756
757 count++;
758 } while (time_before(jiffies, end_time));
759 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
760 st_engine_heartbeat_enable(engine);
761 pr_info("%s: Completed %lu %s resets\n",
762 engine->name, count, active ? "active" : "idle");
763
764 if (err)
765 break;
766
767 err = igt_flush_test(gt->i915);
768 if (err)
769 break;
770 }
771
772 if (intel_gt_is_wedged(gt))
773 err = -EIO;
774
775 if (active)
776 hang_fini(&h);
777
778 return err;
779 }
780
igt_reset_idle_engine(void * arg)781 static int igt_reset_idle_engine(void *arg)
782 {
783 return __igt_reset_engine(arg, false);
784 }
785
igt_reset_active_engine(void * arg)786 static int igt_reset_active_engine(void *arg)
787 {
788 return __igt_reset_engine(arg, true);
789 }
790
791 struct active_engine {
792 struct task_struct *task;
793 struct intel_engine_cs *engine;
794 unsigned long resets;
795 unsigned int flags;
796 };
797
798 #define TEST_ACTIVE BIT(0)
799 #define TEST_OTHERS BIT(1)
800 #define TEST_SELF BIT(2)
801 #define TEST_PRIORITY BIT(3)
802
active_request_put(struct i915_request * rq)803 static int active_request_put(struct i915_request *rq)
804 {
805 int err = 0;
806
807 if (!rq)
808 return 0;
809
810 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
811 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
812 rq->engine->name,
813 rq->fence.context,
814 rq->fence.seqno);
815 GEM_TRACE_DUMP();
816
817 intel_gt_set_wedged(rq->engine->gt);
818 err = -EIO;
819 }
820
821 i915_request_put(rq);
822
823 return err;
824 }
825
active_engine(void * data)826 static int active_engine(void *data)
827 {
828 I915_RND_STATE(prng);
829 struct active_engine *arg = data;
830 struct intel_engine_cs *engine = arg->engine;
831 struct i915_request *rq[8] = {};
832 struct intel_context *ce[ARRAY_SIZE(rq)];
833 unsigned long count;
834 int err = 0;
835
836 for (count = 0; count < ARRAY_SIZE(ce); count++) {
837 ce[count] = intel_context_create(engine);
838 if (IS_ERR(ce[count])) {
839 err = PTR_ERR(ce[count]);
840 while (--count)
841 intel_context_put(ce[count]);
842 return err;
843 }
844 }
845
846 count = 0;
847 while (!kthread_should_stop()) {
848 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
849 struct i915_request *old = rq[idx];
850 struct i915_request *new;
851
852 new = intel_context_create_request(ce[idx]);
853 if (IS_ERR(new)) {
854 err = PTR_ERR(new);
855 break;
856 }
857
858 rq[idx] = i915_request_get(new);
859 i915_request_add(new);
860
861 if (engine->schedule && arg->flags & TEST_PRIORITY) {
862 struct i915_sched_attr attr = {
863 .priority =
864 i915_prandom_u32_max_state(512, &prng),
865 };
866 engine->schedule(rq[idx], &attr);
867 }
868
869 err = active_request_put(old);
870 if (err)
871 break;
872
873 cond_resched();
874 }
875
876 for (count = 0; count < ARRAY_SIZE(rq); count++) {
877 int err__ = active_request_put(rq[count]);
878
879 /* Keep the first error */
880 if (!err)
881 err = err__;
882
883 intel_context_put(ce[count]);
884 }
885
886 return err;
887 }
888
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)889 static int __igt_reset_engines(struct intel_gt *gt,
890 const char *test_name,
891 unsigned int flags)
892 {
893 struct i915_gpu_error *global = >->i915->gpu_error;
894 struct intel_engine_cs *engine, *other;
895 enum intel_engine_id id, tmp;
896 struct hang h;
897 int err = 0;
898
899 /* Check that issuing a reset on one engine does not interfere
900 * with any other engine.
901 */
902
903 if (!intel_has_reset_engine(gt))
904 return 0;
905
906 if (flags & TEST_ACTIVE) {
907 err = hang_init(&h, gt);
908 if (err)
909 return err;
910
911 if (flags & TEST_PRIORITY)
912 h.ctx->sched.priority = 1024;
913 }
914
915 for_each_engine(engine, gt, id) {
916 struct active_engine threads[I915_NUM_ENGINES] = {};
917 unsigned long device = i915_reset_count(global);
918 unsigned long count = 0, reported;
919 IGT_TIMEOUT(end_time);
920
921 if (flags & TEST_ACTIVE &&
922 !intel_engine_can_store_dword(engine))
923 continue;
924
925 if (!wait_for_idle(engine)) {
926 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
927 engine->name, test_name);
928 err = -EIO;
929 break;
930 }
931
932 memset(threads, 0, sizeof(threads));
933 for_each_engine(other, gt, tmp) {
934 struct task_struct *tsk;
935
936 threads[tmp].resets =
937 i915_reset_engine_count(global, other);
938
939 if (other == engine && !(flags & TEST_SELF))
940 continue;
941
942 if (other != engine && !(flags & TEST_OTHERS))
943 continue;
944
945 threads[tmp].engine = other;
946 threads[tmp].flags = flags;
947
948 tsk = kthread_run(active_engine, &threads[tmp],
949 "igt/%s", other->name);
950 if (IS_ERR(tsk)) {
951 err = PTR_ERR(tsk);
952 goto unwind;
953 }
954
955 threads[tmp].task = tsk;
956 get_task_struct(tsk);
957 }
958
959 yield(); /* start all threads before we begin */
960
961 st_engine_heartbeat_disable(engine);
962 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
963 do {
964 struct i915_request *rq = NULL;
965
966 if (flags & TEST_ACTIVE) {
967 rq = hang_create_request(&h, engine);
968 if (IS_ERR(rq)) {
969 err = PTR_ERR(rq);
970 break;
971 }
972
973 i915_request_get(rq);
974 i915_request_add(rq);
975
976 if (!wait_until_running(&h, rq)) {
977 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
978
979 pr_err("%s: Failed to start request %llx, at %x\n",
980 __func__, rq->fence.seqno, hws_seqno(&h, rq));
981 intel_engine_dump(engine, &p,
982 "%s\n", engine->name);
983
984 i915_request_put(rq);
985 err = -EIO;
986 break;
987 }
988 }
989
990 err = intel_engine_reset(engine, NULL);
991 if (err) {
992 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
993 engine->name, test_name, err);
994 break;
995 }
996
997 count++;
998
999 if (rq) {
1000 if (rq->fence.error != -EIO) {
1001 pr_err("i915_reset_engine(%s:%s):"
1002 " failed to reset request %llx:%lld\n",
1003 engine->name, test_name,
1004 rq->fence.context,
1005 rq->fence.seqno);
1006 i915_request_put(rq);
1007
1008 GEM_TRACE_DUMP();
1009 intel_gt_set_wedged(gt);
1010 err = -EIO;
1011 break;
1012 }
1013
1014 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1015 struct drm_printer p =
1016 drm_info_printer(gt->i915->drm.dev);
1017
1018 pr_err("i915_reset_engine(%s:%s):"
1019 " failed to complete request %llx:%lld after reset\n",
1020 engine->name, test_name,
1021 rq->fence.context,
1022 rq->fence.seqno);
1023 intel_engine_dump(engine, &p,
1024 "%s\n", engine->name);
1025 i915_request_put(rq);
1026
1027 GEM_TRACE_DUMP();
1028 intel_gt_set_wedged(gt);
1029 err = -EIO;
1030 break;
1031 }
1032
1033 i915_request_put(rq);
1034 }
1035
1036 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1037 struct drm_printer p =
1038 drm_info_printer(gt->i915->drm.dev);
1039
1040 pr_err("i915_reset_engine(%s:%s):"
1041 " failed to idle after reset\n",
1042 engine->name, test_name);
1043 intel_engine_dump(engine, &p,
1044 "%s\n", engine->name);
1045
1046 err = -EIO;
1047 break;
1048 }
1049 } while (time_before(jiffies, end_time));
1050 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
1051 st_engine_heartbeat_enable(engine);
1052
1053 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1054 engine->name, test_name, count);
1055
1056 reported = i915_reset_engine_count(global, engine);
1057 reported -= threads[engine->id].resets;
1058 if (reported != count) {
1059 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1060 engine->name, test_name, count, reported);
1061 if (!err)
1062 err = -EINVAL;
1063 }
1064
1065 unwind:
1066 for_each_engine(other, gt, tmp) {
1067 int ret;
1068
1069 if (!threads[tmp].task)
1070 continue;
1071
1072 ret = kthread_stop(threads[tmp].task);
1073 if (ret) {
1074 pr_err("kthread for other engine %s failed, err=%d\n",
1075 other->name, ret);
1076 if (!err)
1077 err = ret;
1078 }
1079 put_task_struct(threads[tmp].task);
1080
1081 if (other->uabi_class != engine->uabi_class &&
1082 threads[tmp].resets !=
1083 i915_reset_engine_count(global, other)) {
1084 pr_err("Innocent engine %s was reset (count=%ld)\n",
1085 other->name,
1086 i915_reset_engine_count(global, other) -
1087 threads[tmp].resets);
1088 if (!err)
1089 err = -EINVAL;
1090 }
1091 }
1092
1093 if (device != i915_reset_count(global)) {
1094 pr_err("Global reset (count=%ld)!\n",
1095 i915_reset_count(global) - device);
1096 if (!err)
1097 err = -EINVAL;
1098 }
1099
1100 if (err)
1101 break;
1102
1103 err = igt_flush_test(gt->i915);
1104 if (err)
1105 break;
1106 }
1107
1108 if (intel_gt_is_wedged(gt))
1109 err = -EIO;
1110
1111 if (flags & TEST_ACTIVE)
1112 hang_fini(&h);
1113
1114 return err;
1115 }
1116
igt_reset_engines(void * arg)1117 static int igt_reset_engines(void *arg)
1118 {
1119 static const struct {
1120 const char *name;
1121 unsigned int flags;
1122 } phases[] = {
1123 { "idle", 0 },
1124 { "active", TEST_ACTIVE },
1125 { "others-idle", TEST_OTHERS },
1126 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1127 {
1128 "others-priority",
1129 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1130 },
1131 {
1132 "self-priority",
1133 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1134 },
1135 { }
1136 };
1137 struct intel_gt *gt = arg;
1138 typeof(*phases) *p;
1139 int err;
1140
1141 for (p = phases; p->name; p++) {
1142 if (p->flags & TEST_PRIORITY) {
1143 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1144 continue;
1145 }
1146
1147 err = __igt_reset_engines(arg, p->name, p->flags);
1148 if (err)
1149 return err;
1150 }
1151
1152 return 0;
1153 }
1154
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1155 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1156 {
1157 u32 count = i915_reset_count(>->i915->gpu_error);
1158
1159 intel_gt_reset(gt, mask, NULL);
1160
1161 return count;
1162 }
1163
igt_reset_wait(void * arg)1164 static int igt_reset_wait(void *arg)
1165 {
1166 struct intel_gt *gt = arg;
1167 struct i915_gpu_error *global = >->i915->gpu_error;
1168 struct intel_engine_cs *engine = gt->engine[RCS0];
1169 struct i915_request *rq;
1170 unsigned int reset_count;
1171 struct hang h;
1172 long timeout;
1173 int err;
1174
1175 if (!engine || !intel_engine_can_store_dword(engine))
1176 return 0;
1177
1178 /* Check that we detect a stuck waiter and issue a reset */
1179
1180 igt_global_reset_lock(gt);
1181
1182 err = hang_init(&h, gt);
1183 if (err)
1184 goto unlock;
1185
1186 rq = hang_create_request(&h, engine);
1187 if (IS_ERR(rq)) {
1188 err = PTR_ERR(rq);
1189 goto fini;
1190 }
1191
1192 i915_request_get(rq);
1193 i915_request_add(rq);
1194
1195 if (!wait_until_running(&h, rq)) {
1196 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1197
1198 pr_err("%s: Failed to start request %llx, at %x\n",
1199 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1200 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1201
1202 intel_gt_set_wedged(gt);
1203
1204 err = -EIO;
1205 goto out_rq;
1206 }
1207
1208 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1209
1210 timeout = i915_request_wait(rq, 0, 10);
1211 if (timeout < 0) {
1212 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1213 timeout);
1214 err = timeout;
1215 goto out_rq;
1216 }
1217
1218 if (i915_reset_count(global) == reset_count) {
1219 pr_err("No GPU reset recorded!\n");
1220 err = -EINVAL;
1221 goto out_rq;
1222 }
1223
1224 out_rq:
1225 i915_request_put(rq);
1226 fini:
1227 hang_fini(&h);
1228 unlock:
1229 igt_global_reset_unlock(gt);
1230
1231 if (intel_gt_is_wedged(gt))
1232 return -EIO;
1233
1234 return err;
1235 }
1236
1237 struct evict_vma {
1238 struct completion completion;
1239 struct i915_vma *vma;
1240 };
1241
evict_vma(void * data)1242 static int evict_vma(void *data)
1243 {
1244 struct evict_vma *arg = data;
1245 struct i915_address_space *vm = arg->vma->vm;
1246 struct drm_mm_node evict = arg->vma->node;
1247 int err;
1248
1249 complete(&arg->completion);
1250
1251 mutex_lock(&vm->mutex);
1252 err = i915_gem_evict_for_node(vm, &evict, 0);
1253 mutex_unlock(&vm->mutex);
1254
1255 return err;
1256 }
1257
evict_fence(void * data)1258 static int evict_fence(void *data)
1259 {
1260 struct evict_vma *arg = data;
1261 int err;
1262
1263 complete(&arg->completion);
1264
1265 /* Mark the fence register as dirty to force the mmio update. */
1266 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1267 if (err) {
1268 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1269 return err;
1270 }
1271
1272 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1273 if (err) {
1274 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1275 return err;
1276 }
1277
1278 err = i915_vma_pin_fence(arg->vma);
1279 i915_vma_unpin(arg->vma);
1280 if (err) {
1281 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1282 return err;
1283 }
1284
1285 i915_vma_unpin_fence(arg->vma);
1286
1287 return 0;
1288 }
1289
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1290 static int __igt_reset_evict_vma(struct intel_gt *gt,
1291 struct i915_address_space *vm,
1292 int (*fn)(void *),
1293 unsigned int flags)
1294 {
1295 struct intel_engine_cs *engine = gt->engine[RCS0];
1296 struct drm_i915_gem_object *obj;
1297 struct task_struct *tsk = NULL;
1298 struct i915_request *rq;
1299 struct evict_vma arg;
1300 struct hang h;
1301 unsigned int pin_flags;
1302 int err;
1303
1304 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1305 return 0;
1306
1307 if (!engine || !intel_engine_can_store_dword(engine))
1308 return 0;
1309
1310 /* Check that we can recover an unbind stuck on a hanging request */
1311
1312 err = hang_init(&h, gt);
1313 if (err)
1314 return err;
1315
1316 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1317 if (IS_ERR(obj)) {
1318 err = PTR_ERR(obj);
1319 goto fini;
1320 }
1321
1322 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1323 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1324 if (err) {
1325 pr_err("Invalid X-tiling settings; err:%d\n", err);
1326 goto out_obj;
1327 }
1328 }
1329
1330 arg.vma = i915_vma_instance(obj, vm, NULL);
1331 if (IS_ERR(arg.vma)) {
1332 err = PTR_ERR(arg.vma);
1333 goto out_obj;
1334 }
1335
1336 rq = hang_create_request(&h, engine);
1337 if (IS_ERR(rq)) {
1338 err = PTR_ERR(rq);
1339 goto out_obj;
1340 }
1341
1342 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1343
1344 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1345 pin_flags |= PIN_MAPPABLE;
1346
1347 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1348 if (err) {
1349 i915_request_add(rq);
1350 goto out_obj;
1351 }
1352
1353 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1354 err = i915_vma_pin_fence(arg.vma);
1355 if (err) {
1356 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1357 i915_vma_unpin(arg.vma);
1358 i915_request_add(rq);
1359 goto out_obj;
1360 }
1361 }
1362
1363 i915_vma_lock(arg.vma);
1364 err = i915_request_await_object(rq, arg.vma->obj,
1365 flags & EXEC_OBJECT_WRITE);
1366 if (err == 0)
1367 err = i915_vma_move_to_active(arg.vma, rq, flags);
1368 i915_vma_unlock(arg.vma);
1369
1370 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1371 i915_vma_unpin_fence(arg.vma);
1372 i915_vma_unpin(arg.vma);
1373
1374 i915_request_get(rq);
1375 i915_request_add(rq);
1376 if (err)
1377 goto out_rq;
1378
1379 if (!wait_until_running(&h, rq)) {
1380 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1381
1382 pr_err("%s: Failed to start request %llx, at %x\n",
1383 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1384 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1385
1386 intel_gt_set_wedged(gt);
1387 goto out_reset;
1388 }
1389
1390 init_completion(&arg.completion);
1391
1392 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1393 if (IS_ERR(tsk)) {
1394 err = PTR_ERR(tsk);
1395 tsk = NULL;
1396 goto out_reset;
1397 }
1398 get_task_struct(tsk);
1399
1400 wait_for_completion(&arg.completion);
1401
1402 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1403 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1404
1405 pr_err("igt/evict_vma kthread did not wait\n");
1406 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1407
1408 intel_gt_set_wedged(gt);
1409 goto out_reset;
1410 }
1411
1412 out_reset:
1413 igt_global_reset_lock(gt);
1414 fake_hangcheck(gt, rq->engine->mask);
1415 igt_global_reset_unlock(gt);
1416
1417 if (tsk) {
1418 struct intel_wedge_me w;
1419
1420 /* The reset, even indirectly, should take less than 10ms. */
1421 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1422 err = kthread_stop(tsk);
1423
1424 put_task_struct(tsk);
1425 }
1426
1427 out_rq:
1428 i915_request_put(rq);
1429 out_obj:
1430 i915_gem_object_put(obj);
1431 fini:
1432 hang_fini(&h);
1433 if (intel_gt_is_wedged(gt))
1434 return -EIO;
1435
1436 return err;
1437 }
1438
igt_reset_evict_ggtt(void * arg)1439 static int igt_reset_evict_ggtt(void *arg)
1440 {
1441 struct intel_gt *gt = arg;
1442
1443 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1444 evict_vma, EXEC_OBJECT_WRITE);
1445 }
1446
igt_reset_evict_ppgtt(void * arg)1447 static int igt_reset_evict_ppgtt(void *arg)
1448 {
1449 struct intel_gt *gt = arg;
1450 struct i915_ppgtt *ppgtt;
1451 int err;
1452
1453 /* aliasing == global gtt locking, covered above */
1454 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1455 return 0;
1456
1457 ppgtt = i915_ppgtt_create(gt);
1458 if (IS_ERR(ppgtt))
1459 return PTR_ERR(ppgtt);
1460
1461 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1462 evict_vma, EXEC_OBJECT_WRITE);
1463 i915_vm_put(&ppgtt->vm);
1464
1465 return err;
1466 }
1467
igt_reset_evict_fence(void * arg)1468 static int igt_reset_evict_fence(void *arg)
1469 {
1470 struct intel_gt *gt = arg;
1471
1472 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1473 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1474 }
1475
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1476 static int wait_for_others(struct intel_gt *gt,
1477 struct intel_engine_cs *exclude)
1478 {
1479 struct intel_engine_cs *engine;
1480 enum intel_engine_id id;
1481
1482 for_each_engine(engine, gt, id) {
1483 if (engine == exclude)
1484 continue;
1485
1486 if (!wait_for_idle(engine))
1487 return -EIO;
1488 }
1489
1490 return 0;
1491 }
1492
igt_reset_queue(void * arg)1493 static int igt_reset_queue(void *arg)
1494 {
1495 struct intel_gt *gt = arg;
1496 struct i915_gpu_error *global = >->i915->gpu_error;
1497 struct intel_engine_cs *engine;
1498 enum intel_engine_id id;
1499 struct hang h;
1500 int err;
1501
1502 /* Check that we replay pending requests following a hang */
1503
1504 igt_global_reset_lock(gt);
1505
1506 err = hang_init(&h, gt);
1507 if (err)
1508 goto unlock;
1509
1510 for_each_engine(engine, gt, id) {
1511 struct i915_request *prev;
1512 IGT_TIMEOUT(end_time);
1513 unsigned int count;
1514
1515 if (!intel_engine_can_store_dword(engine))
1516 continue;
1517
1518 prev = hang_create_request(&h, engine);
1519 if (IS_ERR(prev)) {
1520 err = PTR_ERR(prev);
1521 goto fini;
1522 }
1523
1524 i915_request_get(prev);
1525 i915_request_add(prev);
1526
1527 count = 0;
1528 do {
1529 struct i915_request *rq;
1530 unsigned int reset_count;
1531
1532 rq = hang_create_request(&h, engine);
1533 if (IS_ERR(rq)) {
1534 err = PTR_ERR(rq);
1535 goto fini;
1536 }
1537
1538 i915_request_get(rq);
1539 i915_request_add(rq);
1540
1541 /*
1542 * XXX We don't handle resetting the kernel context
1543 * very well. If we trigger a device reset twice in
1544 * quick succession while the kernel context is
1545 * executing, we may end up skipping the breadcrumb.
1546 * This is really only a problem for the selftest as
1547 * normally there is a large interlude between resets
1548 * (hangcheck), or we focus on resetting just one
1549 * engine and so avoid repeatedly resetting innocents.
1550 */
1551 err = wait_for_others(gt, engine);
1552 if (err) {
1553 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1554 __func__, engine->name);
1555 i915_request_put(rq);
1556 i915_request_put(prev);
1557
1558 GEM_TRACE_DUMP();
1559 intel_gt_set_wedged(gt);
1560 goto fini;
1561 }
1562
1563 if (!wait_until_running(&h, prev)) {
1564 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1565
1566 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1567 __func__, engine->name,
1568 prev->fence.seqno, hws_seqno(&h, prev));
1569 intel_engine_dump(engine, &p,
1570 "%s\n", engine->name);
1571
1572 i915_request_put(rq);
1573 i915_request_put(prev);
1574
1575 intel_gt_set_wedged(gt);
1576
1577 err = -EIO;
1578 goto fini;
1579 }
1580
1581 reset_count = fake_hangcheck(gt, BIT(id));
1582
1583 if (prev->fence.error != -EIO) {
1584 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1585 prev->fence.error);
1586 i915_request_put(rq);
1587 i915_request_put(prev);
1588 err = -EINVAL;
1589 goto fini;
1590 }
1591
1592 if (rq->fence.error) {
1593 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1594 rq->fence.error);
1595 i915_request_put(rq);
1596 i915_request_put(prev);
1597 err = -EINVAL;
1598 goto fini;
1599 }
1600
1601 if (i915_reset_count(global) == reset_count) {
1602 pr_err("No GPU reset recorded!\n");
1603 i915_request_put(rq);
1604 i915_request_put(prev);
1605 err = -EINVAL;
1606 goto fini;
1607 }
1608
1609 i915_request_put(prev);
1610 prev = rq;
1611 count++;
1612 } while (time_before(jiffies, end_time));
1613 pr_info("%s: Completed %d queued resets\n",
1614 engine->name, count);
1615
1616 *h.batch = MI_BATCH_BUFFER_END;
1617 intel_gt_chipset_flush(engine->gt);
1618
1619 i915_request_put(prev);
1620
1621 err = igt_flush_test(gt->i915);
1622 if (err)
1623 break;
1624 }
1625
1626 fini:
1627 hang_fini(&h);
1628 unlock:
1629 igt_global_reset_unlock(gt);
1630
1631 if (intel_gt_is_wedged(gt))
1632 return -EIO;
1633
1634 return err;
1635 }
1636
igt_handle_error(void * arg)1637 static int igt_handle_error(void *arg)
1638 {
1639 struct intel_gt *gt = arg;
1640 struct i915_gpu_error *global = >->i915->gpu_error;
1641 struct intel_engine_cs *engine = gt->engine[RCS0];
1642 struct hang h;
1643 struct i915_request *rq;
1644 struct i915_gpu_coredump *error;
1645 int err;
1646
1647 /* Check that we can issue a global GPU and engine reset */
1648
1649 if (!intel_has_reset_engine(gt))
1650 return 0;
1651
1652 if (!engine || !intel_engine_can_store_dword(engine))
1653 return 0;
1654
1655 err = hang_init(&h, gt);
1656 if (err)
1657 return err;
1658
1659 rq = hang_create_request(&h, engine);
1660 if (IS_ERR(rq)) {
1661 err = PTR_ERR(rq);
1662 goto err_fini;
1663 }
1664
1665 i915_request_get(rq);
1666 i915_request_add(rq);
1667
1668 if (!wait_until_running(&h, rq)) {
1669 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1670
1671 pr_err("%s: Failed to start request %llx, at %x\n",
1672 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1673 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1674
1675 intel_gt_set_wedged(gt);
1676
1677 err = -EIO;
1678 goto err_request;
1679 }
1680
1681 /* Temporarily disable error capture */
1682 error = xchg(&global->first_error, (void *)-1);
1683
1684 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1685
1686 xchg(&global->first_error, error);
1687
1688 if (rq->fence.error != -EIO) {
1689 pr_err("Guilty request not identified!\n");
1690 err = -EINVAL;
1691 goto err_request;
1692 }
1693
1694 err_request:
1695 i915_request_put(rq);
1696 err_fini:
1697 hang_fini(&h);
1698 return err;
1699 }
1700
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1701 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1702 const struct igt_atomic_section *p,
1703 const char *mode)
1704 {
1705 struct tasklet_struct * const t = &engine->execlists.tasklet;
1706 int err;
1707
1708 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1709 engine->name, mode, p->name);
1710
1711 if (t->func)
1712 tasklet_disable(t);
1713 if (strcmp(p->name, "softirq"))
1714 local_bh_disable();
1715 p->critical_section_begin();
1716
1717 err = __intel_engine_reset_bh(engine, NULL);
1718
1719 p->critical_section_end();
1720 if (strcmp(p->name, "softirq"))
1721 local_bh_enable();
1722 if (t->func) {
1723 tasklet_enable(t);
1724 tasklet_hi_schedule(t);
1725 }
1726
1727 if (err)
1728 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1729 engine->name, mode, p->name);
1730
1731 return err;
1732 }
1733
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1734 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1735 const struct igt_atomic_section *p)
1736 {
1737 struct i915_request *rq;
1738 struct hang h;
1739 int err;
1740
1741 err = __igt_atomic_reset_engine(engine, p, "idle");
1742 if (err)
1743 return err;
1744
1745 err = hang_init(&h, engine->gt);
1746 if (err)
1747 return err;
1748
1749 rq = hang_create_request(&h, engine);
1750 if (IS_ERR(rq)) {
1751 err = PTR_ERR(rq);
1752 goto out;
1753 }
1754
1755 i915_request_get(rq);
1756 i915_request_add(rq);
1757
1758 if (wait_until_running(&h, rq)) {
1759 err = __igt_atomic_reset_engine(engine, p, "active");
1760 } else {
1761 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1762 __func__, engine->name,
1763 rq->fence.seqno, hws_seqno(&h, rq));
1764 intel_gt_set_wedged(engine->gt);
1765 err = -EIO;
1766 }
1767
1768 if (err == 0) {
1769 struct intel_wedge_me w;
1770
1771 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1772 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1773 if (intel_gt_is_wedged(engine->gt))
1774 err = -EIO;
1775 }
1776
1777 i915_request_put(rq);
1778 out:
1779 hang_fini(&h);
1780 return err;
1781 }
1782
igt_reset_engines_atomic(void * arg)1783 static int igt_reset_engines_atomic(void *arg)
1784 {
1785 struct intel_gt *gt = arg;
1786 const typeof(*igt_atomic_phases) *p;
1787 int err = 0;
1788
1789 /* Check that the engines resets are usable from atomic context */
1790
1791 if (!intel_has_reset_engine(gt))
1792 return 0;
1793
1794 if (intel_uc_uses_guc_submission(>->uc))
1795 return 0;
1796
1797 igt_global_reset_lock(gt);
1798
1799 /* Flush any requests before we get started and check basics */
1800 if (!igt_force_reset(gt))
1801 goto unlock;
1802
1803 for (p = igt_atomic_phases; p->name; p++) {
1804 struct intel_engine_cs *engine;
1805 enum intel_engine_id id;
1806
1807 for_each_engine(engine, gt, id) {
1808 err = igt_atomic_reset_engine(engine, p);
1809 if (err)
1810 goto out;
1811 }
1812 }
1813
1814 out:
1815 /* As we poke around the guts, do a full reset before continuing. */
1816 igt_force_reset(gt);
1817 unlock:
1818 igt_global_reset_unlock(gt);
1819
1820 return err;
1821 }
1822
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1823 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1824 {
1825 static const struct i915_subtest tests[] = {
1826 SUBTEST(igt_hang_sanitycheck),
1827 SUBTEST(igt_reset_nop),
1828 SUBTEST(igt_reset_nop_engine),
1829 SUBTEST(igt_reset_idle_engine),
1830 SUBTEST(igt_reset_active_engine),
1831 SUBTEST(igt_reset_fail_engine),
1832 SUBTEST(igt_reset_engines),
1833 SUBTEST(igt_reset_engines_atomic),
1834 SUBTEST(igt_reset_queue),
1835 SUBTEST(igt_reset_wait),
1836 SUBTEST(igt_reset_evict_ggtt),
1837 SUBTEST(igt_reset_evict_ppgtt),
1838 SUBTEST(igt_reset_evict_fence),
1839 SUBTEST(igt_handle_error),
1840 };
1841 struct intel_gt *gt = &i915->gt;
1842 intel_wakeref_t wakeref;
1843 int err;
1844
1845 if (!intel_has_gpu_reset(gt))
1846 return 0;
1847
1848 if (intel_gt_is_wedged(gt))
1849 return -EIO; /* we're long past hope of a successful reset */
1850
1851 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1852
1853 err = intel_gt_live_subtests(tests, gt);
1854
1855 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1856
1857 return err;
1858 }
1859