1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5 
6 #include <linux/kthread.h>
7 
8 #include "gem/i915_gem_context.h"
9 
10 #include "intel_gt.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
14 
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
20 
21 #include "selftests/mock_drm.h"
22 
23 #include "gem/selftests/mock_context.h"
24 #include "gem/selftests/igt_gem_utils.h"
25 
26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
27 
28 struct hang {
29 	struct intel_gt *gt;
30 	struct drm_i915_gem_object *hws;
31 	struct drm_i915_gem_object *obj;
32 	struct i915_gem_context *ctx;
33 	u32 *seqno;
34 	u32 *batch;
35 };
36 
hang_init(struct hang * h,struct intel_gt * gt)37 static int hang_init(struct hang *h, struct intel_gt *gt)
38 {
39 	void *vaddr;
40 	int err;
41 
42 	memset(h, 0, sizeof(*h));
43 	h->gt = gt;
44 
45 	h->ctx = kernel_context(gt->i915);
46 	if (IS_ERR(h->ctx))
47 		return PTR_ERR(h->ctx);
48 
49 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
50 
51 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
52 	if (IS_ERR(h->hws)) {
53 		err = PTR_ERR(h->hws);
54 		goto err_ctx;
55 	}
56 
57 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
58 	if (IS_ERR(h->obj)) {
59 		err = PTR_ERR(h->obj);
60 		goto err_hws;
61 	}
62 
63 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
64 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
65 	if (IS_ERR(vaddr)) {
66 		err = PTR_ERR(vaddr);
67 		goto err_obj;
68 	}
69 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
70 
71 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
72 						 i915_coherent_map_type(gt->i915));
73 	if (IS_ERR(vaddr)) {
74 		err = PTR_ERR(vaddr);
75 		goto err_unpin_hws;
76 	}
77 	h->batch = vaddr;
78 
79 	return 0;
80 
81 err_unpin_hws:
82 	i915_gem_object_unpin_map(h->hws);
83 err_obj:
84 	i915_gem_object_put(h->obj);
85 err_hws:
86 	i915_gem_object_put(h->hws);
87 err_ctx:
88 	kernel_context_close(h->ctx);
89 	return err;
90 }
91 
hws_address(const struct i915_vma * hws,const struct i915_request * rq)92 static u64 hws_address(const struct i915_vma *hws,
93 		       const struct i915_request *rq)
94 {
95 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
96 }
97 
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)98 static int move_to_active(struct i915_vma *vma,
99 			  struct i915_request *rq,
100 			  unsigned int flags)
101 {
102 	int err;
103 
104 	i915_vma_lock(vma);
105 	err = i915_request_await_object(rq, vma->obj,
106 					flags & EXEC_OBJECT_WRITE);
107 	if (err == 0)
108 		err = i915_vma_move_to_active(vma, rq, flags);
109 	i915_vma_unlock(vma);
110 
111 	return err;
112 }
113 
114 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)115 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
116 {
117 	struct intel_gt *gt = h->gt;
118 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
119 	struct drm_i915_gem_object *obj;
120 	struct i915_request *rq = NULL;
121 	struct i915_vma *hws, *vma;
122 	unsigned int flags;
123 	void *vaddr;
124 	u32 *batch;
125 	int err;
126 
127 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
128 	if (IS_ERR(obj)) {
129 		i915_vm_put(vm);
130 		return ERR_CAST(obj);
131 	}
132 
133 	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915));
134 	if (IS_ERR(vaddr)) {
135 		i915_gem_object_put(obj);
136 		i915_vm_put(vm);
137 		return ERR_CAST(vaddr);
138 	}
139 
140 	i915_gem_object_unpin_map(h->obj);
141 	i915_gem_object_put(h->obj);
142 
143 	h->obj = obj;
144 	h->batch = vaddr;
145 
146 	vma = i915_vma_instance(h->obj, vm, NULL);
147 	if (IS_ERR(vma)) {
148 		i915_vm_put(vm);
149 		return ERR_CAST(vma);
150 	}
151 
152 	hws = i915_vma_instance(h->hws, vm, NULL);
153 	if (IS_ERR(hws)) {
154 		i915_vm_put(vm);
155 		return ERR_CAST(hws);
156 	}
157 
158 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
159 	if (err) {
160 		i915_vm_put(vm);
161 		return ERR_PTR(err);
162 	}
163 
164 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
165 	if (err)
166 		goto unpin_vma;
167 
168 	rq = igt_request_alloc(h->ctx, engine);
169 	if (IS_ERR(rq)) {
170 		err = PTR_ERR(rq);
171 		goto unpin_hws;
172 	}
173 
174 	err = move_to_active(vma, rq, 0);
175 	if (err)
176 		goto cancel_rq;
177 
178 	err = move_to_active(hws, rq, 0);
179 	if (err)
180 		goto cancel_rq;
181 
182 	batch = h->batch;
183 	if (INTEL_GEN(gt->i915) >= 8) {
184 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
185 		*batch++ = lower_32_bits(hws_address(hws, rq));
186 		*batch++ = upper_32_bits(hws_address(hws, rq));
187 		*batch++ = rq->fence.seqno;
188 		*batch++ = MI_NOOP;
189 
190 		memset(batch, 0, 1024);
191 		batch += 1024 / sizeof(*batch);
192 
193 		*batch++ = MI_NOOP;
194 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
195 		*batch++ = lower_32_bits(vma->node.start);
196 		*batch++ = upper_32_bits(vma->node.start);
197 	} else if (INTEL_GEN(gt->i915) >= 6) {
198 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
199 		*batch++ = 0;
200 		*batch++ = lower_32_bits(hws_address(hws, rq));
201 		*batch++ = rq->fence.seqno;
202 		*batch++ = MI_NOOP;
203 
204 		memset(batch, 0, 1024);
205 		batch += 1024 / sizeof(*batch);
206 
207 		*batch++ = MI_NOOP;
208 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
209 		*batch++ = lower_32_bits(vma->node.start);
210 	} else if (INTEL_GEN(gt->i915) >= 4) {
211 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
212 		*batch++ = 0;
213 		*batch++ = lower_32_bits(hws_address(hws, rq));
214 		*batch++ = rq->fence.seqno;
215 		*batch++ = MI_NOOP;
216 
217 		memset(batch, 0, 1024);
218 		batch += 1024 / sizeof(*batch);
219 
220 		*batch++ = MI_NOOP;
221 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222 		*batch++ = lower_32_bits(vma->node.start);
223 	} else {
224 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
225 		*batch++ = lower_32_bits(hws_address(hws, rq));
226 		*batch++ = rq->fence.seqno;
227 		*batch++ = MI_NOOP;
228 
229 		memset(batch, 0, 1024);
230 		batch += 1024 / sizeof(*batch);
231 
232 		*batch++ = MI_NOOP;
233 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234 		*batch++ = lower_32_bits(vma->node.start);
235 	}
236 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
237 	intel_gt_chipset_flush(engine->gt);
238 
239 	if (rq->engine->emit_init_breadcrumb) {
240 		err = rq->engine->emit_init_breadcrumb(rq);
241 		if (err)
242 			goto cancel_rq;
243 	}
244 
245 	flags = 0;
246 	if (INTEL_GEN(gt->i915) <= 5)
247 		flags |= I915_DISPATCH_SECURE;
248 
249 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
250 
251 cancel_rq:
252 	if (err) {
253 		i915_request_set_error_once(rq, err);
254 		i915_request_add(rq);
255 	}
256 unpin_hws:
257 	i915_vma_unpin(hws);
258 unpin_vma:
259 	i915_vma_unpin(vma);
260 	i915_vm_put(vm);
261 	return err ? ERR_PTR(err) : rq;
262 }
263 
hws_seqno(const struct hang * h,const struct i915_request * rq)264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
265 {
266 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
267 }
268 
hang_fini(struct hang * h)269 static void hang_fini(struct hang *h)
270 {
271 	*h->batch = MI_BATCH_BUFFER_END;
272 	intel_gt_chipset_flush(h->gt);
273 
274 	i915_gem_object_unpin_map(h->obj);
275 	i915_gem_object_put(h->obj);
276 
277 	i915_gem_object_unpin_map(h->hws);
278 	i915_gem_object_put(h->hws);
279 
280 	kernel_context_close(h->ctx);
281 
282 	igt_flush_test(h->gt->i915);
283 }
284 
wait_until_running(struct hang * h,struct i915_request * rq)285 static bool wait_until_running(struct hang *h, struct i915_request *rq)
286 {
287 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
288 					       rq->fence.seqno),
289 			     10) &&
290 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
291 					    rq->fence.seqno),
292 			  1000));
293 }
294 
igt_hang_sanitycheck(void * arg)295 static int igt_hang_sanitycheck(void *arg)
296 {
297 	struct intel_gt *gt = arg;
298 	struct i915_request *rq;
299 	struct intel_engine_cs *engine;
300 	enum intel_engine_id id;
301 	struct hang h;
302 	int err;
303 
304 	/* Basic check that we can execute our hanging batch */
305 
306 	err = hang_init(&h, gt);
307 	if (err)
308 		return err;
309 
310 	for_each_engine(engine, gt, id) {
311 		struct intel_wedge_me w;
312 		long timeout;
313 
314 		if (!intel_engine_can_store_dword(engine))
315 			continue;
316 
317 		rq = hang_create_request(&h, engine);
318 		if (IS_ERR(rq)) {
319 			err = PTR_ERR(rq);
320 			pr_err("Failed to create request for %s, err=%d\n",
321 			       engine->name, err);
322 			goto fini;
323 		}
324 
325 		i915_request_get(rq);
326 
327 		*h.batch = MI_BATCH_BUFFER_END;
328 		intel_gt_chipset_flush(engine->gt);
329 
330 		i915_request_add(rq);
331 
332 		timeout = 0;
333 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
334 			timeout = i915_request_wait(rq, 0,
335 						    MAX_SCHEDULE_TIMEOUT);
336 		if (intel_gt_is_wedged(gt))
337 			timeout = -EIO;
338 
339 		i915_request_put(rq);
340 
341 		if (timeout < 0) {
342 			err = timeout;
343 			pr_err("Wait for request failed on %s, err=%d\n",
344 			       engine->name, err);
345 			goto fini;
346 		}
347 	}
348 
349 fini:
350 	hang_fini(&h);
351 	return err;
352 }
353 
wait_for_idle(struct intel_engine_cs * engine)354 static bool wait_for_idle(struct intel_engine_cs *engine)
355 {
356 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
357 }
358 
igt_reset_nop(void * arg)359 static int igt_reset_nop(void *arg)
360 {
361 	struct intel_gt *gt = arg;
362 	struct i915_gpu_error *global = &gt->i915->gpu_error;
363 	struct intel_engine_cs *engine;
364 	unsigned int reset_count, count;
365 	enum intel_engine_id id;
366 	IGT_TIMEOUT(end_time);
367 	int err = 0;
368 
369 	/* Check that we can reset during non-user portions of requests */
370 
371 	reset_count = i915_reset_count(global);
372 	count = 0;
373 	do {
374 		for_each_engine(engine, gt, id) {
375 			struct intel_context *ce;
376 			int i;
377 
378 			ce = intel_context_create(engine);
379 			if (IS_ERR(ce)) {
380 				err = PTR_ERR(ce);
381 				break;
382 			}
383 
384 			for (i = 0; i < 16; i++) {
385 				struct i915_request *rq;
386 
387 				rq = intel_context_create_request(ce);
388 				if (IS_ERR(rq)) {
389 					err = PTR_ERR(rq);
390 					break;
391 				}
392 
393 				i915_request_add(rq);
394 			}
395 
396 			intel_context_put(ce);
397 		}
398 
399 		igt_global_reset_lock(gt);
400 		intel_gt_reset(gt, ALL_ENGINES, NULL);
401 		igt_global_reset_unlock(gt);
402 
403 		if (intel_gt_is_wedged(gt)) {
404 			err = -EIO;
405 			break;
406 		}
407 
408 		if (i915_reset_count(global) != reset_count + ++count) {
409 			pr_err("Full GPU reset not recorded!\n");
410 			err = -EINVAL;
411 			break;
412 		}
413 
414 		err = igt_flush_test(gt->i915);
415 		if (err)
416 			break;
417 	} while (time_before(jiffies, end_time));
418 	pr_info("%s: %d resets\n", __func__, count);
419 
420 	if (igt_flush_test(gt->i915))
421 		err = -EIO;
422 	return err;
423 }
424 
igt_reset_nop_engine(void * arg)425 static int igt_reset_nop_engine(void *arg)
426 {
427 	struct intel_gt *gt = arg;
428 	struct i915_gpu_error *global = &gt->i915->gpu_error;
429 	struct intel_engine_cs *engine;
430 	enum intel_engine_id id;
431 
432 	/* Check that we can engine-reset during non-user portions */
433 
434 	if (!intel_has_reset_engine(gt))
435 		return 0;
436 
437 	for_each_engine(engine, gt, id) {
438 		unsigned int reset_count, reset_engine_count, count;
439 		struct intel_context *ce;
440 		IGT_TIMEOUT(end_time);
441 		int err;
442 
443 		ce = intel_context_create(engine);
444 		if (IS_ERR(ce))
445 			return PTR_ERR(ce);
446 
447 		reset_count = i915_reset_count(global);
448 		reset_engine_count = i915_reset_engine_count(global, engine);
449 		count = 0;
450 
451 		st_engine_heartbeat_disable(engine);
452 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
453 		do {
454 			int i;
455 
456 			if (!wait_for_idle(engine)) {
457 				pr_err("%s failed to idle before reset\n",
458 				       engine->name);
459 				err = -EIO;
460 				break;
461 			}
462 
463 			for (i = 0; i < 16; i++) {
464 				struct i915_request *rq;
465 
466 				rq = intel_context_create_request(ce);
467 				if (IS_ERR(rq)) {
468 					struct drm_printer p =
469 						drm_info_printer(gt->i915->drm.dev);
470 					intel_engine_dump(engine, &p,
471 							  "%s(%s): failed to submit request\n",
472 							  __func__,
473 							  engine->name);
474 
475 					GEM_TRACE("%s(%s): failed to submit request\n",
476 						  __func__,
477 						  engine->name);
478 					GEM_TRACE_DUMP();
479 
480 					intel_gt_set_wedged(gt);
481 
482 					err = PTR_ERR(rq);
483 					break;
484 				}
485 
486 				i915_request_add(rq);
487 			}
488 			err = intel_engine_reset(engine, NULL);
489 			if (err) {
490 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
491 				       engine->name, err);
492 				break;
493 			}
494 
495 			if (i915_reset_count(global) != reset_count) {
496 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
497 				err = -EINVAL;
498 				break;
499 			}
500 
501 			if (i915_reset_engine_count(global, engine) !=
502 			    reset_engine_count + ++count) {
503 				pr_err("%s engine reset not recorded!\n",
504 				       engine->name);
505 				err = -EINVAL;
506 				break;
507 			}
508 		} while (time_before(jiffies, end_time));
509 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
510 		st_engine_heartbeat_enable(engine);
511 
512 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
513 
514 		intel_context_put(ce);
515 		if (igt_flush_test(gt->i915))
516 			err = -EIO;
517 		if (err)
518 			return err;
519 	}
520 
521 	return 0;
522 }
523 
force_reset_timeout(struct intel_engine_cs * engine)524 static void force_reset_timeout(struct intel_engine_cs *engine)
525 {
526 	engine->reset_timeout.probability = 999;
527 	atomic_set(&engine->reset_timeout.times, -1);
528 }
529 
cancel_reset_timeout(struct intel_engine_cs * engine)530 static void cancel_reset_timeout(struct intel_engine_cs *engine)
531 {
532 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
533 }
534 
igt_reset_fail_engine(void * arg)535 static int igt_reset_fail_engine(void *arg)
536 {
537 	struct intel_gt *gt = arg;
538 	struct intel_engine_cs *engine;
539 	enum intel_engine_id id;
540 
541 	/* Check that we can recover from engine-reset failues */
542 
543 	if (!intel_has_reset_engine(gt))
544 		return 0;
545 
546 	for_each_engine(engine, gt, id) {
547 		unsigned int count;
548 		struct intel_context *ce;
549 		IGT_TIMEOUT(end_time);
550 		int err;
551 
552 		ce = intel_context_create(engine);
553 		if (IS_ERR(ce))
554 			return PTR_ERR(ce);
555 
556 		st_engine_heartbeat_disable(engine);
557 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
558 
559 		force_reset_timeout(engine);
560 		err = intel_engine_reset(engine, NULL);
561 		cancel_reset_timeout(engine);
562 		if (err == 0) /* timeouts only generated on gen8+ */
563 			goto skip;
564 
565 		count = 0;
566 		do {
567 			struct i915_request *last = NULL;
568 			int i;
569 
570 			if (!wait_for_idle(engine)) {
571 				pr_err("%s failed to idle before reset\n",
572 				       engine->name);
573 				err = -EIO;
574 				break;
575 			}
576 
577 			for (i = 0; i < count % 15; i++) {
578 				struct i915_request *rq;
579 
580 				rq = intel_context_create_request(ce);
581 				if (IS_ERR(rq)) {
582 					struct drm_printer p =
583 						drm_info_printer(gt->i915->drm.dev);
584 					intel_engine_dump(engine, &p,
585 							  "%s(%s): failed to submit request\n",
586 							  __func__,
587 							  engine->name);
588 
589 					GEM_TRACE("%s(%s): failed to submit request\n",
590 						  __func__,
591 						  engine->name);
592 					GEM_TRACE_DUMP();
593 
594 					intel_gt_set_wedged(gt);
595 					if (last)
596 						i915_request_put(last);
597 
598 					err = PTR_ERR(rq);
599 					goto out;
600 				}
601 
602 				if (last)
603 					i915_request_put(last);
604 				last = i915_request_get(rq);
605 				i915_request_add(rq);
606 			}
607 
608 			if (count & 1) {
609 				err = intel_engine_reset(engine, NULL);
610 				if (err) {
611 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
612 						      engine->name, err);
613 					GEM_TRACE_DUMP();
614 					i915_request_put(last);
615 					break;
616 				}
617 			} else {
618 				force_reset_timeout(engine);
619 				err = intel_engine_reset(engine, NULL);
620 				cancel_reset_timeout(engine);
621 				if (err != -ETIMEDOUT) {
622 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
623 					       engine->name, err);
624 					i915_request_put(last);
625 					break;
626 				}
627 			}
628 
629 			err = 0;
630 			if (last) {
631 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
632 					struct drm_printer p =
633 						drm_info_printer(gt->i915->drm.dev);
634 
635 					intel_engine_dump(engine, &p,
636 							  "%s(%s): failed to complete request\n",
637 							  __func__,
638 							  engine->name);
639 
640 					GEM_TRACE("%s(%s): failed to complete request\n",
641 						  __func__,
642 						  engine->name);
643 					GEM_TRACE_DUMP();
644 
645 					err = -EIO;
646 				}
647 				i915_request_put(last);
648 			}
649 			count++;
650 		} while (err == 0 && time_before(jiffies, end_time));
651 out:
652 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
653 skip:
654 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
655 		st_engine_heartbeat_enable(engine);
656 		intel_context_put(ce);
657 
658 		if (igt_flush_test(gt->i915))
659 			err = -EIO;
660 		if (err)
661 			return err;
662 	}
663 
664 	return 0;
665 }
666 
__igt_reset_engine(struct intel_gt * gt,bool active)667 static int __igt_reset_engine(struct intel_gt *gt, bool active)
668 {
669 	struct i915_gpu_error *global = &gt->i915->gpu_error;
670 	struct intel_engine_cs *engine;
671 	enum intel_engine_id id;
672 	struct hang h;
673 	int err = 0;
674 
675 	/* Check that we can issue an engine reset on an idle engine (no-op) */
676 
677 	if (!intel_has_reset_engine(gt))
678 		return 0;
679 
680 	if (active) {
681 		err = hang_init(&h, gt);
682 		if (err)
683 			return err;
684 	}
685 
686 	for_each_engine(engine, gt, id) {
687 		unsigned int reset_count, reset_engine_count;
688 		unsigned long count;
689 		IGT_TIMEOUT(end_time);
690 
691 		if (active && !intel_engine_can_store_dword(engine))
692 			continue;
693 
694 		if (!wait_for_idle(engine)) {
695 			pr_err("%s failed to idle before reset\n",
696 			       engine->name);
697 			err = -EIO;
698 			break;
699 		}
700 
701 		reset_count = i915_reset_count(global);
702 		reset_engine_count = i915_reset_engine_count(global, engine);
703 
704 		st_engine_heartbeat_disable(engine);
705 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
706 		count = 0;
707 		do {
708 			if (active) {
709 				struct i915_request *rq;
710 
711 				rq = hang_create_request(&h, engine);
712 				if (IS_ERR(rq)) {
713 					err = PTR_ERR(rq);
714 					break;
715 				}
716 
717 				i915_request_get(rq);
718 				i915_request_add(rq);
719 
720 				if (!wait_until_running(&h, rq)) {
721 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
722 
723 					pr_err("%s: Failed to start request %llx, at %x\n",
724 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
725 					intel_engine_dump(engine, &p,
726 							  "%s\n", engine->name);
727 
728 					i915_request_put(rq);
729 					err = -EIO;
730 					break;
731 				}
732 
733 				i915_request_put(rq);
734 			}
735 
736 			err = intel_engine_reset(engine, NULL);
737 			if (err) {
738 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
739 				       engine->name, err);
740 				break;
741 			}
742 
743 			if (i915_reset_count(global) != reset_count) {
744 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
745 				err = -EINVAL;
746 				break;
747 			}
748 
749 			if (i915_reset_engine_count(global, engine) !=
750 			    ++reset_engine_count) {
751 				pr_err("%s engine reset not recorded!\n",
752 				       engine->name);
753 				err = -EINVAL;
754 				break;
755 			}
756 
757 			count++;
758 		} while (time_before(jiffies, end_time));
759 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
760 		st_engine_heartbeat_enable(engine);
761 		pr_info("%s: Completed %lu %s resets\n",
762 			engine->name, count, active ? "active" : "idle");
763 
764 		if (err)
765 			break;
766 
767 		err = igt_flush_test(gt->i915);
768 		if (err)
769 			break;
770 	}
771 
772 	if (intel_gt_is_wedged(gt))
773 		err = -EIO;
774 
775 	if (active)
776 		hang_fini(&h);
777 
778 	return err;
779 }
780 
igt_reset_idle_engine(void * arg)781 static int igt_reset_idle_engine(void *arg)
782 {
783 	return __igt_reset_engine(arg, false);
784 }
785 
igt_reset_active_engine(void * arg)786 static int igt_reset_active_engine(void *arg)
787 {
788 	return __igt_reset_engine(arg, true);
789 }
790 
791 struct active_engine {
792 	struct task_struct *task;
793 	struct intel_engine_cs *engine;
794 	unsigned long resets;
795 	unsigned int flags;
796 };
797 
798 #define TEST_ACTIVE	BIT(0)
799 #define TEST_OTHERS	BIT(1)
800 #define TEST_SELF	BIT(2)
801 #define TEST_PRIORITY	BIT(3)
802 
active_request_put(struct i915_request * rq)803 static int active_request_put(struct i915_request *rq)
804 {
805 	int err = 0;
806 
807 	if (!rq)
808 		return 0;
809 
810 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
811 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
812 			  rq->engine->name,
813 			  rq->fence.context,
814 			  rq->fence.seqno);
815 		GEM_TRACE_DUMP();
816 
817 		intel_gt_set_wedged(rq->engine->gt);
818 		err = -EIO;
819 	}
820 
821 	i915_request_put(rq);
822 
823 	return err;
824 }
825 
active_engine(void * data)826 static int active_engine(void *data)
827 {
828 	I915_RND_STATE(prng);
829 	struct active_engine *arg = data;
830 	struct intel_engine_cs *engine = arg->engine;
831 	struct i915_request *rq[8] = {};
832 	struct intel_context *ce[ARRAY_SIZE(rq)];
833 	unsigned long count;
834 	int err = 0;
835 
836 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
837 		ce[count] = intel_context_create(engine);
838 		if (IS_ERR(ce[count])) {
839 			err = PTR_ERR(ce[count]);
840 			while (--count)
841 				intel_context_put(ce[count]);
842 			return err;
843 		}
844 	}
845 
846 	count = 0;
847 	while (!kthread_should_stop()) {
848 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
849 		struct i915_request *old = rq[idx];
850 		struct i915_request *new;
851 
852 		new = intel_context_create_request(ce[idx]);
853 		if (IS_ERR(new)) {
854 			err = PTR_ERR(new);
855 			break;
856 		}
857 
858 		rq[idx] = i915_request_get(new);
859 		i915_request_add(new);
860 
861 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
862 			struct i915_sched_attr attr = {
863 				.priority =
864 					i915_prandom_u32_max_state(512, &prng),
865 			};
866 			engine->schedule(rq[idx], &attr);
867 		}
868 
869 		err = active_request_put(old);
870 		if (err)
871 			break;
872 
873 		cond_resched();
874 	}
875 
876 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
877 		int err__ = active_request_put(rq[count]);
878 
879 		/* Keep the first error */
880 		if (!err)
881 			err = err__;
882 
883 		intel_context_put(ce[count]);
884 	}
885 
886 	return err;
887 }
888 
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)889 static int __igt_reset_engines(struct intel_gt *gt,
890 			       const char *test_name,
891 			       unsigned int flags)
892 {
893 	struct i915_gpu_error *global = &gt->i915->gpu_error;
894 	struct intel_engine_cs *engine, *other;
895 	enum intel_engine_id id, tmp;
896 	struct hang h;
897 	int err = 0;
898 
899 	/* Check that issuing a reset on one engine does not interfere
900 	 * with any other engine.
901 	 */
902 
903 	if (!intel_has_reset_engine(gt))
904 		return 0;
905 
906 	if (flags & TEST_ACTIVE) {
907 		err = hang_init(&h, gt);
908 		if (err)
909 			return err;
910 
911 		if (flags & TEST_PRIORITY)
912 			h.ctx->sched.priority = 1024;
913 	}
914 
915 	for_each_engine(engine, gt, id) {
916 		struct active_engine threads[I915_NUM_ENGINES] = {};
917 		unsigned long device = i915_reset_count(global);
918 		unsigned long count = 0, reported;
919 		IGT_TIMEOUT(end_time);
920 
921 		if (flags & TEST_ACTIVE &&
922 		    !intel_engine_can_store_dword(engine))
923 			continue;
924 
925 		if (!wait_for_idle(engine)) {
926 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
927 			       engine->name, test_name);
928 			err = -EIO;
929 			break;
930 		}
931 
932 		memset(threads, 0, sizeof(threads));
933 		for_each_engine(other, gt, tmp) {
934 			struct task_struct *tsk;
935 
936 			threads[tmp].resets =
937 				i915_reset_engine_count(global, other);
938 
939 			if (other == engine && !(flags & TEST_SELF))
940 				continue;
941 
942 			if (other != engine && !(flags & TEST_OTHERS))
943 				continue;
944 
945 			threads[tmp].engine = other;
946 			threads[tmp].flags = flags;
947 
948 			tsk = kthread_run(active_engine, &threads[tmp],
949 					  "igt/%s", other->name);
950 			if (IS_ERR(tsk)) {
951 				err = PTR_ERR(tsk);
952 				goto unwind;
953 			}
954 
955 			threads[tmp].task = tsk;
956 			get_task_struct(tsk);
957 		}
958 
959 		yield(); /* start all threads before we begin */
960 
961 		st_engine_heartbeat_disable(engine);
962 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
963 		do {
964 			struct i915_request *rq = NULL;
965 
966 			if (flags & TEST_ACTIVE) {
967 				rq = hang_create_request(&h, engine);
968 				if (IS_ERR(rq)) {
969 					err = PTR_ERR(rq);
970 					break;
971 				}
972 
973 				i915_request_get(rq);
974 				i915_request_add(rq);
975 
976 				if (!wait_until_running(&h, rq)) {
977 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
978 
979 					pr_err("%s: Failed to start request %llx, at %x\n",
980 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
981 					intel_engine_dump(engine, &p,
982 							  "%s\n", engine->name);
983 
984 					i915_request_put(rq);
985 					err = -EIO;
986 					break;
987 				}
988 			}
989 
990 			err = intel_engine_reset(engine, NULL);
991 			if (err) {
992 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
993 				       engine->name, test_name, err);
994 				break;
995 			}
996 
997 			count++;
998 
999 			if (rq) {
1000 				if (rq->fence.error != -EIO) {
1001 					pr_err("i915_reset_engine(%s:%s):"
1002 					       " failed to reset request %llx:%lld\n",
1003 					       engine->name, test_name,
1004 					       rq->fence.context,
1005 					       rq->fence.seqno);
1006 					i915_request_put(rq);
1007 
1008 					GEM_TRACE_DUMP();
1009 					intel_gt_set_wedged(gt);
1010 					err = -EIO;
1011 					break;
1012 				}
1013 
1014 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1015 					struct drm_printer p =
1016 						drm_info_printer(gt->i915->drm.dev);
1017 
1018 					pr_err("i915_reset_engine(%s:%s):"
1019 					       " failed to complete request %llx:%lld after reset\n",
1020 					       engine->name, test_name,
1021 					       rq->fence.context,
1022 					       rq->fence.seqno);
1023 					intel_engine_dump(engine, &p,
1024 							  "%s\n", engine->name);
1025 					i915_request_put(rq);
1026 
1027 					GEM_TRACE_DUMP();
1028 					intel_gt_set_wedged(gt);
1029 					err = -EIO;
1030 					break;
1031 				}
1032 
1033 				i915_request_put(rq);
1034 			}
1035 
1036 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1037 				struct drm_printer p =
1038 					drm_info_printer(gt->i915->drm.dev);
1039 
1040 				pr_err("i915_reset_engine(%s:%s):"
1041 				       " failed to idle after reset\n",
1042 				       engine->name, test_name);
1043 				intel_engine_dump(engine, &p,
1044 						  "%s\n", engine->name);
1045 
1046 				err = -EIO;
1047 				break;
1048 			}
1049 		} while (time_before(jiffies, end_time));
1050 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1051 		st_engine_heartbeat_enable(engine);
1052 
1053 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1054 			engine->name, test_name, count);
1055 
1056 		reported = i915_reset_engine_count(global, engine);
1057 		reported -= threads[engine->id].resets;
1058 		if (reported != count) {
1059 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1060 			       engine->name, test_name, count, reported);
1061 			if (!err)
1062 				err = -EINVAL;
1063 		}
1064 
1065 unwind:
1066 		for_each_engine(other, gt, tmp) {
1067 			int ret;
1068 
1069 			if (!threads[tmp].task)
1070 				continue;
1071 
1072 			ret = kthread_stop(threads[tmp].task);
1073 			if (ret) {
1074 				pr_err("kthread for other engine %s failed, err=%d\n",
1075 				       other->name, ret);
1076 				if (!err)
1077 					err = ret;
1078 			}
1079 			put_task_struct(threads[tmp].task);
1080 
1081 			if (other->uabi_class != engine->uabi_class &&
1082 			    threads[tmp].resets !=
1083 			    i915_reset_engine_count(global, other)) {
1084 				pr_err("Innocent engine %s was reset (count=%ld)\n",
1085 				       other->name,
1086 				       i915_reset_engine_count(global, other) -
1087 				       threads[tmp].resets);
1088 				if (!err)
1089 					err = -EINVAL;
1090 			}
1091 		}
1092 
1093 		if (device != i915_reset_count(global)) {
1094 			pr_err("Global reset (count=%ld)!\n",
1095 			       i915_reset_count(global) - device);
1096 			if (!err)
1097 				err = -EINVAL;
1098 		}
1099 
1100 		if (err)
1101 			break;
1102 
1103 		err = igt_flush_test(gt->i915);
1104 		if (err)
1105 			break;
1106 	}
1107 
1108 	if (intel_gt_is_wedged(gt))
1109 		err = -EIO;
1110 
1111 	if (flags & TEST_ACTIVE)
1112 		hang_fini(&h);
1113 
1114 	return err;
1115 }
1116 
igt_reset_engines(void * arg)1117 static int igt_reset_engines(void *arg)
1118 {
1119 	static const struct {
1120 		const char *name;
1121 		unsigned int flags;
1122 	} phases[] = {
1123 		{ "idle", 0 },
1124 		{ "active", TEST_ACTIVE },
1125 		{ "others-idle", TEST_OTHERS },
1126 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1127 		{
1128 			"others-priority",
1129 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1130 		},
1131 		{
1132 			"self-priority",
1133 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1134 		},
1135 		{ }
1136 	};
1137 	struct intel_gt *gt = arg;
1138 	typeof(*phases) *p;
1139 	int err;
1140 
1141 	for (p = phases; p->name; p++) {
1142 		if (p->flags & TEST_PRIORITY) {
1143 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1144 				continue;
1145 		}
1146 
1147 		err = __igt_reset_engines(arg, p->name, p->flags);
1148 		if (err)
1149 			return err;
1150 	}
1151 
1152 	return 0;
1153 }
1154 
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1155 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1156 {
1157 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1158 
1159 	intel_gt_reset(gt, mask, NULL);
1160 
1161 	return count;
1162 }
1163 
igt_reset_wait(void * arg)1164 static int igt_reset_wait(void *arg)
1165 {
1166 	struct intel_gt *gt = arg;
1167 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1168 	struct intel_engine_cs *engine = gt->engine[RCS0];
1169 	struct i915_request *rq;
1170 	unsigned int reset_count;
1171 	struct hang h;
1172 	long timeout;
1173 	int err;
1174 
1175 	if (!engine || !intel_engine_can_store_dword(engine))
1176 		return 0;
1177 
1178 	/* Check that we detect a stuck waiter and issue a reset */
1179 
1180 	igt_global_reset_lock(gt);
1181 
1182 	err = hang_init(&h, gt);
1183 	if (err)
1184 		goto unlock;
1185 
1186 	rq = hang_create_request(&h, engine);
1187 	if (IS_ERR(rq)) {
1188 		err = PTR_ERR(rq);
1189 		goto fini;
1190 	}
1191 
1192 	i915_request_get(rq);
1193 	i915_request_add(rq);
1194 
1195 	if (!wait_until_running(&h, rq)) {
1196 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1197 
1198 		pr_err("%s: Failed to start request %llx, at %x\n",
1199 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1200 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1201 
1202 		intel_gt_set_wedged(gt);
1203 
1204 		err = -EIO;
1205 		goto out_rq;
1206 	}
1207 
1208 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1209 
1210 	timeout = i915_request_wait(rq, 0, 10);
1211 	if (timeout < 0) {
1212 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1213 		       timeout);
1214 		err = timeout;
1215 		goto out_rq;
1216 	}
1217 
1218 	if (i915_reset_count(global) == reset_count) {
1219 		pr_err("No GPU reset recorded!\n");
1220 		err = -EINVAL;
1221 		goto out_rq;
1222 	}
1223 
1224 out_rq:
1225 	i915_request_put(rq);
1226 fini:
1227 	hang_fini(&h);
1228 unlock:
1229 	igt_global_reset_unlock(gt);
1230 
1231 	if (intel_gt_is_wedged(gt))
1232 		return -EIO;
1233 
1234 	return err;
1235 }
1236 
1237 struct evict_vma {
1238 	struct completion completion;
1239 	struct i915_vma *vma;
1240 };
1241 
evict_vma(void * data)1242 static int evict_vma(void *data)
1243 {
1244 	struct evict_vma *arg = data;
1245 	struct i915_address_space *vm = arg->vma->vm;
1246 	struct drm_mm_node evict = arg->vma->node;
1247 	int err;
1248 
1249 	complete(&arg->completion);
1250 
1251 	mutex_lock(&vm->mutex);
1252 	err = i915_gem_evict_for_node(vm, &evict, 0);
1253 	mutex_unlock(&vm->mutex);
1254 
1255 	return err;
1256 }
1257 
evict_fence(void * data)1258 static int evict_fence(void *data)
1259 {
1260 	struct evict_vma *arg = data;
1261 	int err;
1262 
1263 	complete(&arg->completion);
1264 
1265 	/* Mark the fence register as dirty to force the mmio update. */
1266 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1267 	if (err) {
1268 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1269 		return err;
1270 	}
1271 
1272 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1273 	if (err) {
1274 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1275 		return err;
1276 	}
1277 
1278 	err = i915_vma_pin_fence(arg->vma);
1279 	i915_vma_unpin(arg->vma);
1280 	if (err) {
1281 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1282 		return err;
1283 	}
1284 
1285 	i915_vma_unpin_fence(arg->vma);
1286 
1287 	return 0;
1288 }
1289 
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1290 static int __igt_reset_evict_vma(struct intel_gt *gt,
1291 				 struct i915_address_space *vm,
1292 				 int (*fn)(void *),
1293 				 unsigned int flags)
1294 {
1295 	struct intel_engine_cs *engine = gt->engine[RCS0];
1296 	struct drm_i915_gem_object *obj;
1297 	struct task_struct *tsk = NULL;
1298 	struct i915_request *rq;
1299 	struct evict_vma arg;
1300 	struct hang h;
1301 	unsigned int pin_flags;
1302 	int err;
1303 
1304 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1305 		return 0;
1306 
1307 	if (!engine || !intel_engine_can_store_dword(engine))
1308 		return 0;
1309 
1310 	/* Check that we can recover an unbind stuck on a hanging request */
1311 
1312 	err = hang_init(&h, gt);
1313 	if (err)
1314 		return err;
1315 
1316 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1317 	if (IS_ERR(obj)) {
1318 		err = PTR_ERR(obj);
1319 		goto fini;
1320 	}
1321 
1322 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1323 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1324 		if (err) {
1325 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1326 			goto out_obj;
1327 		}
1328 	}
1329 
1330 	arg.vma = i915_vma_instance(obj, vm, NULL);
1331 	if (IS_ERR(arg.vma)) {
1332 		err = PTR_ERR(arg.vma);
1333 		goto out_obj;
1334 	}
1335 
1336 	rq = hang_create_request(&h, engine);
1337 	if (IS_ERR(rq)) {
1338 		err = PTR_ERR(rq);
1339 		goto out_obj;
1340 	}
1341 
1342 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1343 
1344 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1345 		pin_flags |= PIN_MAPPABLE;
1346 
1347 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1348 	if (err) {
1349 		i915_request_add(rq);
1350 		goto out_obj;
1351 	}
1352 
1353 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1354 		err = i915_vma_pin_fence(arg.vma);
1355 		if (err) {
1356 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1357 			i915_vma_unpin(arg.vma);
1358 			i915_request_add(rq);
1359 			goto out_obj;
1360 		}
1361 	}
1362 
1363 	i915_vma_lock(arg.vma);
1364 	err = i915_request_await_object(rq, arg.vma->obj,
1365 					flags & EXEC_OBJECT_WRITE);
1366 	if (err == 0)
1367 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1368 	i915_vma_unlock(arg.vma);
1369 
1370 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1371 		i915_vma_unpin_fence(arg.vma);
1372 	i915_vma_unpin(arg.vma);
1373 
1374 	i915_request_get(rq);
1375 	i915_request_add(rq);
1376 	if (err)
1377 		goto out_rq;
1378 
1379 	if (!wait_until_running(&h, rq)) {
1380 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1381 
1382 		pr_err("%s: Failed to start request %llx, at %x\n",
1383 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1384 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1385 
1386 		intel_gt_set_wedged(gt);
1387 		goto out_reset;
1388 	}
1389 
1390 	init_completion(&arg.completion);
1391 
1392 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1393 	if (IS_ERR(tsk)) {
1394 		err = PTR_ERR(tsk);
1395 		tsk = NULL;
1396 		goto out_reset;
1397 	}
1398 	get_task_struct(tsk);
1399 
1400 	wait_for_completion(&arg.completion);
1401 
1402 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1403 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1404 
1405 		pr_err("igt/evict_vma kthread did not wait\n");
1406 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1407 
1408 		intel_gt_set_wedged(gt);
1409 		goto out_reset;
1410 	}
1411 
1412 out_reset:
1413 	igt_global_reset_lock(gt);
1414 	fake_hangcheck(gt, rq->engine->mask);
1415 	igt_global_reset_unlock(gt);
1416 
1417 	if (tsk) {
1418 		struct intel_wedge_me w;
1419 
1420 		/* The reset, even indirectly, should take less than 10ms. */
1421 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1422 			err = kthread_stop(tsk);
1423 
1424 		put_task_struct(tsk);
1425 	}
1426 
1427 out_rq:
1428 	i915_request_put(rq);
1429 out_obj:
1430 	i915_gem_object_put(obj);
1431 fini:
1432 	hang_fini(&h);
1433 	if (intel_gt_is_wedged(gt))
1434 		return -EIO;
1435 
1436 	return err;
1437 }
1438 
igt_reset_evict_ggtt(void * arg)1439 static int igt_reset_evict_ggtt(void *arg)
1440 {
1441 	struct intel_gt *gt = arg;
1442 
1443 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1444 				     evict_vma, EXEC_OBJECT_WRITE);
1445 }
1446 
igt_reset_evict_ppgtt(void * arg)1447 static int igt_reset_evict_ppgtt(void *arg)
1448 {
1449 	struct intel_gt *gt = arg;
1450 	struct i915_ppgtt *ppgtt;
1451 	int err;
1452 
1453 	/* aliasing == global gtt locking, covered above */
1454 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1455 		return 0;
1456 
1457 	ppgtt = i915_ppgtt_create(gt);
1458 	if (IS_ERR(ppgtt))
1459 		return PTR_ERR(ppgtt);
1460 
1461 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1462 				    evict_vma, EXEC_OBJECT_WRITE);
1463 	i915_vm_put(&ppgtt->vm);
1464 
1465 	return err;
1466 }
1467 
igt_reset_evict_fence(void * arg)1468 static int igt_reset_evict_fence(void *arg)
1469 {
1470 	struct intel_gt *gt = arg;
1471 
1472 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1473 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1474 }
1475 
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1476 static int wait_for_others(struct intel_gt *gt,
1477 			   struct intel_engine_cs *exclude)
1478 {
1479 	struct intel_engine_cs *engine;
1480 	enum intel_engine_id id;
1481 
1482 	for_each_engine(engine, gt, id) {
1483 		if (engine == exclude)
1484 			continue;
1485 
1486 		if (!wait_for_idle(engine))
1487 			return -EIO;
1488 	}
1489 
1490 	return 0;
1491 }
1492 
igt_reset_queue(void * arg)1493 static int igt_reset_queue(void *arg)
1494 {
1495 	struct intel_gt *gt = arg;
1496 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1497 	struct intel_engine_cs *engine;
1498 	enum intel_engine_id id;
1499 	struct hang h;
1500 	int err;
1501 
1502 	/* Check that we replay pending requests following a hang */
1503 
1504 	igt_global_reset_lock(gt);
1505 
1506 	err = hang_init(&h, gt);
1507 	if (err)
1508 		goto unlock;
1509 
1510 	for_each_engine(engine, gt, id) {
1511 		struct i915_request *prev;
1512 		IGT_TIMEOUT(end_time);
1513 		unsigned int count;
1514 
1515 		if (!intel_engine_can_store_dword(engine))
1516 			continue;
1517 
1518 		prev = hang_create_request(&h, engine);
1519 		if (IS_ERR(prev)) {
1520 			err = PTR_ERR(prev);
1521 			goto fini;
1522 		}
1523 
1524 		i915_request_get(prev);
1525 		i915_request_add(prev);
1526 
1527 		count = 0;
1528 		do {
1529 			struct i915_request *rq;
1530 			unsigned int reset_count;
1531 
1532 			rq = hang_create_request(&h, engine);
1533 			if (IS_ERR(rq)) {
1534 				err = PTR_ERR(rq);
1535 				goto fini;
1536 			}
1537 
1538 			i915_request_get(rq);
1539 			i915_request_add(rq);
1540 
1541 			/*
1542 			 * XXX We don't handle resetting the kernel context
1543 			 * very well. If we trigger a device reset twice in
1544 			 * quick succession while the kernel context is
1545 			 * executing, we may end up skipping the breadcrumb.
1546 			 * This is really only a problem for the selftest as
1547 			 * normally there is a large interlude between resets
1548 			 * (hangcheck), or we focus on resetting just one
1549 			 * engine and so avoid repeatedly resetting innocents.
1550 			 */
1551 			err = wait_for_others(gt, engine);
1552 			if (err) {
1553 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1554 				       __func__, engine->name);
1555 				i915_request_put(rq);
1556 				i915_request_put(prev);
1557 
1558 				GEM_TRACE_DUMP();
1559 				intel_gt_set_wedged(gt);
1560 				goto fini;
1561 			}
1562 
1563 			if (!wait_until_running(&h, prev)) {
1564 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1565 
1566 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1567 				       __func__, engine->name,
1568 				       prev->fence.seqno, hws_seqno(&h, prev));
1569 				intel_engine_dump(engine, &p,
1570 						  "%s\n", engine->name);
1571 
1572 				i915_request_put(rq);
1573 				i915_request_put(prev);
1574 
1575 				intel_gt_set_wedged(gt);
1576 
1577 				err = -EIO;
1578 				goto fini;
1579 			}
1580 
1581 			reset_count = fake_hangcheck(gt, BIT(id));
1582 
1583 			if (prev->fence.error != -EIO) {
1584 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1585 				       prev->fence.error);
1586 				i915_request_put(rq);
1587 				i915_request_put(prev);
1588 				err = -EINVAL;
1589 				goto fini;
1590 			}
1591 
1592 			if (rq->fence.error) {
1593 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1594 				       rq->fence.error);
1595 				i915_request_put(rq);
1596 				i915_request_put(prev);
1597 				err = -EINVAL;
1598 				goto fini;
1599 			}
1600 
1601 			if (i915_reset_count(global) == reset_count) {
1602 				pr_err("No GPU reset recorded!\n");
1603 				i915_request_put(rq);
1604 				i915_request_put(prev);
1605 				err = -EINVAL;
1606 				goto fini;
1607 			}
1608 
1609 			i915_request_put(prev);
1610 			prev = rq;
1611 			count++;
1612 		} while (time_before(jiffies, end_time));
1613 		pr_info("%s: Completed %d queued resets\n",
1614 			engine->name, count);
1615 
1616 		*h.batch = MI_BATCH_BUFFER_END;
1617 		intel_gt_chipset_flush(engine->gt);
1618 
1619 		i915_request_put(prev);
1620 
1621 		err = igt_flush_test(gt->i915);
1622 		if (err)
1623 			break;
1624 	}
1625 
1626 fini:
1627 	hang_fini(&h);
1628 unlock:
1629 	igt_global_reset_unlock(gt);
1630 
1631 	if (intel_gt_is_wedged(gt))
1632 		return -EIO;
1633 
1634 	return err;
1635 }
1636 
igt_handle_error(void * arg)1637 static int igt_handle_error(void *arg)
1638 {
1639 	struct intel_gt *gt = arg;
1640 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1641 	struct intel_engine_cs *engine = gt->engine[RCS0];
1642 	struct hang h;
1643 	struct i915_request *rq;
1644 	struct i915_gpu_coredump *error;
1645 	int err;
1646 
1647 	/* Check that we can issue a global GPU and engine reset */
1648 
1649 	if (!intel_has_reset_engine(gt))
1650 		return 0;
1651 
1652 	if (!engine || !intel_engine_can_store_dword(engine))
1653 		return 0;
1654 
1655 	err = hang_init(&h, gt);
1656 	if (err)
1657 		return err;
1658 
1659 	rq = hang_create_request(&h, engine);
1660 	if (IS_ERR(rq)) {
1661 		err = PTR_ERR(rq);
1662 		goto err_fini;
1663 	}
1664 
1665 	i915_request_get(rq);
1666 	i915_request_add(rq);
1667 
1668 	if (!wait_until_running(&h, rq)) {
1669 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1670 
1671 		pr_err("%s: Failed to start request %llx, at %x\n",
1672 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1673 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1674 
1675 		intel_gt_set_wedged(gt);
1676 
1677 		err = -EIO;
1678 		goto err_request;
1679 	}
1680 
1681 	/* Temporarily disable error capture */
1682 	error = xchg(&global->first_error, (void *)-1);
1683 
1684 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1685 
1686 	xchg(&global->first_error, error);
1687 
1688 	if (rq->fence.error != -EIO) {
1689 		pr_err("Guilty request not identified!\n");
1690 		err = -EINVAL;
1691 		goto err_request;
1692 	}
1693 
1694 err_request:
1695 	i915_request_put(rq);
1696 err_fini:
1697 	hang_fini(&h);
1698 	return err;
1699 }
1700 
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1701 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1702 				     const struct igt_atomic_section *p,
1703 				     const char *mode)
1704 {
1705 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1706 	int err;
1707 
1708 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1709 		  engine->name, mode, p->name);
1710 
1711 	if (t->func)
1712 		tasklet_disable(t);
1713 	if (strcmp(p->name, "softirq"))
1714 		local_bh_disable();
1715 	p->critical_section_begin();
1716 
1717 	err = __intel_engine_reset_bh(engine, NULL);
1718 
1719 	p->critical_section_end();
1720 	if (strcmp(p->name, "softirq"))
1721 		local_bh_enable();
1722 	if (t->func) {
1723 		tasklet_enable(t);
1724 		tasklet_hi_schedule(t);
1725 	}
1726 
1727 	if (err)
1728 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1729 		       engine->name, mode, p->name);
1730 
1731 	return err;
1732 }
1733 
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1734 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1735 				   const struct igt_atomic_section *p)
1736 {
1737 	struct i915_request *rq;
1738 	struct hang h;
1739 	int err;
1740 
1741 	err = __igt_atomic_reset_engine(engine, p, "idle");
1742 	if (err)
1743 		return err;
1744 
1745 	err = hang_init(&h, engine->gt);
1746 	if (err)
1747 		return err;
1748 
1749 	rq = hang_create_request(&h, engine);
1750 	if (IS_ERR(rq)) {
1751 		err = PTR_ERR(rq);
1752 		goto out;
1753 	}
1754 
1755 	i915_request_get(rq);
1756 	i915_request_add(rq);
1757 
1758 	if (wait_until_running(&h, rq)) {
1759 		err = __igt_atomic_reset_engine(engine, p, "active");
1760 	} else {
1761 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1762 		       __func__, engine->name,
1763 		       rq->fence.seqno, hws_seqno(&h, rq));
1764 		intel_gt_set_wedged(engine->gt);
1765 		err = -EIO;
1766 	}
1767 
1768 	if (err == 0) {
1769 		struct intel_wedge_me w;
1770 
1771 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1772 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1773 		if (intel_gt_is_wedged(engine->gt))
1774 			err = -EIO;
1775 	}
1776 
1777 	i915_request_put(rq);
1778 out:
1779 	hang_fini(&h);
1780 	return err;
1781 }
1782 
igt_reset_engines_atomic(void * arg)1783 static int igt_reset_engines_atomic(void *arg)
1784 {
1785 	struct intel_gt *gt = arg;
1786 	const typeof(*igt_atomic_phases) *p;
1787 	int err = 0;
1788 
1789 	/* Check that the engines resets are usable from atomic context */
1790 
1791 	if (!intel_has_reset_engine(gt))
1792 		return 0;
1793 
1794 	if (intel_uc_uses_guc_submission(&gt->uc))
1795 		return 0;
1796 
1797 	igt_global_reset_lock(gt);
1798 
1799 	/* Flush any requests before we get started and check basics */
1800 	if (!igt_force_reset(gt))
1801 		goto unlock;
1802 
1803 	for (p = igt_atomic_phases; p->name; p++) {
1804 		struct intel_engine_cs *engine;
1805 		enum intel_engine_id id;
1806 
1807 		for_each_engine(engine, gt, id) {
1808 			err = igt_atomic_reset_engine(engine, p);
1809 			if (err)
1810 				goto out;
1811 		}
1812 	}
1813 
1814 out:
1815 	/* As we poke around the guts, do a full reset before continuing. */
1816 	igt_force_reset(gt);
1817 unlock:
1818 	igt_global_reset_unlock(gt);
1819 
1820 	return err;
1821 }
1822 
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1823 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1824 {
1825 	static const struct i915_subtest tests[] = {
1826 		SUBTEST(igt_hang_sanitycheck),
1827 		SUBTEST(igt_reset_nop),
1828 		SUBTEST(igt_reset_nop_engine),
1829 		SUBTEST(igt_reset_idle_engine),
1830 		SUBTEST(igt_reset_active_engine),
1831 		SUBTEST(igt_reset_fail_engine),
1832 		SUBTEST(igt_reset_engines),
1833 		SUBTEST(igt_reset_engines_atomic),
1834 		SUBTEST(igt_reset_queue),
1835 		SUBTEST(igt_reset_wait),
1836 		SUBTEST(igt_reset_evict_ggtt),
1837 		SUBTEST(igt_reset_evict_ppgtt),
1838 		SUBTEST(igt_reset_evict_fence),
1839 		SUBTEST(igt_handle_error),
1840 	};
1841 	struct intel_gt *gt = &i915->gt;
1842 	intel_wakeref_t wakeref;
1843 	int err;
1844 
1845 	if (!intel_has_gpu_reset(gt))
1846 		return 0;
1847 
1848 	if (intel_gt_is_wedged(gt))
1849 		return -EIO; /* we're long past hope of a successful reset */
1850 
1851 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1852 
1853 	err = intel_gt_live_subtests(tests, gt);
1854 
1855 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1856 
1857 	return err;
1858 }
1859