1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2017-2018 Intel Corporation
4  */
5 
6 #include <linux/prime_numbers.h>
7 
8 #include "intel_context.h"
9 #include "intel_engine_heartbeat.h"
10 #include "intel_engine_pm.h"
11 #include "intel_engine_regs.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_gt_requests.h"
15 #include "intel_ring.h"
16 #include "selftest_engine_heartbeat.h"
17 
18 #include "../selftests/i915_random.h"
19 #include "../i915_selftest.h"
20 
21 #include "selftests/igt_flush_test.h"
22 #include "selftests/lib_sw_fence.h"
23 #include "selftests/mock_gem_device.h"
24 #include "selftests/mock_timeline.h"
25 
26 static struct page *hwsp_page(struct intel_timeline *tl)
27 {
28 	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
29 
30 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
31 	return sg_page(obj->mm.pages->sgl);
32 }
33 
34 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
35 {
36 	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
37 
38 	return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES;
39 }
40 
41 static int selftest_tl_pin(struct intel_timeline *tl)
42 {
43 	struct i915_gem_ww_ctx ww;
44 	int err;
45 
46 	i915_gem_ww_ctx_init(&ww, false);
47 retry:
48 	err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww);
49 	if (!err)
50 		err = intel_timeline_pin(tl, &ww);
51 
52 	if (err == -EDEADLK) {
53 		err = i915_gem_ww_ctx_backoff(&ww);
54 		if (!err)
55 			goto retry;
56 	}
57 	i915_gem_ww_ctx_fini(&ww);
58 	return err;
59 }
60 
61 /* Only half of seqno's are usable, see __intel_timeline_get_seqno() */
62 #define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2)
63 
64 struct mock_hwsp_freelist {
65 	struct intel_gt *gt;
66 	struct radix_tree_root cachelines;
67 	struct intel_timeline **history;
68 	unsigned long count, max;
69 	struct rnd_state prng;
70 };
71 
72 enum {
73 	SHUFFLE = BIT(0),
74 };
75 
76 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
77 			       unsigned int idx,
78 			       struct intel_timeline *tl)
79 {
80 	tl = xchg(&state->history[idx], tl);
81 	if (tl) {
82 		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
83 		intel_timeline_unpin(tl);
84 		intel_timeline_put(tl);
85 	}
86 }
87 
88 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
89 				unsigned int count,
90 				unsigned int flags)
91 {
92 	struct intel_timeline *tl;
93 	unsigned int idx;
94 
95 	while (count--) {
96 		unsigned long cacheline;
97 		int err;
98 
99 		tl = intel_timeline_create(state->gt);
100 		if (IS_ERR(tl))
101 			return PTR_ERR(tl);
102 
103 		err = selftest_tl_pin(tl);
104 		if (err) {
105 			intel_timeline_put(tl);
106 			return err;
107 		}
108 
109 		cacheline = hwsp_cacheline(tl);
110 		err = radix_tree_insert(&state->cachelines, cacheline, tl);
111 		if (err) {
112 			if (err == -EEXIST) {
113 				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
114 				       cacheline);
115 			}
116 			intel_timeline_unpin(tl);
117 			intel_timeline_put(tl);
118 			return err;
119 		}
120 
121 		idx = state->count++ % state->max;
122 		__mock_hwsp_record(state, idx, tl);
123 	}
124 
125 	if (flags & SHUFFLE)
126 		i915_prandom_shuffle(state->history,
127 				     sizeof(*state->history),
128 				     min(state->count, state->max),
129 				     &state->prng);
130 
131 	count = i915_prandom_u32_max_state(min(state->count, state->max),
132 					   &state->prng);
133 	while (count--) {
134 		idx = --state->count % state->max;
135 		__mock_hwsp_record(state, idx, NULL);
136 	}
137 
138 	return 0;
139 }
140 
141 static int mock_hwsp_freelist(void *arg)
142 {
143 	struct mock_hwsp_freelist state;
144 	struct drm_i915_private *i915;
145 	const struct {
146 		const char *name;
147 		unsigned int flags;
148 	} phases[] = {
149 		{ "linear", 0 },
150 		{ "shuffled", SHUFFLE },
151 		{ },
152 	}, *p;
153 	unsigned int na;
154 	int err = 0;
155 
156 	i915 = mock_gem_device();
157 	if (!i915)
158 		return -ENOMEM;
159 
160 	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
161 	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
162 
163 	state.gt = to_gt(i915);
164 
165 	/*
166 	 * Create a bunch of timelines and check that their HWSP do not overlap.
167 	 * Free some, and try again.
168 	 */
169 
170 	state.max = PAGE_SIZE / sizeof(*state.history);
171 	state.count = 0;
172 	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
173 	if (!state.history) {
174 		err = -ENOMEM;
175 		goto err_put;
176 	}
177 
178 	for (p = phases; p->name; p++) {
179 		pr_debug("%s(%s)\n", __func__, p->name);
180 		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
181 			err = __mock_hwsp_timeline(&state, na, p->flags);
182 			if (err)
183 				goto out;
184 		}
185 	}
186 
187 out:
188 	for (na = 0; na < state.max; na++)
189 		__mock_hwsp_record(&state, na, NULL);
190 	kfree(state.history);
191 err_put:
192 	mock_destroy_device(i915);
193 	return err;
194 }
195 
196 struct __igt_sync {
197 	const char *name;
198 	u32 seqno;
199 	bool expected;
200 	bool set;
201 };
202 
203 static int __igt_sync(struct intel_timeline *tl,
204 		      u64 ctx,
205 		      const struct __igt_sync *p,
206 		      const char *name)
207 {
208 	int ret;
209 
210 	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
211 		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
212 		       name, p->name, ctx, p->seqno, yesno(p->expected));
213 		return -EINVAL;
214 	}
215 
216 	if (p->set) {
217 		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
218 		if (ret)
219 			return ret;
220 	}
221 
222 	return 0;
223 }
224 
225 static int igt_sync(void *arg)
226 {
227 	const struct __igt_sync pass[] = {
228 		{ "unset", 0, false, false },
229 		{ "new", 0, false, true },
230 		{ "0a", 0, true, true },
231 		{ "1a", 1, false, true },
232 		{ "1b", 1, true, true },
233 		{ "0b", 0, true, false },
234 		{ "2a", 2, false, true },
235 		{ "4", 4, false, true },
236 		{ "INT_MAX", INT_MAX, false, true },
237 		{ "INT_MAX-1", INT_MAX-1, true, false },
238 		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
239 		{ "INT_MAX", INT_MAX, true, false },
240 		{ "UINT_MAX", UINT_MAX, false, true },
241 		{ "wrap", 0, false, true },
242 		{ "unwrap", UINT_MAX, true, false },
243 		{},
244 	}, *p;
245 	struct intel_timeline tl;
246 	int order, offset;
247 	int ret = -ENODEV;
248 
249 	mock_timeline_init(&tl, 0);
250 	for (p = pass; p->name; p++) {
251 		for (order = 1; order < 64; order++) {
252 			for (offset = -1; offset <= (order > 1); offset++) {
253 				u64 ctx = BIT_ULL(order) + offset;
254 
255 				ret = __igt_sync(&tl, ctx, p, "1");
256 				if (ret)
257 					goto out;
258 			}
259 		}
260 	}
261 	mock_timeline_fini(&tl);
262 
263 	mock_timeline_init(&tl, 0);
264 	for (order = 1; order < 64; order++) {
265 		for (offset = -1; offset <= (order > 1); offset++) {
266 			u64 ctx = BIT_ULL(order) + offset;
267 
268 			for (p = pass; p->name; p++) {
269 				ret = __igt_sync(&tl, ctx, p, "2");
270 				if (ret)
271 					goto out;
272 			}
273 		}
274 	}
275 
276 out:
277 	mock_timeline_fini(&tl);
278 	return ret;
279 }
280 
281 static unsigned int random_engine(struct rnd_state *rnd)
282 {
283 	return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
284 }
285 
286 static int bench_sync(void *arg)
287 {
288 	struct rnd_state prng;
289 	struct intel_timeline tl;
290 	unsigned long end_time, count;
291 	u64 prng32_1M;
292 	ktime_t kt;
293 	int order, last_order;
294 
295 	mock_timeline_init(&tl, 0);
296 
297 	/* Lookups from cache are very fast and so the random number generation
298 	 * and the loop itself becomes a significant factor in the per-iteration
299 	 * timings. We try to compensate the results by measuring the overhead
300 	 * of the prng and subtract it from the reported results.
301 	 */
302 	prandom_seed_state(&prng, i915_selftest.random_seed);
303 	count = 0;
304 	kt = ktime_get();
305 	end_time = jiffies + HZ/10;
306 	do {
307 		u32 x;
308 
309 		/* Make sure the compiler doesn't optimise away the prng call */
310 		WRITE_ONCE(x, prandom_u32_state(&prng));
311 
312 		count++;
313 	} while (!time_after(jiffies, end_time));
314 	kt = ktime_sub(ktime_get(), kt);
315 	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
316 		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
317 	prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
318 
319 	/* Benchmark (only) setting random context ids */
320 	prandom_seed_state(&prng, i915_selftest.random_seed);
321 	count = 0;
322 	kt = ktime_get();
323 	end_time = jiffies + HZ/10;
324 	do {
325 		u64 id = i915_prandom_u64_state(&prng);
326 
327 		__intel_timeline_sync_set(&tl, id, 0);
328 		count++;
329 	} while (!time_after(jiffies, end_time));
330 	kt = ktime_sub(ktime_get(), kt);
331 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
332 	pr_info("%s: %lu random insertions, %lluns/insert\n",
333 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
334 
335 	/* Benchmark looking up the exact same context ids as we just set */
336 	prandom_seed_state(&prng, i915_selftest.random_seed);
337 	end_time = count;
338 	kt = ktime_get();
339 	while (end_time--) {
340 		u64 id = i915_prandom_u64_state(&prng);
341 
342 		if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
343 			mock_timeline_fini(&tl);
344 			pr_err("Lookup of %llu failed\n", id);
345 			return -EINVAL;
346 		}
347 	}
348 	kt = ktime_sub(ktime_get(), kt);
349 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
350 	pr_info("%s: %lu random lookups, %lluns/lookup\n",
351 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
352 
353 	mock_timeline_fini(&tl);
354 	cond_resched();
355 
356 	mock_timeline_init(&tl, 0);
357 
358 	/* Benchmark setting the first N (in order) contexts */
359 	count = 0;
360 	kt = ktime_get();
361 	end_time = jiffies + HZ/10;
362 	do {
363 		__intel_timeline_sync_set(&tl, count++, 0);
364 	} while (!time_after(jiffies, end_time));
365 	kt = ktime_sub(ktime_get(), kt);
366 	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
367 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
368 
369 	/* Benchmark looking up the exact same context ids as we just set */
370 	end_time = count;
371 	kt = ktime_get();
372 	while (end_time--) {
373 		if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
374 			pr_err("Lookup of %lu failed\n", end_time);
375 			mock_timeline_fini(&tl);
376 			return -EINVAL;
377 		}
378 	}
379 	kt = ktime_sub(ktime_get(), kt);
380 	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
381 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
382 
383 	mock_timeline_fini(&tl);
384 	cond_resched();
385 
386 	mock_timeline_init(&tl, 0);
387 
388 	/* Benchmark searching for a random context id and maybe changing it */
389 	prandom_seed_state(&prng, i915_selftest.random_seed);
390 	count = 0;
391 	kt = ktime_get();
392 	end_time = jiffies + HZ/10;
393 	do {
394 		u32 id = random_engine(&prng);
395 		u32 seqno = prandom_u32_state(&prng);
396 
397 		if (!__intel_timeline_sync_is_later(&tl, id, seqno))
398 			__intel_timeline_sync_set(&tl, id, seqno);
399 
400 		count++;
401 	} while (!time_after(jiffies, end_time));
402 	kt = ktime_sub(ktime_get(), kt);
403 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
404 	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
405 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
406 	mock_timeline_fini(&tl);
407 	cond_resched();
408 
409 	/* Benchmark searching for a known context id and changing the seqno */
410 	for (last_order = 1, order = 1; order < 32;
411 	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
412 		unsigned int mask = BIT(order) - 1;
413 
414 		mock_timeline_init(&tl, 0);
415 
416 		count = 0;
417 		kt = ktime_get();
418 		end_time = jiffies + HZ/10;
419 		do {
420 			/* Without assuming too many details of the underlying
421 			 * implementation, try to identify its phase-changes
422 			 * (if any)!
423 			 */
424 			u64 id = (u64)(count & mask) << order;
425 
426 			__intel_timeline_sync_is_later(&tl, id, 0);
427 			__intel_timeline_sync_set(&tl, id, 0);
428 
429 			count++;
430 		} while (!time_after(jiffies, end_time));
431 		kt = ktime_sub(ktime_get(), kt);
432 		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
433 			__func__, count, order,
434 			(long long)div64_ul(ktime_to_ns(kt), count));
435 		mock_timeline_fini(&tl);
436 		cond_resched();
437 	}
438 
439 	return 0;
440 }
441 
442 int intel_timeline_mock_selftests(void)
443 {
444 	static const struct i915_subtest tests[] = {
445 		SUBTEST(mock_hwsp_freelist),
446 		SUBTEST(igt_sync),
447 		SUBTEST(bench_sync),
448 	};
449 
450 	return i915_subtests(tests, NULL);
451 }
452 
453 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
454 {
455 	u32 *cs;
456 
457 	cs = intel_ring_begin(rq, 4);
458 	if (IS_ERR(cs))
459 		return PTR_ERR(cs);
460 
461 	if (GRAPHICS_VER(rq->engine->i915) >= 8) {
462 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
463 		*cs++ = addr;
464 		*cs++ = 0;
465 		*cs++ = value;
466 	} else if (GRAPHICS_VER(rq->engine->i915) >= 4) {
467 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
468 		*cs++ = 0;
469 		*cs++ = addr;
470 		*cs++ = value;
471 	} else {
472 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
473 		*cs++ = addr;
474 		*cs++ = value;
475 		*cs++ = MI_NOOP;
476 	}
477 
478 	intel_ring_advance(rq, cs);
479 
480 	return 0;
481 }
482 
483 static struct i915_request *
484 checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
485 {
486 	struct i915_request *rq;
487 	int err;
488 
489 	err = selftest_tl_pin(tl);
490 	if (err) {
491 		rq = ERR_PTR(err);
492 		goto out;
493 	}
494 
495 	if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
496 		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
497 		       *tl->hwsp_seqno, tl->seqno);
498 		intel_timeline_unpin(tl);
499 		return ERR_PTR(-EINVAL);
500 	}
501 
502 	rq = intel_engine_create_kernel_request(engine);
503 	if (IS_ERR(rq))
504 		goto out_unpin;
505 
506 	i915_request_get(rq);
507 
508 	err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
509 	i915_request_add(rq);
510 	if (err) {
511 		i915_request_put(rq);
512 		rq = ERR_PTR(err);
513 	}
514 
515 out_unpin:
516 	intel_timeline_unpin(tl);
517 out:
518 	if (IS_ERR(rq))
519 		pr_err("Failed to write to timeline!\n");
520 	return rq;
521 }
522 
523 static int live_hwsp_engine(void *arg)
524 {
525 #define NUM_TIMELINES 4096
526 	struct intel_gt *gt = arg;
527 	struct intel_timeline **timelines;
528 	struct intel_engine_cs *engine;
529 	enum intel_engine_id id;
530 	unsigned long count, n;
531 	int err = 0;
532 
533 	/*
534 	 * Create a bunch of timelines and check we can write
535 	 * independently to each of their breadcrumb slots.
536 	 */
537 
538 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
539 				   sizeof(*timelines),
540 				   GFP_KERNEL);
541 	if (!timelines)
542 		return -ENOMEM;
543 
544 	count = 0;
545 	for_each_engine(engine, gt, id) {
546 		if (!intel_engine_can_store_dword(engine))
547 			continue;
548 
549 		intel_engine_pm_get(engine);
550 
551 		for (n = 0; n < NUM_TIMELINES; n++) {
552 			struct intel_timeline *tl;
553 			struct i915_request *rq;
554 
555 			tl = intel_timeline_create(gt);
556 			if (IS_ERR(tl)) {
557 				err = PTR_ERR(tl);
558 				break;
559 			}
560 
561 			rq = checked_tl_write(tl, engine, count);
562 			if (IS_ERR(rq)) {
563 				intel_timeline_put(tl);
564 				err = PTR_ERR(rq);
565 				break;
566 			}
567 
568 			timelines[count++] = tl;
569 			i915_request_put(rq);
570 		}
571 
572 		intel_engine_pm_put(engine);
573 		if (err)
574 			break;
575 	}
576 
577 	if (igt_flush_test(gt->i915))
578 		err = -EIO;
579 
580 	for (n = 0; n < count; n++) {
581 		struct intel_timeline *tl = timelines[n];
582 
583 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
584 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
585 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
586 			GEM_TRACE_DUMP();
587 			err = -EINVAL;
588 		}
589 		intel_timeline_put(tl);
590 	}
591 
592 	kvfree(timelines);
593 	return err;
594 #undef NUM_TIMELINES
595 }
596 
597 static int live_hwsp_alternate(void *arg)
598 {
599 #define NUM_TIMELINES 4096
600 	struct intel_gt *gt = arg;
601 	struct intel_timeline **timelines;
602 	struct intel_engine_cs *engine;
603 	enum intel_engine_id id;
604 	unsigned long count, n;
605 	int err = 0;
606 
607 	/*
608 	 * Create a bunch of timelines and check we can write
609 	 * independently to each of their breadcrumb slots with adjacent
610 	 * engines.
611 	 */
612 
613 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
614 				   sizeof(*timelines),
615 				   GFP_KERNEL);
616 	if (!timelines)
617 		return -ENOMEM;
618 
619 	count = 0;
620 	for (n = 0; n < NUM_TIMELINES; n++) {
621 		for_each_engine(engine, gt, id) {
622 			struct intel_timeline *tl;
623 			struct i915_request *rq;
624 
625 			if (!intel_engine_can_store_dword(engine))
626 				continue;
627 
628 			tl = intel_timeline_create(gt);
629 			if (IS_ERR(tl)) {
630 				err = PTR_ERR(tl);
631 				goto out;
632 			}
633 
634 			intel_engine_pm_get(engine);
635 			rq = checked_tl_write(tl, engine, count);
636 			intel_engine_pm_put(engine);
637 			if (IS_ERR(rq)) {
638 				intel_timeline_put(tl);
639 				err = PTR_ERR(rq);
640 				goto out;
641 			}
642 
643 			timelines[count++] = tl;
644 			i915_request_put(rq);
645 		}
646 	}
647 
648 out:
649 	if (igt_flush_test(gt->i915))
650 		err = -EIO;
651 
652 	for (n = 0; n < count; n++) {
653 		struct intel_timeline *tl = timelines[n];
654 
655 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
656 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
657 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
658 			GEM_TRACE_DUMP();
659 			err = -EINVAL;
660 		}
661 		intel_timeline_put(tl);
662 	}
663 
664 	kvfree(timelines);
665 	return err;
666 #undef NUM_TIMELINES
667 }
668 
669 static int live_hwsp_wrap(void *arg)
670 {
671 	struct intel_gt *gt = arg;
672 	struct intel_engine_cs *engine;
673 	struct intel_timeline *tl;
674 	enum intel_engine_id id;
675 	int err = 0;
676 
677 	/*
678 	 * Across a seqno wrap, we need to keep the old cacheline alive for
679 	 * foreign GPU references.
680 	 */
681 
682 	tl = intel_timeline_create(gt);
683 	if (IS_ERR(tl))
684 		return PTR_ERR(tl);
685 
686 	if (!tl->has_initial_breadcrumb)
687 		goto out_free;
688 
689 	err = selftest_tl_pin(tl);
690 	if (err)
691 		goto out_free;
692 
693 	for_each_engine(engine, gt, id) {
694 		const u32 *hwsp_seqno[2];
695 		struct i915_request *rq;
696 		u32 seqno[2];
697 
698 		if (!intel_engine_can_store_dword(engine))
699 			continue;
700 
701 		rq = intel_engine_create_kernel_request(engine);
702 		if (IS_ERR(rq)) {
703 			err = PTR_ERR(rq);
704 			goto out;
705 		}
706 
707 		tl->seqno = -4u;
708 
709 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
710 		err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
711 		mutex_unlock(&tl->mutex);
712 		if (err) {
713 			i915_request_add(rq);
714 			goto out;
715 		}
716 		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
717 			 seqno[0], tl->hwsp_offset);
718 
719 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
720 		if (err) {
721 			i915_request_add(rq);
722 			goto out;
723 		}
724 		hwsp_seqno[0] = tl->hwsp_seqno;
725 
726 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
727 		err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
728 		mutex_unlock(&tl->mutex);
729 		if (err) {
730 			i915_request_add(rq);
731 			goto out;
732 		}
733 		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
734 			 seqno[1], tl->hwsp_offset);
735 
736 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
737 		if (err) {
738 			i915_request_add(rq);
739 			goto out;
740 		}
741 		hwsp_seqno[1] = tl->hwsp_seqno;
742 
743 		/* With wrap should come a new hwsp */
744 		GEM_BUG_ON(seqno[1] >= seqno[0]);
745 		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
746 
747 		i915_request_add(rq);
748 
749 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
750 			pr_err("Wait for timeline writes timed out!\n");
751 			err = -EIO;
752 			goto out;
753 		}
754 
755 		if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
756 		    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
757 			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
758 			       *hwsp_seqno[0], *hwsp_seqno[1],
759 			       seqno[0], seqno[1]);
760 			err = -EINVAL;
761 			goto out;
762 		}
763 
764 		intel_gt_retire_requests(gt); /* recycle HWSP */
765 	}
766 
767 out:
768 	if (igt_flush_test(gt->i915))
769 		err = -EIO;
770 
771 	intel_timeline_unpin(tl);
772 out_free:
773 	intel_timeline_put(tl);
774 	return err;
775 }
776 
777 static int emit_read_hwsp(struct i915_request *rq,
778 			  u32 seqno, u32 hwsp,
779 			  u32 *addr)
780 {
781 	const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
782 	u32 *cs;
783 
784 	cs = intel_ring_begin(rq, 12);
785 	if (IS_ERR(cs))
786 		return PTR_ERR(cs);
787 
788 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
789 	*cs++ = *addr;
790 	*cs++ = 0;
791 	*cs++ = seqno;
792 	*addr += 4;
793 
794 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
795 	*cs++ = gpr;
796 	*cs++ = hwsp;
797 	*cs++ = 0;
798 
799 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
800 	*cs++ = gpr;
801 	*cs++ = *addr;
802 	*cs++ = 0;
803 	*addr += 4;
804 
805 	intel_ring_advance(rq, cs);
806 
807 	return 0;
808 }
809 
810 struct hwsp_watcher {
811 	struct i915_vma *vma;
812 	struct i915_request *rq;
813 	u32 addr;
814 	u32 *map;
815 };
816 
817 static bool cmp_lt(u32 a, u32 b)
818 {
819 	return a < b;
820 }
821 
822 static bool cmp_gte(u32 a, u32 b)
823 {
824 	return a >= b;
825 }
826 
827 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
828 {
829 	struct drm_i915_gem_object *obj;
830 	struct i915_vma *vma;
831 
832 	obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
833 	if (IS_ERR(obj))
834 		return PTR_ERR(obj);
835 
836 	w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
837 	if (IS_ERR(w->map)) {
838 		i915_gem_object_put(obj);
839 		return PTR_ERR(w->map);
840 	}
841 
842 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
843 	if (IS_ERR(vma)) {
844 		i915_gem_object_put(obj);
845 		return PTR_ERR(vma);
846 	}
847 
848 	w->vma = vma;
849 	w->addr = i915_ggtt_offset(vma);
850 	return 0;
851 }
852 
853 static void switch_tl_lock(struct i915_request *from, struct i915_request *to)
854 {
855 	/* some light mutex juggling required; think co-routines */
856 
857 	if (from) {
858 		lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie);
859 		mutex_unlock(&from->context->timeline->mutex);
860 	}
861 
862 	if (to) {
863 		mutex_lock(&to->context->timeline->mutex);
864 		to->cookie = lockdep_pin_lock(&to->context->timeline->mutex);
865 	}
866 }
867 
868 static int create_watcher(struct hwsp_watcher *w,
869 			  struct intel_engine_cs *engine,
870 			  int ringsz)
871 {
872 	struct intel_context *ce;
873 
874 	ce = intel_context_create(engine);
875 	if (IS_ERR(ce))
876 		return PTR_ERR(ce);
877 
878 	ce->ring_size = ringsz;
879 	w->rq = intel_context_create_request(ce);
880 	intel_context_put(ce);
881 	if (IS_ERR(w->rq))
882 		return PTR_ERR(w->rq);
883 
884 	w->addr = i915_ggtt_offset(w->vma);
885 
886 	switch_tl_lock(w->rq, NULL);
887 
888 	return 0;
889 }
890 
891 static int check_watcher(struct hwsp_watcher *w, const char *name,
892 			 bool (*op)(u32 hwsp, u32 seqno))
893 {
894 	struct i915_request *rq = fetch_and_zero(&w->rq);
895 	u32 offset, end;
896 	int err;
897 
898 	GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
899 
900 	i915_request_get(rq);
901 	switch_tl_lock(NULL, rq);
902 	i915_request_add(rq);
903 
904 	if (i915_request_wait(rq, 0, HZ) < 0) {
905 		err = -ETIME;
906 		goto out;
907 	}
908 
909 	err = 0;
910 	offset = 0;
911 	end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
912 	while (offset < end) {
913 		if (!op(w->map[offset + 1], w->map[offset])) {
914 			pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
915 			       name, w->map[offset + 1], w->map[offset]);
916 			err = -EINVAL;
917 		}
918 
919 		offset += 2;
920 	}
921 
922 out:
923 	i915_request_put(rq);
924 	return err;
925 }
926 
927 static void cleanup_watcher(struct hwsp_watcher *w)
928 {
929 	if (w->rq) {
930 		switch_tl_lock(NULL, w->rq);
931 
932 		i915_request_add(w->rq);
933 	}
934 
935 	i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
936 }
937 
938 static bool retire_requests(struct intel_timeline *tl)
939 {
940 	struct i915_request *rq, *rn;
941 
942 	mutex_lock(&tl->mutex);
943 	list_for_each_entry_safe(rq, rn, &tl->requests, link)
944 		if (!i915_request_retire(rq))
945 			break;
946 	mutex_unlock(&tl->mutex);
947 
948 	return !i915_active_fence_isset(&tl->last_request);
949 }
950 
951 static struct i915_request *wrap_timeline(struct i915_request *rq)
952 {
953 	struct intel_context *ce = rq->context;
954 	struct intel_timeline *tl = ce->timeline;
955 	u32 seqno = rq->fence.seqno;
956 
957 	while (tl->seqno >= seqno) { /* Cause a wrap */
958 		i915_request_put(rq);
959 		rq = intel_context_create_request(ce);
960 		if (IS_ERR(rq))
961 			return rq;
962 
963 		i915_request_get(rq);
964 		i915_request_add(rq);
965 	}
966 
967 	i915_request_put(rq);
968 	rq = i915_request_create(ce);
969 	if (IS_ERR(rq))
970 		return rq;
971 
972 	i915_request_get(rq);
973 	i915_request_add(rq);
974 
975 	return rq;
976 }
977 
978 static int live_hwsp_read(void *arg)
979 {
980 	struct intel_gt *gt = arg;
981 	struct hwsp_watcher watcher[2] = {};
982 	struct intel_engine_cs *engine;
983 	struct intel_timeline *tl;
984 	enum intel_engine_id id;
985 	int err = 0;
986 	int i;
987 
988 	/*
989 	 * If we take a reference to the HWSP for reading on the GPU, that
990 	 * read may be arbitrarily delayed (either by foreign fence or
991 	 * priority saturation) and a wrap can happen within 30 minutes.
992 	 * When the GPU read is finally submitted it should be correct,
993 	 * even across multiple wraps.
994 	 */
995 
996 	if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */
997 		return 0;
998 
999 	tl = intel_timeline_create(gt);
1000 	if (IS_ERR(tl))
1001 		return PTR_ERR(tl);
1002 
1003 	if (!tl->has_initial_breadcrumb)
1004 		goto out_free;
1005 
1006 	for (i = 0; i < ARRAY_SIZE(watcher); i++) {
1007 		err = setup_watcher(&watcher[i], gt);
1008 		if (err)
1009 			goto out;
1010 	}
1011 
1012 	for_each_engine(engine, gt, id) {
1013 		struct intel_context *ce;
1014 		unsigned long count = 0;
1015 		IGT_TIMEOUT(end_time);
1016 
1017 		/* Create a request we can use for remote reading of the HWSP */
1018 		err = create_watcher(&watcher[1], engine, SZ_512K);
1019 		if (err)
1020 			goto out;
1021 
1022 		do {
1023 			struct i915_sw_fence *submit;
1024 			struct i915_request *rq;
1025 			u32 hwsp, dummy;
1026 
1027 			submit = heap_fence_create(GFP_KERNEL);
1028 			if (!submit) {
1029 				err = -ENOMEM;
1030 				goto out;
1031 			}
1032 
1033 			err = create_watcher(&watcher[0], engine, SZ_4K);
1034 			if (err)
1035 				goto out;
1036 
1037 			ce = intel_context_create(engine);
1038 			if (IS_ERR(ce)) {
1039 				err = PTR_ERR(ce);
1040 				goto out;
1041 			}
1042 
1043 			ce->timeline = intel_timeline_get(tl);
1044 
1045 			/* Ensure timeline is mapped, done during first pin */
1046 			err = intel_context_pin(ce);
1047 			if (err) {
1048 				intel_context_put(ce);
1049 				goto out;
1050 			}
1051 
1052 			/*
1053 			 * Start at a new wrap, and set seqno right before another wrap,
1054 			 * saving 30 minutes of nops
1055 			 */
1056 			tl->seqno = -12u + 2 * (count & 3);
1057 			__intel_timeline_get_seqno(tl, &dummy);
1058 
1059 			rq = i915_request_create(ce);
1060 			if (IS_ERR(rq)) {
1061 				err = PTR_ERR(rq);
1062 				intel_context_unpin(ce);
1063 				intel_context_put(ce);
1064 				goto out;
1065 			}
1066 
1067 			err = i915_sw_fence_await_dma_fence(&rq->submit,
1068 							    &watcher[0].rq->fence, 0,
1069 							    GFP_KERNEL);
1070 			if (err < 0) {
1071 				i915_request_add(rq);
1072 				intel_context_unpin(ce);
1073 				intel_context_put(ce);
1074 				goto out;
1075 			}
1076 
1077 			switch_tl_lock(rq, watcher[0].rq);
1078 			err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1079 			if (err == 0)
1080 				err = emit_read_hwsp(watcher[0].rq, /* before */
1081 						     rq->fence.seqno, hwsp,
1082 						     &watcher[0].addr);
1083 			switch_tl_lock(watcher[0].rq, rq);
1084 			if (err) {
1085 				i915_request_add(rq);
1086 				intel_context_unpin(ce);
1087 				intel_context_put(ce);
1088 				goto out;
1089 			}
1090 
1091 			switch_tl_lock(rq, watcher[1].rq);
1092 			err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1093 			if (err == 0)
1094 				err = emit_read_hwsp(watcher[1].rq, /* after */
1095 						     rq->fence.seqno, hwsp,
1096 						     &watcher[1].addr);
1097 			switch_tl_lock(watcher[1].rq, rq);
1098 			if (err) {
1099 				i915_request_add(rq);
1100 				intel_context_unpin(ce);
1101 				intel_context_put(ce);
1102 				goto out;
1103 			}
1104 
1105 			i915_request_get(rq);
1106 			i915_request_add(rq);
1107 
1108 			rq = wrap_timeline(rq);
1109 			intel_context_unpin(ce);
1110 			intel_context_put(ce);
1111 			if (IS_ERR(rq)) {
1112 				err = PTR_ERR(rq);
1113 				goto out;
1114 			}
1115 
1116 			err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1117 							    &rq->fence, 0,
1118 							    GFP_KERNEL);
1119 			if (err < 0) {
1120 				i915_request_put(rq);
1121 				goto out;
1122 			}
1123 
1124 			err = check_watcher(&watcher[0], "before", cmp_lt);
1125 			i915_sw_fence_commit(submit);
1126 			heap_fence_put(submit);
1127 			if (err) {
1128 				i915_request_put(rq);
1129 				goto out;
1130 			}
1131 			count++;
1132 
1133 			/* Flush the timeline before manually wrapping again */
1134 			if (i915_request_wait(rq,
1135 					      I915_WAIT_INTERRUPTIBLE,
1136 					      HZ) < 0) {
1137 				err = -ETIME;
1138 				i915_request_put(rq);
1139 				goto out;
1140 			}
1141 			retire_requests(tl);
1142 			i915_request_put(rq);
1143 
1144 			/* Single requests are limited to half a ring at most */
1145 			if (8 * watcher[1].rq->ring->emit >
1146 			    3 * watcher[1].rq->ring->size)
1147 				break;
1148 
1149 		} while (!__igt_timeout(end_time, NULL) &&
1150 			 count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2);
1151 
1152 		pr_info("%s: simulated %lu wraps\n", engine->name, count);
1153 		err = check_watcher(&watcher[1], "after", cmp_gte);
1154 		if (err)
1155 			goto out;
1156 	}
1157 
1158 out:
1159 	for (i = 0; i < ARRAY_SIZE(watcher); i++)
1160 		cleanup_watcher(&watcher[i]);
1161 
1162 	if (igt_flush_test(gt->i915))
1163 		err = -EIO;
1164 
1165 out_free:
1166 	intel_timeline_put(tl);
1167 	return err;
1168 }
1169 
1170 static int live_hwsp_rollover_kernel(void *arg)
1171 {
1172 	struct intel_gt *gt = arg;
1173 	struct intel_engine_cs *engine;
1174 	enum intel_engine_id id;
1175 	int err = 0;
1176 
1177 	/*
1178 	 * Run the host for long enough, and even the kernel context will
1179 	 * see a seqno rollover.
1180 	 */
1181 
1182 	for_each_engine(engine, gt, id) {
1183 		struct intel_context *ce = engine->kernel_context;
1184 		struct intel_timeline *tl = ce->timeline;
1185 		struct i915_request *rq[3] = {};
1186 		int i;
1187 
1188 		st_engine_heartbeat_disable(engine);
1189 		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1190 			err = -EIO;
1191 			goto out;
1192 		}
1193 
1194 		GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1195 		tl->seqno = -2u;
1196 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1197 
1198 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1199 			struct i915_request *this;
1200 
1201 			this = i915_request_create(ce);
1202 			if (IS_ERR(this)) {
1203 				err = PTR_ERR(this);
1204 				goto out;
1205 			}
1206 
1207 			pr_debug("%s: create fence.seqnp:%d\n",
1208 				 engine->name,
1209 				 lower_32_bits(this->fence.seqno));
1210 
1211 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1212 
1213 			rq[i] = i915_request_get(this);
1214 			i915_request_add(this);
1215 		}
1216 
1217 		/* We expected a wrap! */
1218 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1219 
1220 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1221 			pr_err("Wait for timeline wrap timed out!\n");
1222 			err = -EIO;
1223 			goto out;
1224 		}
1225 
1226 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1227 			if (!i915_request_completed(rq[i])) {
1228 				pr_err("Pre-wrap request not completed!\n");
1229 				err = -EINVAL;
1230 				goto out;
1231 			}
1232 		}
1233 
1234 out:
1235 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1236 			i915_request_put(rq[i]);
1237 		st_engine_heartbeat_enable(engine);
1238 		if (err)
1239 			break;
1240 	}
1241 
1242 	if (igt_flush_test(gt->i915))
1243 		err = -EIO;
1244 
1245 	return err;
1246 }
1247 
1248 static int live_hwsp_rollover_user(void *arg)
1249 {
1250 	struct intel_gt *gt = arg;
1251 	struct intel_engine_cs *engine;
1252 	enum intel_engine_id id;
1253 	int err = 0;
1254 
1255 	/*
1256 	 * Simulate a long running user context, and force the seqno wrap
1257 	 * on the user's timeline.
1258 	 */
1259 
1260 	for_each_engine(engine, gt, id) {
1261 		struct i915_request *rq[3] = {};
1262 		struct intel_timeline *tl;
1263 		struct intel_context *ce;
1264 		int i;
1265 
1266 		ce = intel_context_create(engine);
1267 		if (IS_ERR(ce))
1268 			return PTR_ERR(ce);
1269 
1270 		err = intel_context_alloc_state(ce);
1271 		if (err)
1272 			goto out;
1273 
1274 		tl = ce->timeline;
1275 		if (!tl->has_initial_breadcrumb)
1276 			goto out;
1277 
1278 		err = intel_context_pin(ce);
1279 		if (err)
1280 			goto out;
1281 
1282 		tl->seqno = -4u;
1283 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1284 
1285 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1286 			struct i915_request *this;
1287 
1288 			this = intel_context_create_request(ce);
1289 			if (IS_ERR(this)) {
1290 				err = PTR_ERR(this);
1291 				goto out_unpin;
1292 			}
1293 
1294 			pr_debug("%s: create fence.seqnp:%d\n",
1295 				 engine->name,
1296 				 lower_32_bits(this->fence.seqno));
1297 
1298 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1299 
1300 			rq[i] = i915_request_get(this);
1301 			i915_request_add(this);
1302 		}
1303 
1304 		/* We expected a wrap! */
1305 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1306 
1307 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1308 			pr_err("Wait for timeline wrap timed out!\n");
1309 			err = -EIO;
1310 			goto out_unpin;
1311 		}
1312 
1313 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1314 			if (!i915_request_completed(rq[i])) {
1315 				pr_err("Pre-wrap request not completed!\n");
1316 				err = -EINVAL;
1317 				goto out_unpin;
1318 			}
1319 		}
1320 out_unpin:
1321 		intel_context_unpin(ce);
1322 out:
1323 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1324 			i915_request_put(rq[i]);
1325 		intel_context_put(ce);
1326 		if (err)
1327 			break;
1328 	}
1329 
1330 	if (igt_flush_test(gt->i915))
1331 		err = -EIO;
1332 
1333 	return err;
1334 }
1335 
1336 static int live_hwsp_recycle(void *arg)
1337 {
1338 	struct intel_gt *gt = arg;
1339 	struct intel_engine_cs *engine;
1340 	enum intel_engine_id id;
1341 	unsigned long count;
1342 	int err = 0;
1343 
1344 	/*
1345 	 * Check seqno writes into one timeline at a time. We expect to
1346 	 * recycle the breadcrumb slot between iterations and neither
1347 	 * want to confuse ourselves or the GPU.
1348 	 */
1349 
1350 	count = 0;
1351 	for_each_engine(engine, gt, id) {
1352 		IGT_TIMEOUT(end_time);
1353 
1354 		if (!intel_engine_can_store_dword(engine))
1355 			continue;
1356 
1357 		intel_engine_pm_get(engine);
1358 
1359 		do {
1360 			struct intel_timeline *tl;
1361 			struct i915_request *rq;
1362 
1363 			tl = intel_timeline_create(gt);
1364 			if (IS_ERR(tl)) {
1365 				err = PTR_ERR(tl);
1366 				break;
1367 			}
1368 
1369 			rq = checked_tl_write(tl, engine, count);
1370 			if (IS_ERR(rq)) {
1371 				intel_timeline_put(tl);
1372 				err = PTR_ERR(rq);
1373 				break;
1374 			}
1375 
1376 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1377 				pr_err("Wait for timeline writes timed out!\n");
1378 				i915_request_put(rq);
1379 				intel_timeline_put(tl);
1380 				err = -EIO;
1381 				break;
1382 			}
1383 
1384 			if (READ_ONCE(*tl->hwsp_seqno) != count) {
1385 				GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1386 					      count, tl->fence_context,
1387 					      tl->hwsp_offset, *tl->hwsp_seqno);
1388 				GEM_TRACE_DUMP();
1389 				err = -EINVAL;
1390 			}
1391 
1392 			i915_request_put(rq);
1393 			intel_timeline_put(tl);
1394 			count++;
1395 
1396 			if (err)
1397 				break;
1398 		} while (!__igt_timeout(end_time, NULL));
1399 
1400 		intel_engine_pm_put(engine);
1401 		if (err)
1402 			break;
1403 	}
1404 
1405 	return err;
1406 }
1407 
1408 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1409 {
1410 	static const struct i915_subtest tests[] = {
1411 		SUBTEST(live_hwsp_recycle),
1412 		SUBTEST(live_hwsp_engine),
1413 		SUBTEST(live_hwsp_alternate),
1414 		SUBTEST(live_hwsp_wrap),
1415 		SUBTEST(live_hwsp_read),
1416 		SUBTEST(live_hwsp_rollover_kernel),
1417 		SUBTEST(live_hwsp_rollover_user),
1418 	};
1419 
1420 	if (intel_gt_is_wedged(to_gt(i915)))
1421 		return 0;
1422 
1423 	return intel_gt_live_subtests(tests, to_gt(i915));
1424 }
1425