xref: /openbsd/sys/dev/pci/drm/i915/gt/intel_workarounds.c (revision 09467b48)
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2014-2018 Intel Corporation
5  */
6 
7 #include "i915_drv.h"
8 #include "intel_context.h"
9 #include "intel_engine_pm.h"
10 #include "intel_gt.h"
11 #include "intel_ring.h"
12 #include "intel_workarounds.h"
13 
14 /**
15  * DOC: Hardware workarounds
16  *
17  * This file is intended as a central place to implement most [1]_ of the
18  * required workarounds for hardware to work as originally intended. They fall
19  * in five basic categories depending on how/when they are applied:
20  *
21  * - Workarounds that touch registers that are saved/restored to/from the HW
22  *   context image. The list is emitted (via Load Register Immediate commands)
23  *   everytime a new context is created.
24  * - GT workarounds. The list of these WAs is applied whenever these registers
25  *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
26  * - Display workarounds. The list is applied during display clock-gating
27  *   initialization.
28  * - Workarounds that whitelist a privileged register, so that UMDs can manage
29  *   them directly. This is just a special case of a MMMIO workaround (as we
30  *   write the list of these to/be-whitelisted registers to some special HW
31  *   registers).
32  * - Workaround batchbuffers, that get executed automatically by the hardware
33  *   on every HW context restore.
34  *
35  * .. [1] Please notice that there are other WAs that, due to their nature,
36  *    cannot be applied from a central place. Those are peppered around the rest
37  *    of the code, as needed.
38  *
39  * .. [2] Technically, some registers are powercontext saved & restored, so they
40  *    survive a suspend/resume. In practice, writing them again is not too
41  *    costly and simplifies things. We can revisit this in the future.
42  *
43  * Layout
44  * ~~~~~~
45  *
46  * Keep things in this file ordered by WA type, as per the above (context, GT,
47  * display, register whitelist, batchbuffer). Then, inside each type, keep the
48  * following order:
49  *
50  * - Infrastructure functions and macros
51  * - WAs per platform in standard gen/chrono order
52  * - Public functions to init or apply the given workaround type.
53  */
54 
55 static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
56 {
57 	wal->name = name;
58 	wal->engine_name = engine_name;
59 }
60 
61 #define WA_LIST_CHUNK (1 << 4)
62 
63 static void wa_init_finish(struct i915_wa_list *wal)
64 {
65 	/* Trim unused entries. */
66 	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
67 		struct i915_wa *list = kmemdup(wal->list,
68 					       wal->count * sizeof(*list),
69 					       GFP_KERNEL);
70 
71 		if (list) {
72 			kfree(wal->list);
73 			wal->list = list;
74 		}
75 	}
76 
77 	if (!wal->count)
78 		return;
79 
80 	DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
81 			 wal->wa_count, wal->name, wal->engine_name);
82 }
83 
84 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
85 {
86 	unsigned int addr = i915_mmio_reg_offset(wa->reg);
87 	unsigned int start = 0, end = wal->count;
88 	const unsigned int grow = WA_LIST_CHUNK;
89 	struct i915_wa *wa_;
90 
91 	GEM_BUG_ON(!is_power_of_2(grow));
92 
93 	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
94 		struct i915_wa *list;
95 
96 		list = kmalloc_array(roundup2(wal->count + 1, grow), sizeof(*wa),
97 				     GFP_KERNEL);
98 		if (!list) {
99 			DRM_ERROR("No space for workaround init!\n");
100 			return;
101 		}
102 
103 		if (wal->list)
104 			memcpy(list, wal->list, sizeof(*wa) * wal->count);
105 
106 		wal->list = list;
107 	}
108 
109 	while (start < end) {
110 		unsigned int mid = start + (end - start) / 2;
111 
112 		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
113 			start = mid + 1;
114 		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
115 			end = mid;
116 		} else {
117 			wa_ = &wal->list[mid];
118 
119 			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
120 				DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
121 					  i915_mmio_reg_offset(wa_->reg),
122 					  wa_->clr, wa_->set);
123 
124 				wa_->set &= ~wa->clr;
125 			}
126 
127 			wal->wa_count++;
128 			wa_->set |= wa->set;
129 			wa_->clr |= wa->clr;
130 			wa_->read |= wa->read;
131 			return;
132 		}
133 	}
134 
135 	wal->wa_count++;
136 	wa_ = &wal->list[wal->count++];
137 	*wa_ = *wa;
138 
139 	while (wa_-- > wal->list) {
140 		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
141 			   i915_mmio_reg_offset(wa_[1].reg));
142 		if (i915_mmio_reg_offset(wa_[1].reg) >
143 		    i915_mmio_reg_offset(wa_[0].reg))
144 			break;
145 
146 		swap(wa_[1], wa_[0]);
147 	}
148 }
149 
150 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
151 		   u32 clear, u32 set, u32 read_mask)
152 {
153 	struct i915_wa wa = {
154 		.reg  = reg,
155 		.clr  = clear,
156 		.set  = set,
157 		.read = read_mask,
158 	};
159 
160 	_wa_add(wal, &wa);
161 }
162 
163 static void
164 wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
165 {
166 	wa_add(wal, reg, clear, set, clear);
167 }
168 
169 static void
170 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
171 {
172 	wa_write_masked_or(wal, reg, ~0, set);
173 }
174 
175 static void
176 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
177 {
178 	wa_write_masked_or(wal, reg, set, set);
179 }
180 
181 static void
182 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
183 {
184 	wa_write_masked_or(wal, reg, clr, 0);
185 }
186 
187 static void
188 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
189 {
190 	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val);
191 }
192 
193 static void
194 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
195 {
196 	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val);
197 }
198 
199 #define WA_SET_BIT_MASKED(addr, mask) \
200 	wa_masked_en(wal, (addr), (mask))
201 
202 #define WA_CLR_BIT_MASKED(addr, mask) \
203 	wa_masked_dis(wal, (addr), (mask))
204 
205 #define WA_SET_FIELD_MASKED(addr, mask, value) \
206 	wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value)))
207 
208 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
209 				      struct i915_wa_list *wal)
210 {
211 	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
212 
213 	/* WaDisableAsyncFlipPerfMode:bdw,chv */
214 	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
215 
216 	/* WaDisablePartialInstShootdown:bdw,chv */
217 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
218 			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
219 
220 	/* Use Force Non-Coherent whenever executing a 3D context. This is a
221 	 * workaround for for a possible hang in the unlikely event a TLB
222 	 * invalidation occurs during a PSD flush.
223 	 */
224 	/* WaForceEnableNonCoherent:bdw,chv */
225 	/* WaHdcDisableFetchWhenMasked:bdw,chv */
226 	WA_SET_BIT_MASKED(HDC_CHICKEN0,
227 			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
228 			  HDC_FORCE_NON_COHERENT);
229 
230 	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
231 	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
232 	 *  polygons in the same 8x4 pixel/sample area to be processed without
233 	 *  stalling waiting for the earlier ones to write to Hierarchical Z
234 	 *  buffer."
235 	 *
236 	 * This optimization is off by default for BDW and CHV; turn it on.
237 	 */
238 	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
239 
240 	/* Wa4x4STCOptimizationDisable:bdw,chv */
241 	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
242 
243 	/*
244 	 * BSpec recommends 8x4 when MSAA is used,
245 	 * however in practice 16x4 seems fastest.
246 	 *
247 	 * Note that PS/WM thread counts depend on the WIZ hashing
248 	 * disable bit, which we don't touch here, but it's good
249 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
250 	 */
251 	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
252 			    GEN6_WIZ_HASHING_MASK,
253 			    GEN6_WIZ_HASHING_16x4);
254 }
255 
256 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
257 				     struct i915_wa_list *wal)
258 {
259 	struct drm_i915_private *i915 = engine->i915;
260 
261 	gen8_ctx_workarounds_init(engine, wal);
262 
263 	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
264 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
265 
266 	/* WaDisableDopClockGating:bdw
267 	 *
268 	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
269 	 * to disable EUTC clock gating.
270 	 */
271 	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
272 			  DOP_CLOCK_GATING_DISABLE);
273 
274 	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
275 			  GEN8_SAMPLER_POWER_BYPASS_DIS);
276 
277 	WA_SET_BIT_MASKED(HDC_CHICKEN0,
278 			  /* WaForceContextSaveRestoreNonCoherent:bdw */
279 			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
280 			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
281 			  (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
282 }
283 
284 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
285 				     struct i915_wa_list *wal)
286 {
287 	gen8_ctx_workarounds_init(engine, wal);
288 
289 	/* WaDisableThreadStallDopClockGating:chv */
290 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
291 
292 	/* Improve HiZ throughput on CHV. */
293 	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
294 }
295 
296 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
297 				      struct i915_wa_list *wal)
298 {
299 	struct drm_i915_private *i915 = engine->i915;
300 
301 	if (HAS_LLC(i915)) {
302 		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
303 		 *
304 		 * Must match Display Engine. See
305 		 * WaCompressedResourceDisplayNewHashMode.
306 		 */
307 		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
308 				  GEN9_PBE_COMPRESSED_HASH_SELECTION);
309 		WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
310 				  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
311 	}
312 
313 	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
314 	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
315 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
316 			  FLOW_CONTROL_ENABLE |
317 			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
318 
319 	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
320 	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
321 	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
322 			  GEN9_ENABLE_YV12_BUGFIX |
323 			  GEN9_ENABLE_GPGPU_PREEMPTION);
324 
325 	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
326 	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
327 	WA_SET_BIT_MASKED(CACHE_MODE_1,
328 			  GEN8_4x4_STC_OPTIMIZATION_DISABLE |
329 			  GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
330 
331 	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
332 	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
333 			  GEN9_CCS_TLB_PREFETCH_ENABLE);
334 
335 	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
336 	WA_SET_BIT_MASKED(HDC_CHICKEN0,
337 			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
338 			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
339 
340 	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
341 	 * both tied to WaForceContextSaveRestoreNonCoherent
342 	 * in some hsds for skl. We keep the tie for all gen9. The
343 	 * documentation is a bit hazy and so we want to get common behaviour,
344 	 * even though there is no clear evidence we would need both on kbl/bxt.
345 	 * This area has been source of system hangs so we play it safe
346 	 * and mimic the skl regardless of what bspec says.
347 	 *
348 	 * Use Force Non-Coherent whenever executing a 3D context. This
349 	 * is a workaround for a possible hang in the unlikely event
350 	 * a TLB invalidation occurs during a PSD flush.
351 	 */
352 
353 	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
354 	WA_SET_BIT_MASKED(HDC_CHICKEN0,
355 			  HDC_FORCE_NON_COHERENT);
356 
357 	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
358 	if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915))
359 		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
360 				  GEN8_SAMPLER_POWER_BYPASS_DIS);
361 
362 	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
363 	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
364 
365 	/*
366 	 * Supporting preemption with fine-granularity requires changes in the
367 	 * batch buffer programming. Since we can't break old userspace, we
368 	 * need to set our default preemption level to safe value. Userspace is
369 	 * still able to use more fine-grained preemption levels, since in
370 	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
371 	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
372 	 * not real HW workarounds, but merely a way to start using preemption
373 	 * while maintaining old contract with userspace.
374 	 */
375 
376 	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
377 	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
378 
379 	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
380 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
381 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
382 			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
383 
384 	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
385 	if (IS_GEN9_LP(i915))
386 		WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
387 }
388 
389 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
390 				struct i915_wa_list *wal)
391 {
392 	struct drm_i915_private *i915 = engine->i915;
393 	u8 vals[3] = { 0, 0, 0 };
394 	unsigned int i;
395 
396 	for (i = 0; i < 3; i++) {
397 		u8 ss;
398 
399 		/*
400 		 * Only consider slices where one, and only one, subslice has 7
401 		 * EUs
402 		 */
403 		if (!is_power_of_2(RUNTIME_INFO(i915)->sseu.subslice_7eu[i]))
404 			continue;
405 
406 		/*
407 		 * subslice_7eu[i] != 0 (because of the check above) and
408 		 * ss_max == 4 (maximum number of subslices possible per slice)
409 		 *
410 		 * ->    0 <= ss <= 3;
411 		 */
412 		ss = ffs(RUNTIME_INFO(i915)->sseu.subslice_7eu[i]) - 1;
413 		vals[i] = 3 - ss;
414 	}
415 
416 	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
417 		return;
418 
419 	/* Tune IZ hashing. See intel_device_info_runtime_init() */
420 	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
421 			    GEN9_IZ_HASHING_MASK(2) |
422 			    GEN9_IZ_HASHING_MASK(1) |
423 			    GEN9_IZ_HASHING_MASK(0),
424 			    GEN9_IZ_HASHING(2, vals[2]) |
425 			    GEN9_IZ_HASHING(1, vals[1]) |
426 			    GEN9_IZ_HASHING(0, vals[0]));
427 }
428 
429 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
430 				     struct i915_wa_list *wal)
431 {
432 	gen9_ctx_workarounds_init(engine, wal);
433 	skl_tune_iz_hashing(engine, wal);
434 }
435 
436 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
437 				     struct i915_wa_list *wal)
438 {
439 	gen9_ctx_workarounds_init(engine, wal);
440 
441 	/* WaDisableThreadStallDopClockGating:bxt */
442 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
443 			  STALL_DOP_GATING_DISABLE);
444 
445 	/* WaToEnableHwFixForPushConstHWBug:bxt */
446 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
447 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
448 }
449 
450 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
451 				     struct i915_wa_list *wal)
452 {
453 	struct drm_i915_private *i915 = engine->i915;
454 
455 	gen9_ctx_workarounds_init(engine, wal);
456 
457 	/* WaToEnableHwFixForPushConstHWBug:kbl */
458 	if (IS_KBL_REVID(i915, KBL_REVID_C0, REVID_FOREVER))
459 		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
460 				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
461 
462 	/* WaDisableSbeCacheDispatchPortSharing:kbl */
463 	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
464 			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
465 }
466 
467 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
468 				     struct i915_wa_list *wal)
469 {
470 	gen9_ctx_workarounds_init(engine, wal);
471 
472 	/* WaToEnableHwFixForPushConstHWBug:glk */
473 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
474 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
475 }
476 
477 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
478 				     struct i915_wa_list *wal)
479 {
480 	gen9_ctx_workarounds_init(engine, wal);
481 
482 	/* WaToEnableHwFixForPushConstHWBug:cfl */
483 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
484 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
485 
486 	/* WaDisableSbeCacheDispatchPortSharing:cfl */
487 	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
488 			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
489 }
490 
491 static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine,
492 				     struct i915_wa_list *wal)
493 {
494 	struct drm_i915_private *i915 = engine->i915;
495 
496 	/* WaForceContextSaveRestoreNonCoherent:cnl */
497 	WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0,
498 			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT);
499 
500 	/* WaThrottleEUPerfToAvoidTDBackPressure:cnl(pre-prod) */
501 	if (IS_CNL_REVID(i915, CNL_REVID_B0, CNL_REVID_B0))
502 		WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, THROTTLE_12_5);
503 
504 	/* WaDisableReplayBufferBankArbitrationOptimization:cnl */
505 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
506 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
507 
508 	/* WaDisableEnhancedSBEVertexCaching:cnl (pre-prod) */
509 	if (IS_CNL_REVID(i915, 0, CNL_REVID_B0))
510 		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
511 				  GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE);
512 
513 	/* WaPushConstantDereferenceHoldDisable:cnl */
514 	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE);
515 
516 	/* FtrEnableFastAnisoL1BankingFix:cnl */
517 	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX);
518 
519 	/* WaDisable3DMidCmdPreemption:cnl */
520 	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
521 
522 	/* WaDisableGPGPUMidCmdPreemption:cnl */
523 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
524 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
525 			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
526 
527 	/* WaDisableEarlyEOT:cnl */
528 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT);
529 }
530 
531 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
532 				     struct i915_wa_list *wal)
533 {
534 	struct drm_i915_private *i915 = engine->i915;
535 
536 	/* WaDisableBankHangMode:icl */
537 	wa_write(wal,
538 		 GEN8_L3CNTLREG,
539 		 intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
540 		 GEN8_ERRDETBCTRL);
541 
542 	/* Wa_1604370585:icl (pre-prod)
543 	 * Formerly known as WaPushConstantDereferenceHoldDisable
544 	 */
545 	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
546 		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
547 				  PUSH_CONSTANT_DEREF_DISABLE);
548 
549 	/* WaForceEnableNonCoherent:icl
550 	 * This is not the same workaround as in early Gen9 platforms, where
551 	 * lacking this could cause system hangs, but coherency performance
552 	 * overhead is high and only a few compute workloads really need it
553 	 * (the register is whitelisted in hardware now, so UMDs can opt in
554 	 * for coherency if they have a good reason).
555 	 */
556 	WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
557 
558 	/* Wa_2006611047:icl (pre-prod)
559 	 * Formerly known as WaDisableImprovedTdlClkGating
560 	 */
561 	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
562 		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
563 				  GEN11_TDL_CLOCK_GATING_FIX_DISABLE);
564 
565 	/* Wa_2006665173:icl (pre-prod) */
566 	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
567 		WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
568 				  GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC);
569 
570 	/* WaEnableFloatBlendOptimization:icl */
571 	wa_write_masked_or(wal,
572 			   GEN10_CACHE_MODE_SS,
573 			   0, /* write-only, so skip validation */
574 			   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE));
575 
576 	/* WaDisableGPGPUMidThreadPreemption:icl */
577 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
578 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
579 			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
580 
581 	/* allow headerless messages for preemptible GPGPU context */
582 	WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE,
583 			  GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
584 
585 	/* Wa_1604278689:icl,ehl */
586 	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
587 	wa_write_masked_or(wal, IVB_FBC_RT_BASE_UPPER,
588 			   0, /* write-only register; skip validation */
589 			   0xFFFFFFFF);
590 
591 	/* Wa_1406306137:icl,ehl */
592 	wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
593 }
594 
595 static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine,
596 				     struct i915_wa_list *wal)
597 {
598 	/*
599 	 * Wa_1409142259:tgl
600 	 * Wa_1409347922:tgl
601 	 * Wa_1409252684:tgl
602 	 * Wa_1409217633:tgl
603 	 * Wa_1409207793:tgl
604 	 * Wa_1409178076:tgl
605 	 * Wa_1408979724:tgl
606 	 */
607 	WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
608 			  GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
609 
610 	/*
611 	 * Wa_1604555607:gen12 and Wa_1608008084:gen12
612 	 * FF_MODE2 register will return the wrong value when read. The default
613 	 * value for this register is zero for all fields and there are no bit
614 	 * masks. So instead of doing a RMW we should just write the TDS timer
615 	 * value for Wa_1604555607.
616 	 */
617 	wa_add(wal, FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
618 	       FF_MODE2_TDS_TIMER_128, 0);
619 
620 	/* WaDisableGPGPUMidThreadPreemption:tgl */
621 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
622 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
623 			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
624 }
625 
626 static void
627 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
628 			   struct i915_wa_list *wal,
629 			   const char *name)
630 {
631 	struct drm_i915_private *i915 = engine->i915;
632 
633 	if (engine->class != RENDER_CLASS)
634 		return;
635 
636 	wa_init_start(wal, name, engine->name);
637 
638 	if (IS_GEN(i915, 12))
639 		tgl_ctx_workarounds_init(engine, wal);
640 	else if (IS_GEN(i915, 11))
641 		icl_ctx_workarounds_init(engine, wal);
642 	else if (IS_CANNONLAKE(i915))
643 		cnl_ctx_workarounds_init(engine, wal);
644 	else if (IS_COFFEELAKE(i915))
645 		cfl_ctx_workarounds_init(engine, wal);
646 	else if (IS_GEMINILAKE(i915))
647 		glk_ctx_workarounds_init(engine, wal);
648 	else if (IS_KABYLAKE(i915))
649 		kbl_ctx_workarounds_init(engine, wal);
650 	else if (IS_BROXTON(i915))
651 		bxt_ctx_workarounds_init(engine, wal);
652 	else if (IS_SKYLAKE(i915))
653 		skl_ctx_workarounds_init(engine, wal);
654 	else if (IS_CHERRYVIEW(i915))
655 		chv_ctx_workarounds_init(engine, wal);
656 	else if (IS_BROADWELL(i915))
657 		bdw_ctx_workarounds_init(engine, wal);
658 	else if (INTEL_GEN(i915) < 8)
659 		return;
660 	else
661 		MISSING_CASE(INTEL_GEN(i915));
662 
663 	wa_init_finish(wal);
664 }
665 
666 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
667 {
668 	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
669 }
670 
671 int intel_engine_emit_ctx_wa(struct i915_request *rq)
672 {
673 	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
674 	struct i915_wa *wa;
675 	unsigned int i;
676 	u32 *cs;
677 	int ret;
678 
679 	if (wal->count == 0)
680 		return 0;
681 
682 	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
683 	if (ret)
684 		return ret;
685 
686 	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
687 	if (IS_ERR(cs))
688 		return PTR_ERR(cs);
689 
690 	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
691 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
692 		*cs++ = i915_mmio_reg_offset(wa->reg);
693 		*cs++ = wa->set;
694 	}
695 	*cs++ = MI_NOOP;
696 
697 	intel_ring_advance(rq, cs);
698 
699 	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
700 	if (ret)
701 		return ret;
702 
703 	return 0;
704 }
705 
706 static void
707 gen4_gt_workarounds_init(struct drm_i915_private *i915,
708 			 struct i915_wa_list *wal)
709 {
710 	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
711 	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
712 }
713 
714 static void
715 g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
716 {
717 	gen4_gt_workarounds_init(i915, wal);
718 
719 	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
720 	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
721 }
722 
723 static void
724 ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
725 {
726 	g4x_gt_workarounds_init(i915, wal);
727 
728 	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
729 }
730 
731 static void
732 snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
733 {
734 	/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
735 	wa_masked_en(wal,
736 		     _3D_CHICKEN,
737 		     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
738 
739 	/* WaDisable_RenderCache_OperationalFlush:snb */
740 	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
741 
742 	/*
743 	 * BSpec recommends 8x4 when MSAA is used,
744 	 * however in practice 16x4 seems fastest.
745 	 *
746 	 * Note that PS/WM thread counts depend on the WIZ hashing
747 	 * disable bit, which we don't touch here, but it's good
748 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
749 	 */
750 	wa_add(wal,
751 	       GEN6_GT_MODE, 0,
752 	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
753 	       GEN6_WIZ_HASHING_16x4);
754 
755 	wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB);
756 
757 	wa_masked_en(wal,
758 		     _3D_CHICKEN3,
759 		     /* WaStripsFansDisableFastClipPerformanceFix:snb */
760 		     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
761 		     /*
762 		      * Bspec says:
763 		      * "This bit must be set if 3DSTATE_CLIP clip mode is set
764 		      * to normal and 3DSTATE_SF number of SF output attributes
765 		      * is more than 16."
766 		      */
767 		   _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
768 }
769 
770 static void
771 ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
772 {
773 	/* WaDisableEarlyCull:ivb */
774 	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
775 
776 	/* WaDisablePSDDualDispatchEnable:ivb */
777 	if (IS_IVB_GT1(i915))
778 		wa_masked_en(wal,
779 			     GEN7_HALF_SLICE_CHICKEN1,
780 			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
781 
782 	/* WaDisable_RenderCache_OperationalFlush:ivb */
783 	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
784 
785 	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
786 	wa_masked_dis(wal,
787 		      GEN7_COMMON_SLICE_CHICKEN1,
788 		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
789 
790 	/* WaApplyL3ControlAndL3ChickenMode:ivb */
791 	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
792 	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
793 
794 	/* WaForceL3Serialization:ivb */
795 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
796 
797 	/*
798 	 * WaVSThreadDispatchOverride:ivb,vlv
799 	 *
800 	 * This actually overrides the dispatch
801 	 * mode for all thread types.
802 	 */
803 	wa_write_masked_or(wal, GEN7_FF_THREAD_MODE,
804 			   GEN7_FF_SCHED_MASK,
805 			   GEN7_FF_TS_SCHED_HW |
806 			   GEN7_FF_VS_SCHED_HW |
807 			   GEN7_FF_DS_SCHED_HW);
808 
809 	if (0) { /* causes HiZ corruption on ivb:gt1 */
810 		/* enable HiZ Raw Stall Optimization */
811 		wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
812 	}
813 
814 	/* WaDisable4x2SubspanOptimization:ivb */
815 	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
816 
817 	/*
818 	 * BSpec recommends 8x4 when MSAA is used,
819 	 * however in practice 16x4 seems fastest.
820 	 *
821 	 * Note that PS/WM thread counts depend on the WIZ hashing
822 	 * disable bit, which we don't touch here, but it's good
823 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
824 	 */
825 	wa_add(wal, GEN7_GT_MODE, 0,
826 	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
827 	       GEN6_WIZ_HASHING_16x4);
828 }
829 
830 static void
831 vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
832 {
833 	/* WaDisableEarlyCull:vlv */
834 	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
835 
836 	/* WaPsdDispatchEnable:vlv */
837 	/* WaDisablePSDDualDispatchEnable:vlv */
838 	wa_masked_en(wal,
839 		     GEN7_HALF_SLICE_CHICKEN1,
840 		     GEN7_MAX_PS_THREAD_DEP |
841 		     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
842 
843 	/* WaDisable_RenderCache_OperationalFlush:vlv */
844 	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
845 
846 	/* WaForceL3Serialization:vlv */
847 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
848 
849 	/*
850 	 * WaVSThreadDispatchOverride:ivb,vlv
851 	 *
852 	 * This actually overrides the dispatch
853 	 * mode for all thread types.
854 	 */
855 	wa_write_masked_or(wal,
856 			   GEN7_FF_THREAD_MODE,
857 			   GEN7_FF_SCHED_MASK,
858 			   GEN7_FF_TS_SCHED_HW |
859 			   GEN7_FF_VS_SCHED_HW |
860 			   GEN7_FF_DS_SCHED_HW);
861 
862 	/*
863 	 * BSpec says this must be set, even though
864 	 * WaDisable4x2SubspanOptimization isn't listed for VLV.
865 	 */
866 	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
867 
868 	/*
869 	 * BSpec recommends 8x4 when MSAA is used,
870 	 * however in practice 16x4 seems fastest.
871 	 *
872 	 * Note that PS/WM thread counts depend on the WIZ hashing
873 	 * disable bit, which we don't touch here, but it's good
874 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
875 	 */
876 	wa_add(wal, GEN7_GT_MODE, 0,
877 	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
878 	       GEN6_WIZ_HASHING_16x4);
879 
880 	/*
881 	 * WaIncreaseL3CreditsForVLVB0:vlv
882 	 * This is the hardware default actually.
883 	 */
884 	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
885 }
886 
887 static void
888 hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
889 {
890 	/* L3 caching of data atomics doesn't work -- disable it. */
891 	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
892 
893 	wa_add(wal,
894 	       HSW_ROW_CHICKEN3, 0,
895 	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
896 		0 /* XXX does this reg exist? */);
897 
898 	/* WaVSRefCountFullforceMissDisable:hsw */
899 	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
900 
901 	wa_masked_dis(wal,
902 		      CACHE_MODE_0_GEN7,
903 		      /* WaDisable_RenderCache_OperationalFlush:hsw */
904 		      RC_OP_FLUSH_ENABLE |
905 		      /* enable HiZ Raw Stall Optimization */
906 		      HIZ_RAW_STALL_OPT_DISABLE);
907 
908 	/* WaDisable4x2SubspanOptimization:hsw */
909 	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
910 
911 	/*
912 	 * BSpec recommends 8x4 when MSAA is used,
913 	 * however in practice 16x4 seems fastest.
914 	 *
915 	 * Note that PS/WM thread counts depend on the WIZ hashing
916 	 * disable bit, which we don't touch here, but it's good
917 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
918 	 */
919 	wa_add(wal, GEN7_GT_MODE, 0,
920 	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
921 	       GEN6_WIZ_HASHING_16x4);
922 
923 	/* WaSampleCChickenBitEnable:hsw */
924 	wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
925 }
926 
927 static void
928 gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
929 {
930 	/* WaDisableKillLogic:bxt,skl,kbl */
931 	if (!IS_COFFEELAKE(i915))
932 		wa_write_or(wal,
933 			    GAM_ECOCHK,
934 			    ECOCHK_DIS_TLB);
935 
936 	if (HAS_LLC(i915)) {
937 		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
938 		 *
939 		 * Must match Display Engine. See
940 		 * WaCompressedResourceDisplayNewHashMode.
941 		 */
942 		wa_write_or(wal,
943 			    MMCD_MISC_CTRL,
944 			    MMCD_PCLA | MMCD_HOTSPOT_EN);
945 	}
946 
947 	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
948 	wa_write_or(wal,
949 		    GAM_ECOCHK,
950 		    BDW_DISABLE_HDC_INVALIDATION);
951 }
952 
953 static void
954 skl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
955 {
956 	gen9_gt_workarounds_init(i915, wal);
957 
958 	/* WaDisableGafsUnitClkGating:skl */
959 	wa_write_or(wal,
960 		    GEN7_UCGCTL4,
961 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
962 
963 	/* WaInPlaceDecompressionHang:skl */
964 	if (IS_SKL_REVID(i915, SKL_REVID_H0, REVID_FOREVER))
965 		wa_write_or(wal,
966 			    GEN9_GAMT_ECO_REG_RW_IA,
967 			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
968 }
969 
970 static void
971 bxt_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
972 {
973 	gen9_gt_workarounds_init(i915, wal);
974 
975 	/* WaInPlaceDecompressionHang:bxt */
976 	wa_write_or(wal,
977 		    GEN9_GAMT_ECO_REG_RW_IA,
978 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
979 }
980 
981 static void
982 kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
983 {
984 	gen9_gt_workarounds_init(i915, wal);
985 
986 	/* WaDisableDynamicCreditSharing:kbl */
987 	if (IS_KBL_REVID(i915, 0, KBL_REVID_B0))
988 		wa_write_or(wal,
989 			    GAMT_CHKN_BIT_REG,
990 			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
991 
992 	/* WaDisableGafsUnitClkGating:kbl */
993 	wa_write_or(wal,
994 		    GEN7_UCGCTL4,
995 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
996 
997 	/* WaInPlaceDecompressionHang:kbl */
998 	wa_write_or(wal,
999 		    GEN9_GAMT_ECO_REG_RW_IA,
1000 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1001 }
1002 
1003 static void
1004 glk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1005 {
1006 	gen9_gt_workarounds_init(i915, wal);
1007 }
1008 
1009 static void
1010 cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1011 {
1012 	gen9_gt_workarounds_init(i915, wal);
1013 
1014 	/* WaDisableGafsUnitClkGating:cfl */
1015 	wa_write_or(wal,
1016 		    GEN7_UCGCTL4,
1017 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1018 
1019 	/* WaInPlaceDecompressionHang:cfl */
1020 	wa_write_or(wal,
1021 		    GEN9_GAMT_ECO_REG_RW_IA,
1022 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1023 }
1024 
1025 static void
1026 wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1027 {
1028 	const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu;
1029 	unsigned int slice, subslice;
1030 	u32 l3_en, mcr, mcr_mask;
1031 
1032 	GEM_BUG_ON(INTEL_GEN(i915) < 10);
1033 
1034 	/*
1035 	 * WaProgramMgsrForL3BankSpecificMmioReads: cnl,icl
1036 	 * L3Banks could be fused off in single slice scenario. If that is
1037 	 * the case, we might need to program MCR select to a valid L3Bank
1038 	 * by default, to make sure we correctly read certain registers
1039 	 * later on (in the range 0xB100 - 0xB3FF).
1040 	 *
1041 	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:cnl,icl
1042 	 * Before any MMIO read into slice/subslice specific registers, MCR
1043 	 * packet control register needs to be programmed to point to any
1044 	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1045 	 * This means each subsequent MMIO read will be forwarded to an
1046 	 * specific s/ss combination, but this is OK since these registers
1047 	 * are consistent across s/ss in almost all cases. In the rare
1048 	 * occasions, such as INSTDONE, where this value is dependent
1049 	 * on s/ss combo, the read should be done with read_subslice_reg.
1050 	 *
1051 	 * Since GEN8_MCR_SELECTOR contains dual-purpose bits which select both
1052 	 * to which subslice, or to which L3 bank, the respective mmio reads
1053 	 * will go, we have to find a common index which works for both
1054 	 * accesses.
1055 	 *
1056 	 * Case where we cannot find a common index fortunately should not
1057 	 * happen in production hardware, so we only emit a warning instead of
1058 	 * implementing something more complex that requires checking the range
1059 	 * of every MMIO read.
1060 	 */
1061 
1062 	if (INTEL_GEN(i915) >= 10 && is_power_of_2(sseu->slice_mask)) {
1063 		u32 l3_fuse =
1064 			intel_uncore_read(&i915->uncore, GEN10_MIRROR_FUSE3) &
1065 			GEN10_L3BANK_MASK;
1066 
1067 		DRM_DEBUG_DRIVER("L3 fuse = %x\n", l3_fuse);
1068 		l3_en = ~(l3_fuse << GEN10_L3BANK_PAIR_COUNT | l3_fuse);
1069 	} else {
1070 		l3_en = ~0;
1071 	}
1072 
1073 	slice = fls(sseu->slice_mask) - 1;
1074 	subslice = fls(l3_en & intel_sseu_get_subslices(sseu, slice));
1075 	if (!subslice) {
1076 		DRM_WARN("No common index found between subslice mask %x and L3 bank mask %x!\n",
1077 			 intel_sseu_get_subslices(sseu, slice), l3_en);
1078 		subslice = fls(l3_en);
1079 		drm_WARN_ON(&i915->drm, !subslice);
1080 	}
1081 	subslice--;
1082 
1083 	if (INTEL_GEN(i915) >= 11) {
1084 		mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1085 		mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1086 	} else {
1087 		mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1088 		mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1089 	}
1090 
1091 	DRM_DEBUG_DRIVER("MCR slice/subslice = %x\n", mcr);
1092 
1093 	wa_write_masked_or(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1094 }
1095 
1096 static void
1097 cnl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1098 {
1099 	wa_init_mcr(i915, wal);
1100 
1101 	/* WaDisableI2mCycleOnWRPort:cnl (pre-prod) */
1102 	if (IS_CNL_REVID(i915, CNL_REVID_B0, CNL_REVID_B0))
1103 		wa_write_or(wal,
1104 			    GAMT_CHKN_BIT_REG,
1105 			    GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT);
1106 
1107 	/* WaInPlaceDecompressionHang:cnl */
1108 	wa_write_or(wal,
1109 		    GEN9_GAMT_ECO_REG_RW_IA,
1110 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1111 }
1112 
1113 static void
1114 icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1115 {
1116 	wa_init_mcr(i915, wal);
1117 
1118 	/* WaInPlaceDecompressionHang:icl */
1119 	wa_write_or(wal,
1120 		    GEN9_GAMT_ECO_REG_RW_IA,
1121 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1122 
1123 	/* WaModifyGamTlbPartitioning:icl */
1124 	wa_write_masked_or(wal,
1125 			   GEN11_GACB_PERF_CTRL,
1126 			   GEN11_HASH_CTRL_MASK,
1127 			   GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1128 
1129 	/* Wa_1405766107:icl
1130 	 * Formerly known as WaCL2SFHalfMaxAlloc
1131 	 */
1132 	wa_write_or(wal,
1133 		    GEN11_LSN_UNSLCVC,
1134 		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1135 		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1136 
1137 	/* Wa_220166154:icl
1138 	 * Formerly known as WaDisCtxReload
1139 	 */
1140 	wa_write_or(wal,
1141 		    GEN8_GAMW_ECO_DEV_RW_IA,
1142 		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1143 
1144 	/* Wa_1405779004:icl (pre-prod) */
1145 	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
1146 		wa_write_or(wal,
1147 			    SLICE_UNIT_LEVEL_CLKGATE,
1148 			    MSCUNIT_CLKGATE_DIS);
1149 
1150 	/* Wa_1406838659:icl (pre-prod) */
1151 	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1152 		wa_write_or(wal,
1153 			    INF_UNIT_LEVEL_CLKGATE,
1154 			    CGPSF_CLKGATE_DIS);
1155 
1156 	/* Wa_1406463099:icl
1157 	 * Formerly known as WaGamTlbPendError
1158 	 */
1159 	wa_write_or(wal,
1160 		    GAMT_CHKN_BIT_REG,
1161 		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1162 
1163 	/* Wa_1607087056:icl */
1164 	wa_write_or(wal,
1165 		    SLICE_UNIT_LEVEL_CLKGATE,
1166 		    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1167 }
1168 
1169 static void
1170 tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1171 {
1172 	/* Wa_1409420604:tgl */
1173 	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1174 		wa_write_or(wal,
1175 			    SUBSLICE_UNIT_LEVEL_CLKGATE2,
1176 			    CPSSUNIT_CLKGATE_DIS);
1177 
1178 	/* Wa_1607087056:tgl also know as BUG:1409180338 */
1179 	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1180 		wa_write_or(wal,
1181 			    SLICE_UNIT_LEVEL_CLKGATE,
1182 			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1183 }
1184 
1185 static void
1186 gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal)
1187 {
1188 	if (IS_GEN(i915, 12))
1189 		tgl_gt_workarounds_init(i915, wal);
1190 	else if (IS_GEN(i915, 11))
1191 		icl_gt_workarounds_init(i915, wal);
1192 	else if (IS_CANNONLAKE(i915))
1193 		cnl_gt_workarounds_init(i915, wal);
1194 	else if (IS_COFFEELAKE(i915))
1195 		cfl_gt_workarounds_init(i915, wal);
1196 	else if (IS_GEMINILAKE(i915))
1197 		glk_gt_workarounds_init(i915, wal);
1198 	else if (IS_KABYLAKE(i915))
1199 		kbl_gt_workarounds_init(i915, wal);
1200 	else if (IS_BROXTON(i915))
1201 		bxt_gt_workarounds_init(i915, wal);
1202 	else if (IS_SKYLAKE(i915))
1203 		skl_gt_workarounds_init(i915, wal);
1204 	else if (IS_HASWELL(i915))
1205 		hsw_gt_workarounds_init(i915, wal);
1206 	else if (IS_VALLEYVIEW(i915))
1207 		vlv_gt_workarounds_init(i915, wal);
1208 	else if (IS_IVYBRIDGE(i915))
1209 		ivb_gt_workarounds_init(i915, wal);
1210 	else if (IS_GEN(i915, 6))
1211 		snb_gt_workarounds_init(i915, wal);
1212 	else if (IS_GEN(i915, 5))
1213 		ilk_gt_workarounds_init(i915, wal);
1214 	else if (IS_G4X(i915))
1215 		g4x_gt_workarounds_init(i915, wal);
1216 	else if (IS_GEN(i915, 4))
1217 		gen4_gt_workarounds_init(i915, wal);
1218 	else if (INTEL_GEN(i915) <= 8)
1219 		return;
1220 	else
1221 		MISSING_CASE(INTEL_GEN(i915));
1222 }
1223 
1224 void intel_gt_init_workarounds(struct drm_i915_private *i915)
1225 {
1226 	struct i915_wa_list *wal = &i915->gt_wa_list;
1227 
1228 	wa_init_start(wal, "GT", "global");
1229 	gt_init_workarounds(i915, wal);
1230 	wa_init_finish(wal);
1231 }
1232 
1233 static enum forcewake_domains
1234 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1235 {
1236 	enum forcewake_domains fw = 0;
1237 	struct i915_wa *wa;
1238 	unsigned int i;
1239 
1240 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1241 		fw |= intel_uncore_forcewake_for_reg(uncore,
1242 						     wa->reg,
1243 						     FW_REG_READ |
1244 						     FW_REG_WRITE);
1245 
1246 	return fw;
1247 }
1248 
1249 static bool
1250 wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
1251 {
1252 	if ((cur ^ wa->set) & wa->read) {
1253 		DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x)\n",
1254 			  name, from, i915_mmio_reg_offset(wa->reg),
1255 			  cur, cur & wa->read, wa->set);
1256 
1257 		return false;
1258 	}
1259 
1260 	return true;
1261 }
1262 
1263 static void
1264 wa_list_apply(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1265 {
1266 	enum forcewake_domains fw;
1267 	unsigned long flags;
1268 	struct i915_wa *wa;
1269 	unsigned int i;
1270 
1271 	if (!wal->count)
1272 		return;
1273 
1274 	fw = wal_get_fw_for_rmw(uncore, wal);
1275 
1276 	spin_lock_irqsave(&uncore->lock, flags);
1277 	intel_uncore_forcewake_get__locked(uncore, fw);
1278 
1279 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1280 		if (wa->clr)
1281 			intel_uncore_rmw_fw(uncore, wa->reg, wa->clr, wa->set);
1282 		else
1283 			intel_uncore_write_fw(uncore, wa->reg, wa->set);
1284 		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1285 			wa_verify(wa,
1286 				  intel_uncore_read_fw(uncore, wa->reg),
1287 				  wal->name, "application");
1288 	}
1289 
1290 	intel_uncore_forcewake_put__locked(uncore, fw);
1291 	spin_unlock_irqrestore(&uncore->lock, flags);
1292 }
1293 
1294 void intel_gt_apply_workarounds(struct intel_gt *gt)
1295 {
1296 	wa_list_apply(gt->uncore, &gt->i915->gt_wa_list);
1297 }
1298 
1299 static bool wa_list_verify(struct intel_uncore *uncore,
1300 			   const struct i915_wa_list *wal,
1301 			   const char *from)
1302 {
1303 	struct i915_wa *wa;
1304 	unsigned int i;
1305 	bool ok = true;
1306 
1307 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1308 		ok &= wa_verify(wa,
1309 				intel_uncore_read(uncore, wa->reg),
1310 				wal->name, from);
1311 
1312 	return ok;
1313 }
1314 
1315 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1316 {
1317 	return wa_list_verify(gt->uncore, &gt->i915->gt_wa_list, from);
1318 }
1319 
1320 static inline bool is_nonpriv_flags_valid(u32 flags)
1321 {
1322 	/* Check only valid flag bits are set */
1323 	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1324 		return false;
1325 
1326 	/* NB: Only 3 out of 4 enum values are valid for access field */
1327 	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1328 	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1329 		return false;
1330 
1331 	return true;
1332 }
1333 
1334 static void
1335 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1336 {
1337 	struct i915_wa wa = {
1338 		.reg = reg
1339 	};
1340 
1341 	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1342 		return;
1343 
1344 	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1345 		return;
1346 
1347 	wa.reg.reg |= flags;
1348 	_wa_add(wal, &wa);
1349 }
1350 
1351 static void
1352 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1353 {
1354 	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1355 }
1356 
1357 static void gen9_whitelist_build(struct i915_wa_list *w)
1358 {
1359 	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1360 	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1361 
1362 	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1363 	whitelist_reg(w, GEN8_CS_CHICKEN1);
1364 
1365 	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1366 	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1367 
1368 	/* WaSendPushConstantsFromMMIO:skl,bxt */
1369 	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1370 }
1371 
1372 static void skl_whitelist_build(struct intel_engine_cs *engine)
1373 {
1374 	struct i915_wa_list *w = &engine->whitelist;
1375 
1376 	if (engine->class != RENDER_CLASS)
1377 		return;
1378 
1379 	gen9_whitelist_build(w);
1380 
1381 	/* WaDisableLSQCROPERFforOCL:skl */
1382 	whitelist_reg(w, GEN8_L3SQCREG4);
1383 }
1384 
1385 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1386 {
1387 	if (engine->class != RENDER_CLASS)
1388 		return;
1389 
1390 	gen9_whitelist_build(&engine->whitelist);
1391 }
1392 
1393 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1394 {
1395 	struct i915_wa_list *w = &engine->whitelist;
1396 
1397 	if (engine->class != RENDER_CLASS)
1398 		return;
1399 
1400 	gen9_whitelist_build(w);
1401 
1402 	/* WaDisableLSQCROPERFforOCL:kbl */
1403 	whitelist_reg(w, GEN8_L3SQCREG4);
1404 }
1405 
1406 static void glk_whitelist_build(struct intel_engine_cs *engine)
1407 {
1408 	struct i915_wa_list *w = &engine->whitelist;
1409 
1410 	if (engine->class != RENDER_CLASS)
1411 		return;
1412 
1413 	gen9_whitelist_build(w);
1414 
1415 	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1416 	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1417 }
1418 
1419 static void cfl_whitelist_build(struct intel_engine_cs *engine)
1420 {
1421 	struct i915_wa_list *w = &engine->whitelist;
1422 
1423 	if (engine->class != RENDER_CLASS)
1424 		return;
1425 
1426 	gen9_whitelist_build(w);
1427 
1428 	/*
1429 	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1430 	 *
1431 	 * This covers 4 register which are next to one another :
1432 	 *   - PS_INVOCATION_COUNT
1433 	 *   - PS_INVOCATION_COUNT_UDW
1434 	 *   - PS_DEPTH_COUNT
1435 	 *   - PS_DEPTH_COUNT_UDW
1436 	 */
1437 	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1438 			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1439 			  RING_FORCE_TO_NONPRIV_RANGE_4);
1440 }
1441 
1442 static void cnl_whitelist_build(struct intel_engine_cs *engine)
1443 {
1444 	struct i915_wa_list *w = &engine->whitelist;
1445 
1446 	if (engine->class != RENDER_CLASS)
1447 		return;
1448 
1449 	/* WaEnablePreemptionGranularityControlByUMD:cnl */
1450 	whitelist_reg(w, GEN8_CS_CHICKEN1);
1451 }
1452 
1453 static void icl_whitelist_build(struct intel_engine_cs *engine)
1454 {
1455 	struct i915_wa_list *w = &engine->whitelist;
1456 
1457 	switch (engine->class) {
1458 	case RENDER_CLASS:
1459 		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1460 		whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1461 
1462 		/* WaAllowUMDToModifySamplerMode:icl */
1463 		whitelist_reg(w, GEN10_SAMPLER_MODE);
1464 
1465 		/* WaEnableStateCacheRedirectToCS:icl */
1466 		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1467 
1468 		/*
1469 		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1470 		 *
1471 		 * This covers 4 register which are next to one another :
1472 		 *   - PS_INVOCATION_COUNT
1473 		 *   - PS_INVOCATION_COUNT_UDW
1474 		 *   - PS_DEPTH_COUNT
1475 		 *   - PS_DEPTH_COUNT_UDW
1476 		 */
1477 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1478 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1479 				  RING_FORCE_TO_NONPRIV_RANGE_4);
1480 		break;
1481 
1482 	case VIDEO_DECODE_CLASS:
1483 		/* hucStatusRegOffset */
1484 		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1485 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1486 		/* hucUKernelHdrInfoRegOffset */
1487 		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
1488 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1489 		/* hucStatus2RegOffset */
1490 		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
1491 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1492 		break;
1493 
1494 	default:
1495 		break;
1496 	}
1497 }
1498 
1499 static void tgl_whitelist_build(struct intel_engine_cs *engine)
1500 {
1501 	struct i915_wa_list *w = &engine->whitelist;
1502 
1503 	switch (engine->class) {
1504 	case RENDER_CLASS:
1505 		/*
1506 		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
1507 		 * Wa_1408556865:tgl
1508 		 *
1509 		 * This covers 4 registers which are next to one another :
1510 		 *   - PS_INVOCATION_COUNT
1511 		 *   - PS_INVOCATION_COUNT_UDW
1512 		 *   - PS_DEPTH_COUNT
1513 		 *   - PS_DEPTH_COUNT_UDW
1514 		 */
1515 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1516 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1517 				  RING_FORCE_TO_NONPRIV_RANGE_4);
1518 
1519 		/* Wa_1808121037:tgl */
1520 		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
1521 
1522 		/* Wa_1806527549:tgl */
1523 		whitelist_reg(w, HIZ_CHICKEN);
1524 		break;
1525 	default:
1526 		break;
1527 	}
1528 }
1529 
1530 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
1531 {
1532 	struct drm_i915_private *i915 = engine->i915;
1533 	struct i915_wa_list *w = &engine->whitelist;
1534 
1535 	wa_init_start(w, "whitelist", engine->name);
1536 
1537 	if (IS_GEN(i915, 12))
1538 		tgl_whitelist_build(engine);
1539 	else if (IS_GEN(i915, 11))
1540 		icl_whitelist_build(engine);
1541 	else if (IS_CANNONLAKE(i915))
1542 		cnl_whitelist_build(engine);
1543 	else if (IS_COFFEELAKE(i915))
1544 		cfl_whitelist_build(engine);
1545 	else if (IS_GEMINILAKE(i915))
1546 		glk_whitelist_build(engine);
1547 	else if (IS_KABYLAKE(i915))
1548 		kbl_whitelist_build(engine);
1549 	else if (IS_BROXTON(i915))
1550 		bxt_whitelist_build(engine);
1551 	else if (IS_SKYLAKE(i915))
1552 		skl_whitelist_build(engine);
1553 	else if (INTEL_GEN(i915) <= 8)
1554 		return;
1555 	else
1556 		MISSING_CASE(INTEL_GEN(i915));
1557 
1558 	wa_init_finish(w);
1559 }
1560 
1561 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
1562 {
1563 	const struct i915_wa_list *wal = &engine->whitelist;
1564 	struct intel_uncore *uncore = engine->uncore;
1565 	const u32 base = engine->mmio_base;
1566 	struct i915_wa *wa;
1567 	unsigned int i;
1568 
1569 	if (!wal->count)
1570 		return;
1571 
1572 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1573 		intel_uncore_write(uncore,
1574 				   RING_FORCE_TO_NONPRIV(base, i),
1575 				   i915_mmio_reg_offset(wa->reg));
1576 
1577 	/* And clear the rest just in case of garbage */
1578 	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
1579 		intel_uncore_write(uncore,
1580 				   RING_FORCE_TO_NONPRIV(base, i),
1581 				   i915_mmio_reg_offset(RING_NOPID(base)));
1582 }
1583 
1584 static void
1585 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1586 {
1587 	struct drm_i915_private *i915 = engine->i915;
1588 
1589 	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) {
1590 		/*
1591 		 * Wa_1607138336:tgl
1592 		 * Wa_1607063988:tgl
1593 		 */
1594 		wa_write_or(wal,
1595 			    GEN9_CTX_PREEMPT_REG,
1596 			    GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
1597 
1598 		/*
1599 		 * Wa_1607030317:tgl
1600 		 * Wa_1607186500:tgl
1601 		 * Wa_1607297627:tgl there is 3 entries for this WA on BSpec, 2
1602 		 * of then says it is fixed on B0 the other one says it is
1603 		 * permanent
1604 		 */
1605 		wa_masked_en(wal,
1606 			     GEN6_RC_SLEEP_PSMI_CONTROL,
1607 			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
1608 			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
1609 
1610 		/*
1611 		 * Wa_1606679103:tgl
1612 		 * (see also Wa_1606682166:icl)
1613 		 */
1614 		wa_write_or(wal,
1615 			    GEN7_SARCHKMD,
1616 			    GEN7_DISABLE_SAMPLER_PREFETCH);
1617 
1618 		/* Wa_1407928979:tgl */
1619 		wa_write_or(wal,
1620 			    GEN7_FF_THREAD_MODE,
1621 			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1622 
1623 		/* Wa_1408615072:tgl */
1624 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1625 			    VSUNIT_CLKGATE_DIS_TGL);
1626 	}
1627 
1628 	if (IS_TIGERLAKE(i915)) {
1629 		/* Wa_1606931601:tgl */
1630 		wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
1631 
1632 		/* Wa_1409804808:tgl */
1633 		wa_masked_en(wal, GEN7_ROW_CHICKEN2,
1634 			     GEN12_PUSH_CONST_DEREF_HOLD_DIS);
1635 
1636 		/* Wa_1606700617:tgl */
1637 		wa_masked_en(wal,
1638 			     GEN9_CS_DEBUG_MODE1,
1639 			     FF_DOP_CLOCK_GATE_DISABLE);
1640 
1641 		/*
1642 		 * Wa_1409085225:tgl
1643 		 * Wa_14010229206:tgl
1644 		 */
1645 		wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
1646 	}
1647 
1648 	if (IS_GEN(i915, 11)) {
1649 		/* This is not an Wa. Enable for better image quality */
1650 		wa_masked_en(wal,
1651 			     _3D_CHICKEN3,
1652 			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
1653 
1654 		/* WaPipelineFlushCoherentLines:icl */
1655 		wa_write_or(wal,
1656 			    GEN8_L3SQCREG4,
1657 			    GEN8_LQSC_FLUSH_COHERENT_LINES);
1658 
1659 		/*
1660 		 * Wa_1405543622:icl
1661 		 * Formerly known as WaGAPZPriorityScheme
1662 		 */
1663 		wa_write_or(wal,
1664 			    GEN8_GARBCNTL,
1665 			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
1666 
1667 		/*
1668 		 * Wa_1604223664:icl
1669 		 * Formerly known as WaL3BankAddressHashing
1670 		 */
1671 		wa_write_masked_or(wal,
1672 				   GEN8_GARBCNTL,
1673 				   GEN11_HASH_CTRL_EXCL_MASK,
1674 				   GEN11_HASH_CTRL_EXCL_BIT0);
1675 		wa_write_masked_or(wal,
1676 				   GEN11_GLBLINVL,
1677 				   GEN11_BANK_HASH_ADDR_EXCL_MASK,
1678 				   GEN11_BANK_HASH_ADDR_EXCL_BIT0);
1679 
1680 		/*
1681 		 * Wa_1405733216:icl
1682 		 * Formerly known as WaDisableCleanEvicts
1683 		 */
1684 		wa_write_or(wal,
1685 			    GEN8_L3SQCREG4,
1686 			    GEN11_LQSC_CLEAN_EVICT_DISABLE);
1687 
1688 		/* WaForwardProgressSoftReset:icl */
1689 		wa_write_or(wal,
1690 			    GEN10_SCRATCH_LNCF2,
1691 			    PMFLUSHDONE_LNICRSDROP |
1692 			    PMFLUSH_GAPL3UNBLOCK |
1693 			    PMFLUSHDONE_LNEBLK);
1694 
1695 		/* Wa_1406609255:icl (pre-prod) */
1696 		if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1697 			wa_write_or(wal,
1698 				    GEN7_SARCHKMD,
1699 				    GEN7_DISABLE_DEMAND_PREFETCH);
1700 
1701 		/* Wa_1606682166:icl */
1702 		wa_write_or(wal,
1703 			    GEN7_SARCHKMD,
1704 			    GEN7_DISABLE_SAMPLER_PREFETCH);
1705 
1706 		/* Wa_1409178092:icl */
1707 		wa_write_masked_or(wal,
1708 				   GEN11_SCRATCH2,
1709 				   GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
1710 				   0);
1711 
1712 		/* WaEnable32PlaneMode:icl */
1713 		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
1714 			     GEN11_ENABLE_32_PLANE_MODE);
1715 
1716 		/*
1717 		 * Wa_1408615072:icl,ehl  (vsunit)
1718 		 * Wa_1407596294:icl,ehl  (hsunit)
1719 		 */
1720 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1721 			    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1722 
1723 		/* Wa_1407352427:icl,ehl */
1724 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1725 			    PSDUNIT_CLKGATE_DIS);
1726 
1727 		/* Wa_1406680159:icl,ehl */
1728 		wa_write_or(wal,
1729 			    SUBSLICE_UNIT_LEVEL_CLKGATE,
1730 			    GWUNIT_CLKGATE_DIS);
1731 
1732 		/*
1733 		 * Wa_1408767742:icl[a2..forever],ehl[all]
1734 		 * Wa_1605460711:icl[a0..c0]
1735 		 */
1736 		wa_write_or(wal,
1737 			    GEN7_FF_THREAD_MODE,
1738 			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1739 	}
1740 
1741 	if (IS_GEN_RANGE(i915, 9, 12)) {
1742 		/* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl,tgl */
1743 		wa_masked_en(wal,
1744 			     GEN7_FF_SLICE_CS_CHICKEN1,
1745 			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
1746 	}
1747 
1748 	if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) {
1749 		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
1750 		wa_write_or(wal,
1751 			    GEN8_GARBCNTL,
1752 			    GEN9_GAPS_TSV_CREDIT_DISABLE);
1753 	}
1754 
1755 	if (IS_BROXTON(i915)) {
1756 		/* WaDisablePooledEuLoadBalancingFix:bxt */
1757 		wa_masked_en(wal,
1758 			     FF_SLICE_CS_CHICKEN2,
1759 			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
1760 	}
1761 
1762 	if (IS_GEN(i915, 9)) {
1763 		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
1764 		wa_masked_en(wal,
1765 			     GEN9_CSFE_CHICKEN1_RCS,
1766 			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
1767 
1768 		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
1769 		wa_write_or(wal,
1770 			    BDW_SCRATCH1,
1771 			    GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
1772 
1773 		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
1774 		if (IS_GEN9_LP(i915))
1775 			wa_write_masked_or(wal,
1776 					   GEN8_L3SQCREG1,
1777 					   L3_PRIO_CREDITS_MASK,
1778 					   L3_GENERAL_PRIO_CREDITS(62) |
1779 					   L3_HIGH_PRIO_CREDITS(2));
1780 
1781 		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
1782 		wa_write_or(wal,
1783 			    GEN8_L3SQCREG4,
1784 			    GEN8_LQSC_FLUSH_COHERENT_LINES);
1785 	}
1786 
1787 	if (IS_GEN(i915, 7))
1788 		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1789 		wa_masked_en(wal,
1790 			     GFX_MODE_GEN7,
1791 			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
1792 
1793 	if (IS_GEN_RANGE(i915, 6, 7))
1794 		/*
1795 		 * We need to disable the AsyncFlip performance optimisations in
1796 		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
1797 		 * already be programmed to '1' on all products.
1798 		 *
1799 		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1800 		 */
1801 		wa_masked_en(wal,
1802 			     MI_MODE,
1803 			     ASYNC_FLIP_PERF_DISABLE);
1804 
1805 	if (IS_GEN(i915, 6)) {
1806 		/*
1807 		 * Required for the hardware to program scanline values for
1808 		 * waiting
1809 		 * WaEnableFlushTlbInvalidationMode:snb
1810 		 */
1811 		wa_masked_en(wal,
1812 			     GFX_MODE,
1813 			     GFX_TLB_INVALIDATE_EXPLICIT);
1814 
1815 		/*
1816 		 * From the Sandybridge PRM, volume 1 part 3, page 24:
1817 		 * "If this bit is set, STCunit will have LRA as replacement
1818 		 *  policy. [...] This bit must be reset. LRA replacement
1819 		 *  policy is not supported."
1820 		 */
1821 		wa_masked_dis(wal,
1822 			      CACHE_MODE_0,
1823 			      CM0_STC_EVICT_DISABLE_LRA_SNB);
1824 	}
1825 
1826 	if (IS_GEN_RANGE(i915, 4, 6))
1827 		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1828 		wa_add(wal, MI_MODE,
1829 		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
1830 		       /* XXX bit doesn't stick on Broadwater */
1831 		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH);
1832 }
1833 
1834 static void
1835 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1836 {
1837 	struct drm_i915_private *i915 = engine->i915;
1838 
1839 	/* WaKBLVECSSemaphoreWaitPoll:kbl */
1840 	if (IS_KBL_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) {
1841 		wa_write(wal,
1842 			 RING_SEMA_WAIT_POLL(engine->mmio_base),
1843 			 1);
1844 	}
1845 }
1846 
1847 static void
1848 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1849 {
1850 	if (I915_SELFTEST_ONLY(INTEL_GEN(engine->i915) < 4))
1851 		return;
1852 
1853 	if (engine->class == RENDER_CLASS)
1854 		rcs_engine_wa_init(engine, wal);
1855 	else
1856 		xcs_engine_wa_init(engine, wal);
1857 }
1858 
1859 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
1860 {
1861 	struct i915_wa_list *wal = &engine->wa_list;
1862 
1863 	if (INTEL_GEN(engine->i915) < 4)
1864 		return;
1865 
1866 	wa_init_start(wal, "engine", engine->name);
1867 	engine_init_workarounds(engine, wal);
1868 	wa_init_finish(wal);
1869 }
1870 
1871 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
1872 {
1873 	wa_list_apply(engine->uncore, &engine->wa_list);
1874 }
1875 
1876 static struct i915_vma *
1877 create_scratch(struct i915_address_space *vm, int count)
1878 {
1879 	struct drm_i915_gem_object *obj;
1880 	struct i915_vma *vma;
1881 	unsigned int size;
1882 	int err;
1883 
1884 	size = round_up(count * sizeof(u32), PAGE_SIZE);
1885 	obj = i915_gem_object_create_internal(vm->i915, size);
1886 	if (IS_ERR(obj))
1887 		return ERR_CAST(obj);
1888 
1889 	i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
1890 
1891 	vma = i915_vma_instance(obj, vm, NULL);
1892 	if (IS_ERR(vma)) {
1893 		err = PTR_ERR(vma);
1894 		goto err_obj;
1895 	}
1896 
1897 	err = i915_vma_pin(vma, 0, 0,
1898 			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
1899 	if (err)
1900 		goto err_obj;
1901 
1902 	return vma;
1903 
1904 err_obj:
1905 	i915_gem_object_put(obj);
1906 	return ERR_PTR(err);
1907 }
1908 
1909 static const struct {
1910 	u32 start;
1911 	u32 end;
1912 } mcr_ranges_gen8[] = {
1913 	{ .start = 0x5500, .end = 0x55ff },
1914 	{ .start = 0x7000, .end = 0x7fff },
1915 	{ .start = 0x9400, .end = 0x97ff },
1916 	{ .start = 0xb000, .end = 0xb3ff },
1917 	{ .start = 0xe000, .end = 0xe7ff },
1918 	{},
1919 };
1920 
1921 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
1922 {
1923 	int i;
1924 
1925 	if (INTEL_GEN(i915) < 8)
1926 		return false;
1927 
1928 	/*
1929 	 * Registers in these ranges are affected by the MCR selector
1930 	 * which only controls CPU initiated MMIO. Routing does not
1931 	 * work for CS access so we cannot verify them on this path.
1932 	 */
1933 	for (i = 0; mcr_ranges_gen8[i].start; i++)
1934 		if (offset >= mcr_ranges_gen8[i].start &&
1935 		    offset <= mcr_ranges_gen8[i].end)
1936 			return true;
1937 
1938 	return false;
1939 }
1940 
1941 static int
1942 wa_list_srm(struct i915_request *rq,
1943 	    const struct i915_wa_list *wal,
1944 	    struct i915_vma *vma)
1945 {
1946 	struct drm_i915_private *i915 = rq->i915;
1947 	unsigned int i, count = 0;
1948 	const struct i915_wa *wa;
1949 	u32 srm, *cs;
1950 
1951 	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1952 	if (INTEL_GEN(i915) >= 8)
1953 		srm++;
1954 
1955 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1956 		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
1957 			count++;
1958 	}
1959 
1960 	cs = intel_ring_begin(rq, 4 * count);
1961 	if (IS_ERR(cs))
1962 		return PTR_ERR(cs);
1963 
1964 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1965 		u32 offset = i915_mmio_reg_offset(wa->reg);
1966 
1967 		if (mcr_range(i915, offset))
1968 			continue;
1969 
1970 		*cs++ = srm;
1971 		*cs++ = offset;
1972 		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
1973 		*cs++ = 0;
1974 	}
1975 	intel_ring_advance(rq, cs);
1976 
1977 	return 0;
1978 }
1979 
1980 static int engine_wa_list_verify(struct intel_context *ce,
1981 				 const struct i915_wa_list * const wal,
1982 				 const char *from)
1983 {
1984 	const struct i915_wa *wa;
1985 	struct i915_request *rq;
1986 	struct i915_vma *vma;
1987 	unsigned int i;
1988 	u32 *results;
1989 	int err;
1990 
1991 	if (!wal->count)
1992 		return 0;
1993 
1994 	vma = create_scratch(&ce->engine->gt->ggtt->vm, wal->count);
1995 	if (IS_ERR(vma))
1996 		return PTR_ERR(vma);
1997 
1998 	intel_engine_pm_get(ce->engine);
1999 	rq = intel_context_create_request(ce);
2000 	intel_engine_pm_put(ce->engine);
2001 	if (IS_ERR(rq)) {
2002 		err = PTR_ERR(rq);
2003 		goto err_vma;
2004 	}
2005 
2006 	i915_vma_lock(vma);
2007 	err = i915_request_await_object(rq, vma->obj, true);
2008 	if (err == 0)
2009 		err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
2010 	i915_vma_unlock(vma);
2011 	if (err) {
2012 		i915_request_add(rq);
2013 		goto err_vma;
2014 	}
2015 
2016 	err = wa_list_srm(rq, wal, vma);
2017 	if (err)
2018 		goto err_vma;
2019 
2020 	i915_request_get(rq);
2021 	i915_request_add(rq);
2022 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
2023 		err = -ETIME;
2024 		goto err_rq;
2025 	}
2026 
2027 	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
2028 	if (IS_ERR(results)) {
2029 		err = PTR_ERR(results);
2030 		goto err_rq;
2031 	}
2032 
2033 	err = 0;
2034 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2035 		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
2036 			continue;
2037 
2038 		if (!wa_verify(wa, results[i], wal->name, from))
2039 			err = -ENXIO;
2040 	}
2041 
2042 	i915_gem_object_unpin_map(vma->obj);
2043 
2044 err_rq:
2045 	i915_request_put(rq);
2046 err_vma:
2047 	i915_vma_unpin(vma);
2048 	i915_vma_put(vma);
2049 	return err;
2050 }
2051 
2052 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
2053 				    const char *from)
2054 {
2055 	return engine_wa_list_verify(engine->kernel_context,
2056 				     &engine->wa_list,
2057 				     from);
2058 }
2059 
2060 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2061 #include "selftest_workarounds.c"
2062 #endif
2063