1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014-2018 Intel Corporation
4 */
5
6 #include "i915_drv.h"
7 #include "i915_reg.h"
8 #include "intel_context.h"
9 #include "intel_engine_pm.h"
10 #include "intel_engine_regs.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt.h"
13 #include "intel_gt_ccs_mode.h"
14 #include "intel_gt_mcr.h"
15 #include "intel_gt_regs.h"
16 #include "intel_ring.h"
17 #include "intel_workarounds.h"
18
19 /**
20 * DOC: Hardware workarounds
21 *
22 * Hardware workarounds are register programming documented to be executed in
23 * the driver that fall outside of the normal programming sequences for a
24 * platform. There are some basic categories of workarounds, depending on
25 * how/when they are applied:
26 *
27 * - Context workarounds: workarounds that touch registers that are
28 * saved/restored to/from the HW context image. The list is emitted (via Load
29 * Register Immediate commands) once when initializing the device and saved in
30 * the default context. That default context is then used on every context
31 * creation to have a "primed golden context", i.e. a context image that
32 * already contains the changes needed to all the registers.
33 *
34 * Context workarounds should be implemented in the \*_ctx_workarounds_init()
35 * variants respective to the targeted platforms.
36 *
37 * - Engine workarounds: the list of these WAs is applied whenever the specific
38 * engine is reset. It's also possible that a set of engine classes share a
39 * common power domain and they are reset together. This happens on some
40 * platforms with render and compute engines. In this case (at least) one of
41 * them need to keeep the workaround programming: the approach taken in the
42 * driver is to tie those workarounds to the first compute/render engine that
43 * is registered. When executing with GuC submission, engine resets are
44 * outside of kernel driver control, hence the list of registers involved in
45 * written once, on engine initialization, and then passed to GuC, that
46 * saves/restores their values before/after the reset takes place. See
47 * ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
48 *
49 * Workarounds for registers specific to RCS and CCS should be implemented in
50 * rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
51 * registers belonging to BCS, VCS or VECS should be implemented in
52 * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
53 * engine's MMIO range but that are part of of the common RCS/CCS reset domain
54 * should be implemented in general_render_compute_wa_init(). The settings
55 * about the CCS load balancing should be added in ccs_engine_wa_mode().
56 *
57 * - GT workarounds: the list of these WAs is applied whenever these registers
58 * revert to their default values: on GPU reset, suspend/resume [1]_, etc.
59 *
60 * GT workarounds should be implemented in the \*_gt_workarounds_init()
61 * variants respective to the targeted platforms.
62 *
63 * - Register whitelist: some workarounds need to be implemented in userspace,
64 * but need to touch privileged registers. The whitelist in the kernel
65 * instructs the hardware to allow the access to happen. From the kernel side,
66 * this is just a special case of a MMIO workaround (as we write the list of
67 * these to/be-whitelisted registers to some special HW registers).
68 *
69 * Register whitelisting should be done in the \*_whitelist_build() variants
70 * respective to the targeted platforms.
71 *
72 * - Workaround batchbuffers: buffers that get executed automatically by the
73 * hardware on every HW context restore. These buffers are created and
74 * programmed in the default context so the hardware always go through those
75 * programming sequences when switching contexts. The support for workaround
76 * batchbuffers is enabled these hardware mechanisms:
77 *
78 * #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
79 * context, pointing the hardware to jump to that location when that offset
80 * is reached in the context restore. Workaround batchbuffer in the driver
81 * currently uses this mechanism for all platforms.
82 *
83 * #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
84 * pointing the hardware to a buffer to continue executing after the
85 * engine registers are restored in a context restore sequence. This is
86 * currently not used in the driver.
87 *
88 * - Other: There are WAs that, due to their nature, cannot be applied from a
89 * central place. Those are peppered around the rest of the code, as needed.
90 * Workarounds related to the display IP are the main example.
91 *
92 * .. [1] Technically, some registers are powercontext saved & restored, so they
93 * survive a suspend/resume. In practice, writing them again is not too
94 * costly and simplifies things, so it's the approach taken in the driver.
95 */
96
wa_init_start(struct i915_wa_list * wal,struct intel_gt * gt,const char * name,const char * engine_name)97 static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
98 const char *name, const char *engine_name)
99 {
100 wal->gt = gt;
101 wal->name = name;
102 wal->engine_name = engine_name;
103 }
104
105 #define WA_LIST_CHUNK (1 << 4)
106
wa_init_finish(struct i915_wa_list * wal)107 static void wa_init_finish(struct i915_wa_list *wal)
108 {
109 /* Trim unused entries. */
110 if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
111 struct i915_wa *list = kmemdup(wal->list,
112 wal->count * sizeof(*list),
113 GFP_KERNEL);
114
115 if (list) {
116 kfree(wal->list);
117 wal->list = list;
118 }
119 }
120
121 if (!wal->count)
122 return;
123
124 drm_dbg(&wal->gt->i915->drm, "Initialized %u %s workarounds on %s\n",
125 wal->wa_count, wal->name, wal->engine_name);
126 }
127
128 static enum forcewake_domains
wal_get_fw_for_rmw(struct intel_uncore * uncore,const struct i915_wa_list * wal)129 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
130 {
131 enum forcewake_domains fw = 0;
132 struct i915_wa *wa;
133 unsigned int i;
134
135 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
136 fw |= intel_uncore_forcewake_for_reg(uncore,
137 wa->reg,
138 FW_REG_READ |
139 FW_REG_WRITE);
140
141 return fw;
142 }
143
_wa_add(struct i915_wa_list * wal,const struct i915_wa * wa)144 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
145 {
146 unsigned int addr = i915_mmio_reg_offset(wa->reg);
147 struct drm_i915_private *i915 = wal->gt->i915;
148 unsigned int start = 0, end = wal->count;
149 const unsigned int grow = WA_LIST_CHUNK;
150 struct i915_wa *wa_;
151
152 GEM_BUG_ON(!is_power_of_2(grow));
153
154 if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
155 struct i915_wa *list;
156
157 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
158 GFP_KERNEL);
159 if (!list) {
160 drm_err(&i915->drm, "No space for workaround init!\n");
161 return;
162 }
163
164 if (wal->list) {
165 memcpy(list, wal->list, sizeof(*wa) * wal->count);
166 kfree(wal->list);
167 }
168
169 wal->list = list;
170 }
171
172 while (start < end) {
173 unsigned int mid = start + (end - start) / 2;
174
175 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
176 start = mid + 1;
177 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
178 end = mid;
179 } else {
180 wa_ = &wal->list[mid];
181
182 if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
183 drm_err(&i915->drm,
184 "Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
185 i915_mmio_reg_offset(wa_->reg),
186 wa_->clr, wa_->set);
187
188 wa_->set &= ~wa->clr;
189 }
190
191 wal->wa_count++;
192 wa_->set |= wa->set;
193 wa_->clr |= wa->clr;
194 wa_->read |= wa->read;
195 return;
196 }
197 }
198
199 wal->wa_count++;
200 wa_ = &wal->list[wal->count++];
201 *wa_ = *wa;
202
203 while (wa_-- > wal->list) {
204 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
205 i915_mmio_reg_offset(wa_[1].reg));
206 if (i915_mmio_reg_offset(wa_[1].reg) >
207 i915_mmio_reg_offset(wa_[0].reg))
208 break;
209
210 swap(wa_[1], wa_[0]);
211 }
212 }
213
wa_add(struct i915_wa_list * wal,i915_reg_t reg,u32 clear,u32 set,u32 read_mask,bool masked_reg)214 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
215 u32 clear, u32 set, u32 read_mask, bool masked_reg)
216 {
217 struct i915_wa wa = {
218 .reg = reg,
219 .clr = clear,
220 .set = set,
221 .read = read_mask,
222 .masked_reg = masked_reg,
223 };
224
225 _wa_add(wal, &wa);
226 }
227
wa_mcr_add(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clear,u32 set,u32 read_mask,bool masked_reg)228 static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
229 u32 clear, u32 set, u32 read_mask, bool masked_reg)
230 {
231 struct i915_wa wa = {
232 .mcr_reg = reg,
233 .clr = clear,
234 .set = set,
235 .read = read_mask,
236 .masked_reg = masked_reg,
237 .is_mcr = 1,
238 };
239
240 _wa_add(wal, &wa);
241 }
242
243 static void
wa_write_clr_set(struct i915_wa_list * wal,i915_reg_t reg,u32 clear,u32 set)244 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
245 {
246 wa_add(wal, reg, clear, set, clear | set, false);
247 }
248
249 static void
wa_mcr_write_clr_set(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clear,u32 set)250 wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
251 {
252 wa_mcr_add(wal, reg, clear, set, clear | set, false);
253 }
254
255 static void
wa_write(struct i915_wa_list * wal,i915_reg_t reg,u32 set)256 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
257 {
258 wa_write_clr_set(wal, reg, ~0, set);
259 }
260
261 static void
wa_mcr_write(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 set)262 wa_mcr_write(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
263 {
264 wa_mcr_write_clr_set(wal, reg, ~0, set);
265 }
266
267 static void
wa_write_or(struct i915_wa_list * wal,i915_reg_t reg,u32 set)268 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
269 {
270 wa_write_clr_set(wal, reg, set, set);
271 }
272
273 static void
wa_mcr_write_or(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 set)274 wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
275 {
276 wa_mcr_write_clr_set(wal, reg, set, set);
277 }
278
279 static void
wa_write_clr(struct i915_wa_list * wal,i915_reg_t reg,u32 clr)280 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
281 {
282 wa_write_clr_set(wal, reg, clr, 0);
283 }
284
285 static void
wa_mcr_write_clr(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clr)286 wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
287 {
288 wa_mcr_write_clr_set(wal, reg, clr, 0);
289 }
290
291 /*
292 * WA operations on "masked register". A masked register has the upper 16 bits
293 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
294 * portion of the register without a rmw: you simply write in the upper 16 bits
295 * the mask of bits you are going to modify.
296 *
297 * The wa_masked_* family of functions already does the necessary operations to
298 * calculate the mask based on the parameters passed, so user only has to
299 * provide the lower 16 bits of that register.
300 */
301
302 static void
wa_masked_en(struct i915_wa_list * wal,i915_reg_t reg,u32 val)303 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
304 {
305 wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
306 }
307
308 static void
wa_mcr_masked_en(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 val)309 wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
310 {
311 wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
312 }
313
314 static void
wa_masked_dis(struct i915_wa_list * wal,i915_reg_t reg,u32 val)315 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
316 {
317 wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
318 }
319
320 static void
wa_mcr_masked_dis(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 val)321 wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
322 {
323 wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
324 }
325
326 static void
wa_masked_field_set(struct i915_wa_list * wal,i915_reg_t reg,u32 mask,u32 val)327 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
328 u32 mask, u32 val)
329 {
330 wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
331 }
332
333 static void
wa_mcr_masked_field_set(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 mask,u32 val)334 wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
335 u32 mask, u32 val)
336 {
337 wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
338 }
339
gen6_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)340 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
341 struct i915_wa_list *wal)
342 {
343 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
344 }
345
gen7_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)346 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
347 struct i915_wa_list *wal)
348 {
349 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
350 }
351
gen8_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)352 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
353 struct i915_wa_list *wal)
354 {
355 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
356
357 /* WaDisableAsyncFlipPerfMode:bdw,chv */
358 wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
359
360 /* WaDisablePartialInstShootdown:bdw,chv */
361 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
362 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
363
364 /* Use Force Non-Coherent whenever executing a 3D context. This is a
365 * workaround for a possible hang in the unlikely event a TLB
366 * invalidation occurs during a PSD flush.
367 */
368 /* WaForceEnableNonCoherent:bdw,chv */
369 /* WaHdcDisableFetchWhenMasked:bdw,chv */
370 wa_masked_en(wal, HDC_CHICKEN0,
371 HDC_DONOT_FETCH_MEM_WHEN_MASKED |
372 HDC_FORCE_NON_COHERENT);
373
374 /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
375 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
376 * polygons in the same 8x4 pixel/sample area to be processed without
377 * stalling waiting for the earlier ones to write to Hierarchical Z
378 * buffer."
379 *
380 * This optimization is off by default for BDW and CHV; turn it on.
381 */
382 wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
383
384 /* Wa4x4STCOptimizationDisable:bdw,chv */
385 wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
386
387 /*
388 * BSpec recommends 8x4 when MSAA is used,
389 * however in practice 16x4 seems fastest.
390 *
391 * Note that PS/WM thread counts depend on the WIZ hashing
392 * disable bit, which we don't touch here, but it's good
393 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
394 */
395 wa_masked_field_set(wal, GEN7_GT_MODE,
396 GEN6_WIZ_HASHING_MASK,
397 GEN6_WIZ_HASHING_16x4);
398 }
399
bdw_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)400 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
401 struct i915_wa_list *wal)
402 {
403 struct drm_i915_private *i915 = engine->i915;
404
405 gen8_ctx_workarounds_init(engine, wal);
406
407 /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
408 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
409
410 /* WaDisableDopClockGating:bdw
411 *
412 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
413 * to disable EUTC clock gating.
414 */
415 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
416 DOP_CLOCK_GATING_DISABLE);
417
418 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
419 GEN8_SAMPLER_POWER_BYPASS_DIS);
420
421 wa_masked_en(wal, HDC_CHICKEN0,
422 /* WaForceContextSaveRestoreNonCoherent:bdw */
423 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
424 /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
425 (IS_BROADWELL_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
426 }
427
chv_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)428 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
429 struct i915_wa_list *wal)
430 {
431 gen8_ctx_workarounds_init(engine, wal);
432
433 /* WaDisableThreadStallDopClockGating:chv */
434 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
435
436 /* Improve HiZ throughput on CHV. */
437 wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
438 }
439
gen9_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)440 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
441 struct i915_wa_list *wal)
442 {
443 struct drm_i915_private *i915 = engine->i915;
444
445 if (HAS_LLC(i915)) {
446 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
447 *
448 * Must match Display Engine. See
449 * WaCompressedResourceDisplayNewHashMode.
450 */
451 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
452 GEN9_PBE_COMPRESSED_HASH_SELECTION);
453 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
454 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
455 }
456
457 /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
458 /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
459 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
460 FLOW_CONTROL_ENABLE |
461 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
462
463 /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
464 /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
465 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
466 GEN9_ENABLE_YV12_BUGFIX |
467 GEN9_ENABLE_GPGPU_PREEMPTION);
468
469 /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
470 /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
471 wa_masked_en(wal, CACHE_MODE_1,
472 GEN8_4x4_STC_OPTIMIZATION_DISABLE |
473 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
474
475 /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
476 wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
477 GEN9_CCS_TLB_PREFETCH_ENABLE);
478
479 /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
480 wa_masked_en(wal, HDC_CHICKEN0,
481 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
482 HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
483
484 /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
485 * both tied to WaForceContextSaveRestoreNonCoherent
486 * in some hsds for skl. We keep the tie for all gen9. The
487 * documentation is a bit hazy and so we want to get common behaviour,
488 * even though there is no clear evidence we would need both on kbl/bxt.
489 * This area has been source of system hangs so we play it safe
490 * and mimic the skl regardless of what bspec says.
491 *
492 * Use Force Non-Coherent whenever executing a 3D context. This
493 * is a workaround for a possible hang in the unlikely event
494 * a TLB invalidation occurs during a PSD flush.
495 */
496
497 /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
498 wa_masked_en(wal, HDC_CHICKEN0,
499 HDC_FORCE_NON_COHERENT);
500
501 /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
502 if (IS_SKYLAKE(i915) ||
503 IS_KABYLAKE(i915) ||
504 IS_COFFEELAKE(i915) ||
505 IS_COMETLAKE(i915))
506 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
507 GEN8_SAMPLER_POWER_BYPASS_DIS);
508
509 /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
510 wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
511
512 /*
513 * Supporting preemption with fine-granularity requires changes in the
514 * batch buffer programming. Since we can't break old userspace, we
515 * need to set our default preemption level to safe value. Userspace is
516 * still able to use more fine-grained preemption levels, since in
517 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
518 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
519 * not real HW workarounds, but merely a way to start using preemption
520 * while maintaining old contract with userspace.
521 */
522
523 /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
524 wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
525
526 /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
527 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
528 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
529 GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
530
531 /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
532 if (IS_GEN9_LP(i915))
533 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
534 }
535
skl_tune_iz_hashing(struct intel_engine_cs * engine,struct i915_wa_list * wal)536 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
537 struct i915_wa_list *wal)
538 {
539 struct intel_gt *gt = engine->gt;
540 u8 vals[3] = { 0, 0, 0 };
541 unsigned int i;
542
543 for (i = 0; i < 3; i++) {
544 u8 ss;
545
546 /*
547 * Only consider slices where one, and only one, subslice has 7
548 * EUs
549 */
550 if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
551 continue;
552
553 /*
554 * subslice_7eu[i] != 0 (because of the check above) and
555 * ss_max == 4 (maximum number of subslices possible per slice)
556 *
557 * -> 0 <= ss <= 3;
558 */
559 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
560 vals[i] = 3 - ss;
561 }
562
563 if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
564 return;
565
566 /* Tune IZ hashing. See intel_device_info_runtime_init() */
567 wa_masked_field_set(wal, GEN7_GT_MODE,
568 GEN9_IZ_HASHING_MASK(2) |
569 GEN9_IZ_HASHING_MASK(1) |
570 GEN9_IZ_HASHING_MASK(0),
571 GEN9_IZ_HASHING(2, vals[2]) |
572 GEN9_IZ_HASHING(1, vals[1]) |
573 GEN9_IZ_HASHING(0, vals[0]));
574 }
575
skl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)576 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
577 struct i915_wa_list *wal)
578 {
579 gen9_ctx_workarounds_init(engine, wal);
580 skl_tune_iz_hashing(engine, wal);
581 }
582
bxt_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)583 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
584 struct i915_wa_list *wal)
585 {
586 gen9_ctx_workarounds_init(engine, wal);
587
588 /* WaDisableThreadStallDopClockGating:bxt */
589 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
590 STALL_DOP_GATING_DISABLE);
591
592 /* WaToEnableHwFixForPushConstHWBug:bxt */
593 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
594 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
595 }
596
kbl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)597 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
598 struct i915_wa_list *wal)
599 {
600 struct drm_i915_private *i915 = engine->i915;
601
602 gen9_ctx_workarounds_init(engine, wal);
603
604 /* WaToEnableHwFixForPushConstHWBug:kbl */
605 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
606 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
607 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
608
609 /* WaDisableSbeCacheDispatchPortSharing:kbl */
610 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
611 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
612 }
613
glk_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)614 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
615 struct i915_wa_list *wal)
616 {
617 gen9_ctx_workarounds_init(engine, wal);
618
619 /* WaToEnableHwFixForPushConstHWBug:glk */
620 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
621 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
622 }
623
cfl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)624 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
625 struct i915_wa_list *wal)
626 {
627 gen9_ctx_workarounds_init(engine, wal);
628
629 /* WaToEnableHwFixForPushConstHWBug:cfl */
630 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
631 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
632
633 /* WaDisableSbeCacheDispatchPortSharing:cfl */
634 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
635 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
636 }
637
icl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)638 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
639 struct i915_wa_list *wal)
640 {
641 /* Wa_1406697149 (WaDisableBankHangMode:icl) */
642 wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
643
644 /* WaForceEnableNonCoherent:icl
645 * This is not the same workaround as in early Gen9 platforms, where
646 * lacking this could cause system hangs, but coherency performance
647 * overhead is high and only a few compute workloads really need it
648 * (the register is whitelisted in hardware now, so UMDs can opt in
649 * for coherency if they have a good reason).
650 */
651 wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
652
653 /* WaEnableFloatBlendOptimization:icl */
654 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
655 _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
656 0 /* write-only, so skip validation */,
657 true);
658
659 /* WaDisableGPGPUMidThreadPreemption:icl */
660 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
661 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
662 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
663
664 /* allow headerless messages for preemptible GPGPU context */
665 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
666 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
667
668 /* Wa_1604278689:icl,ehl */
669 wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
670 wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
671 0,
672 0xFFFFFFFF);
673
674 /* Wa_1406306137:icl,ehl */
675 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
676 }
677
678 /*
679 * These settings aren't actually workarounds, but general tuning settings that
680 * need to be programmed on dg2 platform.
681 */
dg2_ctx_gt_tuning_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)682 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
683 struct i915_wa_list *wal)
684 {
685 wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
686 wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
687 REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
688 wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
689 FF_MODE2_TDS_TIMER_128);
690 }
691
gen12_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)692 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
693 struct i915_wa_list *wal)
694 {
695 struct drm_i915_private *i915 = engine->i915;
696
697 /*
698 * Wa_1409142259:tgl,dg1,adl-p
699 * Wa_1409347922:tgl,dg1,adl-p
700 * Wa_1409252684:tgl,dg1,adl-p
701 * Wa_1409217633:tgl,dg1,adl-p
702 * Wa_1409207793:tgl,dg1,adl-p
703 * Wa_1409178076:tgl,dg1,adl-p
704 * Wa_1408979724:tgl,dg1,adl-p
705 * Wa_14010443199:tgl,rkl,dg1,adl-p
706 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
707 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
708 */
709 wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
710 GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
711
712 /* WaDisableGPGPUMidThreadPreemption:gen12 */
713 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
714 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
715 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
716
717 /*
718 * Wa_16011163337 - GS_TIMER
719 *
720 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
721 * need to program it even on those that don't explicitly list that
722 * workaround.
723 *
724 * Note that the programming of GEN12_FF_MODE2 is further modified
725 * according to the FF_MODE2 guidance given by Wa_1608008084.
726 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
727 * value when read from the CPU.
728 *
729 * The default value for this register is zero for all fields.
730 * So instead of doing a RMW we should just write the desired values
731 * for TDS and GS timers. Note that since the readback can't be trusted,
732 * the clear mask is just set to ~0 to make sure other bits are not
733 * inadvertently set. For the same reason read verification is ignored.
734 */
735 wa_add(wal,
736 GEN12_FF_MODE2,
737 ~0,
738 FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
739 0, false);
740
741 if (!IS_DG1(i915)) {
742 /* Wa_1806527549 */
743 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
744
745 /* Wa_1606376872 */
746 wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
747 }
748 }
749
dg1_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)750 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
751 struct i915_wa_list *wal)
752 {
753 gen12_ctx_workarounds_init(engine, wal);
754
755 /* Wa_1409044764 */
756 wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
757 DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
758
759 /* Wa_22010493298 */
760 wa_masked_en(wal, HIZ_CHICKEN,
761 DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
762 }
763
dg2_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)764 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
765 struct i915_wa_list *wal)
766 {
767 dg2_ctx_gt_tuning_init(engine, wal);
768
769 /* Wa_16013271637:dg2 */
770 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
771 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
772
773 /* Wa_14014947963:dg2 */
774 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
775
776 /* Wa_18018764978:dg2 */
777 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
778
779 /* Wa_15010599737:dg2 */
780 wa_mcr_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN);
781
782 /* Wa_18019271663:dg2 */
783 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
784
785 /* Wa_14019877138:dg2 */
786 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
787 }
788
xelpg_ctx_gt_tuning_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)789 static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
790 struct i915_wa_list *wal)
791 {
792 struct intel_gt *gt = engine->gt;
793
794 dg2_ctx_gt_tuning_init(engine, wal);
795
796 /*
797 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
798 * gen12_emit_indirect_ctx_rcs() rather than here on some early
799 * steppings.
800 */
801 if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
802 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
803 wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
804 }
805
xelpg_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)806 static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
807 struct i915_wa_list *wal)
808 {
809 struct intel_gt *gt = engine->gt;
810
811 xelpg_ctx_gt_tuning_init(engine, wal);
812
813 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
814 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
815 /* Wa_14014947963 */
816 wa_masked_field_set(wal, VF_PREEMPTION,
817 PREEMPTION_VERTEX_COUNT, 0x4000);
818
819 /* Wa_16013271637 */
820 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
821 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
822
823 /* Wa_18019627453 */
824 wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
825
826 /* Wa_18018764978 */
827 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
828 }
829
830 /* Wa_18019271663 */
831 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
832
833 /* Wa_14019877138 */
834 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
835 }
836
fakewa_disable_nestedbb_mode(struct intel_engine_cs * engine,struct i915_wa_list * wal)837 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
838 struct i915_wa_list *wal)
839 {
840 /*
841 * This is a "fake" workaround defined by software to ensure we
842 * maintain reliable, backward-compatible behavior for userspace with
843 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
844 *
845 * The per-context setting of MI_MODE[12] determines whether the bits
846 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
847 * in the traditional manner or whether they should instead use a new
848 * tgl+ meaning that breaks backward compatibility, but allows nesting
849 * into 3rd-level batchbuffers. When this new capability was first
850 * added in TGL, it remained off by default unless a context
851 * intentionally opted in to the new behavior. However Xe_HPG now
852 * flips this on by default and requires that we explicitly opt out if
853 * we don't want the new behavior.
854 *
855 * From a SW perspective, we want to maintain the backward-compatible
856 * behavior for userspace, so we'll apply a fake workaround to set it
857 * back to the legacy behavior on platforms where the hardware default
858 * is to break compatibility. At the moment there is no Linux
859 * userspace that utilizes third-level batchbuffers, so this will avoid
860 * userspace from needing to make any changes. using the legacy
861 * meaning is the correct thing to do. If/when we have userspace
862 * consumers that want to utilize third-level batch nesting, we can
863 * provide a context parameter to allow them to opt-in.
864 */
865 wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
866 }
867
gen12_ctx_gt_mocs_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)868 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
869 struct i915_wa_list *wal)
870 {
871 u8 mocs;
872
873 /*
874 * Some blitter commands do not have a field for MOCS, those
875 * commands will use MOCS index pointed by BLIT_CCTL.
876 * BLIT_CCTL registers are needed to be programmed to un-cached.
877 */
878 if (engine->class == COPY_ENGINE_CLASS) {
879 mocs = engine->gt->mocs.uc_index;
880 wa_write_clr_set(wal,
881 BLIT_CCTL(engine->mmio_base),
882 BLIT_CCTL_MASK,
883 BLIT_CCTL_MOCS(mocs, mocs));
884 }
885 }
886
887 /*
888 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
889 * defined by the hardware team, but it programming general context registers.
890 * Adding those context register programming in context workaround
891 * allow us to use the wa framework for proper application and validation.
892 */
893 static void
gen12_ctx_gt_fake_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)894 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
895 struct i915_wa_list *wal)
896 {
897 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
898 fakewa_disable_nestedbb_mode(engine, wal);
899
900 gen12_ctx_gt_mocs_init(engine, wal);
901 }
902
903 static void
__intel_engine_init_ctx_wa(struct intel_engine_cs * engine,struct i915_wa_list * wal,const char * name)904 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
905 struct i915_wa_list *wal,
906 const char *name)
907 {
908 struct drm_i915_private *i915 = engine->i915;
909
910 wa_init_start(wal, engine->gt, name, engine->name);
911
912 /* Applies to all engines */
913 /*
914 * Fake workarounds are not the actual workaround but
915 * programming of context registers using workaround framework.
916 */
917 if (GRAPHICS_VER(i915) >= 12)
918 gen12_ctx_gt_fake_wa_init(engine, wal);
919
920 if (engine->class != RENDER_CLASS)
921 goto done;
922
923 if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
924 xelpg_ctx_workarounds_init(engine, wal);
925 else if (IS_PONTEVECCHIO(i915))
926 ; /* noop; none at this time */
927 else if (IS_DG2(i915))
928 dg2_ctx_workarounds_init(engine, wal);
929 else if (IS_XEHPSDV(i915))
930 ; /* noop; none at this time */
931 else if (IS_DG1(i915))
932 dg1_ctx_workarounds_init(engine, wal);
933 else if (GRAPHICS_VER(i915) == 12)
934 gen12_ctx_workarounds_init(engine, wal);
935 else if (GRAPHICS_VER(i915) == 11)
936 icl_ctx_workarounds_init(engine, wal);
937 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
938 cfl_ctx_workarounds_init(engine, wal);
939 else if (IS_GEMINILAKE(i915))
940 glk_ctx_workarounds_init(engine, wal);
941 else if (IS_KABYLAKE(i915))
942 kbl_ctx_workarounds_init(engine, wal);
943 else if (IS_BROXTON(i915))
944 bxt_ctx_workarounds_init(engine, wal);
945 else if (IS_SKYLAKE(i915))
946 skl_ctx_workarounds_init(engine, wal);
947 else if (IS_CHERRYVIEW(i915))
948 chv_ctx_workarounds_init(engine, wal);
949 else if (IS_BROADWELL(i915))
950 bdw_ctx_workarounds_init(engine, wal);
951 else if (GRAPHICS_VER(i915) == 7)
952 gen7_ctx_workarounds_init(engine, wal);
953 else if (GRAPHICS_VER(i915) == 6)
954 gen6_ctx_workarounds_init(engine, wal);
955 else if (GRAPHICS_VER(i915) < 8)
956 ;
957 else
958 MISSING_CASE(GRAPHICS_VER(i915));
959
960 done:
961 wa_init_finish(wal);
962 }
963
intel_engine_init_ctx_wa(struct intel_engine_cs * engine)964 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
965 {
966 __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
967 }
968
intel_engine_emit_ctx_wa(struct i915_request * rq)969 int intel_engine_emit_ctx_wa(struct i915_request *rq)
970 {
971 struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
972 struct intel_uncore *uncore = rq->engine->uncore;
973 enum forcewake_domains fw;
974 unsigned long flags;
975 struct i915_wa *wa;
976 unsigned int i;
977 u32 *cs;
978 int ret;
979
980 if (wal->count == 0)
981 return 0;
982
983 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
984 if (ret)
985 return ret;
986
987 cs = intel_ring_begin(rq, (wal->count * 2 + 2));
988 if (IS_ERR(cs))
989 return PTR_ERR(cs);
990
991 fw = wal_get_fw_for_rmw(uncore, wal);
992
993 intel_gt_mcr_lock(wal->gt, &flags);
994 spin_lock(&uncore->lock);
995 intel_uncore_forcewake_get__locked(uncore, fw);
996
997 *cs++ = MI_LOAD_REGISTER_IMM(wal->count);
998 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
999 u32 val;
1000
1001 /* Skip reading the register if it's not really needed */
1002 if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
1003 val = wa->set;
1004 } else {
1005 val = wa->is_mcr ?
1006 intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1007 intel_uncore_read_fw(uncore, wa->reg);
1008 val &= ~wa->clr;
1009 val |= wa->set;
1010 }
1011
1012 *cs++ = i915_mmio_reg_offset(wa->reg);
1013 *cs++ = val;
1014 }
1015 *cs++ = MI_NOOP;
1016
1017 intel_uncore_forcewake_put__locked(uncore, fw);
1018 spin_unlock(&uncore->lock);
1019 intel_gt_mcr_unlock(wal->gt, flags);
1020
1021 intel_ring_advance(rq, cs);
1022
1023 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1024 if (ret)
1025 return ret;
1026
1027 return 0;
1028 }
1029
1030 static void
gen4_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1031 gen4_gt_workarounds_init(struct intel_gt *gt,
1032 struct i915_wa_list *wal)
1033 {
1034 /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1035 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1036 }
1037
1038 static void
g4x_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1039 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1040 {
1041 gen4_gt_workarounds_init(gt, wal);
1042
1043 /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1044 wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1045 }
1046
1047 static void
ilk_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1048 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1049 {
1050 g4x_gt_workarounds_init(gt, wal);
1051
1052 wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1053 }
1054
1055 static void
snb_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1056 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1057 {
1058 }
1059
1060 static void
ivb_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1061 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1062 {
1063 /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1064 wa_masked_dis(wal,
1065 GEN7_COMMON_SLICE_CHICKEN1,
1066 GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1067
1068 /* WaApplyL3ControlAndL3ChickenMode:ivb */
1069 wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1070 wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1071
1072 /* WaForceL3Serialization:ivb */
1073 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1074 }
1075
1076 static void
vlv_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1077 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1078 {
1079 /* WaForceL3Serialization:vlv */
1080 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1081
1082 /*
1083 * WaIncreaseL3CreditsForVLVB0:vlv
1084 * This is the hardware default actually.
1085 */
1086 wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1087 }
1088
1089 static void
hsw_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1090 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1091 {
1092 /* L3 caching of data atomics doesn't work -- disable it. */
1093 wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1094
1095 wa_add(wal,
1096 HSW_ROW_CHICKEN3, 0,
1097 _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1098 0 /* XXX does this reg exist? */, true);
1099
1100 /* WaVSRefCountFullforceMissDisable:hsw */
1101 wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1102 }
1103
1104 static void
gen9_wa_init_mcr(struct drm_i915_private * i915,struct i915_wa_list * wal)1105 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1106 {
1107 const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1108 unsigned int slice, subslice;
1109 u32 mcr, mcr_mask;
1110
1111 GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1112
1113 /*
1114 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1115 * Before any MMIO read into slice/subslice specific registers, MCR
1116 * packet control register needs to be programmed to point to any
1117 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1118 * This means each subsequent MMIO read will be forwarded to an
1119 * specific s/ss combination, but this is OK since these registers
1120 * are consistent across s/ss in almost all cases. In the rare
1121 * occasions, such as INSTDONE, where this value is dependent
1122 * on s/ss combo, the read should be done with read_subslice_reg.
1123 */
1124 slice = ffs(sseu->slice_mask) - 1;
1125 GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1126 subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1127 GEM_BUG_ON(!subslice);
1128 subslice--;
1129
1130 /*
1131 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1132 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1133 */
1134 mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1135 mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1136
1137 drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1138
1139 wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1140 }
1141
1142 static void
gen9_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1143 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1144 {
1145 struct drm_i915_private *i915 = gt->i915;
1146
1147 /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1148 gen9_wa_init_mcr(i915, wal);
1149
1150 /* WaDisableKillLogic:bxt,skl,kbl */
1151 if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1152 wa_write_or(wal,
1153 GAM_ECOCHK,
1154 ECOCHK_DIS_TLB);
1155
1156 if (HAS_LLC(i915)) {
1157 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1158 *
1159 * Must match Display Engine. See
1160 * WaCompressedResourceDisplayNewHashMode.
1161 */
1162 wa_write_or(wal,
1163 MMCD_MISC_CTRL,
1164 MMCD_PCLA | MMCD_HOTSPOT_EN);
1165 }
1166
1167 /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1168 wa_write_or(wal,
1169 GAM_ECOCHK,
1170 BDW_DISABLE_HDC_INVALIDATION);
1171 }
1172
1173 static void
skl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1174 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1175 {
1176 gen9_gt_workarounds_init(gt, wal);
1177
1178 /* WaDisableGafsUnitClkGating:skl */
1179 wa_write_or(wal,
1180 GEN7_UCGCTL4,
1181 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1182
1183 /* WaInPlaceDecompressionHang:skl */
1184 if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1185 wa_write_or(wal,
1186 GEN9_GAMT_ECO_REG_RW_IA,
1187 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1188 }
1189
1190 static void
kbl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1191 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1192 {
1193 gen9_gt_workarounds_init(gt, wal);
1194
1195 /* WaDisableDynamicCreditSharing:kbl */
1196 if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1197 wa_write_or(wal,
1198 GAMT_CHKN_BIT_REG,
1199 GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1200
1201 /* WaDisableGafsUnitClkGating:kbl */
1202 wa_write_or(wal,
1203 GEN7_UCGCTL4,
1204 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1205
1206 /* WaInPlaceDecompressionHang:kbl */
1207 wa_write_or(wal,
1208 GEN9_GAMT_ECO_REG_RW_IA,
1209 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1210 }
1211
1212 static void
glk_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1213 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1214 {
1215 gen9_gt_workarounds_init(gt, wal);
1216 }
1217
1218 static void
cfl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1219 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1220 {
1221 gen9_gt_workarounds_init(gt, wal);
1222
1223 /* WaDisableGafsUnitClkGating:cfl */
1224 wa_write_or(wal,
1225 GEN7_UCGCTL4,
1226 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1227
1228 /* WaInPlaceDecompressionHang:cfl */
1229 wa_write_or(wal,
1230 GEN9_GAMT_ECO_REG_RW_IA,
1231 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1232 }
1233
__set_mcr_steering(struct i915_wa_list * wal,i915_reg_t steering_reg,unsigned int slice,unsigned int subslice)1234 static void __set_mcr_steering(struct i915_wa_list *wal,
1235 i915_reg_t steering_reg,
1236 unsigned int slice, unsigned int subslice)
1237 {
1238 u32 mcr, mcr_mask;
1239
1240 mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1241 mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1242
1243 wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1244 }
1245
debug_dump_steering(struct intel_gt * gt)1246 static void debug_dump_steering(struct intel_gt *gt)
1247 {
1248 struct drm_printer p = drm_debug_printer("MCR Steering:");
1249
1250 if (drm_debug_enabled(DRM_UT_DRIVER))
1251 intel_gt_mcr_report_steering(&p, gt, false);
1252 }
1253
__add_mcr_wa(struct intel_gt * gt,struct i915_wa_list * wal,unsigned int slice,unsigned int subslice)1254 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1255 unsigned int slice, unsigned int subslice)
1256 {
1257 __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1258
1259 gt->default_steering.groupid = slice;
1260 gt->default_steering.instanceid = subslice;
1261
1262 debug_dump_steering(gt);
1263 }
1264
1265 static void
icl_wa_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1266 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1267 {
1268 const struct sseu_dev_info *sseu = >->info.sseu;
1269 unsigned int subslice;
1270
1271 GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1272 GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1273
1274 /*
1275 * Although a platform may have subslices, we need to always steer
1276 * reads to the lowest instance that isn't fused off. When Render
1277 * Power Gating is enabled, grabbing forcewake will only power up a
1278 * single subslice (the "minconfig") if there isn't a real workload
1279 * that needs to be run; this means that if we steer register reads to
1280 * one of the higher subslices, we run the risk of reading back 0's or
1281 * random garbage.
1282 */
1283 subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1284
1285 /*
1286 * If the subslice we picked above also steers us to a valid L3 bank,
1287 * then we can just rely on the default steering and won't need to
1288 * worry about explicitly re-steering L3BANK reads later.
1289 */
1290 if (gt->info.l3bank_mask & BIT(subslice))
1291 gt->steering_table[L3BANK] = NULL;
1292
1293 __add_mcr_wa(gt, wal, 0, subslice);
1294 }
1295
1296 static void
xehp_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1297 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1298 {
1299 const struct sseu_dev_info *sseu = >->info.sseu;
1300 unsigned long slice, subslice = 0, slice_mask = 0;
1301 u32 lncf_mask = 0;
1302 int i;
1303
1304 /*
1305 * On Xe_HP the steering increases in complexity. There are now several
1306 * more units that require steering and we're not guaranteed to be able
1307 * to find a common setting for all of them. These are:
1308 * - GSLICE (fusable)
1309 * - DSS (sub-unit within gslice; fusable)
1310 * - L3 Bank (fusable)
1311 * - MSLICE (fusable)
1312 * - LNCF (sub-unit within mslice; always present if mslice is present)
1313 *
1314 * We'll do our default/implicit steering based on GSLICE (in the
1315 * sliceid field) and DSS (in the subsliceid field). If we can
1316 * find overlap between the valid MSLICE and/or LNCF values with
1317 * a suitable GSLICE, then we can just re-use the default value and
1318 * skip and explicit steering at runtime.
1319 *
1320 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1321 * a valid sliceid value. DSS steering is the only type of steering
1322 * that utilizes the 'subsliceid' bits.
1323 *
1324 * Also note that, even though the steering domain is called "GSlice"
1325 * and it is encoded in the register using the gslice format, the spec
1326 * says that the combined (geometry | compute) fuse should be used to
1327 * select the steering.
1328 */
1329
1330 /* Find the potential gslice candidates */
1331 slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1332 GEN_DSS_PER_GSLICE);
1333
1334 /*
1335 * Find the potential LNCF candidates. Either LNCF within a valid
1336 * mslice is fine.
1337 */
1338 for_each_set_bit(i, >->info.mslice_mask, GEN12_MAX_MSLICES)
1339 lncf_mask |= (0x3 << (i * 2));
1340
1341 /*
1342 * Are there any sliceid values that work for both GSLICE and LNCF
1343 * steering?
1344 */
1345 if (slice_mask & lncf_mask) {
1346 slice_mask &= lncf_mask;
1347 gt->steering_table[LNCF] = NULL;
1348 }
1349
1350 /* How about sliceid values that also work for MSLICE steering? */
1351 if (slice_mask & gt->info.mslice_mask) {
1352 slice_mask &= gt->info.mslice_mask;
1353 gt->steering_table[MSLICE] = NULL;
1354 }
1355
1356 if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1357 gt->steering_table[GAM] = NULL;
1358
1359 slice = __ffs(slice_mask);
1360 subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1361 GEN_DSS_PER_GSLICE;
1362
1363 __add_mcr_wa(gt, wal, slice, subslice);
1364
1365 /*
1366 * SQIDI ranges are special because they use different steering
1367 * registers than everything else we work with. On XeHP SDV and
1368 * DG2-G10, any value in the steering registers will work fine since
1369 * all instances are present, but DG2-G11 only has SQIDI instances at
1370 * ID's 2 and 3, so we need to steer to one of those. For simplicity
1371 * we'll just steer to a hardcoded "2" since that value will work
1372 * everywhere.
1373 */
1374 __set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1375 __set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1376
1377 /*
1378 * On DG2, GAM registers have a dedicated steering control register
1379 * and must always be programmed to a hardcoded groupid of "1."
1380 */
1381 if (IS_DG2(gt->i915))
1382 __set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1383 }
1384
1385 static void
pvc_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1386 pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1387 {
1388 unsigned int dss;
1389
1390 /*
1391 * Setup implicit steering for COMPUTE and DSS ranges to the first
1392 * non-fused-off DSS. All other types of MCR registers will be
1393 * explicitly steered.
1394 */
1395 dss = intel_sseu_find_first_xehp_dss(>->info.sseu, 0, 0);
1396 __add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1397 }
1398
1399 static void
icl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1400 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1401 {
1402 struct drm_i915_private *i915 = gt->i915;
1403
1404 icl_wa_init_mcr(gt, wal);
1405
1406 /* WaModifyGamTlbPartitioning:icl */
1407 wa_write_clr_set(wal,
1408 GEN11_GACB_PERF_CTRL,
1409 GEN11_HASH_CTRL_MASK,
1410 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1411
1412 /* Wa_1405766107:icl
1413 * Formerly known as WaCL2SFHalfMaxAlloc
1414 */
1415 wa_write_or(wal,
1416 GEN11_LSN_UNSLCVC,
1417 GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1418 GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1419
1420 /* Wa_220166154:icl
1421 * Formerly known as WaDisCtxReload
1422 */
1423 wa_write_or(wal,
1424 GEN8_GAMW_ECO_DEV_RW_IA,
1425 GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1426
1427 /* Wa_1406463099:icl
1428 * Formerly known as WaGamTlbPendError
1429 */
1430 wa_write_or(wal,
1431 GAMT_CHKN_BIT_REG,
1432 GAMT_CHKN_DISABLE_L3_COH_PIPE);
1433
1434 /*
1435 * Wa_1408615072:icl,ehl (vsunit)
1436 * Wa_1407596294:icl,ehl (hsunit)
1437 */
1438 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1439 VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1440
1441 /* Wa_1407352427:icl,ehl */
1442 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1443 PSDUNIT_CLKGATE_DIS);
1444
1445 /* Wa_1406680159:icl,ehl */
1446 wa_mcr_write_or(wal,
1447 GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1448 GWUNIT_CLKGATE_DIS);
1449
1450 /* Wa_1607087056:icl,ehl,jsl */
1451 if (IS_ICELAKE(i915) ||
1452 ((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1453 IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1454 wa_write_or(wal,
1455 GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1456 L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1457
1458 /*
1459 * This is not a documented workaround, but rather an optimization
1460 * to reduce sampler power.
1461 */
1462 wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1463 }
1464
1465 /*
1466 * Though there are per-engine instances of these registers,
1467 * they retain their value through engine resets and should
1468 * only be provided on the GT workaround list rather than
1469 * the engine-specific workaround list.
1470 */
1471 static void
wa_14011060649(struct intel_gt * gt,struct i915_wa_list * wal)1472 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1473 {
1474 struct intel_engine_cs *engine;
1475 int id;
1476
1477 for_each_engine(engine, gt, id) {
1478 if (engine->class != VIDEO_DECODE_CLASS ||
1479 (engine->instance % 2))
1480 continue;
1481
1482 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1483 IECPUNIT_CLKGATE_DIS);
1484 }
1485 }
1486
1487 static void
gen12_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1488 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1489 {
1490 icl_wa_init_mcr(gt, wal);
1491
1492 /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1493 wa_14011060649(gt, wal);
1494
1495 /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1496 wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1497
1498 /*
1499 * Wa_14015795083
1500 *
1501 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1502 * preventing i915 from modifying it for this workaround. Skip the
1503 * readback verification for this workaround on debug builds; if the
1504 * workaround doesn't stick due to firmware behavior, it's not an error
1505 * that we want CI to flag.
1506 */
1507 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1508 0, 0, false);
1509 }
1510
1511 static void
dg1_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1512 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1513 {
1514 gen12_gt_workarounds_init(gt, wal);
1515
1516 /* Wa_1409420604:dg1 */
1517 wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1518 CPSSUNIT_CLKGATE_DIS);
1519
1520 /* Wa_1408615072:dg1 */
1521 /* Empirical testing shows this register is unaffected by engine reset. */
1522 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1523 }
1524
1525 static void
xehpsdv_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1526 xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1527 {
1528 struct drm_i915_private *i915 = gt->i915;
1529
1530 xehp_init_mcr(gt, wal);
1531
1532 /* Wa_1409757795:xehpsdv */
1533 wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1534
1535 /* Wa_18011725039:xehpsdv */
1536 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
1537 wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
1538 wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
1539 }
1540
1541 /* Wa_16011155590:xehpsdv */
1542 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1543 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1544 TSGUNIT_CLKGATE_DIS);
1545
1546 /* Wa_14011780169:xehpsdv */
1547 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1548 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1549 GAMTLBVDBOX7_CLKGATE_DIS |
1550 GAMTLBVDBOX6_CLKGATE_DIS |
1551 GAMTLBVDBOX5_CLKGATE_DIS |
1552 GAMTLBVDBOX4_CLKGATE_DIS |
1553 GAMTLBVDBOX3_CLKGATE_DIS |
1554 GAMTLBVDBOX2_CLKGATE_DIS |
1555 GAMTLBVDBOX1_CLKGATE_DIS |
1556 GAMTLBVDBOX0_CLKGATE_DIS |
1557 GAMTLBKCR_CLKGATE_DIS |
1558 GAMTLBGUC_CLKGATE_DIS |
1559 GAMTLBBLT_CLKGATE_DIS);
1560 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1561 GAMTLBGFXA1_CLKGATE_DIS |
1562 GAMTLBCOMPA0_CLKGATE_DIS |
1563 GAMTLBCOMPA1_CLKGATE_DIS |
1564 GAMTLBCOMPB0_CLKGATE_DIS |
1565 GAMTLBCOMPB1_CLKGATE_DIS |
1566 GAMTLBCOMPC0_CLKGATE_DIS |
1567 GAMTLBCOMPC1_CLKGATE_DIS |
1568 GAMTLBCOMPD0_CLKGATE_DIS |
1569 GAMTLBCOMPD1_CLKGATE_DIS |
1570 GAMTLBMERT_CLKGATE_DIS |
1571 GAMTLBVEBOX3_CLKGATE_DIS |
1572 GAMTLBVEBOX2_CLKGATE_DIS |
1573 GAMTLBVEBOX1_CLKGATE_DIS |
1574 GAMTLBVEBOX0_CLKGATE_DIS);
1575 }
1576
1577 /* Wa_16012725990:xehpsdv */
1578 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1579 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1580
1581 /* Wa_14011060649:xehpsdv */
1582 wa_14011060649(gt, wal);
1583
1584 /* Wa_14012362059:xehpsdv */
1585 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
1586
1587 /* Wa_14014368820:xehpsdv */
1588 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1589 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1590
1591 /* Wa_14010670810:xehpsdv */
1592 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1593 }
1594
1595 static void
dg2_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1596 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1597 {
1598 xehp_init_mcr(gt, wal);
1599
1600 /* Wa_14011060649:dg2 */
1601 wa_14011060649(gt, wal);
1602
1603 if (IS_DG2_G10(gt->i915)) {
1604 /* Wa_22010523718:dg2 */
1605 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1606 CG3DDISCFEG_CLKGATE_DIS);
1607
1608 /* Wa_14011006942:dg2 */
1609 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1610 DSS_ROUTER_CLKGATE_DIS);
1611 }
1612
1613 /* Wa_14014830051:dg2 */
1614 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1615
1616 /* Wa_14015795083 */
1617 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1618
1619 /* Wa_18018781329 */
1620 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1621 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1622 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1623 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1624
1625 /* Wa_1509235366:dg2 */
1626 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1627 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1628
1629 /* Wa_14010648519:dg2 */
1630 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1631 }
1632
1633 static void
pvc_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1634 pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1635 {
1636 pvc_init_mcr(gt, wal);
1637
1638 /* Wa_14015795083 */
1639 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1640
1641 /* Wa_18018781329 */
1642 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1643 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1644 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1645 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1646
1647 /* Wa_16016694945 */
1648 wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
1649 }
1650
1651 static void
xelpg_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1652 xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1653 {
1654 /* Wa_14018575942 / Wa_18018781329 */
1655 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1656 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1657
1658 /* Wa_22016670082 */
1659 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1660
1661 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1662 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1663 /* Wa_14014830051 */
1664 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1665
1666 /* Wa_14015795083 */
1667 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1668 }
1669
1670 /*
1671 * Unlike older platforms, we no longer setup implicit steering here;
1672 * all MCR accesses are explicitly steered.
1673 */
1674 debug_dump_steering(gt);
1675 }
1676
1677 static void
xelpmp_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1678 xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1679 {
1680 /*
1681 * Wa_14018778641
1682 * Wa_18018781329
1683 *
1684 * Note that although these registers are MCR on the primary
1685 * GT, the media GT's versions are regular singleton registers.
1686 */
1687 wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1688
1689 debug_dump_steering(gt);
1690 }
1691
1692 /*
1693 * The bspec performance guide has recommended MMIO tuning settings. These
1694 * aren't truly "workarounds" but we want to program them through the
1695 * workaround infrastructure to make sure they're (re)applied at the proper
1696 * times.
1697 *
1698 * The programming in this function is for settings that persist through
1699 * engine resets and also are not part of any engine's register state context.
1700 * I.e., settings that only need to be re-applied in the event of a full GT
1701 * reset.
1702 */
gt_tuning_settings(struct intel_gt * gt,struct i915_wa_list * wal)1703 static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1704 {
1705 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1706 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1707 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1708 }
1709
1710 if (IS_PONTEVECCHIO(gt->i915)) {
1711 wa_mcr_write(wal, XEHPC_L3SCRUB,
1712 SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
1713 wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_HOSTCACHEEN);
1714 }
1715
1716 if (IS_DG2(gt->i915)) {
1717 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1718 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1719 }
1720 }
1721
1722 static void
gt_init_workarounds(struct intel_gt * gt,struct i915_wa_list * wal)1723 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1724 {
1725 struct drm_i915_private *i915 = gt->i915;
1726
1727 gt_tuning_settings(gt, wal);
1728
1729 if (gt->type == GT_MEDIA) {
1730 if (MEDIA_VER(i915) >= 13)
1731 xelpmp_gt_workarounds_init(gt, wal);
1732 else
1733 MISSING_CASE(MEDIA_VER(i915));
1734
1735 return;
1736 }
1737
1738 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1739 xelpg_gt_workarounds_init(gt, wal);
1740 else if (IS_PONTEVECCHIO(i915))
1741 pvc_gt_workarounds_init(gt, wal);
1742 else if (IS_DG2(i915))
1743 dg2_gt_workarounds_init(gt, wal);
1744 else if (IS_XEHPSDV(i915))
1745 xehpsdv_gt_workarounds_init(gt, wal);
1746 else if (IS_DG1(i915))
1747 dg1_gt_workarounds_init(gt, wal);
1748 else if (GRAPHICS_VER(i915) == 12)
1749 gen12_gt_workarounds_init(gt, wal);
1750 else if (GRAPHICS_VER(i915) == 11)
1751 icl_gt_workarounds_init(gt, wal);
1752 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1753 cfl_gt_workarounds_init(gt, wal);
1754 else if (IS_GEMINILAKE(i915))
1755 glk_gt_workarounds_init(gt, wal);
1756 else if (IS_KABYLAKE(i915))
1757 kbl_gt_workarounds_init(gt, wal);
1758 else if (IS_BROXTON(i915))
1759 gen9_gt_workarounds_init(gt, wal);
1760 else if (IS_SKYLAKE(i915))
1761 skl_gt_workarounds_init(gt, wal);
1762 else if (IS_HASWELL(i915))
1763 hsw_gt_workarounds_init(gt, wal);
1764 else if (IS_VALLEYVIEW(i915))
1765 vlv_gt_workarounds_init(gt, wal);
1766 else if (IS_IVYBRIDGE(i915))
1767 ivb_gt_workarounds_init(gt, wal);
1768 else if (GRAPHICS_VER(i915) == 6)
1769 snb_gt_workarounds_init(gt, wal);
1770 else if (GRAPHICS_VER(i915) == 5)
1771 ilk_gt_workarounds_init(gt, wal);
1772 else if (IS_G4X(i915))
1773 g4x_gt_workarounds_init(gt, wal);
1774 else if (GRAPHICS_VER(i915) == 4)
1775 gen4_gt_workarounds_init(gt, wal);
1776 else if (GRAPHICS_VER(i915) <= 8)
1777 ;
1778 else
1779 MISSING_CASE(GRAPHICS_VER(i915));
1780 }
1781
intel_gt_init_workarounds(struct intel_gt * gt)1782 void intel_gt_init_workarounds(struct intel_gt *gt)
1783 {
1784 struct i915_wa_list *wal = >->wa_list;
1785
1786 wa_init_start(wal, gt, "GT", "global");
1787 gt_init_workarounds(gt, wal);
1788 wa_init_finish(wal);
1789 }
1790
1791 static bool
wa_verify(struct intel_gt * gt,const struct i915_wa * wa,u32 cur,const char * name,const char * from)1792 wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1793 const char *name, const char *from)
1794 {
1795 if ((cur ^ wa->set) & wa->read) {
1796 drm_err(>->i915->drm,
1797 "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1798 name, from, i915_mmio_reg_offset(wa->reg),
1799 cur, cur & wa->read, wa->set & wa->read);
1800
1801 return false;
1802 }
1803
1804 return true;
1805 }
1806
wa_list_apply(const struct i915_wa_list * wal)1807 static void wa_list_apply(const struct i915_wa_list *wal)
1808 {
1809 struct intel_gt *gt = wal->gt;
1810 struct intel_uncore *uncore = gt->uncore;
1811 enum forcewake_domains fw;
1812 unsigned long flags;
1813 struct i915_wa *wa;
1814 unsigned int i;
1815
1816 if (!wal->count)
1817 return;
1818
1819 fw = wal_get_fw_for_rmw(uncore, wal);
1820
1821 intel_gt_mcr_lock(gt, &flags);
1822 spin_lock(&uncore->lock);
1823 intel_uncore_forcewake_get__locked(uncore, fw);
1824
1825 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1826 u32 val, old = 0;
1827
1828 /* open-coded rmw due to steering */
1829 if (wa->clr)
1830 old = wa->is_mcr ?
1831 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1832 intel_uncore_read_fw(uncore, wa->reg);
1833 val = (old & ~wa->clr) | wa->set;
1834 if (val != old || !wa->clr) {
1835 if (wa->is_mcr)
1836 intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1837 else
1838 intel_uncore_write_fw(uncore, wa->reg, val);
1839 }
1840
1841 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1842 u32 val = wa->is_mcr ?
1843 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1844 intel_uncore_read_fw(uncore, wa->reg);
1845
1846 wa_verify(gt, wa, val, wal->name, "application");
1847 }
1848 }
1849
1850 intel_uncore_forcewake_put__locked(uncore, fw);
1851 spin_unlock(&uncore->lock);
1852 intel_gt_mcr_unlock(gt, flags);
1853 }
1854
intel_gt_apply_workarounds(struct intel_gt * gt)1855 void intel_gt_apply_workarounds(struct intel_gt *gt)
1856 {
1857 wa_list_apply(>->wa_list);
1858 }
1859
wa_list_verify(struct intel_gt * gt,const struct i915_wa_list * wal,const char * from)1860 static bool wa_list_verify(struct intel_gt *gt,
1861 const struct i915_wa_list *wal,
1862 const char *from)
1863 {
1864 struct intel_uncore *uncore = gt->uncore;
1865 struct i915_wa *wa;
1866 enum forcewake_domains fw;
1867 unsigned long flags;
1868 unsigned int i;
1869 bool ok = true;
1870
1871 fw = wal_get_fw_for_rmw(uncore, wal);
1872
1873 intel_gt_mcr_lock(gt, &flags);
1874 spin_lock(&uncore->lock);
1875 intel_uncore_forcewake_get__locked(uncore, fw);
1876
1877 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1878 ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1879 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1880 intel_uncore_read_fw(uncore, wa->reg),
1881 wal->name, from);
1882
1883 intel_uncore_forcewake_put__locked(uncore, fw);
1884 spin_unlock(&uncore->lock);
1885 intel_gt_mcr_unlock(gt, flags);
1886
1887 return ok;
1888 }
1889
intel_gt_verify_workarounds(struct intel_gt * gt,const char * from)1890 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1891 {
1892 return wa_list_verify(gt, >->wa_list, from);
1893 }
1894
1895 __maybe_unused
is_nonpriv_flags_valid(u32 flags)1896 static bool is_nonpriv_flags_valid(u32 flags)
1897 {
1898 /* Check only valid flag bits are set */
1899 if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1900 return false;
1901
1902 /* NB: Only 3 out of 4 enum values are valid for access field */
1903 if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1904 RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1905 return false;
1906
1907 return true;
1908 }
1909
1910 static void
whitelist_reg_ext(struct i915_wa_list * wal,i915_reg_t reg,u32 flags)1911 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1912 {
1913 struct i915_wa wa = {
1914 .reg = reg
1915 };
1916
1917 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1918 return;
1919
1920 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1921 return;
1922
1923 wa.reg.reg |= flags;
1924 _wa_add(wal, &wa);
1925 }
1926
1927 static void
whitelist_mcr_reg_ext(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 flags)1928 whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1929 {
1930 struct i915_wa wa = {
1931 .mcr_reg = reg,
1932 .is_mcr = 1,
1933 };
1934
1935 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1936 return;
1937
1938 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1939 return;
1940
1941 wa.mcr_reg.reg |= flags;
1942 _wa_add(wal, &wa);
1943 }
1944
1945 static void
whitelist_reg(struct i915_wa_list * wal,i915_reg_t reg)1946 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1947 {
1948 whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1949 }
1950
1951 static void
whitelist_mcr_reg(struct i915_wa_list * wal,i915_mcr_reg_t reg)1952 whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1953 {
1954 whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1955 }
1956
gen9_whitelist_build(struct i915_wa_list * w)1957 static void gen9_whitelist_build(struct i915_wa_list *w)
1958 {
1959 /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1960 whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1961
1962 /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1963 whitelist_reg(w, GEN8_CS_CHICKEN1);
1964
1965 /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1966 whitelist_reg(w, GEN8_HDC_CHICKEN1);
1967
1968 /* WaSendPushConstantsFromMMIO:skl,bxt */
1969 whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1970 }
1971
skl_whitelist_build(struct intel_engine_cs * engine)1972 static void skl_whitelist_build(struct intel_engine_cs *engine)
1973 {
1974 struct i915_wa_list *w = &engine->whitelist;
1975
1976 if (engine->class != RENDER_CLASS)
1977 return;
1978
1979 gen9_whitelist_build(w);
1980
1981 /* WaDisableLSQCROPERFforOCL:skl */
1982 whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1983 }
1984
bxt_whitelist_build(struct intel_engine_cs * engine)1985 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1986 {
1987 if (engine->class != RENDER_CLASS)
1988 return;
1989
1990 gen9_whitelist_build(&engine->whitelist);
1991 }
1992
kbl_whitelist_build(struct intel_engine_cs * engine)1993 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1994 {
1995 struct i915_wa_list *w = &engine->whitelist;
1996
1997 if (engine->class != RENDER_CLASS)
1998 return;
1999
2000 gen9_whitelist_build(w);
2001
2002 /* WaDisableLSQCROPERFforOCL:kbl */
2003 whitelist_mcr_reg(w, GEN8_L3SQCREG4);
2004 }
2005
glk_whitelist_build(struct intel_engine_cs * engine)2006 static void glk_whitelist_build(struct intel_engine_cs *engine)
2007 {
2008 struct i915_wa_list *w = &engine->whitelist;
2009
2010 if (engine->class != RENDER_CLASS)
2011 return;
2012
2013 gen9_whitelist_build(w);
2014
2015 /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
2016 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2017 }
2018
cfl_whitelist_build(struct intel_engine_cs * engine)2019 static void cfl_whitelist_build(struct intel_engine_cs *engine)
2020 {
2021 struct i915_wa_list *w = &engine->whitelist;
2022
2023 if (engine->class != RENDER_CLASS)
2024 return;
2025
2026 gen9_whitelist_build(w);
2027
2028 /*
2029 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
2030 *
2031 * This covers 4 register which are next to one another :
2032 * - PS_INVOCATION_COUNT
2033 * - PS_INVOCATION_COUNT_UDW
2034 * - PS_DEPTH_COUNT
2035 * - PS_DEPTH_COUNT_UDW
2036 */
2037 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2038 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2039 RING_FORCE_TO_NONPRIV_RANGE_4);
2040 }
2041
allow_read_ctx_timestamp(struct intel_engine_cs * engine)2042 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
2043 {
2044 struct i915_wa_list *w = &engine->whitelist;
2045
2046 if (engine->class != RENDER_CLASS)
2047 whitelist_reg_ext(w,
2048 RING_CTX_TIMESTAMP(engine->mmio_base),
2049 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2050 }
2051
cml_whitelist_build(struct intel_engine_cs * engine)2052 static void cml_whitelist_build(struct intel_engine_cs *engine)
2053 {
2054 allow_read_ctx_timestamp(engine);
2055
2056 cfl_whitelist_build(engine);
2057 }
2058
icl_whitelist_build(struct intel_engine_cs * engine)2059 static void icl_whitelist_build(struct intel_engine_cs *engine)
2060 {
2061 struct i915_wa_list *w = &engine->whitelist;
2062
2063 allow_read_ctx_timestamp(engine);
2064
2065 switch (engine->class) {
2066 case RENDER_CLASS:
2067 /* WaAllowUMDToModifyHalfSliceChicken7:icl */
2068 whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
2069
2070 /* WaAllowUMDToModifySamplerMode:icl */
2071 whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
2072
2073 /* WaEnableStateCacheRedirectToCS:icl */
2074 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2075
2076 /*
2077 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2078 *
2079 * This covers 4 register which are next to one another :
2080 * - PS_INVOCATION_COUNT
2081 * - PS_INVOCATION_COUNT_UDW
2082 * - PS_DEPTH_COUNT
2083 * - PS_DEPTH_COUNT_UDW
2084 */
2085 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2086 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2087 RING_FORCE_TO_NONPRIV_RANGE_4);
2088 break;
2089
2090 case VIDEO_DECODE_CLASS:
2091 /* hucStatusRegOffset */
2092 whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2093 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2094 /* hucUKernelHdrInfoRegOffset */
2095 whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2096 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2097 /* hucStatus2RegOffset */
2098 whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2099 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2100 break;
2101
2102 default:
2103 break;
2104 }
2105 }
2106
tgl_whitelist_build(struct intel_engine_cs * engine)2107 static void tgl_whitelist_build(struct intel_engine_cs *engine)
2108 {
2109 struct i915_wa_list *w = &engine->whitelist;
2110
2111 allow_read_ctx_timestamp(engine);
2112
2113 switch (engine->class) {
2114 case RENDER_CLASS:
2115 /*
2116 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2117 * Wa_1408556865:tgl
2118 *
2119 * This covers 4 registers which are next to one another :
2120 * - PS_INVOCATION_COUNT
2121 * - PS_INVOCATION_COUNT_UDW
2122 * - PS_DEPTH_COUNT
2123 * - PS_DEPTH_COUNT_UDW
2124 */
2125 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2126 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2127 RING_FORCE_TO_NONPRIV_RANGE_4);
2128
2129 /*
2130 * Wa_1808121037:tgl
2131 * Wa_14012131227:dg1
2132 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2133 */
2134 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2135
2136 /* Wa_1806527549:tgl */
2137 whitelist_reg(w, HIZ_CHICKEN);
2138
2139 /* Required by recommended tuning setting (not a workaround) */
2140 whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2141
2142 break;
2143 default:
2144 break;
2145 }
2146 }
2147
dg2_whitelist_build(struct intel_engine_cs * engine)2148 static void dg2_whitelist_build(struct intel_engine_cs *engine)
2149 {
2150 struct i915_wa_list *w = &engine->whitelist;
2151
2152 switch (engine->class) {
2153 case RENDER_CLASS:
2154 /* Required by recommended tuning setting (not a workaround) */
2155 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2156
2157 break;
2158 default:
2159 break;
2160 }
2161 }
2162
blacklist_trtt(struct intel_engine_cs * engine)2163 static void blacklist_trtt(struct intel_engine_cs *engine)
2164 {
2165 struct i915_wa_list *w = &engine->whitelist;
2166
2167 /*
2168 * Prevent read/write access to [0x4400, 0x4600) which covers
2169 * the TRTT range across all engines. Note that normally userspace
2170 * cannot access the other engines' trtt control, but for simplicity
2171 * we cover the entire range on each engine.
2172 */
2173 whitelist_reg_ext(w, _MMIO(0x4400),
2174 RING_FORCE_TO_NONPRIV_DENY |
2175 RING_FORCE_TO_NONPRIV_RANGE_64);
2176 whitelist_reg_ext(w, _MMIO(0x4500),
2177 RING_FORCE_TO_NONPRIV_DENY |
2178 RING_FORCE_TO_NONPRIV_RANGE_64);
2179 }
2180
pvc_whitelist_build(struct intel_engine_cs * engine)2181 static void pvc_whitelist_build(struct intel_engine_cs *engine)
2182 {
2183 /* Wa_16014440446:pvc */
2184 blacklist_trtt(engine);
2185 }
2186
xelpg_whitelist_build(struct intel_engine_cs * engine)2187 static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2188 {
2189 struct i915_wa_list *w = &engine->whitelist;
2190
2191 switch (engine->class) {
2192 case RENDER_CLASS:
2193 /* Required by recommended tuning setting (not a workaround) */
2194 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2195
2196 break;
2197 default:
2198 break;
2199 }
2200 }
2201
intel_engine_init_whitelist(struct intel_engine_cs * engine)2202 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2203 {
2204 struct drm_i915_private *i915 = engine->i915;
2205 struct i915_wa_list *w = &engine->whitelist;
2206
2207 wa_init_start(w, engine->gt, "whitelist", engine->name);
2208
2209 if (engine->gt->type == GT_MEDIA)
2210 ; /* none yet */
2211 else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2212 xelpg_whitelist_build(engine);
2213 else if (IS_PONTEVECCHIO(i915))
2214 pvc_whitelist_build(engine);
2215 else if (IS_DG2(i915))
2216 dg2_whitelist_build(engine);
2217 else if (IS_XEHPSDV(i915))
2218 ; /* none needed */
2219 else if (GRAPHICS_VER(i915) == 12)
2220 tgl_whitelist_build(engine);
2221 else if (GRAPHICS_VER(i915) == 11)
2222 icl_whitelist_build(engine);
2223 else if (IS_COMETLAKE(i915))
2224 cml_whitelist_build(engine);
2225 else if (IS_COFFEELAKE(i915))
2226 cfl_whitelist_build(engine);
2227 else if (IS_GEMINILAKE(i915))
2228 glk_whitelist_build(engine);
2229 else if (IS_KABYLAKE(i915))
2230 kbl_whitelist_build(engine);
2231 else if (IS_BROXTON(i915))
2232 bxt_whitelist_build(engine);
2233 else if (IS_SKYLAKE(i915))
2234 skl_whitelist_build(engine);
2235 else if (GRAPHICS_VER(i915) <= 8)
2236 ;
2237 else
2238 MISSING_CASE(GRAPHICS_VER(i915));
2239
2240 wa_init_finish(w);
2241 }
2242
intel_engine_apply_whitelist(struct intel_engine_cs * engine)2243 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2244 {
2245 const struct i915_wa_list *wal = &engine->whitelist;
2246 struct intel_uncore *uncore = engine->uncore;
2247 const u32 base = engine->mmio_base;
2248 struct i915_wa *wa;
2249 unsigned int i;
2250
2251 if (!wal->count)
2252 return;
2253
2254 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2255 intel_uncore_write(uncore,
2256 RING_FORCE_TO_NONPRIV(base, i),
2257 i915_mmio_reg_offset(wa->reg));
2258
2259 /* And clear the rest just in case of garbage */
2260 for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2261 intel_uncore_write(uncore,
2262 RING_FORCE_TO_NONPRIV(base, i),
2263 i915_mmio_reg_offset(RING_NOPID(base)));
2264 }
2265
2266 /*
2267 * engine_fake_wa_init(), a place holder to program the registers
2268 * which are not part of an official workaround defined by the
2269 * hardware team.
2270 * Adding programming of those register inside workaround will
2271 * allow utilizing wa framework to proper application and verification.
2272 */
2273 static void
engine_fake_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2274 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2275 {
2276 u8 mocs_w, mocs_r;
2277
2278 /*
2279 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2280 * by the command streamer when executing commands that don't have
2281 * a way to explicitly specify a MOCS setting. The default should
2282 * usually reference whichever MOCS entry corresponds to uncached
2283 * behavior, although use of a WB cached entry is recommended by the
2284 * spec in certain circumstances on specific platforms.
2285 */
2286 if (GRAPHICS_VER(engine->i915) >= 12) {
2287 mocs_r = engine->gt->mocs.uc_index;
2288 mocs_w = engine->gt->mocs.uc_index;
2289
2290 if (HAS_L3_CCS_READ(engine->i915) &&
2291 engine->class == COMPUTE_CLASS) {
2292 mocs_r = engine->gt->mocs.wb_index;
2293
2294 /*
2295 * Even on the few platforms where MOCS 0 is a
2296 * legitimate table entry, it's never the correct
2297 * setting to use here; we can assume the MOCS init
2298 * just forgot to initialize wb_index.
2299 */
2300 drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2301 }
2302
2303 wa_masked_field_set(wal,
2304 RING_CMD_CCTL(engine->mmio_base),
2305 CMD_CCTL_MOCS_MASK,
2306 CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2307 }
2308 }
2309
2310 static void
rcs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2311 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2312 {
2313 struct drm_i915_private *i915 = engine->i915;
2314 struct intel_gt *gt = engine->gt;
2315
2316 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2317 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2318 /* Wa_22014600077 */
2319 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2320 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2321 }
2322
2323 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2324 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2325 IS_DG2(i915)) {
2326 /* Wa_1509727124 */
2327 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2328 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2329 }
2330
2331 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2332 IS_DG2(i915)) {
2333 /* Wa_22012856258 */
2334 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2335 GEN12_DISABLE_READ_SUPPRESSION);
2336 }
2337
2338 if (IS_DG2(i915)) {
2339 /*
2340 * Wa_22010960976:dg2
2341 * Wa_14013347512:dg2
2342 */
2343 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2344 LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2345 }
2346
2347 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2348 IS_DG2(i915)) {
2349 /* Wa_14015150844 */
2350 wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2351 _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2352 0, true);
2353 }
2354
2355 if (IS_DG2_G11(i915) || IS_DG2_G10(i915)) {
2356 /* Wa_22014600077:dg2 */
2357 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2358 _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2359 0 /* Wa_14012342262 write-only reg, so skip verification */,
2360 true);
2361 }
2362
2363 if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2364 IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2365 /*
2366 * Wa_1606700617:tgl,dg1,adl-p
2367 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2368 * Wa_14010826681:tgl,dg1,rkl,adl-p
2369 * Wa_18019627453:dg2
2370 */
2371 wa_masked_en(wal,
2372 GEN9_CS_DEBUG_MODE1,
2373 FF_DOP_CLOCK_GATE_DISABLE);
2374 }
2375
2376 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2377 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2378 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2379 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2380
2381 /*
2382 * Wa_1407928979:tgl A*
2383 * Wa_18011464164:tgl[B0+],dg1[B0+]
2384 * Wa_22010931296:tgl[B0+],dg1[B0+]
2385 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2386 */
2387 wa_write_or(wal, GEN7_FF_THREAD_MODE,
2388 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2389
2390 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2391 wa_mcr_masked_en(wal,
2392 GEN10_SAMPLER_MODE,
2393 ENABLE_SMALLPL);
2394 }
2395
2396 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2397 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2398 /* Wa_1409804808 */
2399 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2400 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2401
2402 /* Wa_14010229206 */
2403 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2404 }
2405
2406 if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2407 /*
2408 * Wa_1607297627
2409 *
2410 * On TGL and RKL there are multiple entries for this WA in the
2411 * BSpec; some indicate this is an A0-only WA, others indicate
2412 * it applies to all steppings so we trust the "all steppings."
2413 */
2414 wa_masked_en(wal,
2415 RING_PSMI_CTL(RENDER_RING_BASE),
2416 GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2417 GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2418 }
2419
2420 if (GRAPHICS_VER(i915) == 11) {
2421 /* This is not an Wa. Enable for better image quality */
2422 wa_masked_en(wal,
2423 _3D_CHICKEN3,
2424 _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2425
2426 /*
2427 * Wa_1405543622:icl
2428 * Formerly known as WaGAPZPriorityScheme
2429 */
2430 wa_write_or(wal,
2431 GEN8_GARBCNTL,
2432 GEN11_ARBITRATION_PRIO_ORDER_MASK);
2433
2434 /*
2435 * Wa_1604223664:icl
2436 * Formerly known as WaL3BankAddressHashing
2437 */
2438 wa_write_clr_set(wal,
2439 GEN8_GARBCNTL,
2440 GEN11_HASH_CTRL_EXCL_MASK,
2441 GEN11_HASH_CTRL_EXCL_BIT0);
2442 wa_write_clr_set(wal,
2443 GEN11_GLBLINVL,
2444 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2445 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2446
2447 /*
2448 * Wa_1405733216:icl
2449 * Formerly known as WaDisableCleanEvicts
2450 */
2451 wa_mcr_write_or(wal,
2452 GEN8_L3SQCREG4,
2453 GEN11_LQSC_CLEAN_EVICT_DISABLE);
2454
2455 /* Wa_1606682166:icl */
2456 wa_write_or(wal,
2457 GEN7_SARCHKMD,
2458 GEN7_DISABLE_SAMPLER_PREFETCH);
2459
2460 /* Wa_1409178092:icl */
2461 wa_mcr_write_clr_set(wal,
2462 GEN11_SCRATCH2,
2463 GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2464 0);
2465
2466 /* WaEnable32PlaneMode:icl */
2467 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2468 GEN11_ENABLE_32_PLANE_MODE);
2469
2470 /*
2471 * Wa_1408767742:icl[a2..forever],ehl[all]
2472 * Wa_1605460711:icl[a0..c0]
2473 */
2474 wa_write_or(wal,
2475 GEN7_FF_THREAD_MODE,
2476 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2477
2478 /* Wa_22010271021 */
2479 wa_masked_en(wal,
2480 GEN9_CS_DEBUG_MODE1,
2481 FF_DOP_CLOCK_GATE_DISABLE);
2482 }
2483
2484 /*
2485 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2486 * beyond) allow the kernel-mode driver to choose between two different
2487 * options for controlling preemption granularity and behavior.
2488 *
2489 * Option 1 (hardware default):
2490 * Preemption settings are controlled in a global manner via
2491 * kernel-only register CS_DEBUG_MODE1 (0x20EC). Any granularity
2492 * and settings chosen by the kernel-mode driver will apply to all
2493 * userspace clients.
2494 *
2495 * Option 2:
2496 * Preemption settings are controlled on a per-context basis via
2497 * register CS_CHICKEN1 (0x2580). CS_CHICKEN1 is saved/restored on
2498 * context switch and is writable by userspace (e.g., via
2499 * MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2500 * which allows different userspace drivers/clients to select
2501 * different settings, or to change those settings on the fly in
2502 * response to runtime needs. This option was known by name
2503 * "FtrPerCtxtPreemptionGranularityControl" at one time, although
2504 * that name is somewhat misleading as other non-granularity
2505 * preemption settings are also impacted by this decision.
2506 *
2507 * On Linux, our policy has always been to let userspace drivers
2508 * control preemption granularity/settings (Option 2). This was
2509 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2510 * userspace developed before object-level preemption was enabled would
2511 * not behave well if i915 were to go with Option 1 and enable that
2512 * preemption in a global manner). On gen9 each context would have
2513 * object-level preemption disabled by default (see
2514 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2515 * userspace drivers could opt-in to object-level preemption as they
2516 * saw fit. For post-gen9 platforms, we continue to utilize Option 2;
2517 * even though it is no longer necessary for ABI compatibility when
2518 * enabling a new platform, it does ensure that userspace will be able
2519 * to implement any workarounds that show up requiring temporary
2520 * adjustments to preemption behavior at runtime.
2521 *
2522 * Notes/Workarounds:
2523 * - Wa_14015141709: On DG2 and early steppings of MTL,
2524 * CS_CHICKEN1[0] does not disable object-level preemption as
2525 * it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2526 * using Option 1). Effectively this means userspace is unable
2527 * to disable object-level preemption on these platforms/steppings
2528 * despite the setting here.
2529 *
2530 * - Wa_16013994831: May require that userspace program
2531 * CS_CHICKEN1[10] when certain runtime conditions are true.
2532 * Userspace requires Option 2 to be in effect for their update of
2533 * CS_CHICKEN1[10] to be effective.
2534 *
2535 * Other workarounds may appear in the future that will also require
2536 * Option 2 behavior to allow proper userspace implementation.
2537 */
2538 if (GRAPHICS_VER(i915) >= 9)
2539 wa_masked_en(wal,
2540 GEN7_FF_SLICE_CS_CHICKEN1,
2541 GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2542
2543 if (IS_SKYLAKE(i915) ||
2544 IS_KABYLAKE(i915) ||
2545 IS_COFFEELAKE(i915) ||
2546 IS_COMETLAKE(i915)) {
2547 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2548 wa_write_or(wal,
2549 GEN8_GARBCNTL,
2550 GEN9_GAPS_TSV_CREDIT_DISABLE);
2551 }
2552
2553 if (IS_BROXTON(i915)) {
2554 /* WaDisablePooledEuLoadBalancingFix:bxt */
2555 wa_masked_en(wal,
2556 FF_SLICE_CS_CHICKEN2,
2557 GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2558 }
2559
2560 if (GRAPHICS_VER(i915) == 9) {
2561 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2562 wa_masked_en(wal,
2563 GEN9_CSFE_CHICKEN1_RCS,
2564 GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2565
2566 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2567 wa_mcr_write_or(wal,
2568 BDW_SCRATCH1,
2569 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2570
2571 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2572 if (IS_GEN9_LP(i915))
2573 wa_mcr_write_clr_set(wal,
2574 GEN8_L3SQCREG1,
2575 L3_PRIO_CREDITS_MASK,
2576 L3_GENERAL_PRIO_CREDITS(62) |
2577 L3_HIGH_PRIO_CREDITS(2));
2578
2579 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2580 wa_mcr_write_or(wal,
2581 GEN8_L3SQCREG4,
2582 GEN8_LQSC_FLUSH_COHERENT_LINES);
2583
2584 /* Disable atomics in L3 to prevent unrecoverable hangs */
2585 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2586 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2587 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2588 GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2589 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2590 EVICTION_PERF_FIX_ENABLE, 0);
2591 }
2592
2593 if (IS_HASWELL(i915)) {
2594 /* WaSampleCChickenBitEnable:hsw */
2595 wa_masked_en(wal,
2596 HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2597
2598 wa_masked_dis(wal,
2599 CACHE_MODE_0_GEN7,
2600 /* enable HiZ Raw Stall Optimization */
2601 HIZ_RAW_STALL_OPT_DISABLE);
2602 }
2603
2604 if (IS_VALLEYVIEW(i915)) {
2605 /* WaDisableEarlyCull:vlv */
2606 wa_masked_en(wal,
2607 _3D_CHICKEN3,
2608 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2609
2610 /*
2611 * WaVSThreadDispatchOverride:ivb,vlv
2612 *
2613 * This actually overrides the dispatch
2614 * mode for all thread types.
2615 */
2616 wa_write_clr_set(wal,
2617 GEN7_FF_THREAD_MODE,
2618 GEN7_FF_SCHED_MASK,
2619 GEN7_FF_TS_SCHED_HW |
2620 GEN7_FF_VS_SCHED_HW |
2621 GEN7_FF_DS_SCHED_HW);
2622
2623 /* WaPsdDispatchEnable:vlv */
2624 /* WaDisablePSDDualDispatchEnable:vlv */
2625 wa_masked_en(wal,
2626 GEN7_HALF_SLICE_CHICKEN1,
2627 GEN7_MAX_PS_THREAD_DEP |
2628 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2629 }
2630
2631 if (IS_IVYBRIDGE(i915)) {
2632 /* WaDisableEarlyCull:ivb */
2633 wa_masked_en(wal,
2634 _3D_CHICKEN3,
2635 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2636
2637 if (0) { /* causes HiZ corruption on ivb:gt1 */
2638 /* enable HiZ Raw Stall Optimization */
2639 wa_masked_dis(wal,
2640 CACHE_MODE_0_GEN7,
2641 HIZ_RAW_STALL_OPT_DISABLE);
2642 }
2643
2644 /*
2645 * WaVSThreadDispatchOverride:ivb,vlv
2646 *
2647 * This actually overrides the dispatch
2648 * mode for all thread types.
2649 */
2650 wa_write_clr_set(wal,
2651 GEN7_FF_THREAD_MODE,
2652 GEN7_FF_SCHED_MASK,
2653 GEN7_FF_TS_SCHED_HW |
2654 GEN7_FF_VS_SCHED_HW |
2655 GEN7_FF_DS_SCHED_HW);
2656
2657 /* WaDisablePSDDualDispatchEnable:ivb */
2658 if (IS_IVB_GT1(i915))
2659 wa_masked_en(wal,
2660 GEN7_HALF_SLICE_CHICKEN1,
2661 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2662 }
2663
2664 if (GRAPHICS_VER(i915) == 7) {
2665 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2666 wa_masked_en(wal,
2667 RING_MODE_GEN7(RENDER_RING_BASE),
2668 GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2669
2670 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2671 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2672
2673 /*
2674 * BSpec says this must be set, even though
2675 * WaDisable4x2SubspanOptimization:ivb,hsw
2676 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2677 */
2678 wa_masked_en(wal,
2679 CACHE_MODE_1,
2680 PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2681
2682 /*
2683 * BSpec recommends 8x4 when MSAA is used,
2684 * however in practice 16x4 seems fastest.
2685 *
2686 * Note that PS/WM thread counts depend on the WIZ hashing
2687 * disable bit, which we don't touch here, but it's good
2688 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2689 */
2690 wa_masked_field_set(wal,
2691 GEN7_GT_MODE,
2692 GEN6_WIZ_HASHING_MASK,
2693 GEN6_WIZ_HASHING_16x4);
2694 }
2695
2696 if (IS_GRAPHICS_VER(i915, 6, 7))
2697 /*
2698 * We need to disable the AsyncFlip performance optimisations in
2699 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2700 * already be programmed to '1' on all products.
2701 *
2702 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2703 */
2704 wa_masked_en(wal,
2705 RING_MI_MODE(RENDER_RING_BASE),
2706 ASYNC_FLIP_PERF_DISABLE);
2707
2708 if (GRAPHICS_VER(i915) == 6) {
2709 /*
2710 * Required for the hardware to program scanline values for
2711 * waiting
2712 * WaEnableFlushTlbInvalidationMode:snb
2713 */
2714 wa_masked_en(wal,
2715 GFX_MODE,
2716 GFX_TLB_INVALIDATE_EXPLICIT);
2717
2718 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2719 wa_masked_en(wal,
2720 _3D_CHICKEN,
2721 _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2722
2723 wa_masked_en(wal,
2724 _3D_CHICKEN3,
2725 /* WaStripsFansDisableFastClipPerformanceFix:snb */
2726 _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2727 /*
2728 * Bspec says:
2729 * "This bit must be set if 3DSTATE_CLIP clip mode is set
2730 * to normal and 3DSTATE_SF number of SF output attributes
2731 * is more than 16."
2732 */
2733 _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2734
2735 /*
2736 * BSpec recommends 8x4 when MSAA is used,
2737 * however in practice 16x4 seems fastest.
2738 *
2739 * Note that PS/WM thread counts depend on the WIZ hashing
2740 * disable bit, which we don't touch here, but it's good
2741 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2742 */
2743 wa_masked_field_set(wal,
2744 GEN6_GT_MODE,
2745 GEN6_WIZ_HASHING_MASK,
2746 GEN6_WIZ_HASHING_16x4);
2747
2748 /* WaDisable_RenderCache_OperationalFlush:snb */
2749 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2750
2751 /*
2752 * From the Sandybridge PRM, volume 1 part 3, page 24:
2753 * "If this bit is set, STCunit will have LRA as replacement
2754 * policy. [...] This bit must be reset. LRA replacement
2755 * policy is not supported."
2756 */
2757 wa_masked_dis(wal,
2758 CACHE_MODE_0,
2759 CM0_STC_EVICT_DISABLE_LRA_SNB);
2760 }
2761
2762 if (IS_GRAPHICS_VER(i915, 4, 6))
2763 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2764 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2765 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2766 /* XXX bit doesn't stick on Broadwater */
2767 IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2768
2769 if (GRAPHICS_VER(i915) == 4)
2770 /*
2771 * Disable CONSTANT_BUFFER before it is loaded from the context
2772 * image. For as it is loaded, it is executed and the stored
2773 * address may no longer be valid, leading to a GPU hang.
2774 *
2775 * This imposes the requirement that userspace reload their
2776 * CONSTANT_BUFFER on every batch, fortunately a requirement
2777 * they are already accustomed to from before contexts were
2778 * enabled.
2779 */
2780 wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2781 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2782 0 /* XXX bit doesn't stick on Broadwater */,
2783 true);
2784 }
2785
2786 static void
xcs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2787 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2788 {
2789 struct drm_i915_private *i915 = engine->i915;
2790
2791 /* WaKBLVECSSemaphoreWaitPoll:kbl */
2792 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2793 wa_write(wal,
2794 RING_SEMA_WAIT_POLL(engine->mmio_base),
2795 1);
2796 }
2797 }
2798
2799 static void
ccs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2800 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2801 {
2802 if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2803 /* Wa_14014999345:pvc */
2804 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2805 }
2806 }
2807
2808 /*
2809 * The bspec performance guide has recommended MMIO tuning settings. These
2810 * aren't truly "workarounds" but we want to program them with the same
2811 * workaround infrastructure to ensure that they're automatically added to
2812 * the GuC save/restore lists, re-applied at the right times, and checked for
2813 * any conflicting programming requested by real workarounds.
2814 *
2815 * Programming settings should be added here only if their registers are not
2816 * part of an engine's register state context. If a register is part of a
2817 * context, then any tuning settings should be programmed in an appropriate
2818 * function invoked by __intel_engine_init_ctx_wa().
2819 */
2820 static void
add_render_compute_tuning_settings(struct intel_gt * gt,struct i915_wa_list * wal)2821 add_render_compute_tuning_settings(struct intel_gt *gt,
2822 struct i915_wa_list *wal)
2823 {
2824 struct drm_i915_private *i915 = gt->i915;
2825
2826 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2827 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2828
2829 /*
2830 * This tuning setting proves beneficial only on ATS-M designs; the
2831 * default "age based" setting is optimal on regular DG2 and other
2832 * platforms.
2833 */
2834 if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2835 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2836 THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2837
2838 if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
2839 wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2840 }
2841
ccs_engine_wa_mode(struct intel_engine_cs * engine,struct i915_wa_list * wal)2842 static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2843 {
2844 struct intel_gt *gt = engine->gt;
2845 u32 mode;
2846
2847 if (!IS_DG2(gt->i915))
2848 return;
2849
2850 /*
2851 * Wa_14019159160: This workaround, along with others, leads to
2852 * significant challenges in utilizing load balancing among the
2853 * CCS slices. Consequently, an architectural decision has been
2854 * made to completely disable automatic CCS load balancing.
2855 */
2856 wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2857
2858 /*
2859 * After having disabled automatic load balancing we need to
2860 * assign all slices to a single CCS. We will call it CCS mode 1
2861 */
2862 mode = intel_gt_apply_ccs_mode(gt);
2863 wa_masked_en(wal, XEHP_CCS_MODE, mode);
2864 }
2865
2866 /*
2867 * The workarounds in this function apply to shared registers in
2868 * the general render reset domain that aren't tied to a
2869 * specific engine. Since all render+compute engines get reset
2870 * together, and the contents of these registers are lost during
2871 * the shared render domain reset, we'll define such workarounds
2872 * here and then add them to just a single RCS or CCS engine's
2873 * workaround list (whichever engine has the XXXX flag).
2874 */
2875 static void
general_render_compute_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2876 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2877 {
2878 struct drm_i915_private *i915 = engine->i915;
2879 struct intel_gt *gt = engine->gt;
2880
2881 add_render_compute_tuning_settings(gt, wal);
2882
2883 if (GRAPHICS_VER(i915) >= 11) {
2884 /* This is not a Wa (although referred to as
2885 * WaSetInidrectStateOverride in places), this allows
2886 * applications that reference sampler states through
2887 * the BindlessSamplerStateBaseAddress to have their
2888 * border color relative to DynamicStateBaseAddress
2889 * rather than BindlessSamplerStateBaseAddress.
2890 *
2891 * Otherwise SAMPLER_STATE border colors have to be
2892 * copied in multiple heaps (DynamicStateBaseAddress &
2893 * BindlessSamplerStateBaseAddress)
2894 *
2895 * BSpec: 46052
2896 */
2897 wa_mcr_masked_en(wal,
2898 GEN10_SAMPLER_MODE,
2899 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2900 }
2901
2902 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2903 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2904 IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2905 /* Wa_14017856879 */
2906 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2907
2908 /* Wa_14020495402 */
2909 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2910 }
2911
2912 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2913 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2914 /*
2915 * Wa_14017066071
2916 * Wa_14017654203
2917 */
2918 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2919 MTL_DISABLE_SAMPLER_SC_OOO);
2920
2921 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2922 /* Wa_22015279794 */
2923 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2924 DISABLE_PREFETCH_INTO_IC);
2925
2926 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2927 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2928 IS_DG2(i915)) {
2929 /* Wa_22013037850 */
2930 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2931 DISABLE_128B_EVICTION_COMMAND_UDW);
2932
2933 /* Wa_18017747507 */
2934 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2935 }
2936
2937 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2938 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2939 IS_PONTEVECCHIO(i915) ||
2940 IS_DG2(i915)) {
2941 /* Wa_22014226127 */
2942 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2943 }
2944
2945 if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) {
2946 /* Wa_14015227452:dg2,pvc */
2947 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2948
2949 /* Wa_16015675438:dg2,pvc */
2950 wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2951 }
2952
2953 if (IS_DG2(i915)) {
2954 /*
2955 * Wa_16011620976:dg2_g11
2956 * Wa_22015475538:dg2
2957 */
2958 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2959 }
2960
2961 if (IS_DG2_G11(i915)) {
2962 /*
2963 * Wa_22012826095:dg2
2964 * Wa_22013059131:dg2
2965 */
2966 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2967 MAXREQS_PER_BANK,
2968 REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2969
2970 /* Wa_22013059131:dg2 */
2971 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2972 FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2973
2974 /*
2975 * Wa_22012654132
2976 *
2977 * Note that register 0xE420 is write-only and cannot be read
2978 * back for verification on DG2 (due to Wa_14012342262), so
2979 * we need to explicitly skip the readback.
2980 */
2981 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2982 _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2983 0 /* write-only, so skip validation */,
2984 true);
2985 }
2986
2987 if (IS_XEHPSDV(i915)) {
2988 /* Wa_1409954639 */
2989 wa_mcr_masked_en(wal,
2990 GEN8_ROW_CHICKEN,
2991 SYSTOLIC_DOP_CLOCK_GATING_DIS);
2992
2993 /* Wa_1607196519 */
2994 wa_mcr_masked_en(wal,
2995 GEN9_ROW_CHICKEN4,
2996 GEN12_DISABLE_GRF_CLEAR);
2997
2998 /* Wa_14010449647:xehpsdv */
2999 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
3000 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
3001 }
3002 }
3003
3004 static void
engine_init_workarounds(struct intel_engine_cs * engine,struct i915_wa_list * wal)3005 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
3006 {
3007 if (GRAPHICS_VER(engine->i915) < 4)
3008 return;
3009
3010 engine_fake_wa_init(engine, wal);
3011
3012 /*
3013 * These are common workarounds that just need to applied
3014 * to a single RCS/CCS engine's workaround list since
3015 * they're reset as part of the general render domain reset.
3016 */
3017 if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
3018 general_render_compute_wa_init(engine, wal);
3019 ccs_engine_wa_mode(engine, wal);
3020 }
3021
3022 if (engine->class == COMPUTE_CLASS)
3023 ccs_engine_wa_init(engine, wal);
3024 else if (engine->class == RENDER_CLASS)
3025 rcs_engine_wa_init(engine, wal);
3026 else
3027 xcs_engine_wa_init(engine, wal);
3028 }
3029
intel_engine_init_workarounds(struct intel_engine_cs * engine)3030 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3031 {
3032 struct i915_wa_list *wal = &engine->wa_list;
3033
3034 wa_init_start(wal, engine->gt, "engine", engine->name);
3035 engine_init_workarounds(engine, wal);
3036 wa_init_finish(wal);
3037 }
3038
intel_engine_apply_workarounds(struct intel_engine_cs * engine)3039 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3040 {
3041 wa_list_apply(&engine->wa_list);
3042 }
3043
3044 static const struct i915_range mcr_ranges_gen8[] = {
3045 { .start = 0x5500, .end = 0x55ff },
3046 { .start = 0x7000, .end = 0x7fff },
3047 { .start = 0x9400, .end = 0x97ff },
3048 { .start = 0xb000, .end = 0xb3ff },
3049 { .start = 0xe000, .end = 0xe7ff },
3050 {},
3051 };
3052
3053 static const struct i915_range mcr_ranges_gen12[] = {
3054 { .start = 0x8150, .end = 0x815f },
3055 { .start = 0x9520, .end = 0x955f },
3056 { .start = 0xb100, .end = 0xb3ff },
3057 { .start = 0xde80, .end = 0xe8ff },
3058 { .start = 0x24a00, .end = 0x24a7f },
3059 {},
3060 };
3061
3062 static const struct i915_range mcr_ranges_xehp[] = {
3063 { .start = 0x4000, .end = 0x4aff },
3064 { .start = 0x5200, .end = 0x52ff },
3065 { .start = 0x5400, .end = 0x7fff },
3066 { .start = 0x8140, .end = 0x815f },
3067 { .start = 0x8c80, .end = 0x8dff },
3068 { .start = 0x94d0, .end = 0x955f },
3069 { .start = 0x9680, .end = 0x96ff },
3070 { .start = 0xb000, .end = 0xb3ff },
3071 { .start = 0xc800, .end = 0xcfff },
3072 { .start = 0xd800, .end = 0xd8ff },
3073 { .start = 0xdc00, .end = 0xffff },
3074 { .start = 0x17000, .end = 0x17fff },
3075 { .start = 0x24a00, .end = 0x24a7f },
3076 {},
3077 };
3078
mcr_range(struct drm_i915_private * i915,u32 offset)3079 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3080 {
3081 const struct i915_range *mcr_ranges;
3082 int i;
3083
3084 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3085 mcr_ranges = mcr_ranges_xehp;
3086 else if (GRAPHICS_VER(i915) >= 12)
3087 mcr_ranges = mcr_ranges_gen12;
3088 else if (GRAPHICS_VER(i915) >= 8)
3089 mcr_ranges = mcr_ranges_gen8;
3090 else
3091 return false;
3092
3093 /*
3094 * Registers in these ranges are affected by the MCR selector
3095 * which only controls CPU initiated MMIO. Routing does not
3096 * work for CS access so we cannot verify them on this path.
3097 */
3098 for (i = 0; mcr_ranges[i].start; i++)
3099 if (offset >= mcr_ranges[i].start &&
3100 offset <= mcr_ranges[i].end)
3101 return true;
3102
3103 return false;
3104 }
3105
3106 static int
wa_list_srm(struct i915_request * rq,const struct i915_wa_list * wal,struct i915_vma * vma)3107 wa_list_srm(struct i915_request *rq,
3108 const struct i915_wa_list *wal,
3109 struct i915_vma *vma)
3110 {
3111 struct drm_i915_private *i915 = rq->i915;
3112 unsigned int i, count = 0;
3113 const struct i915_wa *wa;
3114 u32 srm, *cs;
3115
3116 srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3117 if (GRAPHICS_VER(i915) >= 8)
3118 srm++;
3119
3120 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3121 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3122 count++;
3123 }
3124
3125 cs = intel_ring_begin(rq, 4 * count);
3126 if (IS_ERR(cs))
3127 return PTR_ERR(cs);
3128
3129 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3130 u32 offset = i915_mmio_reg_offset(wa->reg);
3131
3132 if (mcr_range(i915, offset))
3133 continue;
3134
3135 *cs++ = srm;
3136 *cs++ = offset;
3137 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3138 *cs++ = 0;
3139 }
3140 intel_ring_advance(rq, cs);
3141
3142 return 0;
3143 }
3144
engine_wa_list_verify(struct intel_context * ce,const struct i915_wa_list * const wal,const char * from)3145 static int engine_wa_list_verify(struct intel_context *ce,
3146 const struct i915_wa_list * const wal,
3147 const char *from)
3148 {
3149 const struct i915_wa *wa;
3150 struct i915_request *rq;
3151 struct i915_vma *vma;
3152 struct i915_gem_ww_ctx ww;
3153 unsigned int i;
3154 u32 *results;
3155 int err;
3156
3157 if (!wal->count)
3158 return 0;
3159
3160 vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3161 wal->count * sizeof(u32));
3162 if (IS_ERR(vma))
3163 return PTR_ERR(vma);
3164
3165 intel_engine_pm_get(ce->engine);
3166 i915_gem_ww_ctx_init(&ww, false);
3167 retry:
3168 err = i915_gem_object_lock(vma->obj, &ww);
3169 if (err == 0)
3170 err = intel_context_pin_ww(ce, &ww);
3171 if (err)
3172 goto err_pm;
3173
3174 err = i915_vma_pin_ww(vma, &ww, 0, 0,
3175 i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3176 if (err)
3177 goto err_unpin;
3178
3179 rq = i915_request_create(ce);
3180 if (IS_ERR(rq)) {
3181 err = PTR_ERR(rq);
3182 goto err_vma;
3183 }
3184
3185 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3186 if (err == 0)
3187 err = wa_list_srm(rq, wal, vma);
3188
3189 i915_request_get(rq);
3190 if (err)
3191 i915_request_set_error_once(rq, err);
3192 i915_request_add(rq);
3193
3194 if (err)
3195 goto err_rq;
3196
3197 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3198 err = -ETIME;
3199 goto err_rq;
3200 }
3201
3202 results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3203 if (IS_ERR(results)) {
3204 err = PTR_ERR(results);
3205 goto err_rq;
3206 }
3207
3208 err = 0;
3209 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3210 if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3211 continue;
3212
3213 if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3214 err = -ENXIO;
3215 }
3216
3217 i915_gem_object_unpin_map(vma->obj);
3218
3219 err_rq:
3220 i915_request_put(rq);
3221 err_vma:
3222 i915_vma_unpin(vma);
3223 err_unpin:
3224 intel_context_unpin(ce);
3225 err_pm:
3226 if (err == -EDEADLK) {
3227 err = i915_gem_ww_ctx_backoff(&ww);
3228 if (!err)
3229 goto retry;
3230 }
3231 i915_gem_ww_ctx_fini(&ww);
3232 intel_engine_pm_put(ce->engine);
3233 i915_vma_put(vma);
3234 return err;
3235 }
3236
intel_engine_verify_workarounds(struct intel_engine_cs * engine,const char * from)3237 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3238 const char *from)
3239 {
3240 return engine_wa_list_verify(engine->kernel_context,
3241 &engine->wa_list,
3242 from);
3243 }
3244
3245 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3246 #include "selftest_workarounds.c"
3247 #endif
3248