1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include "intel_device_info.h"
31 #include "intel/common/intel_gem.h"
32 #include "util/bitscan.h"
33 #include "util/debug.h"
34 #include "util/log.h"
35 #include "util/macros.h"
36
37 #include "drm-uapi/i915_drm.h"
38
39 static const struct {
40 const char *name;
41 int pci_id;
42 } name_map[] = {
43 { "lpt", 0x27a2 },
44 { "brw", 0x2a02 },
45 { "g4x", 0x2a42 },
46 { "ilk", 0x0042 },
47 { "snb", 0x0126 },
48 { "ivb", 0x016a },
49 { "hsw", 0x0d2e },
50 { "byt", 0x0f33 },
51 { "bdw", 0x162e },
52 { "chv", 0x22B3 },
53 { "skl", 0x1912 },
54 { "bxt", 0x5A85 },
55 { "kbl", 0x5912 },
56 { "aml", 0x591C },
57 { "glk", 0x3185 },
58 { "cfl", 0x3E9B },
59 { "whl", 0x3EA1 },
60 { "cml", 0x9b41 },
61 { "icl", 0x8a52 },
62 { "ehl", 0x4500 },
63 { "jsl", 0x4E71 },
64 { "tgl", 0x9a49 },
65 { "rkl", 0x4c8a },
66 { "dg1", 0x4905 },
67 { "adl", 0x4680 },
68 { "sg1", 0x4907 },
69 };
70
71 /**
72 * Get the PCI ID for the device name.
73 *
74 * Returns -1 if the device is not known.
75 */
76 int
intel_device_name_to_pci_device_id(const char * name)77 intel_device_name_to_pci_device_id(const char *name)
78 {
79 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
80 if (!strcmp(name_map[i].name, name))
81 return name_map[i].pci_id;
82 }
83
84 return -1;
85 }
86
87 static const struct intel_device_info intel_device_info_gfx3 = {
88 .ver = 3,
89 .simulator_id = -1,
90 .cs_prefetch_size = 512,
91 };
92
93 static const struct intel_device_info intel_device_info_i965 = {
94 .ver = 4,
95 .has_negative_rhw_bug = true,
96 .num_slices = 1,
97 .num_subslices = { 1, },
98 .num_eu_per_subslice = 8,
99 .num_thread_per_eu = 4,
100 .max_vs_threads = 16,
101 .max_gs_threads = 2,
102 .max_wm_threads = 8 * 4,
103 .urb = {
104 .size = 256,
105 },
106 .timestamp_frequency = 12500000,
107 .simulator_id = -1,
108 .cs_prefetch_size = 512,
109 };
110
111 static const struct intel_device_info intel_device_info_g4x = {
112 .ver = 4,
113 .verx10 = 45,
114 .has_pln = true,
115 .has_compr4 = true,
116 .has_surface_tile_offset = true,
117 .is_g4x = true,
118 .num_slices = 1,
119 .num_subslices = { 1, },
120 .num_eu_per_subslice = 10,
121 .num_thread_per_eu = 5,
122 .max_vs_threads = 32,
123 .max_gs_threads = 2,
124 .max_wm_threads = 10 * 5,
125 .urb = {
126 .size = 384,
127 },
128 .timestamp_frequency = 12500000,
129 .simulator_id = -1,
130 .cs_prefetch_size = 512,
131 };
132
133 static const struct intel_device_info intel_device_info_ilk = {
134 .ver = 5,
135 .has_pln = true,
136 .has_compr4 = true,
137 .has_surface_tile_offset = true,
138 .num_slices = 1,
139 .num_subslices = { 1, },
140 .num_eu_per_subslice = 12,
141 .num_thread_per_eu = 6,
142 .max_vs_threads = 72,
143 .max_gs_threads = 32,
144 .max_wm_threads = 12 * 6,
145 .urb = {
146 .size = 1024,
147 },
148 .timestamp_frequency = 12500000,
149 .simulator_id = -1,
150 .cs_prefetch_size = 512,
151 };
152
153 static const struct intel_device_info intel_device_info_snb_gt1 = {
154 .ver = 6,
155 .gt = 1,
156 .has_hiz_and_separate_stencil = true,
157 .has_llc = true,
158 .has_pln = true,
159 .has_surface_tile_offset = true,
160 .needs_unlit_centroid_workaround = true,
161 .num_slices = 1,
162 .num_subslices = { 1, },
163 .num_eu_per_subslice = 6,
164 .num_thread_per_eu = 6, /* Not confirmed */
165 .max_vs_threads = 24,
166 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
167 .max_wm_threads = 40,
168 .urb = {
169 .size = 32,
170 .min_entries = {
171 [MESA_SHADER_VERTEX] = 24,
172 },
173 .max_entries = {
174 [MESA_SHADER_VERTEX] = 256,
175 [MESA_SHADER_GEOMETRY] = 256,
176 },
177 },
178 .timestamp_frequency = 12500000,
179 .simulator_id = -1,
180 .cs_prefetch_size = 512,
181 };
182
183 static const struct intel_device_info intel_device_info_snb_gt2 = {
184 .ver = 6,
185 .gt = 2,
186 .has_hiz_and_separate_stencil = true,
187 .has_llc = true,
188 .has_pln = true,
189 .has_surface_tile_offset = true,
190 .needs_unlit_centroid_workaround = true,
191 .num_slices = 1,
192 .num_subslices = { 1, },
193 .num_eu_per_subslice = 12,
194 .num_thread_per_eu = 6, /* Not confirmed */
195 .max_vs_threads = 60,
196 .max_gs_threads = 60,
197 .max_wm_threads = 80,
198 .urb = {
199 .size = 64,
200 .min_entries = {
201 [MESA_SHADER_VERTEX] = 24,
202 },
203 .max_entries = {
204 [MESA_SHADER_VERTEX] = 256,
205 [MESA_SHADER_GEOMETRY] = 256,
206 },
207 },
208 .timestamp_frequency = 12500000,
209 .simulator_id = -1,
210 .cs_prefetch_size = 512,
211 };
212
213 #define GFX7_FEATURES \
214 .ver = 7, \
215 .has_hiz_and_separate_stencil = true, \
216 .must_use_separate_stencil = true, \
217 .has_llc = true, \
218 .has_pln = true, \
219 .has_64bit_float = true, \
220 .has_surface_tile_offset = true, \
221 .timestamp_frequency = 12500000, \
222 .max_constant_urb_size_kb = 16, \
223 .cs_prefetch_size = 512
224
225 static const struct intel_device_info intel_device_info_ivb_gt1 = {
226 GFX7_FEATURES, .is_ivybridge = true, .gt = 1,
227 .num_slices = 1,
228 .num_subslices = { 1, },
229 .num_eu_per_subslice = 6,
230 .num_thread_per_eu = 6,
231 .l3_banks = 2,
232 .max_vs_threads = 36,
233 .max_tcs_threads = 36,
234 .max_tes_threads = 36,
235 .max_gs_threads = 36,
236 .max_wm_threads = 48,
237 .max_cs_threads = 36,
238 .urb = {
239 .min_entries = {
240 [MESA_SHADER_VERTEX] = 32,
241 [MESA_SHADER_TESS_EVAL] = 10,
242 },
243 .max_entries = {
244 [MESA_SHADER_VERTEX] = 512,
245 [MESA_SHADER_TESS_CTRL] = 32,
246 [MESA_SHADER_TESS_EVAL] = 288,
247 [MESA_SHADER_GEOMETRY] = 192,
248 },
249 },
250 .simulator_id = 7,
251 };
252
253 static const struct intel_device_info intel_device_info_ivb_gt2 = {
254 GFX7_FEATURES, .is_ivybridge = true, .gt = 2,
255 .num_slices = 1,
256 .num_subslices = { 1, },
257 .num_eu_per_subslice = 12,
258 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
259 * @max_wm_threads ... */
260 .l3_banks = 4,
261 .max_vs_threads = 128,
262 .max_tcs_threads = 128,
263 .max_tes_threads = 128,
264 .max_gs_threads = 128,
265 .max_wm_threads = 172,
266 .max_cs_threads = 64,
267 .urb = {
268 .min_entries = {
269 [MESA_SHADER_VERTEX] = 32,
270 [MESA_SHADER_TESS_EVAL] = 10,
271 },
272 .max_entries = {
273 [MESA_SHADER_VERTEX] = 704,
274 [MESA_SHADER_TESS_CTRL] = 64,
275 [MESA_SHADER_TESS_EVAL] = 448,
276 [MESA_SHADER_GEOMETRY] = 320,
277 },
278 },
279 .simulator_id = 7,
280 };
281
282 static const struct intel_device_info intel_device_info_byt = {
283 GFX7_FEATURES, .is_baytrail = true, .gt = 1,
284 .num_slices = 1,
285 .num_subslices = { 1, },
286 .num_eu_per_subslice = 4,
287 .num_thread_per_eu = 8,
288 .l3_banks = 1,
289 .has_llc = false,
290 .max_vs_threads = 36,
291 .max_tcs_threads = 36,
292 .max_tes_threads = 36,
293 .max_gs_threads = 36,
294 .max_wm_threads = 48,
295 .max_cs_threads = 32,
296 .urb = {
297 .min_entries = {
298 [MESA_SHADER_VERTEX] = 32,
299 [MESA_SHADER_TESS_EVAL] = 10,
300 },
301 .max_entries = {
302 [MESA_SHADER_VERTEX] = 512,
303 [MESA_SHADER_TESS_CTRL] = 32,
304 [MESA_SHADER_TESS_EVAL] = 288,
305 [MESA_SHADER_GEOMETRY] = 192,
306 },
307 },
308 .simulator_id = 10,
309 };
310
311 #define HSW_FEATURES \
312 GFX7_FEATURES, \
313 .is_haswell = true, \
314 .verx10 = 75, \
315 .supports_simd16_3src = true
316
317 static const struct intel_device_info intel_device_info_hsw_gt1 = {
318 HSW_FEATURES, .gt = 1,
319 .num_slices = 1,
320 .num_subslices = { 1, },
321 .num_eu_per_subslice = 10,
322 .num_thread_per_eu = 7,
323 .l3_banks = 2,
324 .max_vs_threads = 70,
325 .max_tcs_threads = 70,
326 .max_tes_threads = 70,
327 .max_gs_threads = 70,
328 .max_wm_threads = 102,
329 .max_cs_threads = 70,
330 .urb = {
331 .min_entries = {
332 [MESA_SHADER_VERTEX] = 32,
333 [MESA_SHADER_TESS_EVAL] = 10,
334 },
335 .max_entries = {
336 [MESA_SHADER_VERTEX] = 640,
337 [MESA_SHADER_TESS_CTRL] = 64,
338 [MESA_SHADER_TESS_EVAL] = 384,
339 [MESA_SHADER_GEOMETRY] = 256,
340 },
341 },
342 .simulator_id = 9,
343 };
344
345 static const struct intel_device_info intel_device_info_hsw_gt2 = {
346 HSW_FEATURES, .gt = 2,
347 .num_slices = 1,
348 .num_subslices = { 2, },
349 .num_eu_per_subslice = 10,
350 .num_thread_per_eu = 7,
351 .l3_banks = 4,
352 .max_vs_threads = 280,
353 .max_tcs_threads = 256,
354 .max_tes_threads = 280,
355 .max_gs_threads = 256,
356 .max_wm_threads = 204,
357 .max_cs_threads = 70,
358 .urb = {
359 .min_entries = {
360 [MESA_SHADER_VERTEX] = 64,
361 [MESA_SHADER_TESS_EVAL] = 10,
362 },
363 .max_entries = {
364 [MESA_SHADER_VERTEX] = 1664,
365 [MESA_SHADER_TESS_CTRL] = 128,
366 [MESA_SHADER_TESS_EVAL] = 960,
367 [MESA_SHADER_GEOMETRY] = 640,
368 },
369 },
370 .simulator_id = 9,
371 };
372
373 static const struct intel_device_info intel_device_info_hsw_gt3 = {
374 HSW_FEATURES, .gt = 3,
375 .num_slices = 2,
376 .num_subslices = { 2, 2, },
377 .num_eu_per_subslice = 10,
378 .num_thread_per_eu = 7,
379 .l3_banks = 8,
380 .max_vs_threads = 280,
381 .max_tcs_threads = 256,
382 .max_tes_threads = 280,
383 .max_gs_threads = 256,
384 .max_wm_threads = 408,
385 .max_cs_threads = 70,
386 .urb = {
387 .min_entries = {
388 [MESA_SHADER_VERTEX] = 64,
389 [MESA_SHADER_TESS_EVAL] = 10,
390 },
391 .max_entries = {
392 [MESA_SHADER_VERTEX] = 1664,
393 [MESA_SHADER_TESS_CTRL] = 128,
394 [MESA_SHADER_TESS_EVAL] = 960,
395 [MESA_SHADER_GEOMETRY] = 640,
396 },
397 },
398 .max_constant_urb_size_kb = 32,
399 .simulator_id = 9,
400 };
401
402 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
403 * so keep things conservative for now and set has_sample_with_hiz = false.
404 */
405 #define GFX8_FEATURES \
406 .ver = 8, \
407 .has_hiz_and_separate_stencil = true, \
408 .must_use_separate_stencil = true, \
409 .has_llc = true, \
410 .has_sample_with_hiz = false, \
411 .has_pln = true, \
412 .has_integer_dword_mul = true, \
413 .has_64bit_float = true, \
414 .has_64bit_int = true, \
415 .supports_simd16_3src = true, \
416 .has_surface_tile_offset = true, \
417 .num_thread_per_eu = 7, \
418 .max_vs_threads = 504, \
419 .max_tcs_threads = 504, \
420 .max_tes_threads = 504, \
421 .max_gs_threads = 504, \
422 .max_wm_threads = 384, \
423 .timestamp_frequency = 12500000, \
424 .max_constant_urb_size_kb = 32, \
425 .cs_prefetch_size = 512
426
427 static const struct intel_device_info intel_device_info_bdw_gt1 = {
428 GFX8_FEATURES, .gt = 1,
429 .is_broadwell = true,
430 .num_slices = 1,
431 .num_subslices = { 2, },
432 .num_eu_per_subslice = 6,
433 .l3_banks = 2,
434 .max_cs_threads = 42,
435 .urb = {
436 .min_entries = {
437 [MESA_SHADER_VERTEX] = 64,
438 [MESA_SHADER_TESS_EVAL] = 34,
439 },
440 .max_entries = {
441 [MESA_SHADER_VERTEX] = 2560,
442 [MESA_SHADER_TESS_CTRL] = 504,
443 [MESA_SHADER_TESS_EVAL] = 1536,
444 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
445 [MESA_SHADER_GEOMETRY] = 690,
446 },
447 },
448 .simulator_id = 11,
449 };
450
451 static const struct intel_device_info intel_device_info_bdw_gt2 = {
452 GFX8_FEATURES, .gt = 2,
453 .is_broadwell = true,
454 .num_slices = 1,
455 .num_subslices = { 3, },
456 .num_eu_per_subslice = 8,
457 .l3_banks = 4,
458 .max_cs_threads = 56,
459 .urb = {
460 .min_entries = {
461 [MESA_SHADER_VERTEX] = 64,
462 [MESA_SHADER_TESS_EVAL] = 34,
463 },
464 .max_entries = {
465 [MESA_SHADER_VERTEX] = 2560,
466 [MESA_SHADER_TESS_CTRL] = 504,
467 [MESA_SHADER_TESS_EVAL] = 1536,
468 [MESA_SHADER_GEOMETRY] = 960,
469 },
470 },
471 .simulator_id = 11,
472 };
473
474 static const struct intel_device_info intel_device_info_bdw_gt3 = {
475 GFX8_FEATURES, .gt = 3,
476 .is_broadwell = true,
477 .num_slices = 2,
478 .num_subslices = { 3, 3, },
479 .num_eu_per_subslice = 8,
480 .l3_banks = 8,
481 .max_cs_threads = 56,
482 .urb = {
483 .min_entries = {
484 [MESA_SHADER_VERTEX] = 64,
485 [MESA_SHADER_TESS_EVAL] = 34,
486 },
487 .max_entries = {
488 [MESA_SHADER_VERTEX] = 2560,
489 [MESA_SHADER_TESS_CTRL] = 504,
490 [MESA_SHADER_TESS_EVAL] = 1536,
491 [MESA_SHADER_GEOMETRY] = 960,
492 },
493 },
494 .simulator_id = 11,
495 };
496
497 static const struct intel_device_info intel_device_info_chv = {
498 GFX8_FEATURES, .is_cherryview = 1, .gt = 1,
499 .has_llc = false,
500 .has_integer_dword_mul = false,
501 .num_slices = 1,
502 .num_subslices = { 2, },
503 .num_eu_per_subslice = 8,
504 .l3_banks = 2,
505 .max_vs_threads = 80,
506 .max_tcs_threads = 80,
507 .max_tes_threads = 80,
508 .max_gs_threads = 80,
509 .max_wm_threads = 128,
510 .max_cs_threads = 6 * 7,
511 .urb = {
512 .min_entries = {
513 [MESA_SHADER_VERTEX] = 34,
514 [MESA_SHADER_TESS_EVAL] = 34,
515 },
516 .max_entries = {
517 [MESA_SHADER_VERTEX] = 640,
518 [MESA_SHADER_TESS_CTRL] = 80,
519 [MESA_SHADER_TESS_EVAL] = 384,
520 [MESA_SHADER_GEOMETRY] = 256,
521 },
522 },
523 .simulator_id = 13,
524 };
525
526 #define GFX9_HW_INFO \
527 .ver = 9, \
528 .max_vs_threads = 336, \
529 .max_gs_threads = 336, \
530 .max_tcs_threads = 336, \
531 .max_tes_threads = 336, \
532 .max_cs_threads = 56, \
533 .timestamp_frequency = 12000000, \
534 .cs_prefetch_size = 512, \
535 .urb = { \
536 .min_entries = { \
537 [MESA_SHADER_VERTEX] = 64, \
538 [MESA_SHADER_TESS_EVAL] = 34, \
539 }, \
540 .max_entries = { \
541 [MESA_SHADER_VERTEX] = 1856, \
542 [MESA_SHADER_TESS_CTRL] = 672, \
543 [MESA_SHADER_TESS_EVAL] = 1120, \
544 [MESA_SHADER_GEOMETRY] = 640, \
545 }, \
546 }
547
548 #define GFX9_LP_FEATURES \
549 GFX8_FEATURES, \
550 GFX9_HW_INFO, \
551 .has_integer_dword_mul = false, \
552 .gt = 1, \
553 .has_llc = false, \
554 .has_sample_with_hiz = true, \
555 .num_slices = 1, \
556 .num_thread_per_eu = 6, \
557 .max_vs_threads = 112, \
558 .max_tcs_threads = 112, \
559 .max_tes_threads = 112, \
560 .max_gs_threads = 112, \
561 .max_cs_threads = 6 * 6, \
562 .timestamp_frequency = 19200000, \
563 .urb = { \
564 .min_entries = { \
565 [MESA_SHADER_VERTEX] = 34, \
566 [MESA_SHADER_TESS_EVAL] = 34, \
567 }, \
568 .max_entries = { \
569 [MESA_SHADER_VERTEX] = 704, \
570 [MESA_SHADER_TESS_CTRL] = 256, \
571 [MESA_SHADER_TESS_EVAL] = 416, \
572 [MESA_SHADER_GEOMETRY] = 256, \
573 }, \
574 }
575
576 #define GFX9_LP_FEATURES_3X6 \
577 GFX9_LP_FEATURES, \
578 .num_subslices = { 3, }, \
579 .num_eu_per_subslice = 6
580
581 #define GFX9_LP_FEATURES_2X6 \
582 GFX9_LP_FEATURES, \
583 .num_subslices = { 2, }, \
584 .num_eu_per_subslice = 6, \
585 .max_vs_threads = 56, \
586 .max_tcs_threads = 56, \
587 .max_tes_threads = 56, \
588 .max_gs_threads = 56, \
589 .max_cs_threads = 6 * 6, \
590 .urb = { \
591 .min_entries = { \
592 [MESA_SHADER_VERTEX] = 34, \
593 [MESA_SHADER_TESS_EVAL] = 34, \
594 }, \
595 .max_entries = { \
596 [MESA_SHADER_VERTEX] = 352, \
597 [MESA_SHADER_TESS_CTRL] = 128, \
598 [MESA_SHADER_TESS_EVAL] = 208, \
599 [MESA_SHADER_GEOMETRY] = 128, \
600 }, \
601 }
602
603 #define GFX9_FEATURES \
604 GFX8_FEATURES, \
605 GFX9_HW_INFO, \
606 .has_sample_with_hiz = true
607
608 static const struct intel_device_info intel_device_info_skl_gt1 = {
609 GFX9_FEATURES, .gt = 1,
610 .is_skylake = true,
611 .num_slices = 1,
612 .num_subslices = { 2, },
613 .num_eu_per_subslice = 6,
614 .l3_banks = 2,
615 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
616 * leading to some vertices to go missing if we use too much URB.
617 */
618 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
619 .simulator_id = 12,
620 };
621
622 static const struct intel_device_info intel_device_info_skl_gt2 = {
623 GFX9_FEATURES, .gt = 2,
624 .is_skylake = true,
625 .num_slices = 1,
626 .num_subslices = { 3, },
627 .num_eu_per_subslice = 8,
628 .l3_banks = 4,
629 .simulator_id = 12,
630 };
631
632 static const struct intel_device_info intel_device_info_skl_gt3 = {
633 GFX9_FEATURES, .gt = 3,
634 .is_skylake = true,
635 .num_slices = 2,
636 .num_subslices = { 3, 3, },
637 .num_eu_per_subslice = 8,
638 .l3_banks = 8,
639 .simulator_id = 12,
640 };
641
642 static const struct intel_device_info intel_device_info_skl_gt4 = {
643 GFX9_FEATURES, .gt = 4,
644 .is_skylake = true,
645 .num_slices = 3,
646 .num_subslices = { 3, 3, 3, },
647 .num_eu_per_subslice = 8,
648 .l3_banks = 12,
649 /* From the "L3 Allocation and Programming" documentation:
650 *
651 * "URB is limited to 1008KB due to programming restrictions. This is not a
652 * restriction of the L3 implementation, but of the FF and other clients.
653 * Therefore, in a GT4 implementation it is possible for the programmed
654 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
655 * only 1008KB of this will be used."
656 */
657 .simulator_id = 12,
658 };
659
660 static const struct intel_device_info intel_device_info_bxt = {
661 GFX9_LP_FEATURES_3X6,
662 .is_broxton = true,
663 .l3_banks = 2,
664 .simulator_id = 14,
665 };
666
667 static const struct intel_device_info intel_device_info_bxt_2x6 = {
668 GFX9_LP_FEATURES_2X6,
669 .is_broxton = true,
670 .l3_banks = 1,
671 .simulator_id = 14,
672 };
673 /*
674 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
675 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
676 */
677
678 static const struct intel_device_info intel_device_info_kbl_gt1 = {
679 GFX9_FEATURES,
680 .is_kabylake = true,
681 .gt = 1,
682
683 .max_cs_threads = 7 * 6,
684 .num_slices = 1,
685 .num_subslices = { 2, },
686 .num_eu_per_subslice = 6,
687 .l3_banks = 2,
688 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
689 * leading to some vertices to go missing if we use too much URB.
690 */
691 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
692 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
693 .simulator_id = 16,
694 };
695
696 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
697 GFX9_FEATURES,
698 .is_kabylake = true,
699 .gt = 1,
700
701 .max_cs_threads = 7 * 6,
702 .num_slices = 1,
703 .num_subslices = { 3, },
704 .num_eu_per_subslice = 6,
705 .l3_banks = 4,
706 .simulator_id = 16,
707 };
708
709 static const struct intel_device_info intel_device_info_kbl_gt2 = {
710 GFX9_FEATURES,
711 .is_kabylake = true,
712 .gt = 2,
713
714 .num_slices = 1,
715 .num_subslices = { 3, },
716 .num_eu_per_subslice = 8,
717 .l3_banks = 4,
718 .simulator_id = 16,
719 };
720
721 static const struct intel_device_info intel_device_info_kbl_gt3 = {
722 GFX9_FEATURES,
723 .is_kabylake = true,
724 .gt = 3,
725
726 .num_slices = 2,
727 .num_subslices = { 3, 3, },
728 .num_eu_per_subslice = 8,
729 .l3_banks = 8,
730 .simulator_id = 16,
731 };
732
733 static const struct intel_device_info intel_device_info_kbl_gt4 = {
734 GFX9_FEATURES,
735 .is_kabylake = true,
736 .gt = 4,
737
738 /*
739 * From the "L3 Allocation and Programming" documentation:
740 *
741 * "URB is limited to 1008KB due to programming restrictions. This
742 * is not a restriction of the L3 implementation, but of the FF and
743 * other clients. Therefore, in a GT4 implementation it is
744 * possible for the programmed allocation of the L3 data array to
745 * provide 3*384KB=1152KB for URB, but only 1008KB of this
746 * will be used."
747 */
748 .num_slices = 3,
749 .num_subslices = { 3, 3, 3, },
750 .num_eu_per_subslice = 8,
751 .l3_banks = 12,
752 .simulator_id = 16,
753 };
754
755 static const struct intel_device_info intel_device_info_glk = {
756 GFX9_LP_FEATURES_3X6,
757 .is_geminilake = true,
758 .l3_banks = 2,
759 .simulator_id = 17,
760 };
761
762 static const struct intel_device_info intel_device_info_glk_2x6 = {
763 GFX9_LP_FEATURES_2X6,
764 .is_geminilake = true,
765 .l3_banks = 2,
766 .simulator_id = 17,
767 };
768
769 static const struct intel_device_info intel_device_info_cfl_gt1 = {
770 GFX9_FEATURES,
771 .is_coffeelake = true,
772 .gt = 1,
773
774 .num_slices = 1,
775 .num_subslices = { 2, },
776 .num_eu_per_subslice = 6,
777 .l3_banks = 2,
778 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
779 * leading to some vertices to go missing if we use too much URB.
780 */
781 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
782 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
783 .simulator_id = 24,
784 };
785 static const struct intel_device_info intel_device_info_cfl_gt2 = {
786 GFX9_FEATURES,
787 .is_coffeelake = true,
788 .gt = 2,
789
790 .num_slices = 1,
791 .num_subslices = { 3, },
792 .num_eu_per_subslice = 8,
793 .l3_banks = 4,
794 .simulator_id = 24,
795 };
796
797 static const struct intel_device_info intel_device_info_cfl_gt3 = {
798 GFX9_FEATURES,
799 .is_coffeelake = true,
800 .gt = 3,
801
802 .num_slices = 2,
803 .num_subslices = { 3, 3, },
804 .num_eu_per_subslice = 8,
805 .l3_banks = 8,
806 .simulator_id = 24,
807 };
808
809 #define subslices(args...) { args, }
810
811 #define GFX11_HW_INFO \
812 .ver = 11, \
813 .has_pln = false, \
814 .max_vs_threads = 364, \
815 .max_gs_threads = 224, \
816 .max_tcs_threads = 224, \
817 .max_tes_threads = 364, \
818 .max_cs_threads = 56, \
819 .cs_prefetch_size = 512
820
821 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3) \
822 GFX8_FEATURES, \
823 GFX11_HW_INFO, \
824 .has_64bit_float = false, \
825 .has_64bit_int = false, \
826 .has_integer_dword_mul = false, \
827 .has_sample_with_hiz = false, \
828 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
829 .num_subslices = _subslices, \
830 .num_eu_per_subslice = 8
831
832 #define GFX11_URB_MIN_MAX_ENTRIES \
833 .min_entries = { \
834 [MESA_SHADER_VERTEX] = 64, \
835 [MESA_SHADER_TESS_EVAL] = 34, \
836 }, \
837 .max_entries = { \
838 [MESA_SHADER_VERTEX] = 2384, \
839 [MESA_SHADER_TESS_CTRL] = 1032, \
840 [MESA_SHADER_TESS_EVAL] = 2384, \
841 [MESA_SHADER_GEOMETRY] = 1032, \
842 }
843
844 static const struct intel_device_info intel_device_info_icl_gt2 = {
845 GFX11_FEATURES(2, 1, subslices(8), 8),
846 .urb = {
847 GFX11_URB_MIN_MAX_ENTRIES,
848 },
849 .simulator_id = 19,
850 };
851
852 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
853 GFX11_FEATURES(1, 1, subslices(6), 6),
854 .urb = {
855 GFX11_URB_MIN_MAX_ENTRIES,
856 },
857 .simulator_id = 19,
858 };
859
860 static const struct intel_device_info intel_device_info_icl_gt1 = {
861 GFX11_FEATURES(1, 1, subslices(4), 6),
862 .urb = {
863 GFX11_URB_MIN_MAX_ENTRIES,
864 },
865 .simulator_id = 19,
866 };
867
868 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
869 GFX11_FEATURES(1, 1, subslices(1), 6),
870 .urb = {
871 GFX11_URB_MIN_MAX_ENTRIES,
872 },
873 .simulator_id = 19,
874 };
875
876 #define GFX11_LP_FEATURES \
877 .is_elkhartlake = true, \
878 .urb = { \
879 GFX11_URB_MIN_MAX_ENTRIES, \
880 }, \
881 .disable_ccs_repack = true, \
882 .simulator_id = 28
883
884 static const struct intel_device_info intel_device_info_ehl_4x8 = {
885 GFX11_FEATURES(1, 1, subslices(4), 4),
886 GFX11_LP_FEATURES,
887 };
888
889 static const struct intel_device_info intel_device_info_ehl_4x6 = {
890 GFX11_FEATURES(1, 1, subslices(4), 4),
891 GFX11_LP_FEATURES,
892 .num_eu_per_subslice = 6,
893 };
894
895 static const struct intel_device_info intel_device_info_ehl_4x5 = {
896 GFX11_FEATURES(1, 1, subslices(4), 4),
897 GFX11_LP_FEATURES,
898 .num_eu_per_subslice = 5,
899 };
900
901 static const struct intel_device_info intel_device_info_ehl_4x4 = {
902 GFX11_FEATURES(1, 1, subslices(4), 4),
903 GFX11_LP_FEATURES,
904 .num_eu_per_subslice = 4,
905 };
906
907 static const struct intel_device_info intel_device_info_ehl_2x8 = {
908 GFX11_FEATURES(1, 1, subslices(2), 4),
909 GFX11_LP_FEATURES,
910 };
911
912 static const struct intel_device_info intel_device_info_ehl_2x4 = {
913 GFX11_FEATURES(1, 1, subslices(2), 4),
914 GFX11_LP_FEATURES,
915 .num_eu_per_subslice =4,
916 };
917
918 #define GFX12_URB_MIN_MAX_ENTRIES \
919 .min_entries = { \
920 [MESA_SHADER_VERTEX] = 64, \
921 [MESA_SHADER_TESS_EVAL] = 34, \
922 }, \
923 .max_entries = { \
924 [MESA_SHADER_VERTEX] = 3576, \
925 [MESA_SHADER_TESS_CTRL] = 1548, \
926 [MESA_SHADER_TESS_EVAL] = 3576, \
927 /* Wa_14013840143 */ \
928 [MESA_SHADER_GEOMETRY] = 1536, \
929 }
930
931 #define GFX12_HW_INFO \
932 .ver = 12, \
933 .has_pln = false, \
934 .has_sample_with_hiz = false, \
935 .has_aux_map = true, \
936 .max_vs_threads = 546, \
937 .max_gs_threads = 336, \
938 .max_tcs_threads = 336, \
939 .max_tes_threads = 546, \
940 .max_cs_threads = 112, /* threads per DSS */ \
941 .urb = { \
942 GFX12_URB_MIN_MAX_ENTRIES, \
943 }
944
945 #define GFX12_FEATURES(_gt, _slices, _l3) \
946 GFX8_FEATURES, \
947 GFX12_HW_INFO, \
948 .has_64bit_float = false, \
949 .has_64bit_int = false, \
950 .has_integer_dword_mul = false, \
951 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
952 .simulator_id = 22, \
953 .num_eu_per_subslice = 16, \
954 .cs_prefetch_size = 512
955
956 #define dual_subslices(args...) { args, }
957
958 #define GFX12_GT05_FEATURES \
959 GFX12_FEATURES(1, 1, 4), \
960 .num_subslices = dual_subslices(1)
961
962 #define GFX12_GT_FEATURES(_gt) \
963 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \
964 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
965
966 static const struct intel_device_info intel_device_info_tgl_gt1 = {
967 GFX12_GT_FEATURES(1),
968 .is_tigerlake = true,
969 };
970
971 static const struct intel_device_info intel_device_info_tgl_gt2 = {
972 GFX12_GT_FEATURES(2),
973 .is_tigerlake = true,
974 };
975
976 static const struct intel_device_info intel_device_info_rkl_gt05 = {
977 GFX12_GT05_FEATURES,
978 .is_rocketlake = true,
979 };
980
981 static const struct intel_device_info intel_device_info_rkl_gt1 = {
982 GFX12_GT_FEATURES(1),
983 .is_rocketlake = true,
984 };
985
986 static const struct intel_device_info intel_device_info_adl_gt05 = {
987 GFX12_GT05_FEATURES,
988 .is_alderlake = true,
989 };
990
991 static const struct intel_device_info intel_device_info_adl_gt1 = {
992 GFX12_GT_FEATURES(1),
993 .is_alderlake = true,
994 };
995
996 static const struct intel_device_info intel_device_info_adl_gt2 = {
997 GFX12_GT_FEATURES(2),
998 .is_alderlake = true,
999 .display_ver = 13,
1000 };
1001
1002 #define GFX12_DG1_SG1_FEATURES \
1003 GFX12_GT_FEATURES(2), \
1004 .is_dg1 = true, \
1005 .has_llc = false, \
1006 .has_local_mem = true, \
1007 .urb.size = 768, \
1008 .simulator_id = 30
1009
1010 static const struct intel_device_info intel_device_info_dg1 = {
1011 GFX12_DG1_SG1_FEATURES,
1012 };
1013
1014 static const struct intel_device_info intel_device_info_sg1 = {
1015 GFX12_DG1_SG1_FEATURES,
1016 };
1017
1018 static void
reset_masks(struct intel_device_info * devinfo)1019 reset_masks(struct intel_device_info *devinfo)
1020 {
1021 devinfo->subslice_slice_stride = 0;
1022 devinfo->eu_subslice_stride = 0;
1023 devinfo->eu_slice_stride = 0;
1024
1025 devinfo->num_slices = 0;
1026 devinfo->num_eu_per_subslice = 0;
1027 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1028
1029 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1030 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1031 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1032 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1033 }
1034
1035 static void
update_from_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology)1036 update_from_topology(struct intel_device_info *devinfo,
1037 const struct drm_i915_query_topology_info *topology)
1038 {
1039 reset_masks(devinfo);
1040
1041 assert(topology->max_slices > 0);
1042 assert(topology->max_subslices > 0);
1043 assert(topology->max_eus_per_subslice > 0);
1044
1045 devinfo->subslice_slice_stride = topology->subslice_stride;
1046
1047 devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1048 devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1049
1050 assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1051 memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1052 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1053 devinfo->max_slices = topology->max_slices;
1054 devinfo->max_subslices_per_slice = topology->max_subslices;
1055 devinfo->max_eu_per_subslice = topology->max_eus_per_subslice;
1056
1057 uint32_t subslice_mask_len =
1058 topology->max_slices * topology->subslice_stride;
1059 assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1060 memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1061 subslice_mask_len);
1062
1063 uint32_t n_subslices = 0;
1064 for (int s = 0; s < topology->max_slices; s++) {
1065 if ((devinfo->slice_masks & (1 << s)) == 0)
1066 continue;
1067
1068 for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1069 devinfo->num_subslices[s] +=
1070 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1071 }
1072 n_subslices += devinfo->num_subslices[s];
1073 }
1074 assert(n_subslices > 0);
1075
1076 if (devinfo->ver >= 11) {
1077 /* On current ICL+ hardware we only have one slice. */
1078 assert(devinfo->slice_masks == 1);
1079
1080 /* Count the number of subslices on each pixel pipe. Assume that every
1081 * contiguous group of 4 subslices in the mask belong to the same pixel
1082 * pipe. However note that on TGL the kernel returns a mask of enabled
1083 * *dual* subslices instead of actual subslices somewhat confusingly, so
1084 * each pixel pipe only takes 2 bits in the mask even though it's still
1085 * 4 subslices.
1086 */
1087 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1088 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1089 const unsigned ppipe_mask = BITFIELD_RANGE(p * ppipe_bits, ppipe_bits);
1090 devinfo->ppipe_subslices[p] =
1091 __builtin_popcount(devinfo->subslice_masks[0] & ppipe_mask);
1092 }
1093 }
1094
1095 if (devinfo->ver == 12 && devinfo->num_slices == 1) {
1096 if (n_subslices >= 6) {
1097 assert(n_subslices == 6);
1098 devinfo->l3_banks = 8;
1099 } else if (n_subslices > 2) {
1100 devinfo->l3_banks = 6;
1101 } else {
1102 devinfo->l3_banks = 4;
1103 }
1104 }
1105
1106 uint32_t eu_mask_len =
1107 topology->eu_stride * topology->max_subslices * topology->max_slices;
1108 assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1109 memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1110
1111 uint32_t n_eus = 0;
1112 for (int b = 0; b < eu_mask_len; b++)
1113 n_eus += __builtin_popcount(devinfo->eu_masks[b]);
1114
1115 #ifdef __DragonFly__
1116 /* XXX avoid SIGFPE on divzero */
1117 if (n_subslices == 0)
1118 devinfo->num_eu_per_subslice = 0;
1119 else
1120 #endif
1121 devinfo->num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1122 }
1123
1124 /* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1125 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1126 */
1127 static bool
update_from_masks(struct intel_device_info * devinfo,uint32_t slice_mask,uint32_t subslice_mask,uint32_t n_eus)1128 update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1129 uint32_t subslice_mask, uint32_t n_eus)
1130 {
1131 struct drm_i915_query_topology_info *topology;
1132
1133 assert((slice_mask & 0xff) == slice_mask);
1134
1135 size_t data_length = 100;
1136
1137 topology = calloc(1, sizeof(*topology) + data_length);
1138 if (!topology)
1139 return false;
1140
1141 topology->max_slices = util_last_bit(slice_mask);
1142 topology->max_subslices = util_last_bit(subslice_mask);
1143
1144 topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1145 topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1146
1147 uint32_t n_subslices = __builtin_popcount(slice_mask) *
1148 __builtin_popcount(subslice_mask);
1149 uint32_t num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1150 uint32_t eu_mask = (1U << num_eu_per_subslice) - 1;
1151
1152 topology->max_eus_per_subslice = num_eu_per_subslice;
1153 topology->eu_offset = topology->subslice_offset +
1154 topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1155 topology->eu_stride = DIV_ROUND_UP(num_eu_per_subslice, 8);
1156
1157 /* Set slice mask in topology */
1158 for (int b = 0; b < topology->subslice_offset; b++)
1159 topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1160
1161 for (int s = 0; s < topology->max_slices; s++) {
1162
1163 /* Set subslice mask in topology */
1164 for (int b = 0; b < topology->subslice_stride; b++) {
1165 int subslice_offset = topology->subslice_offset +
1166 s * topology->subslice_stride + b;
1167
1168 topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1169 }
1170
1171 /* Set eu mask in topology */
1172 for (int ss = 0; ss < topology->max_subslices; ss++) {
1173 for (int b = 0; b < topology->eu_stride; b++) {
1174 int eu_offset = topology->eu_offset +
1175 (s * topology->max_subslices + ss) * topology->eu_stride + b;
1176
1177 topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1178 }
1179 }
1180 }
1181
1182 update_from_topology(devinfo, topology);
1183 free(topology);
1184
1185 return true;
1186 }
1187
1188 /* Generate mask from the device data. */
1189 static void
fill_masks(struct intel_device_info * devinfo)1190 fill_masks(struct intel_device_info *devinfo)
1191 {
1192 /* All of our internal device descriptions assign the same number of
1193 * subslices for each slice. Just verify that this is true.
1194 */
1195 for (int s = 1; s < devinfo->num_slices; s++)
1196 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1197
1198 update_from_masks(devinfo,
1199 (1U << devinfo->num_slices) - 1,
1200 (1U << devinfo->num_subslices[0]) - 1,
1201 devinfo->num_slices * devinfo->num_subslices[0] *
1202 devinfo->num_eu_per_subslice);
1203 }
1204
1205 static bool
getparam(int fd,uint32_t param,int * value)1206 getparam(int fd, uint32_t param, int *value)
1207 {
1208 int tmp;
1209
1210 struct drm_i915_getparam gp = {
1211 .param = param,
1212 .value = &tmp,
1213 };
1214
1215 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1216 if (ret != 0)
1217 return false;
1218
1219 *value = tmp;
1220 return true;
1221 }
1222
1223 static void
update_cs_workgroup_threads(struct intel_device_info * devinfo)1224 update_cs_workgroup_threads(struct intel_device_info *devinfo)
1225 {
1226 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1227 * can program is 64 without going up to a rectangular group. This only
1228 * impacts Haswell and TGL which have higher thread counts.
1229 *
1230 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1231 * is 10 bits so we have no such restrictions.
1232 */
1233 devinfo->max_cs_workgroup_threads =
1234 devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1235 MIN2(devinfo->max_cs_threads, 64);
1236 }
1237
1238 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1239 intel_get_device_info_from_pci_id(int pci_id,
1240 struct intel_device_info *devinfo)
1241 {
1242 switch (pci_id) {
1243 #undef CHIPSET
1244 #define CHIPSET(id, family, fam_str, name) \
1245 case id: *devinfo = intel_device_info_##family; break;
1246 #include "pci_ids/i965_pci_ids.h"
1247 #include "pci_ids/iris_pci_ids.h"
1248
1249 #undef CHIPSET
1250 #define CHIPSET(id, fam_str, name) \
1251 case id: *devinfo = intel_device_info_gfx3; break;
1252 #include "pci_ids/i915_pci_ids.h"
1253
1254 default:
1255 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1256 return false;
1257 }
1258
1259 switch (pci_id) {
1260 #undef CHIPSET
1261 #define CHIPSET(_id, _family, _fam_str, _name) \
1262 case _id: \
1263 /* sizeof(str_literal) includes the null */ \
1264 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1265 sizeof(devinfo->name)); \
1266 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1267 break;
1268 #include "pci_ids/i965_pci_ids.h"
1269 #include "pci_ids/iris_pci_ids.h"
1270 default:
1271 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1272 }
1273
1274 fill_masks(devinfo);
1275
1276 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1277 *
1278 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
1279 * allocate scratch space enough so that each slice has 4 slices allowed."
1280 *
1281 * The equivalent internal documentation says that this programming note
1282 * applies to all Gfx9+ platforms.
1283 *
1284 * The hardware typically calculates the scratch space pointer by taking
1285 * the base address, and adding per-thread-scratch-space * thread ID.
1286 * Extra padding can be necessary depending how the thread IDs are
1287 * calculated for a particular shader stage.
1288 */
1289
1290 switch(devinfo->ver) {
1291 case 9:
1292 devinfo->max_wm_threads = 64 /* threads-per-PSD */
1293 * devinfo->num_slices
1294 * 4; /* effective subslices per slice */
1295 break;
1296 case 11:
1297 case 12:
1298 devinfo->max_wm_threads = 128 /* threads-per-PSD */
1299 * devinfo->num_slices
1300 * 8; /* subslices per slice */
1301 break;
1302 default:
1303 assert(devinfo->ver < 9);
1304 break;
1305 }
1306
1307 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1308
1309 if (devinfo->verx10 == 0)
1310 devinfo->verx10 = devinfo->ver * 10;
1311
1312 if (devinfo->display_ver == 0)
1313 devinfo->display_ver = devinfo->ver;
1314
1315 update_cs_workgroup_threads(devinfo);
1316
1317 devinfo->chipset_id = pci_id;
1318 return true;
1319 }
1320
1321 /**
1322 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1323 * (kernel 4.13+)
1324 */
1325 static bool
getparam_topology(struct intel_device_info * devinfo,int fd)1326 getparam_topology(struct intel_device_info *devinfo, int fd)
1327 {
1328 int slice_mask = 0;
1329 if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1330 goto maybe_warn;
1331
1332 int n_eus;
1333 if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1334 goto maybe_warn;
1335
1336 int subslice_mask = 0;
1337 if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1338 goto maybe_warn;
1339
1340 return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1341
1342 maybe_warn:
1343 /* Only with Gfx8+ are we starting to see devices with fusing that can only
1344 * be detected at runtime.
1345 */
1346 if (devinfo->ver >= 8)
1347 mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1348
1349 return false;
1350 }
1351
1352 /**
1353 * preferred API for updating the topology in devinfo (kernel 4.17+)
1354 */
1355 static bool
query_topology(struct intel_device_info * devinfo,int fd)1356 query_topology(struct intel_device_info *devinfo, int fd)
1357 {
1358 struct drm_i915_query_topology_info *topo_info =
1359 intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO);
1360 if (topo_info == NULL)
1361 return false;
1362
1363 update_from_topology(devinfo, topo_info);
1364
1365 free(topo_info);
1366
1367 return true;
1368
1369 }
1370
1371 int
intel_get_aperture_size(int fd,uint64_t * size)1372 intel_get_aperture_size(int fd, uint64_t *size)
1373 {
1374 struct drm_i915_gem_get_aperture aperture = { 0 };
1375
1376 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1377 if (ret == 0 && size)
1378 *size = aperture.aper_size;
1379
1380 return ret;
1381 }
1382
1383 static bool
has_get_tiling(int fd)1384 has_get_tiling(int fd)
1385 {
1386 int ret;
1387
1388 struct drm_i915_gem_create gem_create = {
1389 .size = 4096,
1390 };
1391
1392 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1393 unreachable("Failed to create GEM BO");
1394 return false;
1395 }
1396
1397 struct drm_i915_gem_get_tiling get_tiling = {
1398 .handle = gem_create.handle,
1399 };
1400 ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1401
1402 struct drm_gem_close close = {
1403 .handle = gem_create.handle,
1404 };
1405 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1406
1407 return ret == 0;
1408 }
1409
1410 static void
fixup_chv_device_info(struct intel_device_info * devinfo)1411 fixup_chv_device_info(struct intel_device_info *devinfo)
1412 {
1413 assert(devinfo->is_cherryview);
1414
1415 /* Cherryview is annoying. The number of EUs is depending on fusing and
1416 * isn't determinable from the PCI ID alone. We default to the minimum
1417 * available for that PCI ID and then compute the real value from the
1418 * subslice information we get from the kernel.
1419 */
1420 const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1421 const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1422
1423 /* Logical CS threads = EUs per subslice * num threads per EU */
1424 uint32_t max_cs_threads =
1425 eu_total / subslice_total * devinfo->num_thread_per_eu;
1426
1427 /* Fuse configurations may give more threads than expected, never less. */
1428 if (max_cs_threads > devinfo->max_cs_threads)
1429 devinfo->max_cs_threads = max_cs_threads;
1430
1431 update_cs_workgroup_threads(devinfo);
1432
1433 /* Braswell is even more annoying. Its marketing name isn't determinable
1434 * from the PCI ID and is also dependent on fusing.
1435 */
1436 if (devinfo->chipset_id != 0x22B1)
1437 return;
1438
1439 char *bsw_model;
1440 switch (eu_total) {
1441 case 16: bsw_model = "405"; break;
1442 case 12: bsw_model = "400"; break;
1443 default: bsw_model = " "; break;
1444 }
1445
1446 char *needle = strstr(devinfo->name, "XXX");
1447 assert(needle);
1448 if (needle)
1449 memcpy(needle, bsw_model, 3);
1450 }
1451
1452 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1453 init_max_scratch_ids(struct intel_device_info *devinfo)
1454 {
1455 /* Determine the max number of subslices that potentially might be used in
1456 * scratch space ids.
1457 *
1458 * For, Gfx11+, scratch space allocation is based on the number of threads
1459 * in the base configuration.
1460 *
1461 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1462 * we wish to view that there are 4 subslices per slice instead of the
1463 * actual number of subslices per slice. The documentation for 3DSTATE_PS
1464 * "Scratch Space Base Pointer" says:
1465 *
1466 * "Scratch Space per slice is computed based on 4 sub-slices. SW
1467 * must allocate scratch space enough so that each slice has 4
1468 * slices allowed."
1469 *
1470 * According to the other driver team, this applies to compute shaders
1471 * as well. This is not currently documented at all.
1472 *
1473 * For Gfx8 and older we user devinfo->subslice_total.
1474 */
1475 unsigned subslices;
1476 if (devinfo->verx10 == 125)
1477 subslices = 32;
1478 else if (devinfo->ver == 12)
1479 subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
1480 else if (devinfo->ver == 11)
1481 subslices = 8;
1482 else if (devinfo->ver >= 9 && devinfo->ver < 11)
1483 subslices = 4 * devinfo->num_slices;
1484 else
1485 subslices = devinfo->subslice_total;
1486 assert(subslices >= devinfo->subslice_total);
1487
1488 unsigned scratch_ids_per_subslice;
1489 if (devinfo->ver >= 12) {
1490 /* Same as ICL below, but with 16 EUs. */
1491 scratch_ids_per_subslice = 16 * 8;
1492 } else if (devinfo->ver >= 11) {
1493 /* The MEDIA_VFE_STATE docs say:
1494 *
1495 * "Starting with this configuration, the Maximum Number of
1496 * Threads must be set to (#EU * 8) for GPGPU dispatches.
1497 *
1498 * Although there are only 7 threads per EU in the configuration,
1499 * the FFTID is calculated as if there are 8 threads per EU,
1500 * which in turn requires a larger amount of Scratch Space to be
1501 * allocated by the driver."
1502 */
1503 scratch_ids_per_subslice = 8 * 8;
1504 } else if (devinfo->is_haswell) {
1505 /* WaCSScratchSize:hsw
1506 *
1507 * Haswell's scratch space address calculation appears to be sparse
1508 * rather than tightly packed. The Thread ID has bits indicating
1509 * which subslice, EU within a subslice, and thread within an EU it
1510 * is. There's a maximum of two slices and two subslices, so these
1511 * can be stored with a single bit. Even though there are only 10 EUs
1512 * per subslice, this is stored in 4 bits, so there's an effective
1513 * maximum value of 16 EUs. Similarly, although there are only 7
1514 * threads per EU, this is stored in a 3 bit number, giving an
1515 * effective maximum value of 8 threads per EU.
1516 *
1517 * This means that we need to use 16 * 8 instead of 10 * 7 for the
1518 * number of threads per subslice.
1519 */
1520 scratch_ids_per_subslice = 16 * 8;
1521 } else if (devinfo->is_cherryview) {
1522 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1523 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1524 * as if it had 8 EUs.
1525 */
1526 scratch_ids_per_subslice = 8 * 7;
1527 } else {
1528 scratch_ids_per_subslice = devinfo->max_cs_threads;
1529 }
1530
1531 unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1532
1533 if (devinfo->verx10 >= 125) {
1534 /* On GFX version 12.5, scratch access changed to a surface-based model.
1535 * Instead of each shader type having its own layout based on IDs passed
1536 * from the relevant fixed-function unit, all scratch access is based on
1537 * thread IDs like it always has been for compute.
1538 */
1539 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1540 devinfo->max_scratch_ids[i] = max_thread_ids;
1541 } else {
1542 unsigned max_scratch_ids[] = {
1543 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
1544 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1545 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1546 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
1547 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
1548 [MESA_SHADER_COMPUTE] = max_thread_ids,
1549 };
1550 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1551 memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1552 sizeof(devinfo->max_scratch_ids));
1553 }
1554 }
1555
1556 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo)1557 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1558 {
1559 int devid = 0;
1560
1561 const char *devid_override = getenv("INTEL_DEVID_OVERRIDE");
1562 if (devid_override && strlen(devid_override) > 0) {
1563 if (geteuid() == getuid()) {
1564 devid = intel_device_name_to_pci_device_id(devid_override);
1565 /* Fallback to PCI ID. */
1566 if (devid <= 0)
1567 devid = strtol(devid_override, NULL, 0);
1568 if (devid <= 0) {
1569 mesa_loge("Invalid INTEL_DEVID_OVERRIDE=\"%s\". "
1570 "Use a valid numeric PCI ID or one of the supported "
1571 "platform names:", devid_override);
1572 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++)
1573 mesa_loge(" %s", name_map[i].name);
1574 return false;
1575 }
1576 } else {
1577 mesa_logi("Ignoring INTEL_DEVID_OVERRIDE=\"%s\" because "
1578 "real and effective user ID don't match.", devid_override);
1579 }
1580 }
1581
1582 if (devid > 0) {
1583 if (!intel_get_device_info_from_pci_id(devid, devinfo))
1584 return false;
1585 devinfo->no_hw = true;
1586 } else {
1587 /* query the device id */
1588 if (!getparam(fd, I915_PARAM_CHIPSET_ID, &devid))
1589 return false;
1590 if (!intel_get_device_info_from_pci_id(devid, devinfo))
1591 return false;
1592 devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1593 }
1594
1595 if (devinfo->ver == 10) {
1596 mesa_loge("Gfx10 support is redacted.");
1597 return false;
1598 }
1599
1600 /* remaining initializion queries the kernel for device info */
1601 if (devinfo->no_hw)
1602 return true;
1603
1604 int timestamp_frequency;
1605 if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1606 ×tamp_frequency))
1607 devinfo->timestamp_frequency = timestamp_frequency;
1608 else if (devinfo->ver >= 10) {
1609 mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1610 return false;
1611 }
1612
1613 if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
1614 devinfo->revision = 0;
1615
1616 if (!query_topology(devinfo, fd)) {
1617 if (devinfo->ver >= 10) {
1618 /* topology uAPI required for CNL+ (kernel 4.17+) */
1619 return false;
1620 }
1621
1622 /* else use the kernel 4.13+ api for gfx8+. For older kernels, topology
1623 * will be wrong, affecting GPU metrics. In this case, fail silently.
1624 */
1625 getparam_topology(devinfo, fd);
1626 }
1627
1628 if (devinfo->is_cherryview)
1629 fixup_chv_device_info(devinfo);
1630
1631 intel_get_aperture_size(fd, &devinfo->aperture_bytes);
1632 devinfo->has_tiling_uapi = has_get_tiling(fd);
1633
1634 devinfo->subslice_total = 0;
1635 for (uint32_t i = 0; i < devinfo->max_slices; i++)
1636 devinfo->subslice_total += __builtin_popcount(devinfo->subslice_masks[i]);
1637
1638 /* Gfx7 and older do not support EU/Subslice info */
1639 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
1640 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
1641
1642 init_max_scratch_ids(devinfo);
1643
1644 return true;
1645 }
1646