1 /*
2  * Copyright © 2013 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include "intel_device_info.h"
31 #include "intel/common/intel_gem.h"
32 #include "util/bitscan.h"
33 #include "util/debug.h"
34 #include "util/log.h"
35 #include "util/macros.h"
36 
37 #include "drm-uapi/i915_drm.h"
38 
39 static const struct {
40    const char *name;
41    int pci_id;
42 } name_map[] = {
43    { "lpt", 0x27a2 },
44    { "brw", 0x2a02 },
45    { "g4x", 0x2a42 },
46    { "ilk", 0x0042 },
47    { "snb", 0x0126 },
48    { "ivb", 0x016a },
49    { "hsw", 0x0d2e },
50    { "byt", 0x0f33 },
51    { "bdw", 0x162e },
52    { "chv", 0x22B3 },
53    { "skl", 0x1912 },
54    { "bxt", 0x5A85 },
55    { "kbl", 0x5912 },
56    { "aml", 0x591C },
57    { "glk", 0x3185 },
58    { "cfl", 0x3E9B },
59    { "whl", 0x3EA1 },
60    { "cml", 0x9b41 },
61    { "icl", 0x8a52 },
62    { "ehl", 0x4500 },
63    { "jsl", 0x4E71 },
64    { "tgl", 0x9a49 },
65    { "rkl", 0x4c8a },
66    { "dg1", 0x4905 },
67    { "adl", 0x4680 },
68    { "sg1", 0x4907 },
69 };
70 
71 /**
72  * Get the PCI ID for the device name.
73  *
74  * Returns -1 if the device is not known.
75  */
76 int
intel_device_name_to_pci_device_id(const char * name)77 intel_device_name_to_pci_device_id(const char *name)
78 {
79    for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
80       if (!strcmp(name_map[i].name, name))
81          return name_map[i].pci_id;
82    }
83 
84    return -1;
85 }
86 
87 static const struct intel_device_info intel_device_info_gfx3 = {
88    .ver = 3,
89    .simulator_id = -1,
90    .cs_prefetch_size = 512,
91 };
92 
93 static const struct intel_device_info intel_device_info_i965 = {
94    .ver = 4,
95    .has_negative_rhw_bug = true,
96    .num_slices = 1,
97    .num_subslices = { 1, },
98    .num_eu_per_subslice = 8,
99    .num_thread_per_eu = 4,
100    .max_vs_threads = 16,
101    .max_gs_threads = 2,
102    .max_wm_threads = 8 * 4,
103    .urb = {
104       .size = 256,
105    },
106    .timestamp_frequency = 12500000,
107    .simulator_id = -1,
108    .cs_prefetch_size = 512,
109 };
110 
111 static const struct intel_device_info intel_device_info_g4x = {
112    .ver = 4,
113    .verx10 = 45,
114    .has_pln = true,
115    .has_compr4 = true,
116    .has_surface_tile_offset = true,
117    .is_g4x = true,
118    .num_slices = 1,
119    .num_subslices = { 1, },
120    .num_eu_per_subslice = 10,
121    .num_thread_per_eu = 5,
122    .max_vs_threads = 32,
123    .max_gs_threads = 2,
124    .max_wm_threads = 10 * 5,
125    .urb = {
126       .size = 384,
127    },
128    .timestamp_frequency = 12500000,
129    .simulator_id = -1,
130    .cs_prefetch_size = 512,
131 };
132 
133 static const struct intel_device_info intel_device_info_ilk = {
134    .ver = 5,
135    .has_pln = true,
136    .has_compr4 = true,
137    .has_surface_tile_offset = true,
138    .num_slices = 1,
139    .num_subslices = { 1, },
140    .num_eu_per_subslice = 12,
141    .num_thread_per_eu = 6,
142    .max_vs_threads = 72,
143    .max_gs_threads = 32,
144    .max_wm_threads = 12 * 6,
145    .urb = {
146       .size = 1024,
147    },
148    .timestamp_frequency = 12500000,
149    .simulator_id = -1,
150    .cs_prefetch_size = 512,
151 };
152 
153 static const struct intel_device_info intel_device_info_snb_gt1 = {
154    .ver = 6,
155    .gt = 1,
156    .has_hiz_and_separate_stencil = true,
157    .has_llc = true,
158    .has_pln = true,
159    .has_surface_tile_offset = true,
160    .needs_unlit_centroid_workaround = true,
161    .num_slices = 1,
162    .num_subslices = { 1, },
163    .num_eu_per_subslice = 6,
164    .num_thread_per_eu = 6, /* Not confirmed */
165    .max_vs_threads = 24,
166    .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
167    .max_wm_threads = 40,
168    .urb = {
169       .size = 32,
170       .min_entries = {
171          [MESA_SHADER_VERTEX]   = 24,
172       },
173       .max_entries = {
174          [MESA_SHADER_VERTEX]   = 256,
175          [MESA_SHADER_GEOMETRY] = 256,
176       },
177    },
178    .timestamp_frequency = 12500000,
179    .simulator_id = -1,
180    .cs_prefetch_size = 512,
181 };
182 
183 static const struct intel_device_info intel_device_info_snb_gt2 = {
184    .ver = 6,
185    .gt = 2,
186    .has_hiz_and_separate_stencil = true,
187    .has_llc = true,
188    .has_pln = true,
189    .has_surface_tile_offset = true,
190    .needs_unlit_centroid_workaround = true,
191    .num_slices = 1,
192    .num_subslices = { 1, },
193    .num_eu_per_subslice = 12,
194    .num_thread_per_eu = 6, /* Not confirmed */
195    .max_vs_threads = 60,
196    .max_gs_threads = 60,
197    .max_wm_threads = 80,
198    .urb = {
199       .size = 64,
200       .min_entries = {
201          [MESA_SHADER_VERTEX]   = 24,
202       },
203       .max_entries = {
204          [MESA_SHADER_VERTEX]   = 256,
205          [MESA_SHADER_GEOMETRY] = 256,
206       },
207    },
208    .timestamp_frequency = 12500000,
209    .simulator_id = -1,
210    .cs_prefetch_size = 512,
211 };
212 
213 #define GFX7_FEATURES                               \
214    .ver = 7,                                        \
215    .has_hiz_and_separate_stencil = true,            \
216    .must_use_separate_stencil = true,               \
217    .has_llc = true,                                 \
218    .has_pln = true,                                 \
219    .has_64bit_float = true,                         \
220    .has_surface_tile_offset = true,                 \
221    .timestamp_frequency = 12500000,                 \
222    .max_constant_urb_size_kb = 16,                  \
223    .cs_prefetch_size = 512
224 
225 static const struct intel_device_info intel_device_info_ivb_gt1 = {
226    GFX7_FEATURES, .is_ivybridge = true, .gt = 1,
227    .num_slices = 1,
228    .num_subslices = { 1, },
229    .num_eu_per_subslice = 6,
230    .num_thread_per_eu = 6,
231    .l3_banks = 2,
232    .max_vs_threads = 36,
233    .max_tcs_threads = 36,
234    .max_tes_threads = 36,
235    .max_gs_threads = 36,
236    .max_wm_threads = 48,
237    .max_cs_threads = 36,
238    .urb = {
239       .min_entries = {
240          [MESA_SHADER_VERTEX]    = 32,
241          [MESA_SHADER_TESS_EVAL] = 10,
242       },
243       .max_entries = {
244          [MESA_SHADER_VERTEX]    = 512,
245          [MESA_SHADER_TESS_CTRL] = 32,
246          [MESA_SHADER_TESS_EVAL] = 288,
247          [MESA_SHADER_GEOMETRY]  = 192,
248       },
249    },
250    .simulator_id = 7,
251 };
252 
253 static const struct intel_device_info intel_device_info_ivb_gt2 = {
254    GFX7_FEATURES, .is_ivybridge = true, .gt = 2,
255    .num_slices = 1,
256    .num_subslices = { 1, },
257    .num_eu_per_subslice = 12,
258    .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
259                             * @max_wm_threads ... */
260    .l3_banks = 4,
261    .max_vs_threads = 128,
262    .max_tcs_threads = 128,
263    .max_tes_threads = 128,
264    .max_gs_threads = 128,
265    .max_wm_threads = 172,
266    .max_cs_threads = 64,
267    .urb = {
268       .min_entries = {
269          [MESA_SHADER_VERTEX]    = 32,
270          [MESA_SHADER_TESS_EVAL] = 10,
271       },
272       .max_entries = {
273          [MESA_SHADER_VERTEX]    = 704,
274          [MESA_SHADER_TESS_CTRL] = 64,
275          [MESA_SHADER_TESS_EVAL] = 448,
276          [MESA_SHADER_GEOMETRY]  = 320,
277       },
278    },
279    .simulator_id = 7,
280 };
281 
282 static const struct intel_device_info intel_device_info_byt = {
283    GFX7_FEATURES, .is_baytrail = true, .gt = 1,
284    .num_slices = 1,
285    .num_subslices = { 1, },
286    .num_eu_per_subslice = 4,
287    .num_thread_per_eu = 8,
288    .l3_banks = 1,
289    .has_llc = false,
290    .max_vs_threads = 36,
291    .max_tcs_threads = 36,
292    .max_tes_threads = 36,
293    .max_gs_threads = 36,
294    .max_wm_threads = 48,
295    .max_cs_threads = 32,
296    .urb = {
297       .min_entries = {
298          [MESA_SHADER_VERTEX]    = 32,
299          [MESA_SHADER_TESS_EVAL] = 10,
300       },
301       .max_entries = {
302          [MESA_SHADER_VERTEX]    = 512,
303          [MESA_SHADER_TESS_CTRL] = 32,
304          [MESA_SHADER_TESS_EVAL] = 288,
305          [MESA_SHADER_GEOMETRY]  = 192,
306       },
307    },
308    .simulator_id = 10,
309 };
310 
311 #define HSW_FEATURES             \
312    GFX7_FEATURES,                \
313    .is_haswell = true,           \
314    .verx10 = 75,                 \
315    .supports_simd16_3src = true
316 
317 static const struct intel_device_info intel_device_info_hsw_gt1 = {
318    HSW_FEATURES, .gt = 1,
319    .num_slices = 1,
320    .num_subslices = { 1, },
321    .num_eu_per_subslice = 10,
322    .num_thread_per_eu = 7,
323    .l3_banks = 2,
324    .max_vs_threads = 70,
325    .max_tcs_threads = 70,
326    .max_tes_threads = 70,
327    .max_gs_threads = 70,
328    .max_wm_threads = 102,
329    .max_cs_threads = 70,
330    .urb = {
331       .min_entries = {
332          [MESA_SHADER_VERTEX]    = 32,
333          [MESA_SHADER_TESS_EVAL] = 10,
334       },
335       .max_entries = {
336          [MESA_SHADER_VERTEX]    = 640,
337          [MESA_SHADER_TESS_CTRL] = 64,
338          [MESA_SHADER_TESS_EVAL] = 384,
339          [MESA_SHADER_GEOMETRY]  = 256,
340       },
341    },
342    .simulator_id = 9,
343 };
344 
345 static const struct intel_device_info intel_device_info_hsw_gt2 = {
346    HSW_FEATURES, .gt = 2,
347    .num_slices = 1,
348    .num_subslices = { 2, },
349    .num_eu_per_subslice = 10,
350    .num_thread_per_eu = 7,
351    .l3_banks = 4,
352    .max_vs_threads = 280,
353    .max_tcs_threads = 256,
354    .max_tes_threads = 280,
355    .max_gs_threads = 256,
356    .max_wm_threads = 204,
357    .max_cs_threads = 70,
358    .urb = {
359       .min_entries = {
360          [MESA_SHADER_VERTEX]    = 64,
361          [MESA_SHADER_TESS_EVAL] = 10,
362       },
363       .max_entries = {
364          [MESA_SHADER_VERTEX]    = 1664,
365          [MESA_SHADER_TESS_CTRL] = 128,
366          [MESA_SHADER_TESS_EVAL] = 960,
367          [MESA_SHADER_GEOMETRY]  = 640,
368       },
369    },
370    .simulator_id = 9,
371 };
372 
373 static const struct intel_device_info intel_device_info_hsw_gt3 = {
374    HSW_FEATURES, .gt = 3,
375    .num_slices = 2,
376    .num_subslices = { 2, 2, },
377    .num_eu_per_subslice = 10,
378    .num_thread_per_eu = 7,
379    .l3_banks = 8,
380    .max_vs_threads = 280,
381    .max_tcs_threads = 256,
382    .max_tes_threads = 280,
383    .max_gs_threads = 256,
384    .max_wm_threads = 408,
385    .max_cs_threads = 70,
386    .urb = {
387       .min_entries = {
388          [MESA_SHADER_VERTEX]    = 64,
389          [MESA_SHADER_TESS_EVAL] = 10,
390       },
391       .max_entries = {
392          [MESA_SHADER_VERTEX]    = 1664,
393          [MESA_SHADER_TESS_CTRL] = 128,
394          [MESA_SHADER_TESS_EVAL] = 960,
395          [MESA_SHADER_GEOMETRY]  = 640,
396       },
397    },
398    .max_constant_urb_size_kb = 32,
399    .simulator_id = 9,
400 };
401 
402 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
403  * so keep things conservative for now and set has_sample_with_hiz = false.
404  */
405 #define GFX8_FEATURES                               \
406    .ver = 8,                                        \
407    .has_hiz_and_separate_stencil = true,            \
408    .must_use_separate_stencil = true,               \
409    .has_llc = true,                                 \
410    .has_sample_with_hiz = false,                    \
411    .has_pln = true,                                 \
412    .has_integer_dword_mul = true,                   \
413    .has_64bit_float = true,                         \
414    .has_64bit_int = true,                           \
415    .supports_simd16_3src = true,                    \
416    .has_surface_tile_offset = true,                 \
417    .num_thread_per_eu = 7,                          \
418    .max_vs_threads = 504,                           \
419    .max_tcs_threads = 504,                          \
420    .max_tes_threads = 504,                          \
421    .max_gs_threads = 504,                           \
422    .max_wm_threads = 384,                           \
423    .timestamp_frequency = 12500000,                 \
424    .max_constant_urb_size_kb = 32,                  \
425    .cs_prefetch_size = 512
426 
427 static const struct intel_device_info intel_device_info_bdw_gt1 = {
428    GFX8_FEATURES, .gt = 1,
429    .is_broadwell = true,
430    .num_slices = 1,
431    .num_subslices = { 2, },
432    .num_eu_per_subslice = 6,
433    .l3_banks = 2,
434    .max_cs_threads = 42,
435    .urb = {
436       .min_entries = {
437          [MESA_SHADER_VERTEX]    = 64,
438          [MESA_SHADER_TESS_EVAL] = 34,
439       },
440       .max_entries = {
441          [MESA_SHADER_VERTEX]    = 2560,
442          [MESA_SHADER_TESS_CTRL] = 504,
443          [MESA_SHADER_TESS_EVAL] = 1536,
444          /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
445          [MESA_SHADER_GEOMETRY]  = 690,
446       },
447    },
448    .simulator_id = 11,
449 };
450 
451 static const struct intel_device_info intel_device_info_bdw_gt2 = {
452    GFX8_FEATURES, .gt = 2,
453    .is_broadwell = true,
454    .num_slices = 1,
455    .num_subslices = { 3, },
456    .num_eu_per_subslice = 8,
457    .l3_banks = 4,
458    .max_cs_threads = 56,
459    .urb = {
460       .min_entries = {
461          [MESA_SHADER_VERTEX]    = 64,
462          [MESA_SHADER_TESS_EVAL] = 34,
463       },
464       .max_entries = {
465          [MESA_SHADER_VERTEX]    = 2560,
466          [MESA_SHADER_TESS_CTRL] = 504,
467          [MESA_SHADER_TESS_EVAL] = 1536,
468          [MESA_SHADER_GEOMETRY]  = 960,
469       },
470    },
471    .simulator_id = 11,
472 };
473 
474 static const struct intel_device_info intel_device_info_bdw_gt3 = {
475    GFX8_FEATURES, .gt = 3,
476    .is_broadwell = true,
477    .num_slices = 2,
478    .num_subslices = { 3, 3, },
479    .num_eu_per_subslice = 8,
480    .l3_banks = 8,
481    .max_cs_threads = 56,
482    .urb = {
483       .min_entries = {
484          [MESA_SHADER_VERTEX]    = 64,
485          [MESA_SHADER_TESS_EVAL] = 34,
486       },
487       .max_entries = {
488          [MESA_SHADER_VERTEX]    = 2560,
489          [MESA_SHADER_TESS_CTRL] = 504,
490          [MESA_SHADER_TESS_EVAL] = 1536,
491          [MESA_SHADER_GEOMETRY]  = 960,
492       },
493    },
494    .simulator_id = 11,
495 };
496 
497 static const struct intel_device_info intel_device_info_chv = {
498    GFX8_FEATURES, .is_cherryview = 1, .gt = 1,
499    .has_llc = false,
500    .has_integer_dword_mul = false,
501    .num_slices = 1,
502    .num_subslices = { 2, },
503    .num_eu_per_subslice = 8,
504    .l3_banks = 2,
505    .max_vs_threads = 80,
506    .max_tcs_threads = 80,
507    .max_tes_threads = 80,
508    .max_gs_threads = 80,
509    .max_wm_threads = 128,
510    .max_cs_threads = 6 * 7,
511    .urb = {
512       .min_entries = {
513          [MESA_SHADER_VERTEX]    = 34,
514          [MESA_SHADER_TESS_EVAL] = 34,
515       },
516       .max_entries = {
517          [MESA_SHADER_VERTEX]    = 640,
518          [MESA_SHADER_TESS_CTRL] = 80,
519          [MESA_SHADER_TESS_EVAL] = 384,
520          [MESA_SHADER_GEOMETRY]  = 256,
521       },
522    },
523    .simulator_id = 13,
524 };
525 
526 #define GFX9_HW_INFO                                \
527    .ver = 9,                                        \
528    .max_vs_threads = 336,                           \
529    .max_gs_threads = 336,                           \
530    .max_tcs_threads = 336,                          \
531    .max_tes_threads = 336,                          \
532    .max_cs_threads = 56,                            \
533    .timestamp_frequency = 12000000,                 \
534    .cs_prefetch_size = 512,                         \
535    .urb = {                                         \
536       .min_entries = {                              \
537          [MESA_SHADER_VERTEX]    = 64,              \
538          [MESA_SHADER_TESS_EVAL] = 34,              \
539       },                                            \
540       .max_entries = {                              \
541          [MESA_SHADER_VERTEX]    = 1856,            \
542          [MESA_SHADER_TESS_CTRL] = 672,             \
543          [MESA_SHADER_TESS_EVAL] = 1120,            \
544          [MESA_SHADER_GEOMETRY]  = 640,             \
545       },                                            \
546    }
547 
548 #define GFX9_LP_FEATURES                           \
549    GFX8_FEATURES,                                  \
550    GFX9_HW_INFO,                                   \
551    .has_integer_dword_mul = false,                 \
552    .gt = 1,                                        \
553    .has_llc = false,                               \
554    .has_sample_with_hiz = true,                    \
555    .num_slices = 1,                                \
556    .num_thread_per_eu = 6,                         \
557    .max_vs_threads = 112,                          \
558    .max_tcs_threads = 112,                         \
559    .max_tes_threads = 112,                         \
560    .max_gs_threads = 112,                          \
561    .max_cs_threads = 6 * 6,                        \
562    .timestamp_frequency = 19200000,                \
563    .urb = {                                        \
564       .min_entries = {                             \
565          [MESA_SHADER_VERTEX]    = 34,             \
566          [MESA_SHADER_TESS_EVAL] = 34,             \
567       },                                           \
568       .max_entries = {                             \
569          [MESA_SHADER_VERTEX]    = 704,            \
570          [MESA_SHADER_TESS_CTRL] = 256,            \
571          [MESA_SHADER_TESS_EVAL] = 416,            \
572          [MESA_SHADER_GEOMETRY]  = 256,            \
573       },                                           \
574    }
575 
576 #define GFX9_LP_FEATURES_3X6                       \
577    GFX9_LP_FEATURES,                               \
578    .num_subslices = { 3, },                        \
579    .num_eu_per_subslice = 6
580 
581 #define GFX9_LP_FEATURES_2X6                       \
582    GFX9_LP_FEATURES,                               \
583    .num_subslices = { 2, },                        \
584    .num_eu_per_subslice = 6,                       \
585    .max_vs_threads = 56,                           \
586    .max_tcs_threads = 56,                          \
587    .max_tes_threads = 56,                          \
588    .max_gs_threads = 56,                           \
589    .max_cs_threads = 6 * 6,                        \
590    .urb = {                                        \
591       .min_entries = {                             \
592          [MESA_SHADER_VERTEX]    = 34,             \
593          [MESA_SHADER_TESS_EVAL] = 34,             \
594       },                                           \
595       .max_entries = {                             \
596          [MESA_SHADER_VERTEX]    = 352,            \
597          [MESA_SHADER_TESS_CTRL] = 128,            \
598          [MESA_SHADER_TESS_EVAL] = 208,            \
599          [MESA_SHADER_GEOMETRY]  = 128,            \
600       },                                           \
601    }
602 
603 #define GFX9_FEATURES                               \
604    GFX8_FEATURES,                                   \
605    GFX9_HW_INFO,                                    \
606    .has_sample_with_hiz = true
607 
608 static const struct intel_device_info intel_device_info_skl_gt1 = {
609    GFX9_FEATURES, .gt = 1,
610    .is_skylake = true,
611    .num_slices = 1,
612    .num_subslices = { 2, },
613    .num_eu_per_subslice = 6,
614    .l3_banks = 2,
615    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
616     * leading to some vertices to go missing if we use too much URB.
617     */
618    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
619    .simulator_id = 12,
620 };
621 
622 static const struct intel_device_info intel_device_info_skl_gt2 = {
623    GFX9_FEATURES, .gt = 2,
624    .is_skylake = true,
625    .num_slices = 1,
626    .num_subslices = { 3, },
627    .num_eu_per_subslice = 8,
628    .l3_banks = 4,
629    .simulator_id = 12,
630 };
631 
632 static const struct intel_device_info intel_device_info_skl_gt3 = {
633    GFX9_FEATURES, .gt = 3,
634    .is_skylake = true,
635    .num_slices = 2,
636    .num_subslices = { 3, 3, },
637    .num_eu_per_subslice = 8,
638    .l3_banks = 8,
639    .simulator_id = 12,
640 };
641 
642 static const struct intel_device_info intel_device_info_skl_gt4 = {
643    GFX9_FEATURES, .gt = 4,
644    .is_skylake = true,
645    .num_slices = 3,
646    .num_subslices = { 3, 3, 3, },
647    .num_eu_per_subslice = 8,
648    .l3_banks = 12,
649    /* From the "L3 Allocation and Programming" documentation:
650     *
651     * "URB is limited to 1008KB due to programming restrictions.  This is not a
652     * restriction of the L3 implementation, but of the FF and other clients.
653     * Therefore, in a GT4 implementation it is possible for the programmed
654     * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
655     * only 1008KB of this will be used."
656     */
657    .simulator_id = 12,
658 };
659 
660 static const struct intel_device_info intel_device_info_bxt = {
661    GFX9_LP_FEATURES_3X6,
662    .is_broxton = true,
663    .l3_banks = 2,
664    .simulator_id = 14,
665 };
666 
667 static const struct intel_device_info intel_device_info_bxt_2x6 = {
668    GFX9_LP_FEATURES_2X6,
669    .is_broxton = true,
670    .l3_banks = 1,
671    .simulator_id = 14,
672 };
673 /*
674  * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
675  * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
676  */
677 
678 static const struct intel_device_info intel_device_info_kbl_gt1 = {
679    GFX9_FEATURES,
680    .is_kabylake = true,
681    .gt = 1,
682 
683    .max_cs_threads = 7 * 6,
684    .num_slices = 1,
685    .num_subslices = { 2, },
686    .num_eu_per_subslice = 6,
687    .l3_banks = 2,
688    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
689     * leading to some vertices to go missing if we use too much URB.
690     */
691    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
692    .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
693    .simulator_id = 16,
694 };
695 
696 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
697    GFX9_FEATURES,
698    .is_kabylake = true,
699    .gt = 1,
700 
701    .max_cs_threads = 7 * 6,
702    .num_slices = 1,
703    .num_subslices = { 3, },
704    .num_eu_per_subslice = 6,
705    .l3_banks = 4,
706    .simulator_id = 16,
707 };
708 
709 static const struct intel_device_info intel_device_info_kbl_gt2 = {
710    GFX9_FEATURES,
711    .is_kabylake = true,
712    .gt = 2,
713 
714    .num_slices = 1,
715    .num_subslices = { 3, },
716    .num_eu_per_subslice = 8,
717    .l3_banks = 4,
718    .simulator_id = 16,
719 };
720 
721 static const struct intel_device_info intel_device_info_kbl_gt3 = {
722    GFX9_FEATURES,
723    .is_kabylake = true,
724    .gt = 3,
725 
726    .num_slices = 2,
727    .num_subslices = { 3, 3, },
728    .num_eu_per_subslice = 8,
729    .l3_banks = 8,
730    .simulator_id = 16,
731 };
732 
733 static const struct intel_device_info intel_device_info_kbl_gt4 = {
734    GFX9_FEATURES,
735    .is_kabylake = true,
736    .gt = 4,
737 
738    /*
739     * From the "L3 Allocation and Programming" documentation:
740     *
741     * "URB is limited to 1008KB due to programming restrictions.  This
742     *  is not a restriction of the L3 implementation, but of the FF and
743     *  other clients.  Therefore, in a GT4 implementation it is
744     *  possible for the programmed allocation of the L3 data array to
745     *  provide 3*384KB=1152KB for URB, but only 1008KB of this
746     *  will be used."
747     */
748    .num_slices = 3,
749    .num_subslices = { 3, 3, 3, },
750    .num_eu_per_subslice = 8,
751    .l3_banks = 12,
752    .simulator_id = 16,
753 };
754 
755 static const struct intel_device_info intel_device_info_glk = {
756    GFX9_LP_FEATURES_3X6,
757    .is_geminilake = true,
758    .l3_banks = 2,
759    .simulator_id = 17,
760 };
761 
762 static const struct intel_device_info intel_device_info_glk_2x6 = {
763    GFX9_LP_FEATURES_2X6,
764    .is_geminilake = true,
765    .l3_banks = 2,
766    .simulator_id = 17,
767 };
768 
769 static const struct intel_device_info intel_device_info_cfl_gt1 = {
770    GFX9_FEATURES,
771    .is_coffeelake = true,
772    .gt = 1,
773 
774    .num_slices = 1,
775    .num_subslices = { 2, },
776    .num_eu_per_subslice = 6,
777    .l3_banks = 2,
778    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
779     * leading to some vertices to go missing if we use too much URB.
780     */
781    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
782    .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
783    .simulator_id = 24,
784 };
785 static const struct intel_device_info intel_device_info_cfl_gt2 = {
786    GFX9_FEATURES,
787    .is_coffeelake = true,
788    .gt = 2,
789 
790    .num_slices = 1,
791    .num_subslices = { 3, },
792    .num_eu_per_subslice = 8,
793    .l3_banks = 4,
794    .simulator_id = 24,
795 };
796 
797 static const struct intel_device_info intel_device_info_cfl_gt3 = {
798    GFX9_FEATURES,
799    .is_coffeelake = true,
800    .gt = 3,
801 
802    .num_slices = 2,
803    .num_subslices = { 3, 3, },
804    .num_eu_per_subslice = 8,
805    .l3_banks = 8,
806    .simulator_id = 24,
807 };
808 
809 #define subslices(args...) { args, }
810 
811 #define GFX11_HW_INFO                               \
812    .ver = 11,                                       \
813    .has_pln = false,                                \
814    .max_vs_threads = 364,                           \
815    .max_gs_threads = 224,                           \
816    .max_tcs_threads = 224,                          \
817    .max_tes_threads = 364,                          \
818    .max_cs_threads = 56,                            \
819    .cs_prefetch_size = 512
820 
821 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3) \
822    GFX8_FEATURES,                                     \
823    GFX11_HW_INFO,                                     \
824    .has_64bit_float = false,                          \
825    .has_64bit_int = false,                            \
826    .has_integer_dword_mul = false,                    \
827    .has_sample_with_hiz = false,                      \
828    .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
829    .num_subslices = _subslices,                       \
830    .num_eu_per_subslice = 8
831 
832 #define GFX11_URB_MIN_MAX_ENTRIES                     \
833    .min_entries = {                                   \
834       [MESA_SHADER_VERTEX]    = 64,                   \
835       [MESA_SHADER_TESS_EVAL] = 34,                   \
836    },                                                 \
837    .max_entries = {                                   \
838       [MESA_SHADER_VERTEX]    = 2384,                 \
839       [MESA_SHADER_TESS_CTRL] = 1032,                 \
840       [MESA_SHADER_TESS_EVAL] = 2384,                 \
841       [MESA_SHADER_GEOMETRY]  = 1032,                 \
842    }
843 
844 static const struct intel_device_info intel_device_info_icl_gt2 = {
845    GFX11_FEATURES(2, 1, subslices(8), 8),
846    .urb = {
847       GFX11_URB_MIN_MAX_ENTRIES,
848    },
849    .simulator_id = 19,
850 };
851 
852 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
853    GFX11_FEATURES(1, 1, subslices(6), 6),
854    .urb = {
855       GFX11_URB_MIN_MAX_ENTRIES,
856    },
857    .simulator_id = 19,
858 };
859 
860 static const struct intel_device_info intel_device_info_icl_gt1 = {
861    GFX11_FEATURES(1, 1, subslices(4), 6),
862    .urb = {
863       GFX11_URB_MIN_MAX_ENTRIES,
864    },
865    .simulator_id = 19,
866 };
867 
868 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
869    GFX11_FEATURES(1, 1, subslices(1), 6),
870    .urb = {
871       GFX11_URB_MIN_MAX_ENTRIES,
872    },
873    .simulator_id = 19,
874 };
875 
876 #define GFX11_LP_FEATURES                           \
877    .is_elkhartlake = true,                          \
878    .urb = {                                         \
879       GFX11_URB_MIN_MAX_ENTRIES,                    \
880    },                                               \
881    .disable_ccs_repack = true,                      \
882    .simulator_id = 28
883 
884 static const struct intel_device_info intel_device_info_ehl_4x8 = {
885    GFX11_FEATURES(1, 1, subslices(4), 4),
886    GFX11_LP_FEATURES,
887 };
888 
889 static const struct intel_device_info intel_device_info_ehl_4x6 = {
890    GFX11_FEATURES(1, 1, subslices(4), 4),
891    GFX11_LP_FEATURES,
892    .num_eu_per_subslice = 6,
893 };
894 
895 static const struct intel_device_info intel_device_info_ehl_4x5 = {
896    GFX11_FEATURES(1, 1, subslices(4), 4),
897    GFX11_LP_FEATURES,
898    .num_eu_per_subslice = 5,
899 };
900 
901 static const struct intel_device_info intel_device_info_ehl_4x4 = {
902    GFX11_FEATURES(1, 1, subslices(4), 4),
903    GFX11_LP_FEATURES,
904    .num_eu_per_subslice = 4,
905 };
906 
907 static const struct intel_device_info intel_device_info_ehl_2x8 = {
908    GFX11_FEATURES(1, 1, subslices(2), 4),
909    GFX11_LP_FEATURES,
910 };
911 
912 static const struct intel_device_info intel_device_info_ehl_2x4 = {
913    GFX11_FEATURES(1, 1, subslices(2), 4),
914    GFX11_LP_FEATURES,
915    .num_eu_per_subslice =4,
916 };
917 
918 #define GFX12_URB_MIN_MAX_ENTRIES                   \
919    .min_entries = {                                 \
920       [MESA_SHADER_VERTEX]    = 64,                 \
921       [MESA_SHADER_TESS_EVAL] = 34,                 \
922    },                                               \
923    .max_entries = {                                 \
924       [MESA_SHADER_VERTEX]    = 3576,               \
925       [MESA_SHADER_TESS_CTRL] = 1548,               \
926       [MESA_SHADER_TESS_EVAL] = 3576,               \
927       /* Wa_14013840143 */                          \
928       [MESA_SHADER_GEOMETRY]  = 1536,               \
929    }
930 
931 #define GFX12_HW_INFO                               \
932    .ver = 12,                                       \
933    .has_pln = false,                                \
934    .has_sample_with_hiz = false,                    \
935    .has_aux_map = true,                             \
936    .max_vs_threads = 546,                           \
937    .max_gs_threads = 336,                           \
938    .max_tcs_threads = 336,                          \
939    .max_tes_threads = 546,                          \
940    .max_cs_threads = 112, /* threads per DSS */     \
941    .urb = {                                         \
942       GFX12_URB_MIN_MAX_ENTRIES,                    \
943    }
944 
945 #define GFX12_FEATURES(_gt, _slices, _l3)                       \
946    GFX8_FEATURES,                                               \
947    GFX12_HW_INFO,                                               \
948    .has_64bit_float = false,                                    \
949    .has_64bit_int = false,                                      \
950    .has_integer_dword_mul = false,                              \
951    .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
952    .simulator_id = 22,                                          \
953    .num_eu_per_subslice = 16,                                   \
954    .cs_prefetch_size = 512
955 
956 #define dual_subslices(args...) { args, }
957 
958 #define GFX12_GT05_FEATURES                                     \
959    GFX12_FEATURES(1, 1, 4),                                     \
960    .num_subslices = dual_subslices(1)
961 
962 #define GFX12_GT_FEATURES(_gt)                                  \
963    GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8),                    \
964    .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
965 
966 static const struct intel_device_info intel_device_info_tgl_gt1 = {
967    GFX12_GT_FEATURES(1),
968    .is_tigerlake = true,
969 };
970 
971 static const struct intel_device_info intel_device_info_tgl_gt2 = {
972    GFX12_GT_FEATURES(2),
973    .is_tigerlake = true,
974 };
975 
976 static const struct intel_device_info intel_device_info_rkl_gt05 = {
977    GFX12_GT05_FEATURES,
978    .is_rocketlake = true,
979 };
980 
981 static const struct intel_device_info intel_device_info_rkl_gt1 = {
982    GFX12_GT_FEATURES(1),
983    .is_rocketlake = true,
984 };
985 
986 static const struct intel_device_info intel_device_info_adl_gt05 = {
987    GFX12_GT05_FEATURES,
988    .is_alderlake = true,
989 };
990 
991 static const struct intel_device_info intel_device_info_adl_gt1 = {
992    GFX12_GT_FEATURES(1),
993    .is_alderlake = true,
994 };
995 
996 static const struct intel_device_info intel_device_info_adl_gt2 = {
997    GFX12_GT_FEATURES(2),
998    .is_alderlake = true,
999    .display_ver = 13,
1000 };
1001 
1002 #define GFX12_DG1_SG1_FEATURES                  \
1003    GFX12_GT_FEATURES(2),                        \
1004    .is_dg1 = true,                              \
1005    .has_llc = false,                            \
1006    .has_local_mem = true,                       \
1007    .urb.size = 768,                             \
1008    .simulator_id = 30
1009 
1010 static const struct intel_device_info intel_device_info_dg1 = {
1011    GFX12_DG1_SG1_FEATURES,
1012 };
1013 
1014 static const struct intel_device_info intel_device_info_sg1 = {
1015    GFX12_DG1_SG1_FEATURES,
1016 };
1017 
1018 static void
reset_masks(struct intel_device_info * devinfo)1019 reset_masks(struct intel_device_info *devinfo)
1020 {
1021    devinfo->subslice_slice_stride = 0;
1022    devinfo->eu_subslice_stride = 0;
1023    devinfo->eu_slice_stride = 0;
1024 
1025    devinfo->num_slices = 0;
1026    devinfo->num_eu_per_subslice = 0;
1027    memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1028 
1029    memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1030    memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1031    memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1032    memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1033 }
1034 
1035 static void
update_from_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology)1036 update_from_topology(struct intel_device_info *devinfo,
1037                      const struct drm_i915_query_topology_info *topology)
1038 {
1039    reset_masks(devinfo);
1040 
1041    assert(topology->max_slices > 0);
1042    assert(topology->max_subslices > 0);
1043    assert(topology->max_eus_per_subslice > 0);
1044 
1045    devinfo->subslice_slice_stride = topology->subslice_stride;
1046 
1047    devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1048    devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1049 
1050    assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1051    memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1052    devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1053    devinfo->max_slices = topology->max_slices;
1054    devinfo->max_subslices_per_slice = topology->max_subslices;
1055    devinfo->max_eu_per_subslice = topology->max_eus_per_subslice;
1056 
1057    uint32_t subslice_mask_len =
1058       topology->max_slices * topology->subslice_stride;
1059    assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1060    memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1061           subslice_mask_len);
1062 
1063    uint32_t n_subslices = 0;
1064    for (int s = 0; s < topology->max_slices; s++) {
1065       if ((devinfo->slice_masks & (1 << s)) == 0)
1066          continue;
1067 
1068       for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1069          devinfo->num_subslices[s] +=
1070             __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1071       }
1072       n_subslices += devinfo->num_subslices[s];
1073    }
1074    assert(n_subslices > 0);
1075 
1076    if (devinfo->ver >= 11) {
1077       /* On current ICL+ hardware we only have one slice. */
1078       assert(devinfo->slice_masks == 1);
1079 
1080       /* Count the number of subslices on each pixel pipe. Assume that every
1081        * contiguous group of 4 subslices in the mask belong to the same pixel
1082        * pipe.  However note that on TGL the kernel returns a mask of enabled
1083        * *dual* subslices instead of actual subslices somewhat confusingly, so
1084        * each pixel pipe only takes 2 bits in the mask even though it's still
1085        * 4 subslices.
1086        */
1087       const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1088       for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1089          const unsigned ppipe_mask = BITFIELD_RANGE(p * ppipe_bits, ppipe_bits);
1090          devinfo->ppipe_subslices[p] =
1091             __builtin_popcount(devinfo->subslice_masks[0] & ppipe_mask);
1092       }
1093    }
1094 
1095    if (devinfo->ver == 12 && devinfo->num_slices == 1) {
1096       if (n_subslices >= 6) {
1097          assert(n_subslices == 6);
1098          devinfo->l3_banks = 8;
1099       } else if (n_subslices > 2) {
1100          devinfo->l3_banks = 6;
1101       } else {
1102          devinfo->l3_banks = 4;
1103       }
1104    }
1105 
1106    uint32_t eu_mask_len =
1107       topology->eu_stride * topology->max_subslices * topology->max_slices;
1108    assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1109    memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1110 
1111    uint32_t n_eus = 0;
1112    for (int b = 0; b < eu_mask_len; b++)
1113       n_eus += __builtin_popcount(devinfo->eu_masks[b]);
1114 
1115 #ifdef __DragonFly__
1116    /* XXX avoid SIGFPE on divzero */
1117    if (n_subslices == 0)
1118    devinfo->num_eu_per_subslice = 0;
1119    else
1120 #endif
1121    devinfo->num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1122 }
1123 
1124 /* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1125  * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1126  */
1127 static bool
update_from_masks(struct intel_device_info * devinfo,uint32_t slice_mask,uint32_t subslice_mask,uint32_t n_eus)1128 update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1129                   uint32_t subslice_mask, uint32_t n_eus)
1130 {
1131    struct drm_i915_query_topology_info *topology;
1132 
1133    assert((slice_mask & 0xff) == slice_mask);
1134 
1135    size_t data_length = 100;
1136 
1137    topology = calloc(1, sizeof(*topology) + data_length);
1138    if (!topology)
1139       return false;
1140 
1141    topology->max_slices = util_last_bit(slice_mask);
1142    topology->max_subslices = util_last_bit(subslice_mask);
1143 
1144    topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1145    topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1146 
1147    uint32_t n_subslices = __builtin_popcount(slice_mask) *
1148       __builtin_popcount(subslice_mask);
1149    uint32_t num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1150    uint32_t eu_mask = (1U << num_eu_per_subslice) - 1;
1151 
1152    topology->max_eus_per_subslice = num_eu_per_subslice;
1153    topology->eu_offset = topology->subslice_offset +
1154       topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1155    topology->eu_stride = DIV_ROUND_UP(num_eu_per_subslice, 8);
1156 
1157    /* Set slice mask in topology */
1158    for (int b = 0; b < topology->subslice_offset; b++)
1159       topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1160 
1161    for (int s = 0; s < topology->max_slices; s++) {
1162 
1163       /* Set subslice mask in topology */
1164       for (int b = 0; b < topology->subslice_stride; b++) {
1165          int subslice_offset = topology->subslice_offset +
1166             s * topology->subslice_stride + b;
1167 
1168          topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1169       }
1170 
1171       /* Set eu mask in topology */
1172       for (int ss = 0; ss < topology->max_subslices; ss++) {
1173          for (int b = 0; b < topology->eu_stride; b++) {
1174             int eu_offset = topology->eu_offset +
1175                (s * topology->max_subslices + ss) * topology->eu_stride + b;
1176 
1177             topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1178          }
1179       }
1180    }
1181 
1182    update_from_topology(devinfo, topology);
1183    free(topology);
1184 
1185    return true;
1186 }
1187 
1188 /* Generate mask from the device data. */
1189 static void
fill_masks(struct intel_device_info * devinfo)1190 fill_masks(struct intel_device_info *devinfo)
1191 {
1192    /* All of our internal device descriptions assign the same number of
1193     * subslices for each slice. Just verify that this is true.
1194     */
1195    for (int s = 1; s < devinfo->num_slices; s++)
1196       assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1197 
1198    update_from_masks(devinfo,
1199                      (1U << devinfo->num_slices) - 1,
1200                      (1U << devinfo->num_subslices[0]) - 1,
1201                      devinfo->num_slices * devinfo->num_subslices[0] *
1202                      devinfo->num_eu_per_subslice);
1203 }
1204 
1205 static bool
getparam(int fd,uint32_t param,int * value)1206 getparam(int fd, uint32_t param, int *value)
1207 {
1208    int tmp;
1209 
1210    struct drm_i915_getparam gp = {
1211       .param = param,
1212       .value = &tmp,
1213    };
1214 
1215    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1216    if (ret != 0)
1217       return false;
1218 
1219    *value = tmp;
1220    return true;
1221 }
1222 
1223 static void
update_cs_workgroup_threads(struct intel_device_info * devinfo)1224 update_cs_workgroup_threads(struct intel_device_info *devinfo)
1225 {
1226    /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1227     * can program is 64 without going up to a rectangular group. This only
1228     * impacts Haswell and TGL which have higher thread counts.
1229     *
1230     * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1231     * is 10 bits so we have no such restrictions.
1232     */
1233    devinfo->max_cs_workgroup_threads =
1234       devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1235                                MIN2(devinfo->max_cs_threads, 64);
1236 }
1237 
1238 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1239 intel_get_device_info_from_pci_id(int pci_id,
1240                                   struct intel_device_info *devinfo)
1241 {
1242    switch (pci_id) {
1243 #undef CHIPSET
1244 #define CHIPSET(id, family, fam_str, name) \
1245       case id: *devinfo = intel_device_info_##family; break;
1246 #include "pci_ids/i965_pci_ids.h"
1247 #include "pci_ids/iris_pci_ids.h"
1248 
1249 #undef CHIPSET
1250 #define CHIPSET(id, fam_str, name) \
1251       case id: *devinfo = intel_device_info_gfx3; break;
1252 #include "pci_ids/i915_pci_ids.h"
1253 
1254    default:
1255       mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1256       return false;
1257    }
1258 
1259    switch (pci_id) {
1260 #undef CHIPSET
1261 #define CHIPSET(_id, _family, _fam_str, _name) \
1262    case _id: \
1263       /* sizeof(str_literal) includes the null */ \
1264       STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1265                     sizeof(devinfo->name)); \
1266       strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1267       break;
1268 #include "pci_ids/i965_pci_ids.h"
1269 #include "pci_ids/iris_pci_ids.h"
1270    default:
1271       strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1272    }
1273 
1274    fill_masks(devinfo);
1275 
1276    /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1277     *
1278     * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
1279     *  allocate scratch space enough so that each slice has 4 slices allowed."
1280     *
1281     * The equivalent internal documentation says that this programming note
1282     * applies to all Gfx9+ platforms.
1283     *
1284     * The hardware typically calculates the scratch space pointer by taking
1285     * the base address, and adding per-thread-scratch-space * thread ID.
1286     * Extra padding can be necessary depending how the thread IDs are
1287     * calculated for a particular shader stage.
1288     */
1289 
1290    switch(devinfo->ver) {
1291    case 9:
1292       devinfo->max_wm_threads = 64 /* threads-per-PSD */
1293                               * devinfo->num_slices
1294                               * 4; /* effective subslices per slice */
1295       break;
1296    case 11:
1297    case 12:
1298       devinfo->max_wm_threads = 128 /* threads-per-PSD */
1299                               * devinfo->num_slices
1300                               * 8; /* subslices per slice */
1301       break;
1302    default:
1303       assert(devinfo->ver < 9);
1304       break;
1305    }
1306 
1307    assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1308 
1309    if (devinfo->verx10 == 0)
1310       devinfo->verx10 = devinfo->ver * 10;
1311 
1312    if (devinfo->display_ver == 0)
1313       devinfo->display_ver = devinfo->ver;
1314 
1315    update_cs_workgroup_threads(devinfo);
1316 
1317    devinfo->chipset_id = pci_id;
1318    return true;
1319 }
1320 
1321 /**
1322  * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1323  * (kernel 4.13+)
1324  */
1325 static bool
getparam_topology(struct intel_device_info * devinfo,int fd)1326 getparam_topology(struct intel_device_info *devinfo, int fd)
1327 {
1328    int slice_mask = 0;
1329    if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1330       goto maybe_warn;
1331 
1332    int n_eus;
1333    if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1334       goto maybe_warn;
1335 
1336    int subslice_mask = 0;
1337    if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1338       goto maybe_warn;
1339 
1340    return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1341 
1342  maybe_warn:
1343    /* Only with Gfx8+ are we starting to see devices with fusing that can only
1344     * be detected at runtime.
1345     */
1346    if (devinfo->ver >= 8)
1347       mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1348 
1349    return false;
1350 }
1351 
1352 /**
1353  * preferred API for updating the topology in devinfo (kernel 4.17+)
1354  */
1355 static bool
query_topology(struct intel_device_info * devinfo,int fd)1356 query_topology(struct intel_device_info *devinfo, int fd)
1357 {
1358    struct drm_i915_query_topology_info *topo_info =
1359       intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO);
1360    if (topo_info == NULL)
1361       return false;
1362 
1363    update_from_topology(devinfo, topo_info);
1364 
1365    free(topo_info);
1366 
1367    return true;
1368 
1369 }
1370 
1371 int
intel_get_aperture_size(int fd,uint64_t * size)1372 intel_get_aperture_size(int fd, uint64_t *size)
1373 {
1374    struct drm_i915_gem_get_aperture aperture = { 0 };
1375 
1376    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1377    if (ret == 0 && size)
1378       *size = aperture.aper_size;
1379 
1380    return ret;
1381 }
1382 
1383 static bool
has_get_tiling(int fd)1384 has_get_tiling(int fd)
1385 {
1386    int ret;
1387 
1388    struct drm_i915_gem_create gem_create = {
1389       .size = 4096,
1390    };
1391 
1392    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1393       unreachable("Failed to create GEM BO");
1394       return false;
1395    }
1396 
1397    struct drm_i915_gem_get_tiling get_tiling = {
1398       .handle = gem_create.handle,
1399    };
1400    ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1401 
1402    struct drm_gem_close close = {
1403       .handle = gem_create.handle,
1404    };
1405    intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1406 
1407    return ret == 0;
1408 }
1409 
1410 static void
fixup_chv_device_info(struct intel_device_info * devinfo)1411 fixup_chv_device_info(struct intel_device_info *devinfo)
1412 {
1413    assert(devinfo->is_cherryview);
1414 
1415    /* Cherryview is annoying.  The number of EUs is depending on fusing and
1416     * isn't determinable from the PCI ID alone.  We default to the minimum
1417     * available for that PCI ID and then compute the real value from the
1418     * subslice information we get from the kernel.
1419     */
1420    const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1421    const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1422 
1423    /* Logical CS threads = EUs per subslice * num threads per EU */
1424    uint32_t max_cs_threads =
1425       eu_total / subslice_total * devinfo->num_thread_per_eu;
1426 
1427    /* Fuse configurations may give more threads than expected, never less. */
1428    if (max_cs_threads > devinfo->max_cs_threads)
1429       devinfo->max_cs_threads = max_cs_threads;
1430 
1431    update_cs_workgroup_threads(devinfo);
1432 
1433    /* Braswell is even more annoying.  Its marketing name isn't determinable
1434     * from the PCI ID and is also dependent on fusing.
1435     */
1436    if (devinfo->chipset_id != 0x22B1)
1437       return;
1438 
1439    char *bsw_model;
1440    switch (eu_total) {
1441    case 16: bsw_model = "405"; break;
1442    case 12: bsw_model = "400"; break;
1443    default: bsw_model = "   "; break;
1444    }
1445 
1446    char *needle = strstr(devinfo->name, "XXX");
1447    assert(needle);
1448    if (needle)
1449       memcpy(needle, bsw_model, 3);
1450 }
1451 
1452 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1453 init_max_scratch_ids(struct intel_device_info *devinfo)
1454 {
1455    /* Determine the max number of subslices that potentially might be used in
1456     * scratch space ids.
1457     *
1458     * For, Gfx11+, scratch space allocation is based on the number of threads
1459     * in the base configuration.
1460     *
1461     * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1462     * we wish to view that there are 4 subslices per slice instead of the
1463     * actual number of subslices per slice. The documentation for 3DSTATE_PS
1464     * "Scratch Space Base Pointer" says:
1465     *
1466     *    "Scratch Space per slice is computed based on 4 sub-slices.  SW
1467     *     must allocate scratch space enough so that each slice has 4
1468     *     slices allowed."
1469     *
1470     * According to the other driver team, this applies to compute shaders
1471     * as well.  This is not currently documented at all.
1472     *
1473     * For Gfx8 and older we user devinfo->subslice_total.
1474     */
1475    unsigned subslices;
1476    if (devinfo->verx10 == 125)
1477       subslices = 32;
1478    else if (devinfo->ver == 12)
1479       subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
1480    else if (devinfo->ver == 11)
1481       subslices = 8;
1482    else if (devinfo->ver >= 9 && devinfo->ver < 11)
1483       subslices = 4 * devinfo->num_slices;
1484    else
1485       subslices = devinfo->subslice_total;
1486    assert(subslices >= devinfo->subslice_total);
1487 
1488    unsigned scratch_ids_per_subslice;
1489    if (devinfo->ver >= 12) {
1490       /* Same as ICL below, but with 16 EUs. */
1491       scratch_ids_per_subslice = 16 * 8;
1492    } else if (devinfo->ver >= 11) {
1493       /* The MEDIA_VFE_STATE docs say:
1494        *
1495        *    "Starting with this configuration, the Maximum Number of
1496        *     Threads must be set to (#EU * 8) for GPGPU dispatches.
1497        *
1498        *     Although there are only 7 threads per EU in the configuration,
1499        *     the FFTID is calculated as if there are 8 threads per EU,
1500        *     which in turn requires a larger amount of Scratch Space to be
1501        *     allocated by the driver."
1502        */
1503       scratch_ids_per_subslice = 8 * 8;
1504    } else if (devinfo->is_haswell) {
1505       /* WaCSScratchSize:hsw
1506        *
1507        * Haswell's scratch space address calculation appears to be sparse
1508        * rather than tightly packed. The Thread ID has bits indicating
1509        * which subslice, EU within a subslice, and thread within an EU it
1510        * is. There's a maximum of two slices and two subslices, so these
1511        * can be stored with a single bit. Even though there are only 10 EUs
1512        * per subslice, this is stored in 4 bits, so there's an effective
1513        * maximum value of 16 EUs. Similarly, although there are only 7
1514        * threads per EU, this is stored in a 3 bit number, giving an
1515        * effective maximum value of 8 threads per EU.
1516        *
1517        * This means that we need to use 16 * 8 instead of 10 * 7 for the
1518        * number of threads per subslice.
1519        */
1520       scratch_ids_per_subslice = 16 * 8;
1521    } else if (devinfo->is_cherryview) {
1522       /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1523        * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1524        * as if it had 8 EUs.
1525        */
1526       scratch_ids_per_subslice = 8 * 7;
1527    } else {
1528       scratch_ids_per_subslice = devinfo->max_cs_threads;
1529    }
1530 
1531    unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1532 
1533    if (devinfo->verx10 >= 125) {
1534       /* On GFX version 12.5, scratch access changed to a surface-based model.
1535        * Instead of each shader type having its own layout based on IDs passed
1536        * from the relevant fixed-function unit, all scratch access is based on
1537        * thread IDs like it always has been for compute.
1538        */
1539       for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1540          devinfo->max_scratch_ids[i] = max_thread_ids;
1541    } else {
1542       unsigned max_scratch_ids[] = {
1543          [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
1544          [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1545          [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1546          [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
1547          [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
1548          [MESA_SHADER_COMPUTE]   = max_thread_ids,
1549       };
1550       STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1551       memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1552              sizeof(devinfo->max_scratch_ids));
1553    }
1554 }
1555 
1556 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo)1557 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1558 {
1559    int devid = 0;
1560 
1561    const char *devid_override = getenv("INTEL_DEVID_OVERRIDE");
1562    if (devid_override && strlen(devid_override) > 0) {
1563       if (geteuid() == getuid()) {
1564          devid = intel_device_name_to_pci_device_id(devid_override);
1565          /* Fallback to PCI ID. */
1566          if (devid <= 0)
1567             devid = strtol(devid_override, NULL, 0);
1568          if (devid <= 0) {
1569             mesa_loge("Invalid INTEL_DEVID_OVERRIDE=\"%s\". "
1570                     "Use a valid numeric PCI ID or one of the supported "
1571                     "platform names:", devid_override);
1572             for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++)
1573                mesa_loge("   %s", name_map[i].name);
1574             return false;
1575          }
1576       } else {
1577          mesa_logi("Ignoring INTEL_DEVID_OVERRIDE=\"%s\" because "
1578                    "real and effective user ID don't match.", devid_override);
1579       }
1580    }
1581 
1582    if (devid > 0) {
1583       if (!intel_get_device_info_from_pci_id(devid, devinfo))
1584          return false;
1585       devinfo->no_hw = true;
1586    } else {
1587       /* query the device id */
1588       if (!getparam(fd, I915_PARAM_CHIPSET_ID, &devid))
1589          return false;
1590       if (!intel_get_device_info_from_pci_id(devid, devinfo))
1591          return false;
1592       devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1593    }
1594 
1595    if (devinfo->ver == 10) {
1596       mesa_loge("Gfx10 support is redacted.");
1597       return false;
1598    }
1599 
1600    /* remaining initializion queries the kernel for device info */
1601    if (devinfo->no_hw)
1602       return true;
1603 
1604    int timestamp_frequency;
1605    if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1606                 &timestamp_frequency))
1607       devinfo->timestamp_frequency = timestamp_frequency;
1608    else if (devinfo->ver >= 10) {
1609       mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1610       return false;
1611    }
1612 
1613    if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
1614       devinfo->revision = 0;
1615 
1616    if (!query_topology(devinfo, fd)) {
1617       if (devinfo->ver >= 10) {
1618          /* topology uAPI required for CNL+ (kernel 4.17+) */
1619          return false;
1620       }
1621 
1622       /* else use the kernel 4.13+ api for gfx8+.  For older kernels, topology
1623        * will be wrong, affecting GPU metrics. In this case, fail silently.
1624        */
1625       getparam_topology(devinfo, fd);
1626    }
1627 
1628    if (devinfo->is_cherryview)
1629       fixup_chv_device_info(devinfo);
1630 
1631    intel_get_aperture_size(fd, &devinfo->aperture_bytes);
1632    devinfo->has_tiling_uapi = has_get_tiling(fd);
1633 
1634    devinfo->subslice_total = 0;
1635    for (uint32_t i = 0; i < devinfo->max_slices; i++)
1636       devinfo->subslice_total += __builtin_popcount(devinfo->subslice_masks[i]);
1637 
1638    /* Gfx7 and older do not support EU/Subslice info */
1639    assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
1640    devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
1641 
1642    init_max_scratch_ids(devinfo);
1643 
1644    return true;
1645 }
1646