1  /*
2   * Copyright © 2013 Intel Corporation
3   *
4   * Permission is hereby granted, free of charge, to any person obtaining a
5   * copy of this software and associated documentation files (the "Software"),
6   * to deal in the Software without restriction, including without limitation
7   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8   * and/or sell copies of the Software, and to permit persons to whom the
9   * Software is furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice (including the next
12   * paragraph) shall be included in all copies or substantial portions of the
13   * Software.
14   *
15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21   * IN THE SOFTWARE.
22   *
23   */
24 
25 #ifndef INTEL_DEVICE_INFO_H
26 #define INTEL_DEVICE_INFO_H
27 
28 #include <stdbool.h>
29 #include <stdint.h>
30 
31 #include "util/macros.h"
32 #include "compiler/shader_enums.h"
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 struct drm_i915_query_topology_info;
39 
40 #define INTEL_DEVICE_MAX_NAME_SIZE        64
41 #define INTEL_DEVICE_MAX_SLICES           8
42 #define INTEL_DEVICE_MAX_SUBSLICES        (8)  /* Maximum on gfx11 */
43 #define INTEL_DEVICE_MAX_EUS_PER_SUBSLICE (16) /* Maximum on gfx12 */
44 #define INTEL_DEVICE_MAX_PIXEL_PIPES      (16) /* Maximum on DG2 */
45 
46 #define INTEL_PLATFORM_GROUP_START(group, new_enum) \
47    new_enum, INTEL_PLATFORM_ ## group ## _START = new_enum
48 #define INTEL_PLATFORM_GROUP_END(group, new_enum) \
49    new_enum, INTEL_PLATFORM_ ## group ## _END = new_enum
50 
51 enum intel_platform {
52    INTEL_PLATFORM_GFX3 = 1,
53    INTEL_PLATFORM_I965,
54    INTEL_PLATFORM_ILK,
55    INTEL_PLATFORM_G4X,
56    INTEL_PLATFORM_SNB,
57    INTEL_PLATFORM_IVB,
58    INTEL_PLATFORM_BYT,
59    INTEL_PLATFORM_HSW,
60    INTEL_PLATFORM_BDW,
61    INTEL_PLATFORM_CHV,
62    INTEL_PLATFORM_SKL,
63    INTEL_PLATFORM_BXT,
64    INTEL_PLATFORM_KBL,
65    INTEL_PLATFORM_GLK,
66    INTEL_PLATFORM_CFL,
67    INTEL_PLATFORM_ICL,
68    INTEL_PLATFORM_EHL,
69    INTEL_PLATFORM_TGL,
70    INTEL_PLATFORM_RKL,
71    INTEL_PLATFORM_DG1,
72    INTEL_PLATFORM_ADL,
73    INTEL_PLATFORM_RPL,
74    INTEL_PLATFORM_GROUP_START(DG2, INTEL_PLATFORM_DG2_G10),
75    INTEL_PLATFORM_GROUP_END(DG2, INTEL_PLATFORM_DG2_G11),
76 };
77 
78 #undef INTEL_PLATFORM_GROUP_START
79 #undef INTEL_PLATFORM_GROUP_END
80 
81 #define intel_platform_in_range(platform, platform_range) \
82    (((platform) >= INTEL_PLATFORM_ ## platform_range ## _START) && \
83     ((platform) <= INTEL_PLATFORM_ ## platform_range ## _END))
84 
85 #define intel_device_info_is_dg2(devinfo) \
86    intel_platform_in_range((devinfo)->platform, DG2)
87 
88 /**
89  * Intel hardware information and quirks
90  */
91 struct intel_device_info
92 {
93    /* Driver internal numbers used to differentiate platforms. */
94    int ver;
95    int verx10;
96    int display_ver;
97 
98    /**
99     * This revision is from ioctl (I915_PARAM_REVISION) unlike
100     * pci_revision_id from drm device. Its value is not always
101     * same as the pci_revision_id.
102     */
103    int revision;
104    int gt;
105 
106    /* PCI info */
107    uint16_t pci_domain;
108    uint8_t pci_bus;
109    uint8_t pci_dev;
110    uint8_t pci_func;
111    uint16_t pci_device_id;
112    uint8_t pci_revision_id;
113 
114    enum intel_platform platform;
115 
116    bool has_hiz_and_separate_stencil;
117    bool must_use_separate_stencil;
118    bool has_sample_with_hiz;
119    bool has_bit6_swizzle;
120    bool has_llc;
121 
122    bool has_pln;
123    bool has_64bit_float;
124    bool has_64bit_int;
125    bool has_integer_dword_mul;
126    bool has_compr4;
127    bool has_surface_tile_offset;
128    bool supports_simd16_3src;
129    bool disable_ccs_repack;
130 
131    /**
132     * True if CCS uses a flat virtual address translation to a memory
133     * carve-out, rather than aux map translations, or additional surfaces.
134     */
135    bool has_flat_ccs;
136    bool has_aux_map;
137    bool has_tiling_uapi;
138    bool has_ray_tracing;
139    bool has_ray_query;
140    bool has_local_mem;
141    bool has_lsc;
142    bool has_mesh_shading;
143 
144    /**
145     * \name Intel hardware quirks
146     *  @{
147     */
148    bool has_negative_rhw_bug;
149 
150    /**
151     * Whether this platform supports fragment shading rate controlled by a
152     * primitive in geometry shaders and by a control buffer.
153     */
154    bool has_coarse_pixel_primitive_and_cb;
155 
156    /**
157     * Some versions of Gen hardware don't do centroid interpolation correctly
158     * on unlit pixels, causing incorrect values for derivatives near triangle
159     * edges.  Enabling this flag causes the fragment shader to use
160     * non-centroid interpolation for unlit pixels, at the expense of two extra
161     * fragment shader instructions.
162     */
163    bool needs_unlit_centroid_workaround;
164    /** @} */
165 
166    /**
167     * \name GPU hardware limits
168     *
169     * In general, you can find shader thread maximums by looking at the "Maximum
170     * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS,
171     * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry
172     * limits come from the "Number of URB Entries" field in the
173     * 3DSTATE_URB_VS command and friends.
174     *
175     * These fields are used to calculate the scratch space to allocate.  The
176     * amount of scratch space can be larger without being harmful on modern
177     * GPUs, however, prior to Haswell, programming the maximum number of threads
178     * to greater than the hardware maximum would cause GPU performance to tank.
179     *
180     *  @{
181     */
182    /**
183     * Total number of slices present on the device whether or not they've been
184     * fused off.
185     *
186     * XXX: CS thread counts are limited by the inability to do cross subslice
187     * communication. It is the effectively the number of logical threads which
188     * can be executed in a subslice. Fuse configurations may cause this number
189     * to change, so we program @max_cs_threads as the lower maximum.
190     */
191    unsigned num_slices;
192 
193    /**
194     * Maximum number of slices present on this device (can be more than
195     * num_slices if some slices are fused).
196     */
197    unsigned max_slices;
198 
199    /**
200     * Number of subslices for each slice (used to be uniform until CNL).
201     */
202    unsigned num_subslices[INTEL_DEVICE_MAX_SLICES];
203 
204    /**
205     * Maximum number of subslices per slice present on this device (can be
206     * more than the maximum value in the num_subslices[] array if some
207     * subslices are fused).
208     */
209    unsigned max_subslices_per_slice;
210 
211    /**
212     * Number of subslices on each pixel pipe (ICL).
213     */
214    unsigned ppipe_subslices[INTEL_DEVICE_MAX_PIXEL_PIPES];
215 
216    /**
217     * Maximum number of EUs per subslice (some EUs can be fused off).
218     */
219    unsigned max_eus_per_subslice;
220 
221    /**
222     * Number of threads per eu, varies between 4 and 8 between generations.
223     */
224    unsigned num_thread_per_eu;
225 
226    /**
227     * A bit mask of the slices available.
228     */
229    uint8_t slice_masks;
230 
231    /**
232     * An array of bit mask of the subslices available, use subslice_slice_stride
233     * to access this array.
234     */
235    uint8_t subslice_masks[INTEL_DEVICE_MAX_SLICES *
236                           DIV_ROUND_UP(INTEL_DEVICE_MAX_SUBSLICES, 8)];
237 
238    /**
239     * The number of enabled subslices (considering fusing). For exactly which
240     * subslices are enabled, see subslice_masks[].
241     */
242    unsigned subslice_total;
243 
244    /**
245     * An array of bit mask of EUs available, use eu_slice_stride &
246     * eu_subslice_stride to access this array.
247     */
248    uint8_t eu_masks[INTEL_DEVICE_MAX_SLICES *
249                     INTEL_DEVICE_MAX_SUBSLICES *
250                     DIV_ROUND_UP(INTEL_DEVICE_MAX_EUS_PER_SUBSLICE, 8)];
251 
252    /**
253     * Stride to access subslice_masks[].
254     */
255    uint16_t subslice_slice_stride;
256 
257    /**
258     * Strides to access eu_masks[].
259     */
260    uint16_t eu_slice_stride;
261    uint16_t eu_subslice_stride;
262 
263    unsigned l3_banks;
264    unsigned max_vs_threads;   /**< Maximum Vertex Shader threads */
265    unsigned max_tcs_threads;  /**< Maximum Hull Shader threads */
266    unsigned max_tes_threads;  /**< Maximum Domain Shader threads */
267    unsigned max_gs_threads;   /**< Maximum Geometry Shader threads. */
268    /**
269     * Theoretical maximum number of Pixel Shader threads.
270     *
271     * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will
272     * automatically scale pixel shader thread count, based on a single value
273     * programmed into 3DSTATE_PS.
274     *
275     * To calculate the maximum number of threads for Gfx8 beyond (which have
276     * multiple Pixel Shader Dispatchers):
277     *
278     * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD"
279     * - Usually there's only one PSD per subslice, so use the number of
280     *   subslices for number of PSDs.
281     * - For max_wm_threads, the total should be PSD threads * #PSDs.
282     */
283    unsigned max_wm_threads;
284 
285    unsigned max_threads_per_psd;
286 
287    /**
288     * Maximum Compute Shader threads.
289     *
290     * Thread count * number of EUs per subslice
291     */
292    unsigned max_cs_threads;
293 
294    /**
295     * Maximum number of threads per workgroup supported by the GPGPU_WALKER or
296     * COMPUTE_WALKER command.
297     *
298     * This may be smaller than max_cs_threads as it takes into account added
299     * restrictions on the GPGPU/COMPUTE_WALKER commands.  While max_cs_threads
300     * expresses the total parallelism of the GPU, this expresses the maximum
301     * number of threads we can dispatch in a single workgroup.
302     */
303    unsigned max_cs_workgroup_threads;
304 
305    /**
306     * The maximum number of potential scratch ids. Due to hardware
307     * implementation details, the range of scratch ids may be larger than the
308     * number of subslices.
309     */
310    unsigned max_scratch_ids[MESA_SHADER_STAGES];
311 
312    struct {
313       /**
314        * Fixed size of the URB.
315        *
316        * On Gfx6 and DG1, this is measured in KB.  Gfx4-5 instead measure
317        * this in 512b blocks, as that's more convenient there.
318        *
319        * On most Gfx7+ platforms, the URB is a section of the L3 cache,
320        * and can be resized based on the L3 programming.  For those platforms,
321        * simply leave this field blank (zero) - it isn't used.
322        */
323       unsigned size;
324 
325       /**
326        * The minimum number of URB entries.  See the 3DSTATE_URB_<XS> docs.
327        */
328       unsigned min_entries[4];
329 
330       /**
331        * The maximum number of URB entries.  See the 3DSTATE_URB_<XS> docs.
332        */
333       unsigned max_entries[4];
334    } urb;
335 
336    /* Maximum size in Kb that can be allocated to constants in the URB, this
337     * is usually divided among the stages for implementing push constants.
338     * See 3DSTATE_PUSH_CONSTANT_ALLOC_*.
339     */
340    unsigned max_constant_urb_size_kb;
341 
342    /**
343     * Size of the command streamer prefetch. This is important to know for
344     * self modifying batches.
345     */
346    unsigned cs_prefetch_size;
347 
348    /**
349     * For the longest time the timestamp frequency for Gen's timestamp counter
350     * could be assumed to be 12.5MHz, where the least significant bit neatly
351     * corresponded to 80 nanoseconds.
352     *
353     * Since Gfx9 the numbers aren't so round, with a a frequency of 12MHz for
354     * SKL (or scale factor of 83.33333333) and a frequency of 19200000Hz for
355     * BXT.
356     *
357     * For simplicty to fit with the current code scaling by a single constant
358     * to map from raw timestamps to nanoseconds we now do the conversion in
359     * floating point instead of integer arithmetic.
360     *
361     * In general it's probably worth noting that the documented constants we
362     * have for the per-platform timestamp frequencies aren't perfect and
363     * shouldn't be trusted for scaling and comparing timestamps with a large
364     * delta.
365     *
366     * E.g. with crude testing on my system using the 'correct' scale factor I'm
367     * seeing a drift of ~2 milliseconds per second.
368     */
369    uint64_t timestamp_frequency;
370 
371    uint64_t aperture_bytes;
372    uint64_t gtt_size;
373 
374    /**
375     * ID to put into the .aub files.
376     */
377    int simulator_id;
378 
379    /**
380     * holds the name of the device
381     */
382    char name[INTEL_DEVICE_MAX_NAME_SIZE];
383 
384    /**
385     * no_hw is true when the pci_device_id has been overridden
386     */
387    bool no_hw;
388 
389    /**
390     * apply_hwconfig is true when the platform should apply hwconfig values
391     */
392    bool apply_hwconfig;
393    /** @} */
394 };
395 
396 #ifdef GFX_VER
397 
398 #define intel_device_info_is_9lp(devinfo) \
399    (GFX_VER == 9 && ((devinfo)->platform == INTEL_PLATFORM_BXT || \
400                      (devinfo)->platform == INTEL_PLATFORM_GLK))
401 
402 #else
403 
404 #define intel_device_info_is_9lp(devinfo) \
405    ((devinfo)->platform == INTEL_PLATFORM_BXT || \
406     (devinfo)->platform == INTEL_PLATFORM_GLK)
407 
408 #endif
409 
410 static inline bool
intel_device_info_slice_available(const struct intel_device_info * devinfo,int slice)411 intel_device_info_slice_available(const struct intel_device_info *devinfo,
412                                   int slice)
413 {
414    assert(slice < INTEL_DEVICE_MAX_SLICES);
415    return (devinfo->slice_masks & (1U << slice)) != 0;
416 }
417 
418 static inline bool
intel_device_info_subslice_available(const struct intel_device_info * devinfo,int slice,int subslice)419 intel_device_info_subslice_available(const struct intel_device_info *devinfo,
420                                      int slice, int subslice)
421 {
422    return (devinfo->subslice_masks[slice * devinfo->subslice_slice_stride +
423                                    subslice / 8] & (1U << (subslice % 8))) != 0;
424 }
425 
426 static inline bool
intel_device_info_eu_available(const struct intel_device_info * devinfo,int slice,int subslice,int eu)427 intel_device_info_eu_available(const struct intel_device_info *devinfo,
428                                int slice, int subslice, int eu)
429 {
430    unsigned subslice_offset = slice * devinfo->eu_slice_stride +
431       subslice * devinfo->eu_subslice_stride;
432 
433    return (devinfo->eu_masks[subslice_offset + eu / 8] & (1U << eu % 8)) != 0;
434 }
435 
436 static inline uint32_t
intel_device_info_subslice_total(const struct intel_device_info * devinfo)437 intel_device_info_subslice_total(const struct intel_device_info *devinfo)
438 {
439    uint32_t total = 0;
440 
441    for (size_t i = 0; i < ARRAY_SIZE(devinfo->subslice_masks); i++) {
442       total += __builtin_popcount(devinfo->subslice_masks[i]);
443    }
444 
445    return total;
446 }
447 
448 static inline uint32_t
intel_device_info_eu_total(const struct intel_device_info * devinfo)449 intel_device_info_eu_total(const struct intel_device_info *devinfo)
450 {
451    uint32_t total = 0;
452 
453    for (size_t i = 0; i < ARRAY_SIZE(devinfo->eu_masks); i++)
454       total += __builtin_popcount(devinfo->eu_masks[i]);
455 
456    return total;
457 }
458 
459 static inline unsigned
intel_device_info_num_dual_subslices(UNUSED const struct intel_device_info * devinfo)460 intel_device_info_num_dual_subslices(UNUSED
461                                      const struct intel_device_info *devinfo)
462 {
463    unreachable("TODO");
464 }
465 
466 int intel_device_name_to_pci_device_id(const char *name);
467 
468 static inline uint64_t
intel_device_info_timebase_scale(const struct intel_device_info * devinfo,uint64_t gpu_timestamp)469 intel_device_info_timebase_scale(const struct intel_device_info *devinfo,
470                                  uint64_t gpu_timestamp)
471 {
472    /* Try to avoid going over the 64bits when doing the scaling */
473    uint64_t upper_ts = gpu_timestamp >> 32;
474    uint64_t lower_ts = gpu_timestamp & 0xffffffff;
475    uint64_t upper_scaled_ts = upper_ts * 1000000000ull / devinfo->timestamp_frequency;
476    uint64_t lower_scaled_ts = lower_ts * 1000000000ull / devinfo->timestamp_frequency;
477    return (upper_scaled_ts << 32) + lower_scaled_ts;
478 }
479 
480 bool intel_get_device_info_from_fd(int fh, struct intel_device_info *devinfo);
481 bool intel_get_device_info_from_pci_id(int pci_id,
482                                        struct intel_device_info *devinfo);
483 
484 #ifdef __cplusplus
485 }
486 #endif
487 
488 #endif /* INTEL_DEVICE_INFO_H */
489