1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 
10 // Uncomment this macro to build "empty" schedulers
11 //#define WA_DISABLE_SCHEDULERS 1
12 
13 #if !defined(__OPENCL_VERSION__)
14 #include <cstdint>
15 
16 typedef uint32_t uint;
17 typedef uint64_t ulong;
18 #endif
19 
20 #define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint))
21 #define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint))
22 #define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint))
23 #define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint))
24 #define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint))
25 #define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint))
26 
27 #define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
28 #define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11)
29 #define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
30 #define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3)
31 
32 #define OCLRT_SIZEOF_MSFLUSH_DWORD (2)
33 #define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1)
34 #define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9)
35 
36 #define OCLRT_BATCH_BUFFER_END_CMD (83886080)
37 
38 //Constant buffer stuff
39 #define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49)
40 
41 #define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50)
42 
43 #define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63)
44 #define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51)
45 #define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT)
46 
47 #define CS_PREFETCH_SIZE (8 * 64)
48 
49 #define ALL_BITS_SET_DWORD_MASK (0xffffffff)
50 #define DWORD_SIZE_IN_BITS (32)
51 
52 #define CL_sRGB 0x10BF
53 #define CL_sRGBX 0x10C0
54 #define CL_sRGBA 0x10C1
55 #define CL_sBGRA 0x10C2
56 
57 //scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler
58 //each HW group consist of 3 HW threads that are capable of scheduling 1 block
59 
60 //!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h
61 #define PARALLEL_SCHEDULER_HW_GROUPS (8)
62 #define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3)
63 #define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3)
64 #define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS)
65 
66 #define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1)
67 
68 //parallel scheduler 2.0 is compiled in simd8
69 #define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8)
70 
71 #define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3)
72 
73 #define GRF_SIZE (32)
74 #define SIZEOF_3GRFS (3 * GRF_SIZE)
75 
76 //estimation for dynamic payload size
77 #define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS)
78 
79 //assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine.
80 #define MAX_DSH_SIZE_PER_ENQUEUE 9472
81 
82 #define MAX_BINDING_TABLE_INDEX (253)
83 #define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size.
84 
85 #define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE)
86 #define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1)
87 #define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE)
88 
89 typedef enum tagDebugDataTypes {
90     DBG_DEFAULT = 0,
91     DBG_COMMAND_QUEUE = 1,
92     DBG_EVENTS_UPDATE = 2,
93     DBG_EVENTS_NUMBER = 3,
94     DBG_STACK_UPDATE = 4,
95     DBG_BEFORE_PATCH = 5,
96     DBG_KERNELID = 6,
97     DBG_DSHOFFSET = 7,
98     DBG_IDOFFSET = 8,
99     DBG_AFTER_PATCH = 9,
100     DBG_UNSPECIFIED = 10,
101     DBG_ENQUEUES_NUMBER = 11,
102     DBG_LOCAL_ID,
103     DBG_WKG_ID,
104     DBG_SCHEDULER_END,
105     // Add here new debug enums
106     DBG_MAX
107 } DebugDataTypes;
108 // Struct for debugging kernels
109 typedef struct
110 {
111     DebugDataTypes m_dataType;
112     uint m_dataSize;
113 } DebugDataInfo;
114 typedef struct
115 {
116     enum DDBFlags { DDB_HAS_DATA_INFO = 1,
117                     DDB_SCHEDULER_PROFILING = 2,
118                     DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags;
119     uint m_size;
120     uint m_stackTop;    //index of data stack
121     uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address
122     uint m_stackBottom;
123     uint m_dataInfoBottom; //index of the bottom of DataInfo
124     uint m_dataInfoSize;
125     uint m_flags;
126 
127     uint m_offset;    //current offset indicates free place
128     uint m_data[100]; //buffer
129 } DebugDataBuffer;
130 
131 #pragma pack(push)
132 #pragma pack(4)
133 #include "DeviceEnqueueInternalTypes.h"
134 #pragma pack(pop)
135