1 /* 2 * Copyright (C) 2018-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8 #pragma once 9 10 // Uncomment this macro to build "empty" schedulers 11 //#define WA_DISABLE_SCHEDULERS 1 12 13 #if !defined(__OPENCL_VERSION__) 14 #include <cstdint> 15 16 typedef uint32_t uint; 17 typedef uint64_t ulong; 18 #endif 19 20 #define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint)) 21 #define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint)) 22 #define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint)) 23 #define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint)) 24 #define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint)) 25 #define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint)) 26 27 #define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4) 28 #define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11) 29 #define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4) 30 #define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3) 31 32 #define OCLRT_SIZEOF_MSFLUSH_DWORD (2) 33 #define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1) 34 #define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9) 35 36 #define OCLRT_BATCH_BUFFER_END_CMD (83886080) 37 38 //Constant buffer stuff 39 #define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49) 40 41 #define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50) 42 43 #define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63) 44 #define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51) 45 #define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT) 46 47 #define CS_PREFETCH_SIZE (8 * 64) 48 49 #define ALL_BITS_SET_DWORD_MASK (0xffffffff) 50 #define DWORD_SIZE_IN_BITS (32) 51 52 #define CL_sRGB 0x10BF 53 #define CL_sRGBX 0x10C0 54 #define CL_sRGBA 0x10C1 55 #define CL_sBGRA 0x10C2 56 57 //scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler 58 //each HW group consist of 3 HW threads that are capable of scheduling 1 block 59 60 //!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h 61 #define PARALLEL_SCHEDULER_HW_GROUPS (8) 62 #define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3) 63 #define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3) 64 #define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS) 65 66 #define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1) 67 68 //parallel scheduler 2.0 is compiled in simd8 69 #define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8) 70 71 #define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3) 72 73 #define GRF_SIZE (32) 74 #define SIZEOF_3GRFS (3 * GRF_SIZE) 75 76 //estimation for dynamic payload size 77 #define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS) 78 79 //assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine. 80 #define MAX_DSH_SIZE_PER_ENQUEUE 9472 81 82 #define MAX_BINDING_TABLE_INDEX (253) 83 #define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size. 84 85 #define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE) 86 #define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1) 87 #define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE) 88 89 typedef enum tagDebugDataTypes { 90 DBG_DEFAULT = 0, 91 DBG_COMMAND_QUEUE = 1, 92 DBG_EVENTS_UPDATE = 2, 93 DBG_EVENTS_NUMBER = 3, 94 DBG_STACK_UPDATE = 4, 95 DBG_BEFORE_PATCH = 5, 96 DBG_KERNELID = 6, 97 DBG_DSHOFFSET = 7, 98 DBG_IDOFFSET = 8, 99 DBG_AFTER_PATCH = 9, 100 DBG_UNSPECIFIED = 10, 101 DBG_ENQUEUES_NUMBER = 11, 102 DBG_LOCAL_ID, 103 DBG_WKG_ID, 104 DBG_SCHEDULER_END, 105 // Add here new debug enums 106 DBG_MAX 107 } DebugDataTypes; 108 // Struct for debugging kernels 109 typedef struct 110 { 111 DebugDataTypes m_dataType; 112 uint m_dataSize; 113 } DebugDataInfo; 114 typedef struct 115 { 116 enum DDBFlags { DDB_HAS_DATA_INFO = 1, 117 DDB_SCHEDULER_PROFILING = 2, 118 DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags; 119 uint m_size; 120 uint m_stackTop; //index of data stack 121 uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address 122 uint m_stackBottom; 123 uint m_dataInfoBottom; //index of the bottom of DataInfo 124 uint m_dataInfoSize; 125 uint m_flags; 126 127 uint m_offset; //current offset indicates free place 128 uint m_data[100]; //buffer 129 } DebugDataBuffer; 130 131 #pragma pack(push) 132 #pragma pack(4) 133 #include "DeviceEnqueueInternalTypes.h" 134 #pragma pack(pop) 135