1 //
2 //  OpenCLRuntime.hpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/01/31.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #ifndef OpenCLRuntime_hpp
10 #define OpenCLRuntime_hpp
11 
12 
13 #include <map>
14 #include <memory>
15 #include <mutex>
16 #include <set>
17 #include <string>
18 #include <vector>
19 
20 #include <sstream>
21 #include <string>
22 #include <vector>
23 #include "core/Macro.h"
24 #include "Type_generated.h"
25 #include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
26 #include "MNN/MNNForwardType.h"
27 
28 namespace MNN {
29 
30 #define CL_CONTEXT_PERF_HINT_QCOM 0x40C2
31 #define CL_PERF_HINT_HIGH_QCOM 0x40C3
32 #define CL_PERF_HINT_NORMAL_QCOM 0x40C4
33 #define CL_PERF_HINT_LOW_QCOM 0x40C5
34 #define CL_CONTEXT_PRIORITY_HINT_QCOM 0x40C9
35 #define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
36 #define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
37 #define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
38 
39 #define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02
40 
41 enum GpuType { MALI = 0, ADRENO = 1, RADEON = 2, OTHER = 3 };
42 enum GpuMemObject { AUTO = 0, BUFFER = 1, IMAGE = 2};
43 enum CLTuneLevel { None = 0, Heavy = 1, Wide = 2, Normal = 3, Fast = 4};
44 
45 class OpenCLRuntime {
46 public:
47     OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode);
48     ~OpenCLRuntime();
49     OpenCLRuntime(const OpenCLRuntime &) = delete;
50     OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
51 
52     bool isSupportedFP16() const;
53     bool isWeightCpuTransHalf() const;
54     bool isDeviceSupportedFP16() const;
55     bool isSupportedDotInt8() const;
56     bool isSupportedDotAccInt8() const;
57     ::cl::Context &context();
58     ::cl::CommandQueue &commandQueue();
59     uint64_t deviceGlobalMemeryCacheSize() const;
60     uint32_t deviceComputeUnits() const;
61     uint32_t maxFreq() const;
62     uint64_t getMaxWorkGroupSize(const ::cl::Kernel &kernel);
63     uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
64     std::vector<uint32_t> getMaxWorkItemSizes();
65     uint64_t getMaxLocalMem() const;
getGpuType()66     GpuType getGpuType() {
67         return mGpuType;
68     }
getGpuMemType()69     GpuMemObject getGpuMemType() {
70         return mMemType;
71     }
getCLTuneLevel()72     CLTuneLevel getCLTuneLevel() {
73         return mTuneLevel;
74     }
getDeviceName()75     std::string getDeviceName() {
76         return mDeviceName;
77     }
78     uint64_t maxAllocSize() const;
79     void setCommandQueueProfileEnable();
80     void setCommandQueueProfileDisable();
81 
82     unsigned int mQueueCount = 0;
83     unsigned int getQueueNum();
84 
85     unsigned int mKernelTime = 0;
86 
87     std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& tunedLwsMap();
88 
89     ::cl::Kernel buildKernel(const std::string &programName, const std::string &kernelName,
90                              const std::set<std::string> &buildOptions);
91 
92     std::vector<size_t> getMaxImage2DSize();
isCreateError() const93     bool isCreateError() const {
94         return mIsCreateError;
95     }
96 
flops() const97     float flops() const {
98         return mFlops;
99     }
100 
101     double getCostTime(const cl::Event *event);
102     double getQueuedTime(const cl::Event *event);
103     double getSubmitTime(const cl::Event *event);
104 
105     std::pair<const void*, size_t> makeCache();
106     bool setCache(std::pair<const void*, size_t> cache);
107 private:
108     bool loadProgram(const std::string &programName, cl::Program *program);
109     bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
110     bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
111     void setGpuMode(const int cl_mode_num);
112 
113 private:
114     std::shared_ptr<::cl::Context> mContext;
115     std::shared_ptr<::cl::Device> mFirstGPUDevicePtr;
116     std::shared_ptr<::cl::CommandQueue> mCommandQueuePtr;
117     std::map<std::tuple<std::string, std::string, std::string>, ::cl::Program> mBuildProgramMap;
118     uint64_t mGPUGlobalMemeryCacheSize;
119     uint32_t mGPUComputeUnits;
120     uint32_t mMaxFreq;
121     uint32_t mMaxMemAllocSize;
122     uint64_t mMaxLocalMemSize;
123     bool mIsSupportedFP16     = false;
124     bool mIsDeviceSupportedFP16     = false;
125     bool mSupportDotInt8 = false;
126     bool mSupportDotAccInt8 = false;
127     GpuType mGpuType;
128     GpuMemObject mMemType = AUTO;
129     CLTuneLevel mTuneLevel = Wide;
130     std::string mDeviceName;
131     bool isSetWorkGroupAttribute = false;
132     std::string mDefaultBuildParams;
133     float mFlops = 4.0f;
134     bool mIsCreateError{false};
135 
136     double mStartNanos;
137     double mStopNanos;
138 
139     std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>,  uint32_t>> mTunedLws;
140     std::vector<uint8_t> mBuffer;
141     const void* mCacheOutside = nullptr;
142     size_t mCacheOutsideSize = 0;
143 };
144 
145 } // namespace MNN
146 #endif  /* OpenCLRuntime_hpp */
147