1 //
2 //  OpenCLRuntime.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/02/28.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "backend/opencl/core/runtime/OpenCLRuntime.hpp"
10 #include <sys/stat.h>
11 #include <cstdlib>
12 #include <fstream>
13 #include <memory>
14 #include <string>
15 #include <utility>
16 #include <vector>
17 #include "core/Macro.h"
18 //#define MNN_OPEN_TIME_TRACE
19 #include <MNN/AutoTime.hpp>
20 #include "CLCache_generated.h"
21 using namespace CLCache;
22 namespace MNN {
23 
24 extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap;
25 
getDeviceSupportsExtension(const cl::Device & device,const char * extensionName)26 bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const char *extensionName) {
27     std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
28     auto pos               = extensions.find(extensionName);
29     return (pos != std::string::npos);
30 }
31 
OpenCLRuntime(const BackendConfig::PrecisionMode precision,const int cl_mode)32 OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode) {
33 #ifdef LOG_VERBOSE
34     MNN_PRINT("start OpenCLRuntime !\n");
35 #endif
36     mDefaultBuildParams = " -cl-mad-enable";
37     std::vector<cl::Platform> platforms;
38     cl_int res = cl::Platform::get(&platforms);
39     MNN_CHECK_CL_SUCCESS(res, "getPlatform");
40     if(platforms.size() > 0 && res == CL_SUCCESS){
41         cl::Platform::setDefault(platforms[0]);
42         std::vector<cl::Device> gpuDevices;
43         res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
44 
45         if(1 <= gpuDevices.size() && res == CL_SUCCESS){
46             mFirstGPUDevicePtr              = std::make_shared<cl::Device>(gpuDevices[0]);
47             const std::string deviceName    = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
48             mDeviceName = deviceName;
49             const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
50             static std::map<std::string, float> gFlopsMap {
51                 {"Mali-T860", 6.83f},
52                 {"Mali-T880", 6.83f},
53                 {"Mali-G51", 6.83f},
54                 {"Mali-G52", 6.83f},
55                 {"Mali-G71", 31.61f},
56                 {"Mali-G72", 31.61f},
57                 {"Mali-G76", 31.61f},
58                 {"Adreno (TM) 505", 3.19f},
59                 {"Adreno (TM) 506", 4.74f},
60                 {"Adreno (TM) 512", 14.23f},
61                 {"Adreno (TM) 530", 25.40f},
62                 {"Adreno (TM) 540", 42.74f},
63                 {"Adreno (TM) 615", 16.77f},
64                 {"Adreno (TM) 616", 18.77f},
65                 {"Adreno (TM) 618", 18.77f},
66                 {"Adreno (TM) 630", 42.74f},
67                 {"Adreno (TM) 640", 42.74f},
68             };
69 
70             if (gFlopsMap.find(deviceName) != gFlopsMap.end()) {
71                 mFlops = gFlopsMap[deviceName];
72             }
73             const std::string deviceVendor  = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VENDOR>();
74             cl_command_queue_properties properties = 0;
75 
76         #ifdef ENABLE_OPENCL_TIME_PROFILER
77             properties |= CL_QUEUE_PROFILING_ENABLE;
78         #endif
79             cl_int res;
80             // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
81 
82             if (deviceName == "QUALCOMM Adreno(TM)" && deviceVersion.substr(0, deviceVersion.find('2')) == "OpenCL ") {
83                 mGpuType = ADRENO;
84 
85                 //if Adreno version is less than Adreno512, donot set WorkGroupAttribute option
86                 std::string adrenoVersion = deviceVersion.substr(deviceVersion.size()-3);
87                 //printf("Adreno Version:%s\n", adrenoVersion.c_str());
88                 if(adrenoVersion >= "512") {
89                     isSetWorkGroupAttribute = true;
90                 }
91             } else if (deviceName.find("Mali") != std::string::npos) {
92                 mGpuType = MALI;
93             } else if (deviceVendor.find("Advanced Micro Devices") != std::string::npos) {
94                 // Radeon series GPU is main product of Advanced Micro Devices (AMD)
95                 mGpuType = RADEON;
96                 isSetWorkGroupAttribute = true;
97             } else {
98                 mGpuType = OTHER;
99             }
100             const std::string extensions = platforms[0].getInfo<CL_PLATFORM_EXTENSIONS>();
101             if(mGpuType == ADRENO && " " != extensions){
102                 std::vector<cl_context_properties> context_properties;
103                 context_properties.reserve(5);
104                 context_properties.push_back(CL_CONTEXT_PERF_HINT_QCOM);
105                 context_properties.push_back(CL_PERF_HINT_HIGH_QCOM);
106                 context_properties.push_back(CL_CONTEXT_PRIORITY_HINT_QCOM);
107                 context_properties.push_back(CL_PRIORITY_HINT_LOW_QCOM);
108                 context_properties.push_back(0);
109                 mContext = std::shared_ptr<cl::Context>(new cl::Context({*mFirstGPUDevicePtr}, context_properties.data(), nullptr, nullptr, &res));
110             }else{
111                 mContext = std::shared_ptr<cl::Context>(new cl::Context({*mFirstGPUDevicePtr}, nullptr, nullptr, nullptr, &res));
112             }
113 
114             MNN_CHECK_CL_SUCCESS(res, "context");
115 
116             mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, properties, &res);
117             MNN_CHECK_CL_SUCCESS(res, "commandQueue");
118 
119             mFirstGPUDevicePtr->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &mGPUGlobalMemeryCacheSize);
120             mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &mGPUComputeUnits);
121             mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &mMaxFreq);
122             cl_device_fp_config fpConfig;
123             auto success = mFirstGPUDevicePtr->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fpConfig);
124             mIsDeviceSupportedFP16     = CL_SUCCESS == success && fpConfig > 0;
125             auto permitFloat16 = false;
126             if(precision == BackendConfig::Precision_Low) {
127                 permitFloat16 = true;
128             }
129             mIsSupportedFP16     = mIsDeviceSupportedFP16 && permitFloat16;
130 
131             //set gpu mode, tuning level and memory object
132             setGpuMode(cl_mode);
133 
134             if(mMemType == AUTO) {
135                 if(mGpuType == MALI && precision != BackendConfig::Precision_Normal) {//buffer mode not support Normal Precision yet
136                     mMemType = BUFFER;
137                 } else {
138                     mMemType = IMAGE;
139                 }
140             }
141 
142             if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_int8")){
143                 mSupportDotInt8 = true;
144             }
145             if(getDeviceSupportsExtension(*(mFirstGPUDevicePtr.get()), "cl_arm_integer_dot_product_accumulate_int8")){
146                 mSupportDotAccInt8 = true;
147             }
148         }else{
149             mIsCreateError = true;
150             MNN_ASSERT(1 <= gpuDevices.size());
151         }
152     }else{
153         mIsCreateError = true;
154         MNN_ASSERT(platforms.size() > 0);
155     }
156 }
157 
setGpuMode(const int cl_mode_num)158 void OpenCLRuntime::setGpuMode(const int cl_mode_num) {
159     int totalSet = 0;
160     bool isSet = (cl_mode_num & MNN_GPU_MEMORY_BUFFER);
161     if(isSet) {
162         mMemType = BUFFER;
163         totalSet++;
164     }
165     isSet = (cl_mode_num & MNN_GPU_MEMORY_IMAGE);
166     if(isSet) {
167         mMemType = IMAGE;
168         totalSet++;
169     }
170     if(totalSet > 1) {
171         MNN_PRINT("set both BUFFER and IMAGE mode is not permitted, please check cl_mode:%x!\n", cl_mode_num);
172     }
173 
174     totalSet = 0;
175     isSet = (cl_mode_num & MNN_GPU_TUNING_NONE);
176     if(isSet) {
177         mTuneLevel = None;
178         totalSet++;
179     }
180 
181     isSet = (cl_mode_num & MNN_GPU_TUNING_FAST);
182     if(isSet) {
183         mTuneLevel = Fast;
184         totalSet++;
185     }
186 
187     isSet = (cl_mode_num & MNN_GPU_TUNING_NORMAL);
188     if(isSet) {
189         mTuneLevel = Normal;
190         totalSet++;
191     }
192 
193     isSet = (cl_mode_num & MNN_GPU_TUNING_HEAVY);
194     if(isSet) {
195         mTuneLevel = Heavy;
196         totalSet++;
197     }
198 
199     isSet = (cl_mode_num & MNN_GPU_TUNING_WIDE);
200     if(isSet) {
201         mTuneLevel = Wide;
202         totalSet++;
203     }
204 
205     if(totalSet != 1) {
206         MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x!\n", cl_mode_num);
207     }
208 }
209 
setCommandQueueProfileEnable()210 void OpenCLRuntime::setCommandQueueProfileEnable() {
211     mCommandQueuePtr->finish();
212     mCommandQueuePtr.reset();
213     cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
214 
215     cl_int res;
216     mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, properties, &res);
217     MNN_CHECK_CL_SUCCESS(res, "commandQueue");
218 }
219 
setCommandQueueProfileDisable()220 void OpenCLRuntime::setCommandQueueProfileDisable() {
221     mCommandQueuePtr->finish();
222     mCommandQueuePtr.reset();
223     cl_command_queue_properties properties = 0;
224 
225     cl_int res;
226     mCommandQueuePtr = std::make_shared<cl::CommandQueue>(*mContext, *mFirstGPUDevicePtr, properties, &res);
227     MNN_CHECK_CL_SUCCESS(res, "commandQueue");
228 }
229 
getQueueNum()230 unsigned int OpenCLRuntime::getQueueNum() {
231     mQueueCount++;
232     return mQueueCount;
233 }
234 
tunedLwsMap()235 std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& OpenCLRuntime::tunedLwsMap() {
236     return mTunedLws;
237 }
238 
~OpenCLRuntime()239 OpenCLRuntime::~OpenCLRuntime() {
240 #ifdef LOG_VERBOSE
241     MNN_PRINT("start ~OpenCLRuntime !\n");
242 #endif
243     mBuildProgramMap.clear();
244     mCommandQueuePtr.reset();
245     mContext.reset();
246     mFirstGPUDevicePtr.reset();
247 #ifdef LOG_VERBOSE
248     MNN_PRINT("end ~OpenCLRuntime !\n");
249 #endif
250 }
251 
getMaxImage2DSize()252 std::vector<size_t> OpenCLRuntime::getMaxImage2DSize() {
253     size_t max_height, max_width;
254     cl_int res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
255     MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
256     res = mFirstGPUDevicePtr->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
257     MNN_CHECK_CL_SUCCESS(res, "image2Dsize");
258     return {max_height, max_width};
259 }
260 
isSupportedFP16() const261 bool OpenCLRuntime::isSupportedFP16() const {
262     return mIsSupportedFP16;
263 }
isWeightCpuTransHalf() const264 bool OpenCLRuntime::isWeightCpuTransHalf() const {
265 #ifdef USE_HALF_WEIGHT_MEMORY
266     return mIsSupportedFP16;
267 #else
268     return false;//most of time
269 #endif
270 }
271 
isDeviceSupportedFP16() const272 bool OpenCLRuntime::isDeviceSupportedFP16() const {
273     return mIsDeviceSupportedFP16;
274 }
275 
isSupportedDotInt8() const276 bool OpenCLRuntime::isSupportedDotInt8() const {
277     return mSupportDotInt8;
278 }
279 
isSupportedDotAccInt8() const280 bool OpenCLRuntime::isSupportedDotAccInt8() const {
281     return mSupportDotAccInt8;
282 }
283 
284 
context()285 cl::Context &OpenCLRuntime::context() {
286     return *mContext;
287 }
288 
commandQueue()289 cl::CommandQueue &OpenCLRuntime::commandQueue() {
290     return *mCommandQueuePtr;
291 }
292 
deviceGlobalMemeryCacheSize() const293 uint64_t OpenCLRuntime::deviceGlobalMemeryCacheSize() const {
294     return mGPUGlobalMemeryCacheSize;
295 }
296 
deviceComputeUnits() const297 uint32_t OpenCLRuntime::deviceComputeUnits() const {
298     return mGPUComputeUnits;
299 }
300 
maxFreq() const301 uint32_t OpenCLRuntime::maxFreq() const {
302     return mMaxFreq;
303 }
304 
maxAllocSize() const305 uint64_t OpenCLRuntime::maxAllocSize() const {
306     return mMaxMemAllocSize;
307 }
308 
loadProgram(const std::string & programName,cl::Program * program)309 bool OpenCLRuntime::loadProgram(const std::string &programName, cl::Program *program) {
310     auto it_source = OpenCLProgramMap.find(programName);
311     if (it_source != OpenCLProgramMap.end()) {
312         cl::Program::Sources sources;
313         std::string source(it_source->second.begin(), it_source->second.end());
314         sources.push_back(source);
315         *program = cl::Program(context(), sources);
316         return true;
317     } else {
318         MNN_PRINT("Can't find kernel source !\n");
319         return false;
320     }
321 }
322 
buildProgram(const std::string & buildOptionsStr,cl::Program * program)323 bool OpenCLRuntime::buildProgram(const std::string &buildOptionsStr, cl::Program *program) {
324     AUTOTIME;
325     cl_int ret = program->build({*mFirstGPUDevicePtr}, buildOptionsStr.c_str());
326     if (ret != CL_SUCCESS) {
327         if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*mFirstGPUDevicePtr) == CL_BUILD_ERROR) {
328             std::string buildLog = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*mFirstGPUDevicePtr);
329             MNN_PRINT("Program build log: %s \n", buildLog.c_str());
330         }
331         MNN_PRINT("Build program failed, err:%d ! \n", ret);
332         return false;
333     }
334     return true;
335 }
336 
buildKernel(const std::string & programName,const std::string & kernelName,const std::set<std::string> & buildOptions)337 cl::Kernel OpenCLRuntime::buildKernel(const std::string &programName, const std::string &kernelName,
338                                       const std::set<std::string> &buildOptions) {
339     std::string buildOptionsStr;
340     if (mIsSupportedFP16) {
341         buildOptionsStr = "-DFLOAT=half -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT4=convert_half4 -DMNN_SUPPORT_FP16";
342     } else {
343         buildOptionsStr = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4";
344     }
345 
346     if(isSetWorkGroupAttribute) {
347         buildOptionsStr += " -DSET_ATTRIBUTE=true";
348     } else {
349         buildOptionsStr += " -DSET_ATTRIBUTE=false";
350     }
351     for (auto &option : buildOptions) {
352         buildOptionsStr += " " + option;
353     }
354     buildOptionsStr += mDefaultBuildParams;
355     auto key = std::make_tuple(programName, kernelName, buildOptionsStr);
356 
357     auto buildProgramInter = mBuildProgramMap.find(key);
358     cl::Program program;
359     if (buildProgramInter != mBuildProgramMap.end()) {
360         program = buildProgramInter->second;
361     } else {
362         this->loadProgram(programName, &program);
363         auto status = this->buildProgram(buildOptionsStr, &program);
364         if (!status) {
365             FUNC_PRINT_ALL(programName.c_str(), s);
366         }
367         mBuildProgramMap.emplace(key, program);
368     }
369 
370     cl_int res;
371     cl::Kernel kernel = cl::Kernel(program, kernelName.c_str(), &res);
372     MNN_CHECK_CL_SUCCESS(res, "getKernel");
373     return kernel;
374 }
375 
getMaxWorkGroupSize(const cl::Kernel & kernel)376 uint64_t OpenCLRuntime::getMaxWorkGroupSize(const cl::Kernel &kernel) {
377     uint64_t maxWorkGroupSize = 0;
378     kernel.getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WORK_GROUP_SIZE, &maxWorkGroupSize);
379     return maxWorkGroupSize;
380 }
381 
GetKernelWaveSize(const cl::Kernel & kernel)382 uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
383     uint64_t kernelWaveSize = 0;
384     kernel.getWorkGroupInfo(*mFirstGPUDevicePtr, CL_KERNEL_WAVE_SIZE_QCOM, &kernelWaveSize);
385     return kernelWaveSize;
386 }
387 
getMaxWorkItemSizes()388 std::vector<uint32_t> OpenCLRuntime::getMaxWorkItemSizes() {
389     cl::vector<cl::size_type> _workItems;
390     mFirstGPUDevicePtr->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &_workItems);
391     std::vector<uint32_t> workItems;
392     for (int i = 0; i < _workItems.size(); ++i) {
393         workItems.push_back(_workItems[i]);
394     }
395     return workItems;
396 }
397 
getCostTime(const cl::Event * event)398 double OpenCLRuntime::getCostTime(const cl::Event *event){
399     //cl_int res = mCommandQueuePtr->finish();
400     cl_int res = event->wait();
401     MNN_CHECK_CL_SUCCESS(res, "clEvent");
402     mStartNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_START>();
403     mStopNanos = event->getProfilingInfo<CL_PROFILING_COMMAND_END>();
404     mKernelTime += (unsigned int)((mStopNanos - mStartNanos) / 1000.0);
405     return (mStopNanos - mStartNanos) / 1000.0;
406 }
407 
getQueuedTime(const cl::Event * event)408 double OpenCLRuntime::getQueuedTime(const cl::Event *event){
409     //cl_int res = mCommandQueuePtr->finish();
410     cl_int res = event->wait();
411     MNN_CHECK_CL_SUCCESS(res, "clEvent");
412     return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) / 1000.0;
413 }
414 
getSubmitTime(const cl::Event * event)415 double OpenCLRuntime::getSubmitTime(const cl::Event *event){
416     //cl_int res = mCommandQueuePtr->finish();
417     cl_int res = event->wait();
418     MNN_CHECK_CL_SUCCESS(res, "clEvent");
419     return (event->getProfilingInfo<CL_PROFILING_COMMAND_START>() - event->getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) / 1000.0;
420 }
421 
422 
makeCache()423 std::pair<const void*, size_t> OpenCLRuntime::makeCache() {
424     if (nullptr != mCacheOutside) {
425         return std::make_pair(mCacheOutside, mCacheOutsideSize);
426     }
427     std::unique_ptr<CacheT> cache(new CacheT);
428     // Get All program's binary
429     for (auto& iter : mBuildProgramMap) {
430         std::unique_ptr<ShaderT> pro(new ShaderT);
431         auto program = iter.second;
432         auto devicesNumber = program.getInfo<CL_PROGRAM_NUM_DEVICES>();
433         auto devices = program.getInfo<CL_PROGRAM_DEVICES>();
434         auto binSizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
435         if (binSizes.empty() || devices.empty()) {
436             MNN_ERROR("Can't load binary, binarySize:%d, deviceSize:%d\n", binSizes.size(), devices.size());
437             continue;
438         }
439         // Only use first one
440         pro->program = std::get<0>(iter.first);
441         pro->kernel = std::get<1>(iter.first);
442         pro->buildInfo = std::get<2>(iter.first);
443 
444         //MNN_PRINT("%s - %s - %s\n", pro->program.c_str(), pro->kernel.c_str(), pro->buildInfo.c_str());
445 
446         pro->buffer.resize(binSizes[0]);
447         auto proRaw = program.get();
448         auto c = pro->buffer.data();
449         clGetProgramInfo(proRaw, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &c, nullptr);
450         cache->programs.emplace_back(std::move(pro));
451     }
452     // Get All Autotuning cache
453     for (auto& iter : mTunedLws) {
454         std::unique_ptr<AutotuningT> tuning(new AutotuningT);
455         tuning->gloablSize = iter.first.second;
456         tuning->localSize = iter.second.first;
457         tuning->timeCost = iter.second.second;
458         tuning->key = iter.first.first;
459         cache->tunings.emplace_back(std::move(tuning));
460     }
461 
462     flatbuffers::FlatBufferBuilder builder;
463     auto lastOffset = Cache::Pack(builder, cache.get());
464     builder.Finish(lastOffset);
465     mBuffer.resize(builder.GetSize());
466     ::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
467     return std::make_pair(mBuffer.data(), mBuffer.size());
468 }
469 
setCache(std::pair<const void *,size_t> cache)470 bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
471     if (nullptr == cache.first) {
472         mCacheOutside = nullptr;
473         mCacheOutsideSize = 0;
474         mBuffer.clear();
475         return true;
476     }
477 
478     mCacheOutsideSize = cache.second;
479     mCacheOutside = cache.first;
480     auto cacheBuffer = GetCache(cache.first);
481 
482     if(nullptr == cacheBuffer->programs() || nullptr == cacheBuffer->tunings()) {
483         return false;
484     }
485 
486     // Load Program
487     if (nullptr != cacheBuffer->programs()) {
488         auto programs = cacheBuffer->programs();
489         for (int i=0; i<programs->size(); ++i) {
490             auto shaderInfo = programs->GetAs<Shader>(i);
491             if (nullptr == shaderInfo->program() || nullptr == shaderInfo->kernel() || nullptr == shaderInfo->buildInfo() || nullptr == shaderInfo->buffer()) {
492                 MNN_ERROR("Invalid Cache\n");
493                 return false;
494             }
495             auto program = shaderInfo->program()->str();
496             auto kernel = shaderInfo->kernel()->str();
497             // Builder Info
498             std::string buildinfo = shaderInfo->buildInfo()->str();
499 
500             auto buffer = shaderInfo->buffer()->data();
501             size_t bufferSize = shaderInfo->buffer()->size();
502             auto deviceId = mFirstGPUDevicePtr->get();
503             auto programRaw = clCreateProgramWithBinary(context().get(), 1, &deviceId, &bufferSize, (const unsigned char**)(&buffer), nullptr, nullptr);
504             if (!programRaw) {
505                 MNN_ERROR("Can't load %s - %s - %s load program\n", program.c_str(), kernel.c_str(), buildinfo.c_str());
506                 return false;
507             }
508             auto pro = cl::Program(programRaw);
509             auto res = buildProgram(buildinfo, &pro);
510             if (!res) {
511                 MNN_ERROR("Can't build %s - %s - %s load program\n", program.c_str(),  kernel.c_str(), buildinfo.c_str());
512                 return false;
513             }
514             mBuildProgramMap.insert(std::make_pair(std::make_tuple(program, kernel, buildinfo), pro));
515         }
516     }
517 
518     // Load Auto Tuning Info
519     if (nullptr != cacheBuffer->tunings()) {
520         auto tuningInfo = cacheBuffer->tunings();
521         for (int i=0; i<tuningInfo->size(); ++i) {
522             auto tun = tuningInfo->GetAs<Autotuning>(i);
523             if (nullptr == tun->gloablSize() || nullptr == tun->localSize() || nullptr == tun->key()) {
524                 MNN_ERROR("Error tunning info\n");
525                 return false;
526             }
527             std::vector<uint32_t> glo(tun->gloablSize()->size());
528             for (int v=0; v<glo.size(); ++v) {
529                 glo[v] = tun->gloablSize()->data()[v];
530             }
531             std::vector<uint32_t> loc(tun->localSize()->size());
532             for (int v=0; v<loc.size(); ++v) {
533                 loc[v] = tun->localSize()->data()[v];
534             }
535             uint32_t cost = tun->timeCost();
536             mTunedLws.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_pair(loc, cost)));
537         }
538     }
539     return true;
540 }
541 
542 } // namespace MNN
543