1 /******************************************************************************
2  * $Id: gdalwarpkernel_opencl.cpp 32b416eddf6e3fa3ea837f60ba5cf5e9f6982cc0 2021-09-01 11:25:43 +0200 Even Rouault $
3  *
4  * Project:  OpenCL Image Reprojector
5  * Purpose:  Implementation of the GDALWarpKernel reprojector in OpenCL.
6  * Author:   Seth Price, seth@pricepages.org
7  *
8  ******************************************************************************
9  * Copyright (c) 2010, Seth Price <seth@pricepages.org>
10  * Copyright (c) 2010-2012, Even Rouault <even dot rouault at spatialys.com>
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a
13  * copy of this software and associated documentation files (the "Software"),
14  * to deal in the Software without restriction, including without limitation
15  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16  * and/or sell copies of the Software, and to permit persons to whom the
17  * Software is furnished to do so, subject to the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included
20  * in all copies or substantial portions of the Software.
21  *
22  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28  * DEALINGS IN THE SOFTWARE.
29  ****************************************************************************/
30 
31 #if defined(HAVE_OPENCL)
32 
33 /* The following line may be uncommented for increased debugging traces and profiling */
34 /* #define DEBUG_OPENCL 1 */
35 
36 #include <assert.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <limits.h>
40 #include <float.h>
41 #include <limits>
42 #include <vector>
43 #include "cpl_string.h"
44 #include "gdalwarpkernel_opencl.h"
45 
46 CPL_CVSID("$Id: gdalwarpkernel_opencl.cpp 32b416eddf6e3fa3ea837f60ba5cf5e9f6982cc0 2021-09-01 11:25:43 +0200 Even Rouault $")
47 
48 #define handleErr(err) if((err) != CL_SUCCESS) { \
49     CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
50     return err; \
51 }
52 
53 #define handleErrRetNULL(err) if((err) != CL_SUCCESS) { \
54     (*clErr) = err; \
55     CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
56     return nullptr; \
57 }
58 
59 #define handleErrGoto(err, goto_label) if((err) != CL_SUCCESS) { \
60     (*clErr) = err; \
61     CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
62     goto goto_label; \
63 }
64 
65 #define freeCLMem(clMem, fallBackMem)  do { \
66     if ((clMem) != nullptr) { \
67         handleErr(err = clReleaseMemObject(clMem)); \
68         clMem = nullptr; \
69         fallBackMem = nullptr; \
70     } else if ((fallBackMem) != nullptr) { \
71         CPLFree(fallBackMem); \
72         fallBackMem = nullptr; \
73     } \
74 } while( false )
75 
getCLErrorString(cl_int err)76 static const char* getCLErrorString(cl_int err)
77 {
78     switch (err)
79     {
80         case CL_SUCCESS:
81             return("CL_SUCCESS");
82             break;
83         case CL_DEVICE_NOT_FOUND:
84             return("CL_DEVICE_NOT_FOUND");
85             break;
86         case CL_DEVICE_NOT_AVAILABLE:
87             return("CL_DEVICE_NOT_AVAILABLE");
88             break;
89         case CL_COMPILER_NOT_AVAILABLE:
90             return("CL_COMPILER_NOT_AVAILABLE");
91             break;
92         case CL_MEM_OBJECT_ALLOCATION_FAILURE:
93             return("CL_MEM_OBJECT_ALLOCATION_FAILURE");
94             break;
95         case CL_OUT_OF_RESOURCES:
96             return("CL_OUT_OF_RESOURCES");
97             break;
98         case CL_OUT_OF_HOST_MEMORY:
99             return("CL_OUT_OF_HOST_MEMORY");
100             break;
101         case CL_PROFILING_INFO_NOT_AVAILABLE:
102             return("CL_PROFILING_INFO_NOT_AVAILABLE");
103             break;
104         case CL_MEM_COPY_OVERLAP:
105             return("CL_MEM_COPY_OVERLAP");
106             break;
107         case CL_IMAGE_FORMAT_MISMATCH:
108             return("CL_IMAGE_FORMAT_MISMATCH");
109             break;
110         case CL_IMAGE_FORMAT_NOT_SUPPORTED:
111             return("CL_IMAGE_FORMAT_NOT_SUPPORTED");
112             break;
113         case CL_BUILD_PROGRAM_FAILURE:
114             return("CL_BUILD_PROGRAM_FAILURE");
115             break;
116         case CL_MAP_FAILURE:
117             return("CL_MAP_FAILURE");
118             break;
119         case CL_INVALID_VALUE:
120             return("CL_INVALID_VALUE");
121             break;
122         case CL_INVALID_DEVICE_TYPE:
123             return("CL_INVALID_DEVICE_TYPE");
124             break;
125         case CL_INVALID_PLATFORM:
126             return("CL_INVALID_PLATFORM");
127             break;
128         case CL_INVALID_DEVICE:
129             return("CL_INVALID_DEVICE");
130             break;
131         case CL_INVALID_CONTEXT:
132             return("CL_INVALID_CONTEXT");
133             break;
134         case CL_INVALID_QUEUE_PROPERTIES:
135             return("CL_INVALID_QUEUE_PROPERTIES");
136             break;
137         case CL_INVALID_COMMAND_QUEUE:
138             return("CL_INVALID_COMMAND_QUEUE");
139             break;
140         case CL_INVALID_HOST_PTR:
141             return("CL_INVALID_HOST_PTR");
142             break;
143         case CL_INVALID_MEM_OBJECT:
144             return("CL_INVALID_MEM_OBJECT");
145             break;
146         case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
147             return("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR");
148             break;
149         case CL_INVALID_IMAGE_SIZE:
150             return("CL_INVALID_IMAGE_SIZE");
151             break;
152         case CL_INVALID_SAMPLER:
153             return("CL_INVALID_SAMPLER");
154             break;
155         case CL_INVALID_BINARY:
156             return("CL_INVALID_BINARY");
157             break;
158         case CL_INVALID_BUILD_OPTIONS:
159             return("CL_INVALID_BUILD_OPTIONS");
160             break;
161         case CL_INVALID_PROGRAM:
162             return("CL_INVALID_PROGRAM");
163             break;
164         case CL_INVALID_PROGRAM_EXECUTABLE:
165             return("CL_INVALID_PROGRAM_EXECUTABLE");
166             break;
167         case CL_INVALID_KERNEL_NAME:
168             return("CL_INVALID_KERNEL_NAME");
169             break;
170         case CL_INVALID_KERNEL_DEFINITION:
171             return("CL_INVALID_KERNEL_DEFINITION");
172             break;
173         case CL_INVALID_KERNEL:
174             return("CL_INVALID_KERNEL");
175             break;
176         case CL_INVALID_ARG_INDEX:
177             return("CL_INVALID_ARG_INDEX");
178             break;
179         case CL_INVALID_ARG_VALUE:
180             return("CL_INVALID_ARG_VALUE");
181             break;
182         case CL_INVALID_ARG_SIZE:
183             return("CL_INVALID_ARG_SIZE");
184             break;
185         case CL_INVALID_KERNEL_ARGS:
186             return("CL_INVALID_KERNEL_ARGS");
187             break;
188         case CL_INVALID_WORK_DIMENSION:
189             return("CL_INVALID_WORK_DIMENSION");
190             break;
191         case CL_INVALID_WORK_GROUP_SIZE:
192             return("CL_INVALID_WORK_GROUP_SIZE");
193             break;
194         case CL_INVALID_WORK_ITEM_SIZE:
195             return("CL_INVALID_WORK_ITEM_SIZE");
196             break;
197         case CL_INVALID_GLOBAL_OFFSET:
198             return("CL_INVALID_GLOBAL_OFFSET");
199             break;
200         case CL_INVALID_EVENT_WAIT_LIST:
201             return("CL_INVALID_EVENT_WAIT_LIST");
202             break;
203         case CL_INVALID_EVENT:
204             return("CL_INVALID_EVENT");
205             break;
206         case CL_INVALID_OPERATION:
207             return("CL_INVALID_OPERATION");
208             break;
209         case CL_INVALID_GL_OBJECT:
210             return("CL_INVALID_GL_OBJECT");
211             break;
212         case CL_INVALID_BUFFER_SIZE:
213             return("CL_INVALID_BUFFER_SIZE");
214             break;
215         case CL_INVALID_MIP_LEVEL:
216             return("CL_INVALID_MIP_LEVEL");
217             break;
218         case CL_INVALID_GLOBAL_WORK_SIZE:
219             return("CL_INVALID_GLOBAL_WORK_SIZE");
220             break;
221     }
222 
223     return "unknown_error";
224 }
225 
getCLDataTypeString(cl_channel_type dataType)226 static const char* getCLDataTypeString( cl_channel_type dataType )
227 {
228     switch( dataType )
229     {
230         case CL_SNORM_INT8: return "CL_SNORM_INT8";
231         case CL_SNORM_INT16: return "CL_SNORM_INT16";
232         case CL_UNORM_INT8: return "CL_UNORM_INT8";
233         case CL_UNORM_INT16: return "CL_UNORM_INT16";
234 #if 0
235         case CL_UNORM_SHORT_565: return "CL_UNORM_SHORT_565";
236         case CL_UNORM_SHORT_555: return "CL_UNORM_SHORT_555";
237         case CL_UNORM_INT_101010: return "CL_UNORM_INT_101010";
238         case CL_SIGNED_INT8: return "CL_SIGNED_INT8";
239         case CL_SIGNED_INT16: return "CL_SIGNED_INT16";
240         case CL_SIGNED_INT32: return "CL_SIGNED_INT32";
241         case CL_UNSIGNED_INT8: return "CL_UNSIGNED_INT8";
242         case CL_UNSIGNED_INT16: return "CL_UNSIGNED_INT16";
243         case CL_UNSIGNED_INT32: return "CL_UNSIGNED_INT32";
244         case CL_HALF_FLOAT: return "CL_HALF_FLOAT";
245 #endif
246         case CL_FLOAT: return "CL_FLOAT";
247         default: return "unknown";
248     }
249 }
250 
251 /*
252  Finds an appropriate OpenCL device. For debugging, it's
253  always easier to use CL_DEVICE_TYPE_CPU because then */ /*ok*/ /*printf() can be called
254  from the kernel. If debugging is on, we can print the name and stats about the
255  device we're using.
256  */
get_device(OCLVendor * peVendor)257 static cl_device_id get_device(OCLVendor *peVendor)
258 {
259     cl_int err = 0;
260     size_t returned_size = 0;
261     cl_char vendor_name[1024] = {0};
262     cl_char device_name[1024] = {0};
263 
264     std::vector<cl_platform_id> platforms;
265     cl_uint num_platforms = 0;
266     cl_device_id preferred_device_id = nullptr;
267     int preferred_is_gpu = FALSE;
268 
269     static bool gbBuggyOpenCL = false;
270     if( gbBuggyOpenCL )
271         return nullptr;
272     try
273     {
274         err = clGetPlatformIDs( 0, nullptr, &num_platforms );
275         if( err != CL_SUCCESS || num_platforms == 0 )
276             return nullptr;
277 
278         platforms.resize(num_platforms);
279         err = clGetPlatformIDs( num_platforms, &platforms[0], nullptr );
280         if( err != CL_SUCCESS )
281             return nullptr;
282     }
283     catch( ... )
284     {
285         gbBuggyOpenCL = true;
286         CPLDebug("OpenCL", "clGetPlatformIDs() threw a C++ exception");
287         // This should normally not happen. But that does happen with
288         // intel-opencl 0r2.0-54426 when run under xvfb-run
289         return nullptr;
290     }
291 
292     bool bUseOpenCLCPU = CPLTestBool( CPLGetConfigOption("OPENCL_USE_CPU", "FALSE") );
293 
294     // In case we have several implementations, pick up the non Intel one by
295     // default, unless the PREFERRED_OPENCL_VENDOR config option is specified.
296     for( cl_uint i=0; i<num_platforms;i++)
297     {
298         cl_device_id device = nullptr;
299         const char* pszBlacklistedVendor;
300         const char* pszPreferredVendor;
301         int is_gpu;
302 
303         // Find the GPU CL device, this is what we really want
304         // If there is no GPU device is CL capable, fall back to CPU
305         if( bUseOpenCLCPU )
306             err = CL_DEVICE_NOT_FOUND;
307         else
308             err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1,
309                                  &device, nullptr);
310         is_gpu = (err == CL_SUCCESS);
311         if (err != CL_SUCCESS)
312         {
313             // Find the CPU CL device, as a fallback
314             err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1,
315                                  &device, nullptr);
316             if( err != CL_SUCCESS || device == nullptr )
317                 continue;
318         }
319 
320         // Get some information about the returned device
321         err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
322                             vendor_name, &returned_size);
323         err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
324                             device_name, &returned_size);
325         assert(err == CL_SUCCESS);
326 
327         if( num_platforms > 1 )
328             CPLDebug( "OpenCL", "Found vendor='%s' / device='%s' (%s implementation).",
329                       vendor_name, device_name, (is_gpu) ? "GPU" : "CPU");
330 
331         pszBlacklistedVendor = CPLGetConfigOption("BLACKLISTED_OPENCL_VENDOR", nullptr);
332         if( pszBlacklistedVendor &&
333             EQUAL( reinterpret_cast<const char*>(vendor_name), pszBlacklistedVendor ) )
334         {
335             CPLDebug("OpenCL", "Blacklisted vendor='%s' / device='%s' implementation skipped",
336                      vendor_name, device_name);
337             continue;
338         }
339 
340         if( preferred_device_id == nullptr || (is_gpu && !preferred_is_gpu) )
341         {
342             preferred_device_id = device;
343             preferred_is_gpu = is_gpu;
344         }
345 
346         pszPreferredVendor = CPLGetConfigOption("PREFERRED_OPENCL_VENDOR", nullptr);
347         if( pszPreferredVendor )
348         {
349             if( EQUAL( reinterpret_cast<const char*>(vendor_name), pszPreferredVendor ) )
350             {
351                 preferred_device_id = device;
352                 preferred_is_gpu = is_gpu;
353                 break;
354             }
355         }
356         else if( is_gpu && !STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Intel") )
357         {
358             preferred_device_id = device;
359             preferred_is_gpu = is_gpu;
360             break;
361         }
362     }
363     if( preferred_device_id == nullptr )
364     {
365         CPLDebug("OpenCL", "No implementation found");
366         return nullptr;
367     }
368 
369     err = clGetDeviceInfo(preferred_device_id, CL_DEVICE_VENDOR, sizeof(vendor_name),
370                             vendor_name, &returned_size);
371     err |= clGetDeviceInfo(preferred_device_id, CL_DEVICE_NAME, sizeof(device_name),
372                             device_name, &returned_size);
373     CPLDebug( "OpenCL", "Connected to vendor='%s' / device='%s' (%s implementation).",
374               vendor_name, device_name, (preferred_is_gpu) ? "GPU" : "CPU");
375 
376     if (STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Advanced Micro Devices"))
377         *peVendor = VENDOR_AMD;
378     else if (STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Intel"))
379         *peVendor = VENDOR_INTEL;
380     else
381         *peVendor = VENDOR_OTHER;
382 
383     return preferred_device_id;
384 }
385 
386 /*
387  Given that not all OpenCL devices support the same image formats, we need to
388  make do with what we have. This leads to wasted space, but as OpenCL matures
389  I hope it'll get better.
390  */
set_supported_formats(struct oclWarper * warper,cl_channel_order minOrderSize,cl_channel_order * chosenOrder,unsigned int * chosenSize,cl_channel_type dataType)391 static cl_int set_supported_formats(struct oclWarper *warper,
392                              cl_channel_order minOrderSize,
393                              cl_channel_order *chosenOrder,
394                              unsigned int *chosenSize,
395                              cl_channel_type dataType )
396 {
397     cl_image_format *fmtBuf = static_cast<cl_image_format *>(
398         calloc(256, sizeof(cl_image_format)));
399     cl_uint numRet;
400     cl_uint i;
401     cl_uint extraSpace = 9999;
402     cl_int err;
403     int bFound = FALSE;
404 
405     //Find what we *can* handle
406     handleErr(err = clGetSupportedImageFormats(warper->context,
407                                                CL_MEM_READ_ONLY,
408                                                CL_MEM_OBJECT_IMAGE2D,
409                                                256, fmtBuf, &numRet));
410     for (i = 0; i < numRet; ++i) {
411         cl_channel_order thisOrderSize = 0;
412         switch (fmtBuf[i].image_channel_order)
413         {
414             //Only support formats which use the channels in order (x,y,z,w)
415           case CL_R:
416           case CL_INTENSITY:
417           case CL_LUMINANCE:
418             thisOrderSize = 1;
419             break;
420           case CL_RG:
421             thisOrderSize = 2;
422             break;
423           case CL_RGB:
424             thisOrderSize = 3;
425             break;
426           case CL_RGBA:
427             thisOrderSize = 4;
428             break;
429         }
430 
431         //Choose an order with the least wasted space
432         if (fmtBuf[i].image_channel_data_type == dataType &&
433             minOrderSize <= thisOrderSize &&
434             extraSpace > thisOrderSize - minOrderSize ) {
435 
436             //Set the vector size, order, & remember wasted space
437             (*chosenSize) = thisOrderSize;
438             (*chosenOrder) = fmtBuf[i].image_channel_order;
439             extraSpace = thisOrderSize - minOrderSize;
440             bFound = TRUE;
441         }
442     }
443 
444     free(fmtBuf);
445 
446     if( !bFound )
447     {
448         CPLDebug("OpenCL",
449                  "Cannot find supported format for dataType = %s and minOrderSize = %d",
450                  getCLDataTypeString(dataType),
451                  static_cast<int>(minOrderSize));
452     }
453     return (bFound) ? CL_SUCCESS : CL_INVALID_OPERATION;
454 }
455 
456 /*
457  Allocate some pinned memory that we can use as an intermediate buffer. We're
458  using the pinned memory to assemble the data before transferring it to the
459  device. The reason we're using pinned RAM is because the transfer speed from
460  host RAM to device RAM is faster than non-pinned. The disadvantage is that
461  pinned RAM is a scarce OS resource. I'm making the assumption that the user
462  has as much pinned host RAM available as total device RAM because device RAM
463  tends to be similarly scarce. However, if the pinned memory fails we fall back
464  to using a regular memory allocation.
465 
466  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
467  */
468 static
alloc_pinned_mem(struct oclWarper * warper,int imgNum,size_t dataSz,void ** wrkPtr,cl_mem * wrkCL)469 cl_int alloc_pinned_mem(struct oclWarper *warper, int imgNum, size_t dataSz,
470                         void **wrkPtr, cl_mem *wrkCL)
471 {
472     cl_int err = CL_SUCCESS;
473     wrkCL[imgNum] = clCreateBuffer(warper->context,
474                                    CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
475                                    dataSz, nullptr, &err);
476 
477     if (err == CL_SUCCESS) {
478         wrkPtr[imgNum] = clEnqueueMapBuffer(warper->queue, wrkCL[imgNum],
479                                                     CL_FALSE, CL_MAP_WRITE,
480                                                     0, dataSz, 0, nullptr, nullptr, &err);
481         handleErr(err);
482     } else {
483         wrkCL[imgNum] = nullptr;
484 #ifdef DEBUG_OPENCL
485         CPLDebug("OpenCL", "Using fallback non-pinned memory!");
486 #endif
487         //Fallback to regular allocation
488         wrkPtr[imgNum] = VSI_MALLOC_VERBOSE(dataSz);
489 
490         if (wrkPtr[imgNum] == nullptr)
491             handleErr(err = CL_OUT_OF_HOST_MEMORY);
492     }
493 
494     return CL_SUCCESS;
495 }
496 
497 /*
498  Allocates the working host memory for all bands of the image in the warper
499  structure. This includes both the source image buffers and the destination
500  buffers. This memory is located on the host, so we can assemble the image.
501  Reasons for buffering it like this include reading each row from disk and
502  de-interleaving bands and parts of bands. Then they can be copied to the device
503  as a single operation fit for use as an OpenCL memory object.
504 
505  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
506  */
507 static
alloc_working_arr(struct oclWarper * warper,size_t ptrSz,size_t dataSz,CPL_UNUSED size_t * fmtSz)508 cl_int alloc_working_arr(struct oclWarper *warper,
509                          size_t ptrSz, size_t dataSz, CPL_UNUSED size_t *fmtSz)
510 {
511     cl_int err = CL_SUCCESS;
512     int i, b;
513     size_t srcDataSz1, dstDataSz1, srcDataSz4, dstDataSz4;
514     const int numBands = warper->numBands;
515 
516     //Find the best channel order for this format
517     err = set_supported_formats(warper, 1,
518                                 &(warper->imgChOrder1), &(warper->imgChSize1),
519                                 warper->imageFormat);
520     handleErr(err);
521     if(warper->useVec) {
522         err = set_supported_formats(warper, 4,
523                                     &(warper->imgChOrder4), &(warper->imgChSize4),
524                                     warper->imageFormat);
525         handleErr(err);
526     }
527 
528     //Alloc space for pointers to the main image data
529     warper->realWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
530     warper->dstRealWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
531     if (warper->realWork.v == nullptr || warper->dstRealWork.v == nullptr)
532         handleErr(err = CL_OUT_OF_HOST_MEMORY);
533 
534     if (warper->imagWorkCL != nullptr) {
535         //Alloc space for pointers to the extra channel, if it exists
536         warper->imagWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
537         warper->dstImagWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
538         if (warper->imagWork.v == nullptr || warper->dstImagWork.v == nullptr)
539             handleErr(err = CL_OUT_OF_HOST_MEMORY);
540     } else {
541         warper->imagWork.v = nullptr;
542         warper->dstImagWork.v = nullptr;
543     }
544 
545     //Calc the sizes we need
546     srcDataSz1 = dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize1;
547     dstDataSz1 = dataSz * warper->dstWidth * warper->dstHeight;
548     srcDataSz4 = dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize4;
549     dstDataSz4 = dataSz * warper->dstWidth * warper->dstHeight * 4;
550 
551     //Allocate pinned memory for each band's image
552     for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i) {
553         if(warper->useVec && b < numBands - numBands % 4) {
554             handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
555                                              warper->realWork.v,
556                                              warper->realWorkCL));
557 
558             handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
559                                              warper->dstRealWork.v,
560                                              warper->dstRealWorkCL));
561             b += 4;
562         } else {
563             handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
564                                              warper->realWork.v,
565                                              warper->realWorkCL));
566 
567             handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
568                                              warper->dstRealWork.v,
569                                              warper->dstRealWorkCL));
570             ++b;
571         }
572     }
573 
574     if (warper->imagWorkCL != nullptr) {
575         //Allocate pinned memory for each band's extra channel, if exists
576         for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i) {
577             if(warper->useVec && b < numBands - numBands % 4) {
578                 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
579                                                  warper->imagWork.v,
580                                                  warper->imagWorkCL));
581 
582                 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
583                                                  warper->dstImagWork.v,
584                                                  warper->dstImagWorkCL));
585                 b += 4;
586             } else {
587                 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
588                                                  warper->imagWork.v,
589                                                  warper->imagWorkCL));
590 
591                 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
592                                                  warper->dstImagWork.v,
593                                                  warper->dstImagWorkCL));
594                 ++b;
595             }
596         }
597     }
598 
599     return CL_SUCCESS;
600 }
601 
602 /*
603  Assemble and create the kernel. For optimization, portability, and
604  implementation limitation reasons, the program is actually assembled from
605  several strings, then compiled with as many invariants as possible defined by
606  the preprocessor. There is also quite a bit of error-catching code in here
607  because the kernel is where many bugs show up.
608 
609  Returns CL_SUCCESS on success and other CL_* errors in the error buffer when
610  something goes wrong.
611  */
612 static
get_kernel(struct oclWarper * warper,char useVec,double dfXScale,double dfYScale,double dfXFilter,double dfYFilter,int nXRadius,int nYRadius,int nFiltInitX,int nFiltInitY,cl_int * clErr)613 cl_kernel get_kernel(struct oclWarper *warper, char useVec,
614                      double dfXScale, double dfYScale, double dfXFilter, double dfYFilter,
615                      int nXRadius, int nYRadius, int nFiltInitX, int nFiltInitY,
616                      cl_int *clErr )
617 {
618     cl_program program;
619     cl_kernel kernel;
620     cl_int err = CL_SUCCESS;
621 #define PROGBUF_SIZE 128000
622     char *buffer = static_cast<char *>(CPLCalloc(PROGBUF_SIZE, sizeof(char)));
623     char *progBuf = static_cast<char *>(CPLCalloc(PROGBUF_SIZE, sizeof(char)));
624     float dstMinVal = 0.f, dstMaxVal = 0.0;
625 
626     const char *outType;
627     const char *dUseVec = "";
628     const char *dVecf = "float";
629     const char *kernGenFuncs = R""""(
630 // ********************* General Funcs ********************
631 void clampToDst(float fReal,
632                 __global outType *dstPtr,
633                 unsigned int iDstOffset,
634                 __constant float *fDstNoDataReal,
635                 int bandNum);
636 void setPixel(__global outType *dstReal,
637                 __global outType *dstImag,
638                 __global float *dstDensity,
639                 __global int *nDstValid,
640                 __constant float *fDstNoDataReal,
641                 const int bandNum,
642                 vecf fDensity, vecf fReal, vecf fImag);
643 int getPixel(__read_only image2d_t srcReal,
644                 __read_only image2d_t srcImag,
645                 __global float *fUnifiedSrcDensity,
646                 __global int *nUnifiedSrcValid,
647                 __constant char *useBandSrcValid,
648                 __global int *nBandSrcValid,
649                 const int2 iSrc,
650                 int bandNum,
651                 vecf *fDensity, vecf *fReal, vecf *fImag);
652 int isValid(__global float *fUnifiedSrcDensity,
653                 __global int *nUnifiedSrcValid,
654                 float2 fSrcCoords );
655 float2 getSrcCoords(__read_only image2d_t srcCoords);
656 
657 #ifdef USE_CLAMP_TO_DST_FLOAT
658 void clampToDst(float fReal,
659                 __global outType *dstPtr,
660                 unsigned int iDstOffset,
661                 __constant float *fDstNoDataReal,
662                 int bandNum)
663 {
664     dstPtr[iDstOffset] = fReal;
665 }
666 #else
667 void clampToDst(float fReal,
668                 __global outType *dstPtr,
669                 unsigned int iDstOffset,
670                 __constant float *fDstNoDataReal,
671                 int bandNum)
672 {
673     fReal *= dstMaxVal;
674 
675     if (fReal < dstMinVal)
676         dstPtr[iDstOffset] = (outType)dstMinVal;
677     else if (fReal > dstMaxVal)
678         dstPtr[iDstOffset] = (outType)dstMaxVal;
679     else
680         dstPtr[iDstOffset] = (dstMinVal < 0) ? (outType)floor(fReal + 0.5f) : (outType)(fReal + 0.5f);
681 
682     if (useDstNoDataReal && bandNum >= 0 &&
683         fDstNoDataReal[bandNum] == dstPtr[iDstOffset])
684     {
685         if (dstPtr[iDstOffset] == dstMinVal)
686             dstPtr[iDstOffset] = dstMinVal + 1;
687         else
688             dstPtr[iDstOffset] = dstPtr[iDstOffset] - 1;
689     }
690 }
691 #endif
692 
693 void setPixel(__global outType *dstReal,
694               __global outType *dstImag,
695               __global float *dstDensity,
696               __global int *nDstValid,
697               __constant float *fDstNoDataReal,
698               const int bandNum,
699               vecf fDensity, vecf fReal, vecf fImag)
700 {
701     unsigned int iDstOffset = get_global_id(1)*iDstWidth + get_global_id(0);
702 
703 #ifdef USE_VEC
704     if (fDensity.x < 0.00001f && fDensity.y < 0.00001f &&
705         fDensity.z < 0.00001f && fDensity.w < 0.00001f ) {
706 
707         fReal = 0.0f;
708         fImag = 0.0f;
709 
710     } else if (fDensity.x < 0.9999f || fDensity.y < 0.9999f ||
711                fDensity.z < 0.9999f || fDensity.w < 0.9999f ) {
712         vecf fDstReal, fDstImag;
713         float fDstDensity;
714 
715         fDstReal.x = dstReal[iDstOffset];
716         fDstReal.y = dstReal[iDstOffset+iDstHeight*iDstWidth];
717         fDstReal.z = dstReal[iDstOffset+iDstHeight*iDstWidth*2];
718         fDstReal.w = dstReal[iDstOffset+iDstHeight*iDstWidth*3];
719         if (useImag) {
720             fDstImag.x = dstImag[iDstOffset];
721             fDstImag.y = dstImag[iDstOffset+iDstHeight*iDstWidth];
722             fDstImag.z = dstImag[iDstOffset+iDstHeight*iDstWidth*2];
723             fDstImag.w = dstImag[iDstOffset+iDstHeight*iDstWidth*3];
724         }
725 #else
726     if (fDensity < 0.00001f) {
727 
728         fReal = 0.0f;
729         fImag = 0.0f;
730 
731     } else if (fDensity < 0.9999f) {
732         vecf fDstReal, fDstImag;
733         float fDstDensity;
734 
735         fDstReal = dstReal[iDstOffset];
736         if (useImag)
737             fDstImag = dstImag[iDstOffset];
738 #endif
739 
740         if (useDstDensity)
741             fDstDensity = dstDensity[iDstOffset];
742         else if (useDstValid &&
743                  !((nDstValid[iDstOffset>>5] & (0x01 << (iDstOffset & 0x1f))) ))
744             fDstDensity = 0.0f;
745         else
746             fDstDensity = 1.0f;
747 
748         vecf fDstInfluence = (1.0f - fDensity) * fDstDensity;
749 
750         // Density should be checked for <= 0.0 & handled by the calling function
751         fReal = (fReal * fDensity + fDstReal * fDstInfluence) / (fDensity + fDstInfluence);
752         if (useImag)
753             fImag = (fImag * fDensity + fDstImag * fDstInfluence) / (fDensity + fDstInfluence);
754     }
755 
756 #ifdef USE_VEC
757     clampToDst(fReal.x, dstReal, iDstOffset, fDstNoDataReal, bandNum);
758     clampToDst(fReal.y, dstReal, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, bandNum);
759     clampToDst(fReal.z, dstReal, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, bandNum);
760     clampToDst(fReal.w, dstReal, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, bandNum);
761     if (useImag) {
762         clampToDst(fImag.x, dstImag, iDstOffset, fDstNoDataReal, -1);
763         clampToDst(fImag.y, dstImag, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, -1);
764         clampToDst(fImag.z, dstImag, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, -1);
765         clampToDst(fImag.w, dstImag, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, -1);
766     }
767 #else
768     clampToDst(fReal, dstReal, iDstOffset, fDstNoDataReal, bandNum);
769     if (useImag)
770         clampToDst(fImag, dstImag, iDstOffset, fDstNoDataReal, -1);
771 #endif
772 }
773 
774 int getPixel(__read_only image2d_t srcReal,
775              __read_only image2d_t srcImag,
776              __global float *fUnifiedSrcDensity,
777              __global int *nUnifiedSrcValid,
778              __constant char *useBandSrcValid,
779              __global int *nBandSrcValid,
780              const int2 iSrc,
781              int bandNum,
782              vecf *fDensity, vecf *fReal, vecf *fImag)
783 {
784     int iSrcOffset = 0, iBandValidLen = 0, iSrcOffsetMask = 0;
785     int bHasValid = FALSE;
786 
787     // Clamp the src offset values if needed
788     if(useUnifiedSrcDensity | useUnifiedSrcValid | useUseBandSrcValid){
789         int iSrcX = iSrc.x;
790         int iSrcY = iSrc.y;
791 
792         // Needed because the offset isn't clamped in OpenCL hardware
793         if(iSrcX < 0)
794             iSrcX = 0;
795         else if(iSrcX >= iSrcWidth)
796             iSrcX = iSrcWidth - 1;
797 
798         if(iSrcY < 0)
799             iSrcY = 0;
800         else if(iSrcY >= iSrcHeight)
801             iSrcY = iSrcHeight - 1;
802 
803         iSrcOffset = iSrcY*iSrcWidth + iSrcX;
804         iBandValidLen = 1 + ((iSrcWidth*iSrcHeight)>>5);
805         iSrcOffsetMask = (0x01 << (iSrcOffset & 0x1f));
806     }
807 
808     if (useUnifiedSrcValid &&
809         !((nUnifiedSrcValid[iSrcOffset>>5] & iSrcOffsetMask) ) )
810         return FALSE;
811 
812 #ifdef USE_VEC
813     if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
814         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum    ] & iSrcOffsetMask)) )
815         bHasValid = TRUE;
816 
817     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+1] ||
818         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(1+bandNum)] & iSrcOffsetMask)) )
819         bHasValid = TRUE;
820 
821     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+2] ||
822         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(2+bandNum)] & iSrcOffsetMask)) )
823         bHasValid = TRUE;
824 
825     if (!useUseBandSrcValid || !useBandSrcValid[bandNum+3] ||
826         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(3+bandNum)] & iSrcOffsetMask)) )
827         bHasValid = TRUE;
828 #else
829     if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
830         ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum    ] & iSrcOffsetMask)) )
831         bHasValid = TRUE;
832 #endif
833 
834     if (!bHasValid)
835         return FALSE;
836 
837     const sampler_t samp =  CLK_NORMALIZED_COORDS_FALSE |
838                             CLK_ADDRESS_CLAMP_TO_EDGE |
839                             CLK_FILTER_NEAREST;
840 
841 #ifdef USE_VEC
842     (*fReal) = read_imagef(srcReal, samp, iSrc);
843     if (useImag)
844         (*fImag) = read_imagef(srcImag, samp, iSrc);
845 #else
846     (*fReal) = read_imagef(srcReal, samp, iSrc).x;
847     if (useImag)
848         (*fImag) = read_imagef(srcImag, samp, iSrc).x;
849 #endif
850 
851     if (useUnifiedSrcDensity) {
852         (*fDensity) = fUnifiedSrcDensity[iSrcOffset];
853     } else {
854         (*fDensity) = 1.0f;
855         return TRUE;
856     }
857 
858 #ifdef USE_VEC
859     return  (*fDensity).x > 0.0000001f || (*fDensity).y > 0.0000001f ||
860             (*fDensity).z > 0.0000001f || (*fDensity).w > 0.0000001f;
861 #else
862     return (*fDensity) > 0.0000001f;
863 #endif
864 }
865 
866 int isValid(__global float *fUnifiedSrcDensity,
867             __global int *nUnifiedSrcValid,
868             float2 fSrcCoords )
869 {
870     if (fSrcCoords.x < 0.0f || fSrcCoords.y < 0.0f)
871         return FALSE;
872 
873     int iSrcX = (int) (fSrcCoords.x - 0.5f);
874     int iSrcY = (int) (fSrcCoords.y - 0.5f);
875 
876     if( iSrcX < 0 || iSrcX >= iSrcWidth || iSrcY < 0 || iSrcY >= iSrcHeight )
877         return FALSE;
878 
879     int iSrcOffset = iSrcX + iSrcY * iSrcWidth;
880 
881     if (useUnifiedSrcDensity && fUnifiedSrcDensity[iSrcOffset] < 0.00001f)
882         return FALSE;
883 
884     if (useUnifiedSrcValid &&
885         !(nUnifiedSrcValid[iSrcOffset>>5] & (0x01 << (iSrcOffset & 0x1f))) )
886         return FALSE;
887 
888     return TRUE;
889 }
890 
891 float2 getSrcCoords(__read_only image2d_t srcCoords)
892 {
893     // Find an appropriate place to sample the coordinates so we're still
894     // accurate after linear interpolation.
895     int nDstX = get_global_id(0);
896     int nDstY = get_global_id(1);
897     float2  fDst = (float2)((0.5f * (float)iCoordMult + nDstX) /
898                                 (float)((ceil((iDstWidth  - 1) / (float)iCoordMult) + 1) * iCoordMult),
899                             (0.5f * (float)iCoordMult + nDstY) /
900                                 (float)((ceil((iDstHeight - 1) / (float)iCoordMult) + 1) * iCoordMult));
901 
902     // Check & return when the thread group overruns the image size
903     if (nDstX >= iDstWidth || nDstY >= iDstHeight)
904         return (float2)(-99.0f, -99.0f);
905 
906     const sampler_t samp =  CLK_NORMALIZED_COORDS_TRUE |
907                             CLK_ADDRESS_CLAMP_TO_EDGE |
908                             CLK_FILTER_LINEAR;
909 
910     float4  fSrcCoords = read_imagef(srcCoords,samp,fDst);
911 
912     return (float2)(fSrcCoords.x, fSrcCoords.y);
913 }
914 )"""";
915 
916     const char *kernBilinear = R""""(
917 // ************************ Bilinear ************************
918 __kernel void resamp(__read_only image2d_t srcCoords,
919                     __read_only image2d_t srcReal,
920                     __read_only image2d_t srcImag,
921                     __global float *fUnifiedSrcDensity,
922                     __global int *nUnifiedSrcValid,
923                     __constant char *useBandSrcValid,
924                     __global int *nBandSrcValid,
925                     __global outType *dstReal,
926                     __global outType *dstImag,
927                     __constant float *fDstNoDataReal,
928                     __global float *dstDensity,
929                     __global int *nDstValid,
930                     const int bandNum)
931 {
932     float2  fSrc = getSrcCoords(srcCoords);
933     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
934         return;
935 
936     int     iSrcX = (int) floor(fSrc.x - 0.5f);
937     int     iSrcY = (int) floor(fSrc.y - 0.5f);
938     float   fRatioX = 1.5f - (fSrc.x - iSrcX);
939     float   fRatioY = 1.5f - (fSrc.y - iSrcY);
940     vecf    fReal, fImag, fDens;
941     vecf    fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
942     vecf    fAccumulatorDensity = 0.0f;
943     float   fAccumulatorDivisor = 0.0f;
944 
945     if ( iSrcY >= 0 && iSrcY < iSrcHeight ) {
946         float fMult1 = fRatioX * fRatioY;
947         float fMult2 = (1.0f-fRatioX) * fRatioY;
948 
949                 // Upper Left Pixel
950                 if ( iSrcX >= 0 && iSrcX < iSrcWidth
951                          && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
952                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY),
953                                                 bandNum, &fDens, &fReal, &fImag))
954                 {
955                         fAccumulatorDivisor += fMult1;
956                         fAccumulatorReal += fReal * fMult1;
957                         fAccumulatorImag += fImag * fMult1;
958                         fAccumulatorDensity += fDens * fMult1;
959                 }
960 
961                 // Upper Right Pixel
962                 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
963                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
964                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY),
965                                                 bandNum, &fDens, &fReal, &fImag))
966                 {
967                         fAccumulatorDivisor += fMult2;
968                         fAccumulatorReal += fReal * fMult2;
969                         fAccumulatorImag += fImag * fMult2;
970                         fAccumulatorDensity += fDens * fMult2;
971                 }
972     }
973 
974     if ( iSrcY+1 >= 0 && iSrcY+1 < iSrcHeight ) {
975         float fMult1 = fRatioX * (1.0f-fRatioY);
976         float fMult2 = (1.0f-fRatioX) * (1.0f-fRatioY);
977 
978         // Lower Left Pixel
979                 if ( iSrcX >= 0 && iSrcX < iSrcWidth
980                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
981                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY+1),
982                                                 bandNum, &fDens, &fReal, &fImag))
983                 {
984                         fAccumulatorDivisor += fMult1;
985                         fAccumulatorReal += fReal * fMult1;
986                         fAccumulatorImag += fImag * fMult1;
987                         fAccumulatorDensity += fDens * fMult1;
988                 }
989 
990                 // Lower Right Pixel
991                 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
992                         && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
993                                                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+1),
994                                                 bandNum, &fDens, &fReal, &fImag))
995                 {
996                         fAccumulatorDivisor += fMult2;
997                         fAccumulatorReal += fReal * fMult2;
998                         fAccumulatorImag += fImag * fMult2;
999                         fAccumulatorDensity += fDens * fMult2;
1000                 }
1001     }
1002 
1003     // Compute and save final pixel
1004     if ( fAccumulatorDivisor < 0.00001f ) {
1005         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1006                 0.0f, 0.0f, 0.0f );
1007     } else if ( fAccumulatorDivisor < 0.99999f || fAccumulatorDivisor > 1.00001f ) {
1008         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1009                 fAccumulatorDensity / fAccumulatorDivisor,
1010                 fAccumulatorReal / fAccumulatorDivisor,
1011 #if useImag != 0
1012                 fAccumulatorImag / fAccumulatorDivisor );
1013 #else
1014                 0.0f );
1015 #endif
1016     } else {
1017         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1018                 fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag );
1019     }
1020 }
1021 )"""";
1022 
1023     const char *kernCubic = R""""(
1024 // ************************ Cubic ************************
1025 vecf cubicConvolution(float dist1, float dist2, float dist3,
1026                         vecf f0, vecf f1, vecf f2, vecf f3);
1027 
1028 vecf cubicConvolution(float dist1, float dist2, float dist3,
1029                        vecf f0, vecf f1, vecf f2, vecf f3)
1030 {
1031    return (  f1
1032        + 0.5f * (dist1*(f2 - f0)
1033                + dist2*(2.0f*f0 - 5.0f*f1 + 4.0f*f2 - f3)
1034                + dist3*(3.0f*(f1 - f2) + f3 - f0)));
1035 }
1036 
1037 // ************************ Cubic ************************
1038 __kernel void resamp(__read_only image2d_t srcCoords,
1039                      __read_only image2d_t srcReal,
1040                      __read_only image2d_t srcImag,
1041                      __global float *fUnifiedSrcDensity,
1042                      __global int *nUnifiedSrcValid,
1043                      __constant char *useBandSrcValid,
1044                      __global int *nBandSrcValid,
1045                      __global outType *dstReal,
1046                      __global outType *dstImag,
1047                      __constant float *fDstNoDataReal,
1048                      __global float *dstDensity,
1049                      __global int *nDstValid,
1050                      const int bandNum)
1051 {
1052     int i;
1053     float2  fSrc = getSrcCoords(srcCoords);
1054 
1055     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1056         return;
1057 
1058     int     iSrcX = (int) floor( fSrc.x - 0.5f );
1059     int     iSrcY = (int) floor( fSrc.y - 0.5f );
1060     float   fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1061     float   fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1062     float   fDeltaX2 = fDeltaX * fDeltaX;
1063     float   fDeltaY2 = fDeltaY * fDeltaY;
1064     float   fDeltaX3 = fDeltaX2 * fDeltaX;
1065     float   fDeltaY3 = fDeltaY2 * fDeltaY;
1066     vecf    afReal[4], afDens[4];
1067 #if useImag != 0
1068     vecf    afImag[4];
1069 #else
1070     vecf    fImag = 0.0f;
1071 #endif
1072 
1073     // Loop over rows
1074     for (i = -1; i < 3; ++i)
1075     {
1076         vecf    fReal1 = 0.0f, fReal2 = 0.0f, fReal3 = 0.0f, fReal4 = 0.0f;
1077         vecf    fDens1 = 0.0f, fDens2 = 0.0f, fDens3 = 0.0f, fDens4 = 0.0f;
1078         int hasPx;
1079 #if useImag != 0
1080         vecf    fImag1 = 0.0f, fImag2 = 0.0f, fImag3 = 0.0f, fImag4 = 0.0f;
1081 
1082         //Get all the pixels for this row
1083         hasPx  = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1084                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1085                         bandNum, &fDens1, &fReal1, &fImag1);
1086 
1087         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1088                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX  , iSrcY+i),
1089                         bandNum, &fDens2, &fReal2, &fImag2);
1090 
1091         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1092                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1093                         bandNum, &fDens3, &fReal3, &fImag3);
1094 
1095         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1096                         useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1097                         bandNum, &fDens4, &fReal4, &fImag4);
1098 #else
1099         //Get all the pixels for this row
1100         hasPx  = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1101                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1102                 bandNum, &fDens1, &fReal1, &fImag);
1103 
1104         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1105                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX  , iSrcY+i),
1106                 bandNum, &fDens2, &fReal2, &fImag);
1107 
1108         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1109                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1110                 bandNum, &fDens3, &fReal3, &fImag);
1111 
1112         hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1113                 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1114                 bandNum, &fDens4, &fReal4, &fImag);
1115 #endif
1116 
1117         // Shortcut if no px
1118         if (!hasPx) {
1119             afDens[i+1] = 0.0f;
1120             afReal[i+1] = 0.0f;
1121 #if useImag != 0
1122             afImag[i+1] = 0.0f;
1123 #endif
1124             continue;
1125         }
1126 
1127         // Process this row
1128         afDens[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fDens1, fDens2, fDens3, fDens4);
1129         afReal[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fReal1, fReal2, fReal3, fReal4);
1130 #if useImag != 0
1131         afImag[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fImag1, fImag2, fImag3, fImag4);
1132 #endif
1133     }
1134 
1135     // Compute and save final pixel
1136     setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1137              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afDens[0], afDens[1], afDens[2], afDens[3]),
1138              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afReal[0], afReal[1], afReal[2], afReal[3]),
1139 #if useImag != 0
1140              cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afImag[0], afImag[1], afImag[2], afImag[3]) );
1141 #else
1142              fImag );
1143 #endif
1144 }
1145 )"""";
1146 
1147 
1148     const char *kernResampler = R""""(
1149 // ************************ LanczosSinc ************************
1150 
1151 float lanczosSinc( float fX, float fR );
1152 float bSpline( float x );
1153 
1154 float lanczosSinc( float fX, float fR )
1155 {
1156     if ( fX > fR || fX < -fR)
1157         return 0.0f;
1158     if ( fX == 0.0f )
1159         return 1.0f;
1160 
1161     float fPIX = PI * fX;
1162     return ( sin(fPIX) / fPIX ) * ( sin(fPIX / fR) * fR / fPIX );
1163 }
1164 
1165 // ************************ Bicubic Spline ************************
1166 
1167 float bSpline( float x )
1168 {
1169     float xp2 = x + 2.0f;
1170     float xp1 = x + 1.0f;
1171     float xm1 = x - 1.0f;
1172     float xp2c = xp2 * xp2 * xp2;
1173 
1174     return (((xp2 > 0.0f)?((xp1 > 0.0f)?((x > 0.0f)?((xm1 > 0.0f)?
1175                                                      -4.0f * xm1*xm1*xm1:0.0f) +
1176                                          6.0f * x*x*x:0.0f) +
1177                            -4.0f * xp1*xp1*xp1:0.0f) +
1178              xp2c:0.0f) ) * 0.166666666666666666666f;
1179 }
1180 
1181 // ************************ General Resampler ************************
1182 
1183 __kernel void resamp(__read_only image2d_t srcCoords,
1184                      __read_only image2d_t srcReal,
1185                      __read_only image2d_t srcImag,
1186                      __global float *fUnifiedSrcDensity,
1187                      __global int *nUnifiedSrcValid,
1188                      __constant char *useBandSrcValid,
1189                      __global int *nBandSrcValid,
1190                      __global outType *dstReal,
1191                      __global outType *dstImag,
1192                      __constant float *fDstNoDataReal,
1193                      __global float *dstDensity,
1194                      __global int *nDstValid,
1195                      const int bandNum)
1196 {
1197     float2  fSrc = getSrcCoords(srcCoords);
1198 
1199     if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1200         return;
1201 
1202     int     iSrcX = floor( fSrc.x - 0.5f );
1203     int     iSrcY = floor( fSrc.y - 0.5f );
1204     float   fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1205     float   fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1206 
1207     vecf  fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1208     vecf  fAccumulatorDensity = 0.0f;
1209     float fAccumulatorWeight = 0.0f;
1210     int   i, j;
1211 
1212     // Loop over pixel rows in the kernel
1213     for ( j = nFiltInitY; j <= nYRadius; ++j )
1214     {
1215         float   fWeight1;
1216         int2 iSrc = (int2)(0, iSrcY + j);
1217 
1218         // Skip sampling over edge of image
1219         if ( iSrc.y < 0 || iSrc.y >= iSrcHeight )
1220             continue;
1221 
1222         // Select the resampling algorithm
1223         if ( doCubicSpline )
1224             // Calculate the Y weight
1225             fWeight1 = ( fYScale < 1.0f ) ?
1226                 bSpline(((float)j) * fYScale) * fYScale :
1227                 bSpline(((float)j) - fDeltaY);
1228         else
1229             fWeight1 = ( fYScale < 1.0f ) ?
1230                 lanczosSinc(j * fYScale, fYFilter) * fYScale :
1231                 lanczosSinc(j - fDeltaY, fYFilter);
1232 
1233         // Iterate over pixels in row
1234         for ( i = nFiltInitX; i <= nXRadius; ++i )
1235         {
1236             float fWeight2;
1237             vecf fDensity = 0.0f, fReal, fImag;
1238             iSrc.x = iSrcX + i;
1239 
1240             // Skip sampling at edge of image
1241             // Skip sampling when invalid pixel
1242             if ( iSrc.x < 0 || iSrc.x >= iSrcWidth ||
1243                   !getPixel(srcReal, srcImag, fUnifiedSrcDensity,
1244                             nUnifiedSrcValid, useBandSrcValid, nBandSrcValid,
1245                             iSrc, bandNum, &fDensity, &fReal, &fImag) )
1246                 continue;
1247 
1248             // Choose among possible algorithms
1249             if ( doCubicSpline )
1250                 // Calculate & save the X weight
1251                 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1252                     bSpline((float)i * fXScale) * fXScale :
1253                     bSpline(fDeltaX - (float)i));
1254             else
1255                 // Calculate & save the X weight
1256                 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1257                     lanczosSinc(i * fXScale, fXFilter) * fXScale :
1258                     lanczosSinc(i - fDeltaX, fXFilter));
1259 
1260             // Accumulate!
1261             fAccumulatorReal += fReal * fWeight2;
1262             fAccumulatorImag += fImag * fWeight2;
1263             fAccumulatorDensity += fDensity * fWeight2;
1264             fAccumulatorWeight += fWeight2;
1265         }
1266     }
1267 
1268     if ( fAccumulatorWeight < 0.000001f ) {
1269         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1270                  0.0f, 0.0f, 0.0f);
1271     } else if ( fAccumulatorWeight < 0.99999f || fAccumulatorWeight > 1.00001f ) {
1272         // Calculate the output taking into account weighting
1273         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1274                  fAccumulatorDensity / fAccumulatorWeight,
1275                  fAccumulatorReal / fAccumulatorWeight,
1276 #if useImag != 0
1277                  fAccumulatorImag / fAccumulatorWeight );
1278 #else
1279                  0.0f );
1280 #endif
1281     } else {
1282         setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1283                  fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag);
1284     }
1285 }
1286 )"""";
1287 
1288     //Defines based on image format
1289     switch (warper->imageFormat) {
1290         case CL_FLOAT:
1291             dstMinVal = std::numeric_limits<float>::lowest();
1292             dstMaxVal = std::numeric_limits<float>::max();
1293             outType = "float";
1294             break;
1295         case CL_SNORM_INT8:
1296             dstMinVal = -128.0;
1297             dstMaxVal = 127.0;
1298             outType = "char";
1299             break;
1300         case CL_UNORM_INT8:
1301             dstMinVal = 0.0;
1302             dstMaxVal = 255.0;
1303             outType = "uchar";
1304             break;
1305         case CL_SNORM_INT16:
1306             dstMinVal = -32768.0;
1307             dstMaxVal = 32767.0;
1308             outType = "short";
1309             break;
1310         case CL_UNORM_INT16:
1311             dstMinVal = 0.0;
1312             dstMaxVal = 65535.0;
1313             outType = "ushort";
1314             break;
1315     }
1316 
1317     //Use vector format?
1318     if (useVec) {
1319         dUseVec = "-D USE_VEC";
1320         dVecf = "float4";
1321     }
1322 
1323     //Assemble the kernel from parts. The compiler is unable to handle multiple
1324     //kernels in one string with more than a few __constant modifiers each.
1325     if (warper->resampAlg == OCL_Bilinear)
1326         snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernBilinear);
1327     else if (warper->resampAlg == OCL_Cubic)
1328         snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernCubic);
1329     else
1330         snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernResampler);
1331 
1332     //Actually make the program from assembled source
1333     const char* pszProgBuf = progBuf;
1334     program = clCreateProgramWithSource(warper->context, 1,
1335                                         &pszProgBuf,
1336                                         nullptr, &err);
1337     handleErrGoto(err, error_final);
1338 
1339     //Assemble the compiler arg string for speed. All invariants should be defined here.
1340     snprintf(buffer, PROGBUF_SIZE,
1341              "-cl-fast-relaxed-math -Werror -D FALSE=0 -D TRUE=1 "
1342             "%s"
1343             "-D iSrcWidth=%d -D iSrcHeight=%d -D iDstWidth=%d -D iDstHeight=%d "
1344             "-D useUnifiedSrcDensity=%d -D useUnifiedSrcValid=%d "
1345             "-D useDstDensity=%d -D useDstValid=%d -D useImag=%d "
1346             "-D fXScale=%015.15lff -D fYScale=%015.15lff -D fXFilter=%015.15lff -D fYFilter=%015.15lff "
1347             "-D nXRadius=%d -D nYRadius=%d -D nFiltInitX=%d -D nFiltInitY=%d "
1348             "-D PI=%015.15lff -D outType=%s -D dstMinVal=%015.15lff -D dstMaxVal=%015.15lff "
1349             "-D useDstNoDataReal=%d -D vecf=%s %s -D doCubicSpline=%d "
1350             "-D useUseBandSrcValid=%d -D iCoordMult=%d ",
1351             (warper->imageFormat == CL_FLOAT) ? "-D USE_CLAMP_TO_DST_FLOAT=1 " : "",
1352             warper->srcWidth, warper->srcHeight, warper->dstWidth, warper->dstHeight,
1353             warper->useUnifiedSrcDensity, warper->useUnifiedSrcValid,
1354             warper->useDstDensity, warper->useDstValid, warper->imagWorkCL != nullptr,
1355             dfXScale, dfYScale, dfXFilter, dfYFilter,
1356             nXRadius, nYRadius, nFiltInitX, nFiltInitY,
1357             M_PI, outType, dstMinVal, dstMaxVal, warper->fDstNoDataRealCL != nullptr,
1358             dVecf, dUseVec, warper->resampAlg == OCL_CubicSpline,
1359             warper->nBandSrcValidCL != nullptr, warper->coordMult);
1360 
1361     (*clErr) = err = clBuildProgram(program, 1, &(warper->dev), buffer, nullptr, nullptr);
1362 
1363     //Detailed debugging info
1364     if (err != CL_SUCCESS)
1365     {
1366         const char* pszStatus = "unknown_status";
1367         err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_LOG,
1368                                     128000*sizeof(char), buffer, nullptr);
1369         handleErrGoto(err, error_free_program);
1370 
1371         CPLError(CE_Failure, CPLE_AppDefined, "Error: Failed to build program executable!\nBuild Log:\n%s", buffer);
1372 
1373         err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_STATUS,
1374                                     128000*sizeof(char), buffer, nullptr);
1375         handleErrGoto(err, error_free_program);
1376 
1377         if(buffer[0] == CL_BUILD_NONE)
1378             pszStatus = "CL_BUILD_NONE";
1379         else if(buffer[0] == CL_BUILD_ERROR)
1380             pszStatus = "CL_BUILD_ERROR";
1381         else if(buffer[0] == CL_BUILD_SUCCESS)
1382             pszStatus = "CL_BUILD_SUCCESS";
1383         else if(buffer[0] == CL_BUILD_IN_PROGRESS)
1384             pszStatus = "CL_BUILD_IN_PROGRESS";
1385 
1386         CPLDebug("OpenCL", "Build Status: %s\nProgram Source:\n%s", pszStatus, progBuf);
1387         goto error_free_program;
1388     }
1389 
1390     kernel = clCreateKernel(program, "resamp", &err);
1391     handleErrGoto(err, error_free_program);
1392 
1393     err = clReleaseProgram(program);
1394     handleErrGoto(err, error_final);
1395 
1396     CPLFree(buffer);
1397     CPLFree(progBuf);
1398     return kernel;
1399 
1400 error_free_program:
1401     err = clReleaseProgram(program);
1402 
1403 error_final:
1404     CPLFree(buffer);
1405     CPLFree(progBuf);
1406     return nullptr;
1407 }
1408 
1409 /*
1410  Alloc & copy the coordinate data from host working memory to the device. The
1411  working memory should be a pinned, linear, array of floats. This allows us to
1412  allocate and copy all data in one step. The pointer to the device memory is
1413  saved and set as the appropriate argument number.
1414 
1415  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1416  */
1417 static
1418 cl_int set_coord_data (struct oclWarper *warper, cl_mem *xy)
1419 {
1420     cl_int err = CL_SUCCESS;
1421     cl_image_format imgFmt;
1422 
1423     //Copy coord data to the device
1424     imgFmt.image_channel_order = warper->xyChOrder;
1425     imgFmt.image_channel_data_type = CL_FLOAT;
1426 
1427 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1428 #pragma GCC diagnostic push
1429 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1430 #endif
1431     (*xy) = clCreateImage2D(warper->context,
1432                             CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1433                             static_cast<size_t>(warper->xyWidth),
1434                             static_cast<size_t>(warper->xyHeight),
1435                             sizeof(float) * warper->xyChSize * warper->xyWidth,
1436                             warper->xyWork, &err);
1437 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1438 #pragma GCC diagnostic pop
1439 #endif
1440     handleErr(err);
1441 
1442     //Free the source memory, now that it's copied we don't need it
1443     freeCLMem(warper->xyWorkCL, warper->xyWork);
1444 
1445     //Set up argument
1446     if (warper->kern1 != nullptr) {
1447         handleErr(err = clSetKernelArg(warper->kern1, 0, sizeof(cl_mem), xy));
1448     }
1449     if (warper->kern4 != nullptr) {
1450         handleErr(err = clSetKernelArg(warper->kern4, 0, sizeof(cl_mem), xy));
1451     }
1452 
1453     return CL_SUCCESS;
1454 }
1455 
1456 /*
1457  Sets the unified density & valid data structures. These are optional structures
1458  from GDAL, and as such if they are NULL a small placeholder memory segment is
1459  defined. This is because the spec is unclear on if a NULL value can be passed
1460  as a kernel argument in place of memory. If it's not NULL, the data is copied
1461  from the working memory to the device memory. After that, we check if we are
1462  using the per-band validity mask, and set that as appropriate. At the end, the
1463  CL mem is passed as the kernel arguments.
1464 
1465  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1466  */
1467 static
1468 cl_int set_unified_data(struct oclWarper *warper,
1469                         cl_mem *unifiedSrcDensityCL, cl_mem *unifiedSrcValidCL,
1470                         float *unifiedSrcDensity, unsigned int *unifiedSrcValid,
1471                         cl_mem *useBandSrcValidCL, cl_mem *nBandSrcValidCL)
1472 {
1473     cl_int err = CL_SUCCESS;
1474     size_t sz = warper->srcWidth * warper->srcHeight;
1475     int useValid = warper->nBandSrcValidCL != nullptr;
1476     //32 bits in the mask
1477     int validSz = static_cast<int>(sizeof(int) * ((31 + sz) >> 5));
1478 
1479     //Copy unifiedSrcDensity if it exists
1480     if (unifiedSrcDensity == nullptr) {
1481         //Alloc dummy device RAM
1482         (*unifiedSrcDensityCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1483         handleErr(err);
1484     } else {
1485         //Alloc & copy all density data
1486         (*unifiedSrcDensityCL) = clCreateBuffer(warper->context,
1487                                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1488                                                 sizeof(float) * sz, unifiedSrcDensity, &err);
1489         handleErr(err);
1490     }
1491 
1492     //Copy unifiedSrcValid if it exists
1493     if (unifiedSrcValid == nullptr) {
1494         //Alloc dummy device RAM
1495         (*unifiedSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1496         handleErr(err);
1497     } else {
1498         //Alloc & copy all validity data
1499         (*unifiedSrcValidCL) = clCreateBuffer(warper->context,
1500                                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1501                                               validSz, unifiedSrcValid, &err);
1502         handleErr(err);
1503     }
1504 
1505     // Set the band validity usage
1506     if(useValid) {
1507         (*useBandSrcValidCL) = clCreateBuffer(warper->context,
1508                                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1509                                               sizeof(char) * warper->numBands,
1510                                               warper->useBandSrcValid, &err);
1511         handleErr(err);
1512     } else {
1513         //Make a fake image so we don't have a NULL pointer
1514         (*useBandSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1515         handleErr(err);
1516     }
1517 
1518     //Do a more thorough check for validity
1519     if (useValid) {
1520         int i;
1521         useValid = FALSE;
1522         for (i = 0; i < warper->numBands; ++i)
1523             if (warper->useBandSrcValid[i])
1524                 useValid = TRUE;
1525     }
1526 
1527     //And the validity mask if needed
1528     if (useValid) {
1529         (*nBandSrcValidCL) = clCreateBuffer(warper->context,
1530                                             CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1531                                             warper->numBands * validSz,
1532                                             warper->nBandSrcValid, &err);
1533         handleErr(err);
1534     } else {
1535         //Make a fake image so we don't have a NULL pointer
1536         (*nBandSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1537         handleErr(err);
1538     }
1539 
1540     //Set up arguments
1541     if (warper->kern1 != nullptr) {
1542         handleErr(err = clSetKernelArg(warper->kern1, 3, sizeof(cl_mem), unifiedSrcDensityCL));
1543         handleErr(err = clSetKernelArg(warper->kern1, 4, sizeof(cl_mem), unifiedSrcValidCL));
1544         handleErr(err = clSetKernelArg(warper->kern1, 5, sizeof(cl_mem), useBandSrcValidCL));
1545         handleErr(err = clSetKernelArg(warper->kern1, 6, sizeof(cl_mem), nBandSrcValidCL));
1546     }
1547     if (warper->kern4 != nullptr) {
1548         handleErr(err = clSetKernelArg(warper->kern4, 3, sizeof(cl_mem), unifiedSrcDensityCL));
1549         handleErr(err = clSetKernelArg(warper->kern4, 4, sizeof(cl_mem), unifiedSrcValidCL));
1550         handleErr(err = clSetKernelArg(warper->kern4, 5, sizeof(cl_mem), useBandSrcValidCL));
1551         handleErr(err = clSetKernelArg(warper->kern4, 6, sizeof(cl_mem), nBandSrcValidCL));
1552     }
1553 
1554     return CL_SUCCESS;
1555 }
1556 
1557 /*
1558  Here we set the per-band raster data. First priority is the real raster data,
1559  of course. Then, if applicable, we set the additional image channel. Once this
1560  data is copied to the device, it can be freed on the host, so that is done
1561  here. Finally the appropriate kernel arguments are set.
1562 
1563  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1564  */
1565 static
1566 cl_int set_src_rast_data (struct oclWarper *warper, int iNum, size_t sz,
1567                           cl_channel_order chOrder, cl_mem *srcReal, cl_mem *srcImag)
1568 {
1569     cl_image_format imgFmt;
1570     cl_int err = CL_SUCCESS;
1571     int useImagWork = warper->imagWork.v != nullptr && warper->imagWork.v[iNum] != nullptr;
1572 
1573     //Set up image vars
1574     imgFmt.image_channel_order = chOrder;
1575     imgFmt.image_channel_data_type = warper->imageFormat;
1576 
1577     //Create & copy the source image
1578 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1579 #pragma GCC diagnostic push
1580 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1581 #endif
1582 
1583     (*srcReal) = clCreateImage2D(warper->context,
1584                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1585                                  static_cast<size_t>(warper->srcWidth),
1586                                  static_cast<size_t>(warper->srcHeight),
1587                                  sz * warper->srcWidth, warper->realWork.v[iNum], &err);
1588     handleErr(err);
1589 
1590     //And the source image parts if needed
1591     if (useImagWork) {
1592         (*srcImag) = clCreateImage2D(warper->context,
1593                                      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1594                                      static_cast<size_t>(warper->srcWidth),
1595                                      static_cast<size_t>(warper->srcHeight),
1596                                      sz * warper->srcWidth, warper->imagWork.v[iNum], &err);
1597         handleErr(err);
1598     } else {
1599         //Make a fake image so we don't have a NULL pointer
1600 
1601         char dummyImageData[16];
1602         (*srcImag) = clCreateImage2D(warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1603                                     1, 1, sz, dummyImageData, &err);
1604 
1605         handleErr(err);
1606     }
1607 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1608 #pragma GCC diagnostic pop
1609 #endif
1610 
1611     //Free the source memory, now that it's copied we don't need it
1612     freeCLMem(warper->realWorkCL[iNum], warper->realWork.v[iNum]);
1613     if (warper->imagWork.v != nullptr) {
1614         freeCLMem(warper->imagWorkCL[iNum], warper->imagWork.v[iNum]);
1615     }
1616 
1617     //Set up per-band arguments
1618     if (warper->kern1 != nullptr) {
1619         handleErr(err = clSetKernelArg(warper->kern1, 1, sizeof(cl_mem), srcReal));
1620         handleErr(err = clSetKernelArg(warper->kern1, 2, sizeof(cl_mem), srcImag));
1621     }
1622     if (warper->kern4 != nullptr) {
1623         handleErr(err = clSetKernelArg(warper->kern4, 1, sizeof(cl_mem), srcReal));
1624         handleErr(err = clSetKernelArg(warper->kern4, 2, sizeof(cl_mem), srcImag));
1625     }
1626 
1627     return CL_SUCCESS;
1628 }
1629 
1630 /*
1631  Set the destination data for the raster. Although it's the output, it still
1632  is copied to the device because some blending is done there. First the real
1633  data is allocated and copied, then the imag data is allocated and copied if
1634  needed. They are then set as the appropriate arguments to the kernel.
1635 
1636  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1637  */
1638 static
1639 cl_int set_dst_rast_data(struct oclWarper *warper, int iImg, size_t sz,
1640                          cl_mem *dstReal, cl_mem *dstImag)
1641 {
1642     cl_int err = CL_SUCCESS;
1643     sz *= warper->dstWidth * warper->dstHeight;
1644 
1645     //Copy the dst real data
1646     (*dstReal) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1647                                 sz, warper->dstRealWork.v[iImg], &err);
1648     handleErr(err);
1649 
1650     //Copy the dst imag data if exists
1651     if (warper->dstImagWork.v != nullptr && warper->dstImagWork.v[iImg] != nullptr) {
1652         (*dstImag) = clCreateBuffer(warper->context,
1653                                     CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1654                                     sz, warper->dstImagWork.v[iImg], &err);
1655         handleErr(err);
1656     } else {
1657         (*dstImag) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1, nullptr, &err);
1658         handleErr(err);
1659     }
1660 
1661     //Set up per-band arguments
1662     if (warper->kern1 != nullptr) {
1663         handleErr(err = clSetKernelArg(warper->kern1, 7, sizeof(cl_mem), dstReal));
1664         handleErr(err = clSetKernelArg(warper->kern1, 8, sizeof(cl_mem), dstImag));
1665     }
1666     if (warper->kern4 != nullptr) {
1667         handleErr(err = clSetKernelArg(warper->kern4, 7, sizeof(cl_mem), dstReal));
1668         handleErr(err = clSetKernelArg(warper->kern4, 8, sizeof(cl_mem), dstImag));
1669     }
1670 
1671     return CL_SUCCESS;
1672 }
1673 
1674 /*
1675  Read the final raster data back from the graphics card to working memory. This
1676  copies both the real memory and the imag memory if appropriate.
1677 
1678  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1679  */
1680 static
1681 cl_int get_dst_rast_data(struct oclWarper *warper, int iImg, size_t wordSz,
1682                          cl_mem dstReal, cl_mem dstImag)
1683 {
1684     cl_int err = CL_SUCCESS;
1685     size_t sz = warper->dstWidth * warper->dstHeight * wordSz;
1686 
1687     //Copy from dev into working memory
1688     handleErr(err = clEnqueueReadBuffer(warper->queue, dstReal,
1689                                         CL_FALSE, 0, sz, warper->dstRealWork.v[iImg],
1690                                         0, nullptr, nullptr));
1691 
1692     //If we are expecting the imag channel, then copy it back also
1693     if (warper->dstImagWork.v != nullptr && warper->dstImagWork.v[iImg] != nullptr) {
1694         handleErr(err = clEnqueueReadBuffer(warper->queue, dstImag,
1695                                             CL_FALSE, 0, sz, warper->dstImagWork.v[iImg],
1696                                             0, nullptr, nullptr));
1697     }
1698 
1699     //The copy requests were non-blocking, so we'll need to make sure they finish.
1700     handleErr(err = clFinish(warper->queue));
1701 
1702     return CL_SUCCESS;
1703 }
1704 
1705 /*
1706  Set the destination image density & validity mask on the device. This is used
1707  to blend the final output image with the existing buffer. This handles the
1708  unified structures that apply to all bands. After the buffers are created and
1709  copied, they are set as kernel arguments.
1710 
1711  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1712  */
1713 static
1714 cl_int set_dst_data(struct oclWarper *warper,
1715                     cl_mem *dstDensityCL, cl_mem *dstValidCL, cl_mem *dstNoDataRealCL,
1716                     float *dstDensity, unsigned int *dstValid, float *dstNoDataReal)
1717 {
1718     cl_int err = CL_SUCCESS;
1719     size_t sz = warper->dstWidth * warper->dstHeight;
1720 
1721     //Copy the no-data value(s)
1722     if (dstNoDataReal == nullptr) {
1723         (*dstNoDataRealCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1724         handleErr(err);
1725     } else {
1726         (*dstNoDataRealCL) = clCreateBuffer(warper->context,
1727                                          CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1728                                          sizeof(float) * warper->numBands, dstNoDataReal, &err);
1729         handleErr(err);
1730     }
1731 
1732     //Copy unifiedSrcDensity if it exists
1733     if (dstDensity == nullptr) {
1734         (*dstDensityCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1735         handleErr(err);
1736     } else {
1737         (*dstDensityCL) = clCreateBuffer(warper->context,
1738                                          CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1739                                          sizeof(float) * sz, dstDensity, &err);
1740         handleErr(err);
1741     }
1742 
1743     //Copy unifiedSrcValid if it exists
1744     if (dstValid == nullptr) {
1745         (*dstValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1746         handleErr(err);
1747     } else {
1748         (*dstValidCL) = clCreateBuffer(warper->context,
1749                                        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1750                                        sizeof(int) * ((31 + sz) >> 5), dstValid, &err);
1751         handleErr(err);
1752     }
1753 
1754     //Set up arguments
1755     if (warper->kern1 != nullptr) {
1756         handleErr(err = clSetKernelArg(warper->kern1,  9, sizeof(cl_mem), dstNoDataRealCL));
1757         handleErr(err = clSetKernelArg(warper->kern1, 10, sizeof(cl_mem), dstDensityCL));
1758         handleErr(err = clSetKernelArg(warper->kern1, 11, sizeof(cl_mem), dstValidCL));
1759     }
1760     if (warper->kern4 != nullptr) {
1761         handleErr(err = clSetKernelArg(warper->kern4,  9, sizeof(cl_mem), dstNoDataRealCL));
1762         handleErr(err = clSetKernelArg(warper->kern4, 10, sizeof(cl_mem), dstDensityCL));
1763         handleErr(err = clSetKernelArg(warper->kern4, 11, sizeof(cl_mem), dstValidCL));
1764     }
1765 
1766     return CL_SUCCESS;
1767 }
1768 
1769 /*
1770  Go ahead and execute the kernel. This handles some housekeeping stuff like the
1771  run dimensions. When running in debug mode, it times the kernel call and prints
1772  the execution time.
1773 
1774  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1775  */
1776 static
1777 cl_int execute_kern(struct oclWarper *warper, cl_kernel kern, size_t loc_size)
1778 {
1779     cl_int err = CL_SUCCESS;
1780     cl_event ev;
1781     size_t ceil_runs[2];
1782     size_t group_size[2];
1783 #ifdef DEBUG_OPENCL
1784     size_t start_time = 0;
1785     size_t end_time;
1786     const char *vecTxt = "";
1787 #endif
1788 
1789     // Use a likely X-dimension which is a power of 2
1790     if (loc_size >= 512)
1791         group_size[0] = 32;
1792     else if (loc_size >= 64)
1793         group_size[0] = 16;
1794     else if (loc_size > 8)
1795         group_size[0] = 8;
1796     else
1797         group_size[0] = 1;
1798 
1799     if (group_size[0] > loc_size)
1800         group_size[1] = group_size[0]/loc_size;
1801     else
1802         group_size[1] = 1;
1803 
1804     //Round up num_runs to find the dim of the block of pixels we'll be processing
1805     if(warper->dstWidth % group_size[0])
1806         ceil_runs[0] = warper->dstWidth + group_size[0] - warper->dstWidth % group_size[0];
1807     else
1808         ceil_runs[0] = warper->dstWidth;
1809 
1810     if(warper->dstHeight % group_size[1])
1811         ceil_runs[1] = warper->dstHeight + group_size[1] - warper->dstHeight % group_size[1];
1812     else
1813         ceil_runs[1] = warper->dstHeight;
1814 
1815 #ifdef DEBUG_OPENCL
1816     handleErr(err = clSetCommandQueueProperty(warper->queue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, nullptr));
1817 #endif
1818 
1819     // Run the calculation by enqueuing it and forcing the
1820     // command queue to complete the task
1821     handleErr(err = clEnqueueNDRangeKernel(warper->queue, kern, 2, nullptr,
1822                                            ceil_runs, group_size, 0, nullptr, &ev));
1823     handleErr(err = clFinish(warper->queue));
1824 
1825 #ifdef DEBUG_OPENCL
1826     handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
1827                                             sizeof(size_t), &start_time, nullptr));
1828     handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
1829                                             sizeof(size_t), &end_time, nullptr));
1830     assert(end_time != 0);
1831     assert(start_time != 0);
1832     if (kern == warper->kern4)
1833         vecTxt = "(vec)";
1834 
1835     CPLDebug("OpenCL", "Kernel Time: %6s %10lu", vecTxt, static_cast<long int>((end_time-start_time)/100000));
1836 #endif
1837 
1838     handleErr(err = clReleaseEvent(ev));
1839     return CL_SUCCESS;
1840 }
1841 
1842 /*
1843  Copy data from a raw source to the warper's working memory. If the imag
1844  channel is expected, then the data will be de-interlaced into component blocks
1845  of memory.
1846 
1847  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1848  */
1849 static
1850 cl_int set_img_data(struct oclWarper *warper, void *srcImgData,
1851                     unsigned int width, unsigned int height, int isSrc,
1852                     unsigned int bandNum, void **dstRealImgs, void **dstImagImgs)
1853 {
1854     unsigned int imgChSize = warper->imgChSize1;
1855     unsigned int iSrcY, i;
1856     unsigned int vecOff = 0;
1857     unsigned int imgNum = bandNum;
1858     void *dstReal = nullptr;
1859     void *dstImag = nullptr;
1860 
1861     // Handle vector if needed
1862     if (warper->useVec && static_cast<int>(bandNum) < warper->numBands - warper->numBands % 4) {
1863         imgChSize = warper->imgChSize4;
1864         vecOff = bandNum % 4;
1865         imgNum = bandNum / 4;
1866     } else if(warper->useVec) {
1867         imgNum = bandNum / 4 + bandNum % 4;
1868     }
1869 
1870     // Set the images as needed
1871     dstReal = dstRealImgs[imgNum];
1872     if(dstImagImgs == nullptr)
1873         dstImag = nullptr;
1874     else
1875         dstImag = dstImagImgs[imgNum];
1876 
1877     // Set stuff for dst imgs
1878     if (!isSrc) {
1879         vecOff *= height * width;
1880         imgChSize = 1;
1881     }
1882 
1883     // Copy values as needed
1884     if (warper->imagWorkCL == nullptr && !(warper->useVec && isSrc)) {
1885         //Set memory size & location depending on the data type
1886         //This is the ideal code path for speed
1887         switch (warper->imageFormat) {
1888             case CL_UNORM_INT8:
1889             {
1890                 unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[vecOff]);
1891                 memcpy(realDst, srcImgData, width*height*sizeof(unsigned char));
1892                 break;
1893             }
1894             case CL_SNORM_INT8:
1895             {
1896                 char *realDst = &((static_cast<char *>(dstReal))[vecOff]);
1897                 memcpy(realDst, srcImgData, width*height*sizeof(char));
1898                 break;
1899             }
1900             case CL_UNORM_INT16:
1901             {
1902                 unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[vecOff]);
1903                 memcpy(realDst, srcImgData, width*height*sizeof(unsigned short));
1904                 break;
1905             }
1906             case CL_SNORM_INT16:
1907             {
1908                 short *realDst = &((static_cast<short *>(dstReal))[vecOff]);
1909                 memcpy(realDst, srcImgData, width*height*sizeof(short));
1910                 break;
1911             }
1912             case CL_FLOAT:
1913             {
1914                 float *realDst = &((static_cast<float *>(dstReal))[vecOff]);
1915                 memcpy(realDst, srcImgData, width*height*sizeof(float));
1916                 break;
1917             }
1918         }
1919     } else if (warper->imagWorkCL == nullptr) {
1920         //We need to space the values due to OpenCL implementation reasons
1921         for( iSrcY = 0; iSrcY < height; iSrcY++ )
1922         {
1923             int pxOff = width*iSrcY;
1924             int imgOff = imgChSize*pxOff + vecOff;
1925             //Copy & deinterleave interleaved data
1926             switch (warper->imageFormat) {
1927                 case CL_UNORM_INT8:
1928                 {
1929                     unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[imgOff]);
1930                     unsigned char *dataSrc = &((static_cast<unsigned char *>(srcImgData))[pxOff]);
1931                     for (i = 0; i < width; ++i)
1932                         realDst[imgChSize*i] = dataSrc[i];
1933                 }
1934                     break;
1935                 case CL_SNORM_INT8:
1936                 {
1937                     char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
1938                     char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
1939                     for (i = 0; i < width; ++i)
1940                         realDst[imgChSize*i] = dataSrc[i];
1941                 }
1942                     break;
1943                 case CL_UNORM_INT16:
1944                 {
1945                     unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[imgOff]);
1946                     unsigned short *dataSrc = &((static_cast<unsigned short *>(srcImgData))[pxOff]);
1947                     for (i = 0; i < width; ++i)
1948                         realDst[imgChSize*i] = dataSrc[i];
1949                 }
1950                     break;
1951                 case CL_SNORM_INT16:
1952                 {
1953                     short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
1954                     short *dataSrc = &((static_cast<short *>(srcImgData))[pxOff]);
1955                     for (i = 0; i < width; ++i)
1956                         realDst[imgChSize*i] = dataSrc[i];
1957                 }
1958                     break;
1959                 case CL_FLOAT:
1960                 {
1961                     float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
1962                     float *dataSrc = &((static_cast<float *>(srcImgData))[pxOff]);
1963                     for (i = 0; i < width; ++i)
1964                         realDst[imgChSize*i] = dataSrc[i];
1965                 }
1966                     break;
1967             }
1968         }
1969     } else {
1970         //Copy, deinterleave, & space interleaved data
1971         for( iSrcY = 0; iSrcY < height; iSrcY++ )
1972         {
1973             int pxOff = width*iSrcY;
1974             int imgOff = imgChSize*pxOff + vecOff;
1975             //Copy & deinterleave interleaved data
1976             switch (warper->imageFormat) {
1977                 case CL_FLOAT:
1978                 {
1979                     float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
1980                     float *imagDst = &((static_cast<float *>(dstImag))[imgOff]);
1981                     float *dataSrc = &((static_cast<float *>(srcImgData))[pxOff]);
1982                     for (i = 0; i < width; ++i) {
1983                         realDst[imgChSize*i] = dataSrc[i*2  ];
1984                         imagDst[imgChSize*i] = dataSrc[i*2+1];
1985                     }
1986                 }
1987                     break;
1988                 case CL_SNORM_INT8:
1989                 {
1990                     char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
1991                     char *imagDst = &((static_cast<char *>(dstImag))[imgOff]);
1992                     char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
1993                     for (i = 0; i < width; ++i) {
1994                         realDst[imgChSize*i] = dataSrc[i*2  ];
1995                         imagDst[imgChSize*i] = dataSrc[i*2+1];
1996                     }
1997                 }
1998                     break;
1999                 case CL_UNORM_INT8:
2000                 {
2001                     unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[imgOff]);
2002                     unsigned char *imagDst = &((static_cast<unsigned char *>(dstImag))[imgOff]);
2003                     unsigned char *dataSrc = &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2004                     for (i = 0; i < width; ++i) {
2005                         realDst[imgChSize*i] = dataSrc[i*2  ];
2006                         imagDst[imgChSize*i] = dataSrc[i*2+1];
2007                     }
2008                 }
2009                     break;
2010                 case CL_SNORM_INT16:
2011                 {
2012                     short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2013                     short *imagDst = &((static_cast<short *>(dstImag))[imgOff]);
2014                     short *dataSrc = &((static_cast<short *>(srcImgData))[pxOff]);
2015                     for (i = 0; i < width; ++i) {
2016                         realDst[imgChSize*i] = dataSrc[i*2  ];
2017                         imagDst[imgChSize*i] = dataSrc[i*2+1];
2018                     }
2019                 }
2020                     break;
2021                 case CL_UNORM_INT16:
2022                 {
2023                     unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[imgOff]);
2024                     unsigned short *imagDst = &((static_cast<unsigned short *>(dstImag))[imgOff]);
2025                     unsigned short *dataSrc = &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2026                     for (i = 0; i < width; ++i) {
2027                         realDst[imgChSize*i] = dataSrc[i*2  ];
2028                         imagDst[imgChSize*i] = dataSrc[i*2+1];
2029                     }
2030                 }
2031                     break;
2032             }
2033         }
2034     }
2035 
2036     return CL_SUCCESS;
2037 }
2038 
2039 /*
2040  Creates the struct which inits & contains the OpenCL context & environment.
2041  Inits wired(?) space to buffer the image in host RAM. Chooses the OpenCL
2042  device, perhaps the user can choose it later? This would also choose the
2043  appropriate OpenCL image format (R, RG, RGBA, or multiples thereof). Space
2044  for metadata can be allocated as required, though.
2045 
2046  Supported image formats are:
2047  CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT16
2048  32-bit int formats won't keep precision when converted to floats internally
2049  and doubles are generally not supported on the GPU image formats.
2050  */
2051 struct oclWarper* GDALWarpKernelOpenCL_createEnv(int srcWidth, int srcHeight,
2052                                                  int dstWidth, int dstHeight,
2053                                                  cl_channel_type imageFormat,
2054                                                  int numBands, int coordMult,
2055                                                  int useImag, int useBandSrcValid,
2056                                                  CPL_UNUSED float *fDstDensity,
2057                                                  double *dfDstNoDataReal,
2058                                                  OCLResampAlg resampAlg, cl_int *clErr)
2059 {
2060     struct oclWarper *warper;
2061     int i;
2062     size_t maxWidth = 0, maxHeight = 0;
2063     cl_int err = CL_SUCCESS;
2064     size_t fmtSize, sz;
2065     cl_device_id device;
2066     cl_bool bool_flag;
2067     OCLVendor eCLVendor = VENDOR_OTHER;
2068 
2069     // Do we have a suitable OpenCL device?
2070     device = get_device(&eCLVendor);
2071     if( device == nullptr )
2072         return nullptr;
2073 
2074     err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
2075                           sizeof(cl_bool), &bool_flag, &sz);
2076     if( err != CL_SUCCESS || !bool_flag )
2077     {
2078         CPLDebug( "OpenCL", "No image support on selected device." );
2079         return nullptr;
2080     }
2081 
2082     // Set up warper environment.
2083     warper = static_cast<struct oclWarper *>(CPLCalloc(1, sizeof(struct oclWarper)));
2084 
2085     warper->eCLVendor = eCLVendor;
2086 
2087     //Init passed vars
2088     warper->srcWidth = srcWidth;
2089     warper->srcHeight = srcHeight;
2090     warper->dstWidth = dstWidth;
2091     warper->dstHeight = dstHeight;
2092 
2093     warper->coordMult = coordMult;
2094     warper->numBands = numBands;
2095     warper->imageFormat = imageFormat;
2096     warper->resampAlg = resampAlg;
2097 
2098     warper->useUnifiedSrcDensity = FALSE;
2099     warper->useUnifiedSrcValid = FALSE;
2100     warper->useDstDensity = FALSE;
2101     warper->useDstValid = FALSE;
2102 
2103     warper->imagWorkCL = nullptr;
2104     warper->dstImagWorkCL = nullptr;
2105     warper->useBandSrcValidCL = nullptr;
2106     warper->useBandSrcValid = nullptr;
2107     warper->nBandSrcValidCL = nullptr;
2108     warper->nBandSrcValid = nullptr;
2109     warper->fDstNoDataRealCL = nullptr;
2110     warper->fDstNoDataReal = nullptr;
2111     warper->kern1 = nullptr;
2112     warper->kern4 = nullptr;
2113 
2114     warper->dev = device;
2115 
2116     warper->context = clCreateContext(nullptr, 1, &(warper->dev), nullptr, nullptr, &err);
2117     handleErrGoto(err, error_label);
2118 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2119 #pragma GCC diagnostic push
2120 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2121 #endif
2122     warper->queue = clCreateCommandQueue(warper->context, warper->dev, 0, &err);
2123 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2124 #pragma GCC diagnostic pop
2125 #endif
2126     handleErrGoto(err, error_label);
2127 
2128     //Ensure that we hand handle imagery of these dimensions
2129     err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &maxWidth, &sz);
2130     handleErrGoto(err, error_label);
2131     err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &maxHeight, &sz);
2132     handleErrGoto(err, error_label);
2133     if (maxWidth < static_cast<size_t>(srcWidth) ||
2134         maxHeight < static_cast<size_t>(srcHeight)) {
2135         err = CL_INVALID_IMAGE_SIZE;
2136         handleErrGoto(err, error_label);
2137     }
2138 
2139     // Split bands into sets of four when possible
2140     // Cubic runs slower as vector, so don't use it (probably register pressure)
2141     // Feel free to do more testing and come up with more precise case statements
2142     if(numBands < 4 || resampAlg == OCL_Cubic) {
2143         warper->numImages = numBands;
2144         warper->useVec = FALSE;
2145     } else {
2146         warper->numImages = numBands/4 + numBands % 4;
2147         warper->useVec = TRUE;
2148     }
2149 
2150     //Make the pointer space for the real images
2151     warper->realWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2152     warper->dstRealWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2153 
2154     //Make space for the per-channel Imag data (if exists)
2155     if (useImag) {
2156         warper->imagWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2157         warper->dstImagWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2158     }
2159 
2160     //Make space for the per-band BandSrcValid data (if exists)
2161     if (useBandSrcValid) {
2162         //32 bits in the mask
2163         sz = warper->numBands * ((31 + warper->srcWidth * warper->srcHeight) >> 5);
2164 
2165         //Allocate some space for the validity of the validity mask
2166         err = alloc_pinned_mem(warper, 0, warper->numBands*sizeof(char),
2167                                reinterpret_cast<void **>(&(warper->useBandSrcValid)),
2168                                &(warper->useBandSrcValidCL));
2169         handleErrGoto(err, error_label);
2170 
2171         for (i = 0; i < warper->numBands; ++i)
2172             warper->useBandSrcValid[i] = FALSE;
2173 
2174         // Allocate one array for all the band validity masks.
2175         // Remember that the masks don't use much memory (they're bitwise).
2176         err = alloc_pinned_mem(warper, 0, sz * sizeof(int),
2177                                reinterpret_cast<void **>(&(warper->nBandSrcValid)),
2178                                &(warper->nBandSrcValidCL));
2179         handleErrGoto(err, error_label);
2180     }
2181 
2182     //Make space for the per-band
2183     if (dfDstNoDataReal != nullptr) {
2184         alloc_pinned_mem(warper, 0, warper->numBands,
2185                          reinterpret_cast<void **>(&(warper->fDstNoDataReal)),
2186                          &(warper->fDstNoDataRealCL));
2187 
2188         //Copy over values
2189         for (i = 0; i < warper->numBands; ++i)
2190             warper->fDstNoDataReal[i] = static_cast<float>(dfDstNoDataReal[i]);
2191     }
2192 
2193     //Alloc working host image memory
2194     //We'll be copying into these buffers soon
2195     switch (imageFormat) {
2196       case CL_FLOAT:
2197         err = alloc_working_arr(warper, sizeof(float *), sizeof(float), &fmtSize);
2198         break;
2199       case CL_SNORM_INT8:
2200         err = alloc_working_arr(warper, sizeof(char *), sizeof(char), &fmtSize);
2201         break;
2202       case CL_UNORM_INT8:
2203         err = alloc_working_arr(warper, sizeof(unsigned char *), sizeof(unsigned char), &fmtSize);
2204         break;
2205       case CL_SNORM_INT16:
2206         err = alloc_working_arr(warper, sizeof(short *), sizeof(short), &fmtSize);
2207         break;
2208       case CL_UNORM_INT16:
2209         err = alloc_working_arr(warper, sizeof(unsigned short *), sizeof(unsigned short), &fmtSize);
2210         break;
2211     }
2212     handleErrGoto(err, error_label);
2213 
2214     // Find a good & compatible image channel order for the Lat/Long array.
2215     err = set_supported_formats(warper, 2,
2216                                 &(warper->xyChOrder), &(warper->xyChSize),
2217                                 CL_FLOAT);
2218     handleErrGoto(err, error_label);
2219 
2220     //Set coordinate image dimensions
2221     warper->xyWidth  = static_cast<int>(ceil((static_cast<float>(warper->dstWidth)  + static_cast<float>(warper->coordMult)-1)/static_cast<float>(warper->coordMult)));
2222     warper->xyHeight = static_cast<int>(ceil((static_cast<float>(warper->dstHeight) + static_cast<float>(warper->coordMult)-1)/static_cast<float>(warper->coordMult)));
2223 
2224     //Alloc coord memory
2225     sz = sizeof(float) * warper->xyChSize * warper->xyWidth * warper->xyHeight;
2226     err = alloc_pinned_mem(warper, 0, sz, reinterpret_cast<void **>(&(warper->xyWork)),
2227                            &(warper->xyWorkCL));
2228     handleErrGoto(err, error_label);
2229 
2230     //Ensure everything is finished allocating, copying, & mapping
2231     err = clFinish(warper->queue);
2232     handleErrGoto(err, error_label);
2233 
2234     (*clErr) = CL_SUCCESS;
2235     return warper;
2236 
2237 error_label:
2238     GDALWarpKernelOpenCL_deleteEnv(warper);
2239     return nullptr;
2240 }
2241 
2242 /*
2243  Copy the validity mask for an image band to the warper.
2244 
2245  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2246  */
2247 cl_int GDALWarpKernelOpenCL_setSrcValid(struct oclWarper *warper,
2248                                         int *bandSrcValid, int bandNum)
2249 {
2250     //32 bits in the mask
2251     int stride = (31 + warper->srcWidth * warper->srcHeight) >> 5;
2252 
2253     //Copy bandSrcValid
2254     assert(warper->nBandSrcValid != nullptr);
2255     memcpy(&(warper->nBandSrcValid[bandNum*stride]), bandSrcValid, sizeof(int) * stride);
2256     warper->useBandSrcValid[bandNum] = TRUE;
2257 
2258     return CL_SUCCESS;
2259 }
2260 
2261 /*
2262  Sets the source image real & imag into the host memory so that it is
2263  permuted (ex. RGBA) for better graphics card access.
2264 
2265  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2266  */
2267 cl_int GDALWarpKernelOpenCL_setSrcImg(struct oclWarper *warper, void *imgData,
2268                                       int bandNum)
2269 {
2270     void **imagWorkPtr = nullptr;
2271 
2272     if (warper->imagWorkCL != nullptr)
2273         imagWorkPtr = warper->imagWork.v;
2274 
2275     return set_img_data(warper, imgData, warper->srcWidth, warper->srcHeight,
2276                         TRUE, bandNum, warper->realWork.v, imagWorkPtr);
2277 }
2278 
2279 /*
2280  Sets the destination image real & imag into the host memory so that it is
2281  permuted (ex. RGBA) for better graphics card access.
2282 
2283  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2284  */
2285 cl_int GDALWarpKernelOpenCL_setDstImg(struct oclWarper *warper, void *imgData,
2286                                       int bandNum)
2287 {
2288     void **dstImagWorkPtr = nullptr;
2289 
2290     if (warper->dstImagWorkCL != nullptr)
2291         dstImagWorkPtr = warper->dstImagWork.v;
2292 
2293     return set_img_data(warper, imgData, warper->dstWidth, warper->dstHeight,
2294                         FALSE, bandNum, warper->dstRealWork.v, dstImagWorkPtr);
2295 }
2296 
2297 /*
2298  Inputs the source coordinates for a row of the destination pixels. Invalid
2299  coordinates are set as -99.0, which should be out of the image bounds. Sets
2300  the coordinates as ready to be used in OpenCL image memory: interleaved and
2301  minus the offset. By using image memory, we can use a smaller texture for
2302  coordinates and use OpenCL's built-in interpolation to save memory.
2303 
2304  What it does: generates a smaller matrix of X/Y coordinate transformation
2305  values from an original matrix. When bilinearly sampled in the GPU hardware,
2306  the generated values are as close as possible to the original matrix.
2307 
2308  Complication: matrices have arbitrary dimensions and the sub-sampling factor
2309  is an arbitrary integer greater than zero. Getting the edge cases right is
2310  difficult.
2311 
2312  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2313  */
2314 cl_int GDALWarpKernelOpenCL_setCoordRow(struct oclWarper *warper,
2315                                         double *rowSrcX, double *rowSrcY,
2316                                         double srcXOff, double srcYOff,
2317                                         int *success, int rowNum)
2318 {
2319     int coordMult = warper->coordMult;
2320     int width = warper->dstWidth;
2321     int height = warper->dstHeight;
2322     int xyWidth = warper->xyWidth;
2323     int i;
2324     int xyChSize = warper->xyChSize;
2325     float *xyPtr, *xyPrevPtr = nullptr;
2326     int lastRow = rowNum == height - 1;
2327     double dstHeightMod = 1.0, dstWidthMod = 1.0;
2328 
2329     //Return if we're at an off row
2330     if(!lastRow && rowNum % coordMult != 0)
2331         return CL_SUCCESS;
2332 
2333     //Standard row, adjusted for the skipped rows
2334     xyPtr = &(warper->xyWork[xyWidth * xyChSize * rowNum / coordMult]);
2335 
2336     //Find our row
2337     if(lastRow){
2338         //Setup for the final row
2339         xyPtr     = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 1)]);
2340         xyPrevPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 2)]);
2341 
2342         if((height-1) % coordMult)
2343             dstHeightMod = static_cast<double>(coordMult) / static_cast<double>((height-1) % coordMult);
2344     }
2345 
2346     //Copy selected coordinates
2347     for (i = 0; i < width; i += coordMult) {
2348         if (success[i]) {
2349             xyPtr[0] = static_cast<float>(rowSrcX[i] - srcXOff);
2350             xyPtr[1] = static_cast<float>(rowSrcY[i] - srcYOff);
2351 
2352             if(lastRow) {
2353                 //Adjust bottom row so interpolator returns correct value
2354                 xyPtr[0] = static_cast<float>(dstHeightMod * (xyPtr[0] - xyPrevPtr[0]) + xyPrevPtr[0]);
2355                 xyPtr[1] = static_cast<float>(dstHeightMod * (xyPtr[1] - xyPrevPtr[1]) + xyPrevPtr[1]);
2356             }
2357         } else {
2358             xyPtr[0] = -99.0f;
2359             xyPtr[1] = -99.0f;
2360         }
2361 
2362         xyPtr += xyChSize;
2363         xyPrevPtr += xyChSize;
2364     }
2365 
2366     //Copy remaining coordinate
2367     if((width-1) % coordMult){
2368         dstWidthMod = static_cast<double>(coordMult) / static_cast<double>((width-1) % coordMult);
2369         xyPtr -= xyChSize;
2370         xyPrevPtr -= xyChSize;
2371     } else {
2372         xyPtr -= xyChSize*2;
2373         xyPrevPtr -= xyChSize*2;
2374     }
2375 
2376     if(lastRow) {
2377         double origX = rowSrcX[width-1] - srcXOff;
2378         double origY = rowSrcY[width-1] - srcYOff;
2379         double a = 1.0, b = 1.0;
2380 
2381         // Calculate the needed x/y values using an equation from the OpenCL Spec
2382         // section 8.2, solving for Ti1j1
2383         if((width -1) % coordMult)
2384             a = ((width -1) % coordMult)/static_cast<double>(coordMult);
2385 
2386         if((height-1) % coordMult)
2387             b = ((height-1) % coordMult)/static_cast<double>(coordMult);
2388 
2389         xyPtr[xyChSize  ] = static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[0]
2390                               + a * (1.0 - b) * xyPrevPtr[xyChSize]
2391                               + (1.0 - a) * b * xyPtr[0]) - origX)/(-a * b));
2392 
2393         xyPtr[xyChSize+1] = static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[1]
2394                               + a * (1.0 - b) * xyPrevPtr[xyChSize+1]
2395                               + (1.0 - a) * b * xyPtr[1]) - origY)/(-a * b));
2396     } else {
2397         //Adjust last coordinate so interpolator returns correct value
2398         xyPtr[xyChSize  ] = static_cast<float>(dstWidthMod * (rowSrcX[width-1] - srcXOff - xyPtr[0]) + xyPtr[0]);
2399         xyPtr[xyChSize+1] = static_cast<float>(dstWidthMod * (rowSrcY[width-1] - srcYOff - xyPtr[1]) + xyPtr[1]);
2400     }
2401 
2402     return CL_SUCCESS;
2403 }
2404 
2405 /*
2406  Copies all data to the device RAM, frees the host RAM, runs the
2407  appropriate resampling kernel, mallocs output space, & copies the data
2408  back from the device RAM for each band. Also check to make sure that
2409  setRow*() was called the appropriate number of times to init all image
2410  data.
2411 
2412  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2413  */
2414 cl_int GDALWarpKernelOpenCL_runResamp(struct oclWarper *warper,
2415                                       float *unifiedSrcDensity,
2416                                       unsigned int *unifiedSrcValid,
2417                                       float *dstDensity,
2418                                       unsigned int *dstValid,
2419                                       double dfXScale, double dfYScale,
2420                                       double dfXFilter, double dfYFilter,
2421                                       int nXRadius, int nYRadius,
2422                                       int nFiltInitX, int nFiltInitY)
2423 {
2424     int i, nextBandNum = 0, chSize = 1;
2425     cl_int err = CL_SUCCESS;
2426     cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;
2427     cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;
2428     cl_mem useBandSrcValidCL, nBandSrcValidCL;
2429     size_t groupSize, wordSize = 0;
2430     cl_kernel kern = nullptr;
2431     cl_channel_order chOrder;
2432 
2433     warper->useUnifiedSrcDensity = unifiedSrcDensity != nullptr;
2434     warper->useUnifiedSrcValid = unifiedSrcValid != nullptr;
2435 
2436     //Check the word size
2437     switch (warper->imageFormat) {
2438         case CL_FLOAT:
2439             wordSize = sizeof(float);
2440             break;
2441         case CL_SNORM_INT8:
2442             wordSize = sizeof(char);
2443             break;
2444         case CL_UNORM_INT8:
2445             wordSize = sizeof(unsigned char);
2446             break;
2447         case CL_SNORM_INT16:
2448             wordSize = sizeof(short);
2449             break;
2450         case CL_UNORM_INT16:
2451             wordSize = sizeof(unsigned short);
2452             break;
2453     }
2454 
2455     //Compile the kernel; the invariants are being compiled into the code
2456     if (!warper->useVec || warper->numBands % 4) {
2457         warper->kern1 = get_kernel(warper, FALSE,
2458                                    dfXScale, dfYScale, dfXFilter, dfYFilter,
2459                                    nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2460         handleErr(err);
2461     }
2462     if (warper->useVec){
2463         warper->kern4 = get_kernel(warper, TRUE,
2464                                    dfXScale, dfYScale, dfXFilter, dfYFilter,
2465                                    nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2466         handleErr(err);
2467     }
2468 
2469     //Copy coord data to the device
2470     handleErr(err = set_coord_data(warper, &xy));
2471 
2472     //Copy unified density & valid data
2473     handleErr(err = set_unified_data(warper, &unifiedSrcDensityCL, &unifiedSrcValidCL,
2474                                      unifiedSrcDensity, unifiedSrcValid,
2475                                      &useBandSrcValidCL, &nBandSrcValidCL));
2476 
2477     //Copy output density & valid data
2478     handleErr(set_dst_data(warper, &dstDensityCL, &dstValidCL, &dstNoDataRealCL,
2479                            dstDensity, dstValid, warper->fDstNoDataReal));
2480 
2481     //What's the recommended group size?
2482     if (warper->useVec) {
2483         // Start with the vector kernel
2484         handleErr(clGetKernelWorkGroupInfo(warper->kern4, warper->dev,
2485                                            CL_KERNEL_WORK_GROUP_SIZE,
2486                                            sizeof(size_t), &groupSize, nullptr));
2487         kern = warper->kern4;
2488         chSize = warper->imgChSize4;
2489         chOrder = warper->imgChOrder4;
2490     } else {
2491         // We're only using the float kernel
2492         handleErr(clGetKernelWorkGroupInfo(warper->kern1, warper->dev,
2493                                            CL_KERNEL_WORK_GROUP_SIZE,
2494                                            sizeof(size_t), &groupSize, nullptr));
2495         kern = warper->kern1;
2496         chSize = warper->imgChSize1;
2497         chOrder = warper->imgChOrder1;
2498     }
2499 
2500     //Loop over each image
2501     for (i = 0; i < warper->numImages; ++i)
2502     {
2503         cl_mem srcImag, srcReal;
2504         cl_mem dstReal, dstImag;
2505         int bandNum = nextBandNum;
2506 
2507         //Switch kernels if needed
2508         if (warper->useVec && nextBandNum < warper->numBands - warper->numBands % 4) {
2509             nextBandNum += 4;
2510         } else {
2511             if (kern == warper->kern4) {
2512                 handleErr(clGetKernelWorkGroupInfo(warper->kern1, warper->dev,
2513                                                    CL_KERNEL_WORK_GROUP_SIZE,
2514                                                    sizeof(size_t), &groupSize, nullptr));
2515                 kern = warper->kern1;
2516                 chSize = warper->imgChSize1;
2517                 chOrder = warper->imgChOrder1;
2518             }
2519             ++nextBandNum;
2520         }
2521 
2522         //Create & copy the source image
2523         handleErr(err = set_src_rast_data(warper, i, chSize*wordSize, chOrder,
2524                                           &srcReal, &srcImag));
2525 
2526         //Create & copy the output image
2527         if (kern == warper->kern1) {
2528             handleErr(err = set_dst_rast_data(warper, i, wordSize, &dstReal, &dstImag));
2529         } else {
2530             handleErr(err = set_dst_rast_data(warper, i, wordSize*4, &dstReal, &dstImag));
2531         }
2532 
2533         //Set the bandNum
2534         handleErr(err = clSetKernelArg(kern, 12, sizeof(int), &bandNum));
2535 
2536         //Run the kernel
2537         handleErr(err = execute_kern(warper, kern, groupSize));
2538 
2539         //Free loop CL mem
2540         handleErr(err = clReleaseMemObject(srcReal));
2541         handleErr(err = clReleaseMemObject(srcImag));
2542 
2543         //Copy the back output results
2544         if (kern == warper->kern1) {
2545             handleErr(err = get_dst_rast_data(warper, i, wordSize, dstReal, dstImag));
2546         } else {
2547             handleErr(err = get_dst_rast_data(warper, i, wordSize*4, dstReal, dstImag));
2548         }
2549 
2550         //Free remaining CL mem
2551         handleErr(err = clReleaseMemObject(dstReal));
2552         handleErr(err = clReleaseMemObject(dstImag));
2553     }
2554 
2555     //Free remaining CL mem
2556     handleErr(err = clReleaseMemObject(xy));
2557     handleErr(err = clReleaseMemObject(unifiedSrcDensityCL));
2558     handleErr(err = clReleaseMemObject(unifiedSrcValidCL));
2559     handleErr(err = clReleaseMemObject(useBandSrcValidCL));
2560     handleErr(err = clReleaseMemObject(nBandSrcValidCL));
2561     handleErr(err = clReleaseMemObject(dstDensityCL));
2562     handleErr(err = clReleaseMemObject(dstValidCL));
2563     handleErr(err = clReleaseMemObject(dstNoDataRealCL));
2564 
2565     return CL_SUCCESS;
2566 }
2567 
2568 /*
2569  Sets pointers to the floating point data in the warper. The pointers
2570  are internal to the warper structure, so don't free() them. If the imag
2571  channel is in use, it will receive a pointer. Otherwise it'll be set to NULL.
2572  These are pointers to floating point data, so the caller will need to
2573  manipulate the output as appropriate before saving the data.
2574 
2575  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2576  */
2577 cl_int GDALWarpKernelOpenCL_getRow(struct oclWarper *warper,
2578                                    void **rowReal, void **rowImag,
2579                                    int rowNum, int bandNum)
2580 {
2581     int memOff = rowNum * warper->dstWidth;
2582     int imgNum = bandNum;
2583 
2584     if (warper->useVec && bandNum < warper->numBands - warper->numBands % 4) {
2585         memOff += warper->dstWidth * warper->dstHeight * (bandNum % 4);
2586         imgNum = bandNum / 4;
2587     } else if(warper->useVec) {
2588         imgNum = bandNum / 4 + bandNum % 4;
2589     }
2590 
2591     //Return pointers into the warper's data
2592     switch (warper->imageFormat) {
2593         case CL_FLOAT:
2594             (*rowReal) = &(warper->dstRealWork.f[imgNum][memOff]);
2595             break;
2596         case CL_SNORM_INT8:
2597             (*rowReal) = &(warper->dstRealWork.c[imgNum][memOff]);
2598             break;
2599         case CL_UNORM_INT8:
2600             (*rowReal) = &(warper->dstRealWork.uc[imgNum][memOff]);
2601             break;
2602         case CL_SNORM_INT16:
2603             (*rowReal) = &(warper->dstRealWork.s[imgNum][memOff]);
2604             break;
2605         case CL_UNORM_INT16:
2606             (*rowReal) = &(warper->dstRealWork.us[imgNum][memOff]);
2607             break;
2608     }
2609 
2610     if (warper->dstImagWorkCL == nullptr) {
2611         (*rowImag) = nullptr;
2612     } else {
2613         switch (warper->imageFormat) {
2614             case CL_FLOAT:
2615                 (*rowImag) = &(warper->dstImagWork.f[imgNum][memOff]);
2616                 break;
2617             case CL_SNORM_INT8:
2618                 (*rowImag) = &(warper->dstImagWork.c[imgNum][memOff]);
2619                 break;
2620             case CL_UNORM_INT8:
2621                 (*rowImag) = &(warper->dstImagWork.uc[imgNum][memOff]);
2622                 break;
2623             case CL_SNORM_INT16:
2624                 (*rowImag) = &(warper->dstImagWork.s[imgNum][memOff]);
2625                 break;
2626             case CL_UNORM_INT16:
2627                 (*rowImag) = &(warper->dstImagWork.us[imgNum][memOff]);
2628                 break;
2629         }
2630     }
2631 
2632     return CL_SUCCESS;
2633 }
2634 
2635 /*
2636  Free the OpenCL warper environment. It should check everything for NULL, so
2637  be sure to mark free()ed pointers as NULL or it'll be double free()ed.
2638 
2639  Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2640  */
2641 cl_int GDALWarpKernelOpenCL_deleteEnv(struct oclWarper *warper)
2642 {
2643     int i;
2644     cl_int err = CL_SUCCESS;
2645 
2646     for (i = 0; i < warper->numImages; ++i) {
2647         // Run free!!
2648         void* dummy = nullptr;
2649         if( warper->realWork.v )
2650             freeCLMem(warper->realWorkCL[i], warper->realWork.v[i]);
2651         else
2652             freeCLMem(warper->realWorkCL[i], dummy);
2653         if( warper->realWork.v )
2654             freeCLMem(warper->dstRealWorkCL[i], warper->dstRealWork.v[i]);
2655         else
2656             freeCLMem(warper->dstRealWorkCL[i], dummy);
2657 
2658         //(As applicable)
2659         if(warper->imagWorkCL != nullptr && warper->imagWork.v != nullptr && warper->imagWork.v[i] != nullptr) {
2660             freeCLMem(warper->imagWorkCL[i], warper->imagWork.v[i]);
2661         }
2662         if(warper->dstImagWorkCL != nullptr && warper->dstImagWork.v != nullptr && warper->dstImagWork.v[i] != nullptr) {
2663             freeCLMem(warper->dstImagWorkCL[i], warper->dstImagWork.v[i]);
2664         }
2665     }
2666 
2667     //Free cl_mem
2668     freeCLMem(warper->useBandSrcValidCL, warper->useBandSrcValid);
2669     freeCLMem(warper->nBandSrcValidCL, warper->nBandSrcValid);
2670     freeCLMem(warper->xyWorkCL, warper->xyWork);
2671     freeCLMem(warper->fDstNoDataRealCL, warper->fDstNoDataReal);
2672 
2673     //Free pointers to cl_mem*
2674     if (warper->realWorkCL != nullptr)
2675         CPLFree(warper->realWorkCL);
2676     if (warper->dstRealWorkCL != nullptr)
2677         CPLFree(warper->dstRealWorkCL);
2678 
2679     if (warper->imagWorkCL != nullptr)
2680         CPLFree(warper->imagWorkCL);
2681     if (warper->dstImagWorkCL != nullptr)
2682         CPLFree(warper->dstImagWorkCL);
2683 
2684     if (warper->realWork.v != nullptr)
2685         CPLFree(warper->realWork.v);
2686     if (warper->dstRealWork.v != nullptr)
2687         CPLFree(warper->dstRealWork.v);
2688 
2689     if (warper->imagWork.v != nullptr)
2690         CPLFree(warper->imagWork.v);
2691     if (warper->dstImagWork.v != nullptr)
2692         CPLFree(warper->dstImagWork.v);
2693 
2694     //Free OpenCL structures
2695     if (warper->kern1 != nullptr)
2696         clReleaseKernel(warper->kern1);
2697     if (warper->kern4 != nullptr)
2698         clReleaseKernel(warper->kern4);
2699     if (warper->queue != nullptr)
2700         clReleaseCommandQueue(warper->queue);
2701     if (warper->context != nullptr)
2702         clReleaseContext(warper->context);
2703 
2704     CPLFree(warper);
2705 
2706     return CL_SUCCESS;
2707 }
2708 
2709 #endif /* defined(HAVE_OPENCL) */
2710