1 /******************************************************************************
2 * $Id: gdalwarpkernel_opencl.cpp 32b416eddf6e3fa3ea837f60ba5cf5e9f6982cc0 2021-09-01 11:25:43 +0200 Even Rouault $
3 *
4 * Project: OpenCL Image Reprojector
5 * Purpose: Implementation of the GDALWarpKernel reprojector in OpenCL.
6 * Author: Seth Price, seth@pricepages.org
7 *
8 ******************************************************************************
9 * Copyright (c) 2010, Seth Price <seth@pricepages.org>
10 * Copyright (c) 2010-2012, Even Rouault <even dot rouault at spatialys.com>
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining a
13 * copy of this software and associated documentation files (the "Software"),
14 * to deal in the Software without restriction, including without limitation
15 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16 * and/or sell copies of the Software, and to permit persons to whom the
17 * Software is furnished to do so, subject to the following conditions:
18 *
19 * The above copyright notice and this permission notice shall be included
20 * in all copies or substantial portions of the Software.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28 * DEALINGS IN THE SOFTWARE.
29 ****************************************************************************/
30
31 #if defined(HAVE_OPENCL)
32
33 /* The following line may be uncommented for increased debugging traces and profiling */
34 /* #define DEBUG_OPENCL 1 */
35
36 #include <assert.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <limits.h>
40 #include <float.h>
41 #include <limits>
42 #include <vector>
43 #include "cpl_string.h"
44 #include "gdalwarpkernel_opencl.h"
45
46 CPL_CVSID("$Id: gdalwarpkernel_opencl.cpp 32b416eddf6e3fa3ea837f60ba5cf5e9f6982cc0 2021-09-01 11:25:43 +0200 Even Rouault $")
47
48 #define handleErr(err) if((err) != CL_SUCCESS) { \
49 CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
50 return err; \
51 }
52
53 #define handleErrRetNULL(err) if((err) != CL_SUCCESS) { \
54 (*clErr) = err; \
55 CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
56 return nullptr; \
57 }
58
59 #define handleErrGoto(err, goto_label) if((err) != CL_SUCCESS) { \
60 (*clErr) = err; \
61 CPLError(CE_Failure, CPLE_AppDefined, "Error at file %s line %d: %s", __FILE__, __LINE__, getCLErrorString(err)); \
62 goto goto_label; \
63 }
64
65 #define freeCLMem(clMem, fallBackMem) do { \
66 if ((clMem) != nullptr) { \
67 handleErr(err = clReleaseMemObject(clMem)); \
68 clMem = nullptr; \
69 fallBackMem = nullptr; \
70 } else if ((fallBackMem) != nullptr) { \
71 CPLFree(fallBackMem); \
72 fallBackMem = nullptr; \
73 } \
74 } while( false )
75
getCLErrorString(cl_int err)76 static const char* getCLErrorString(cl_int err)
77 {
78 switch (err)
79 {
80 case CL_SUCCESS:
81 return("CL_SUCCESS");
82 break;
83 case CL_DEVICE_NOT_FOUND:
84 return("CL_DEVICE_NOT_FOUND");
85 break;
86 case CL_DEVICE_NOT_AVAILABLE:
87 return("CL_DEVICE_NOT_AVAILABLE");
88 break;
89 case CL_COMPILER_NOT_AVAILABLE:
90 return("CL_COMPILER_NOT_AVAILABLE");
91 break;
92 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
93 return("CL_MEM_OBJECT_ALLOCATION_FAILURE");
94 break;
95 case CL_OUT_OF_RESOURCES:
96 return("CL_OUT_OF_RESOURCES");
97 break;
98 case CL_OUT_OF_HOST_MEMORY:
99 return("CL_OUT_OF_HOST_MEMORY");
100 break;
101 case CL_PROFILING_INFO_NOT_AVAILABLE:
102 return("CL_PROFILING_INFO_NOT_AVAILABLE");
103 break;
104 case CL_MEM_COPY_OVERLAP:
105 return("CL_MEM_COPY_OVERLAP");
106 break;
107 case CL_IMAGE_FORMAT_MISMATCH:
108 return("CL_IMAGE_FORMAT_MISMATCH");
109 break;
110 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
111 return("CL_IMAGE_FORMAT_NOT_SUPPORTED");
112 break;
113 case CL_BUILD_PROGRAM_FAILURE:
114 return("CL_BUILD_PROGRAM_FAILURE");
115 break;
116 case CL_MAP_FAILURE:
117 return("CL_MAP_FAILURE");
118 break;
119 case CL_INVALID_VALUE:
120 return("CL_INVALID_VALUE");
121 break;
122 case CL_INVALID_DEVICE_TYPE:
123 return("CL_INVALID_DEVICE_TYPE");
124 break;
125 case CL_INVALID_PLATFORM:
126 return("CL_INVALID_PLATFORM");
127 break;
128 case CL_INVALID_DEVICE:
129 return("CL_INVALID_DEVICE");
130 break;
131 case CL_INVALID_CONTEXT:
132 return("CL_INVALID_CONTEXT");
133 break;
134 case CL_INVALID_QUEUE_PROPERTIES:
135 return("CL_INVALID_QUEUE_PROPERTIES");
136 break;
137 case CL_INVALID_COMMAND_QUEUE:
138 return("CL_INVALID_COMMAND_QUEUE");
139 break;
140 case CL_INVALID_HOST_PTR:
141 return("CL_INVALID_HOST_PTR");
142 break;
143 case CL_INVALID_MEM_OBJECT:
144 return("CL_INVALID_MEM_OBJECT");
145 break;
146 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
147 return("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR");
148 break;
149 case CL_INVALID_IMAGE_SIZE:
150 return("CL_INVALID_IMAGE_SIZE");
151 break;
152 case CL_INVALID_SAMPLER:
153 return("CL_INVALID_SAMPLER");
154 break;
155 case CL_INVALID_BINARY:
156 return("CL_INVALID_BINARY");
157 break;
158 case CL_INVALID_BUILD_OPTIONS:
159 return("CL_INVALID_BUILD_OPTIONS");
160 break;
161 case CL_INVALID_PROGRAM:
162 return("CL_INVALID_PROGRAM");
163 break;
164 case CL_INVALID_PROGRAM_EXECUTABLE:
165 return("CL_INVALID_PROGRAM_EXECUTABLE");
166 break;
167 case CL_INVALID_KERNEL_NAME:
168 return("CL_INVALID_KERNEL_NAME");
169 break;
170 case CL_INVALID_KERNEL_DEFINITION:
171 return("CL_INVALID_KERNEL_DEFINITION");
172 break;
173 case CL_INVALID_KERNEL:
174 return("CL_INVALID_KERNEL");
175 break;
176 case CL_INVALID_ARG_INDEX:
177 return("CL_INVALID_ARG_INDEX");
178 break;
179 case CL_INVALID_ARG_VALUE:
180 return("CL_INVALID_ARG_VALUE");
181 break;
182 case CL_INVALID_ARG_SIZE:
183 return("CL_INVALID_ARG_SIZE");
184 break;
185 case CL_INVALID_KERNEL_ARGS:
186 return("CL_INVALID_KERNEL_ARGS");
187 break;
188 case CL_INVALID_WORK_DIMENSION:
189 return("CL_INVALID_WORK_DIMENSION");
190 break;
191 case CL_INVALID_WORK_GROUP_SIZE:
192 return("CL_INVALID_WORK_GROUP_SIZE");
193 break;
194 case CL_INVALID_WORK_ITEM_SIZE:
195 return("CL_INVALID_WORK_ITEM_SIZE");
196 break;
197 case CL_INVALID_GLOBAL_OFFSET:
198 return("CL_INVALID_GLOBAL_OFFSET");
199 break;
200 case CL_INVALID_EVENT_WAIT_LIST:
201 return("CL_INVALID_EVENT_WAIT_LIST");
202 break;
203 case CL_INVALID_EVENT:
204 return("CL_INVALID_EVENT");
205 break;
206 case CL_INVALID_OPERATION:
207 return("CL_INVALID_OPERATION");
208 break;
209 case CL_INVALID_GL_OBJECT:
210 return("CL_INVALID_GL_OBJECT");
211 break;
212 case CL_INVALID_BUFFER_SIZE:
213 return("CL_INVALID_BUFFER_SIZE");
214 break;
215 case CL_INVALID_MIP_LEVEL:
216 return("CL_INVALID_MIP_LEVEL");
217 break;
218 case CL_INVALID_GLOBAL_WORK_SIZE:
219 return("CL_INVALID_GLOBAL_WORK_SIZE");
220 break;
221 }
222
223 return "unknown_error";
224 }
225
getCLDataTypeString(cl_channel_type dataType)226 static const char* getCLDataTypeString( cl_channel_type dataType )
227 {
228 switch( dataType )
229 {
230 case CL_SNORM_INT8: return "CL_SNORM_INT8";
231 case CL_SNORM_INT16: return "CL_SNORM_INT16";
232 case CL_UNORM_INT8: return "CL_UNORM_INT8";
233 case CL_UNORM_INT16: return "CL_UNORM_INT16";
234 #if 0
235 case CL_UNORM_SHORT_565: return "CL_UNORM_SHORT_565";
236 case CL_UNORM_SHORT_555: return "CL_UNORM_SHORT_555";
237 case CL_UNORM_INT_101010: return "CL_UNORM_INT_101010";
238 case CL_SIGNED_INT8: return "CL_SIGNED_INT8";
239 case CL_SIGNED_INT16: return "CL_SIGNED_INT16";
240 case CL_SIGNED_INT32: return "CL_SIGNED_INT32";
241 case CL_UNSIGNED_INT8: return "CL_UNSIGNED_INT8";
242 case CL_UNSIGNED_INT16: return "CL_UNSIGNED_INT16";
243 case CL_UNSIGNED_INT32: return "CL_UNSIGNED_INT32";
244 case CL_HALF_FLOAT: return "CL_HALF_FLOAT";
245 #endif
246 case CL_FLOAT: return "CL_FLOAT";
247 default: return "unknown";
248 }
249 }
250
251 /*
252 Finds an appropriate OpenCL device. For debugging, it's
253 always easier to use CL_DEVICE_TYPE_CPU because then */ /*ok*/ /*printf() can be called
254 from the kernel. If debugging is on, we can print the name and stats about the
255 device we're using.
256 */
get_device(OCLVendor * peVendor)257 static cl_device_id get_device(OCLVendor *peVendor)
258 {
259 cl_int err = 0;
260 size_t returned_size = 0;
261 cl_char vendor_name[1024] = {0};
262 cl_char device_name[1024] = {0};
263
264 std::vector<cl_platform_id> platforms;
265 cl_uint num_platforms = 0;
266 cl_device_id preferred_device_id = nullptr;
267 int preferred_is_gpu = FALSE;
268
269 static bool gbBuggyOpenCL = false;
270 if( gbBuggyOpenCL )
271 return nullptr;
272 try
273 {
274 err = clGetPlatformIDs( 0, nullptr, &num_platforms );
275 if( err != CL_SUCCESS || num_platforms == 0 )
276 return nullptr;
277
278 platforms.resize(num_platforms);
279 err = clGetPlatformIDs( num_platforms, &platforms[0], nullptr );
280 if( err != CL_SUCCESS )
281 return nullptr;
282 }
283 catch( ... )
284 {
285 gbBuggyOpenCL = true;
286 CPLDebug("OpenCL", "clGetPlatformIDs() threw a C++ exception");
287 // This should normally not happen. But that does happen with
288 // intel-opencl 0r2.0-54426 when run under xvfb-run
289 return nullptr;
290 }
291
292 bool bUseOpenCLCPU = CPLTestBool( CPLGetConfigOption("OPENCL_USE_CPU", "FALSE") );
293
294 // In case we have several implementations, pick up the non Intel one by
295 // default, unless the PREFERRED_OPENCL_VENDOR config option is specified.
296 for( cl_uint i=0; i<num_platforms;i++)
297 {
298 cl_device_id device = nullptr;
299 const char* pszBlacklistedVendor;
300 const char* pszPreferredVendor;
301 int is_gpu;
302
303 // Find the GPU CL device, this is what we really want
304 // If there is no GPU device is CL capable, fall back to CPU
305 if( bUseOpenCLCPU )
306 err = CL_DEVICE_NOT_FOUND;
307 else
308 err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1,
309 &device, nullptr);
310 is_gpu = (err == CL_SUCCESS);
311 if (err != CL_SUCCESS)
312 {
313 // Find the CPU CL device, as a fallback
314 err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1,
315 &device, nullptr);
316 if( err != CL_SUCCESS || device == nullptr )
317 continue;
318 }
319
320 // Get some information about the returned device
321 err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
322 vendor_name, &returned_size);
323 err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
324 device_name, &returned_size);
325 assert(err == CL_SUCCESS);
326
327 if( num_platforms > 1 )
328 CPLDebug( "OpenCL", "Found vendor='%s' / device='%s' (%s implementation).",
329 vendor_name, device_name, (is_gpu) ? "GPU" : "CPU");
330
331 pszBlacklistedVendor = CPLGetConfigOption("BLACKLISTED_OPENCL_VENDOR", nullptr);
332 if( pszBlacklistedVendor &&
333 EQUAL( reinterpret_cast<const char*>(vendor_name), pszBlacklistedVendor ) )
334 {
335 CPLDebug("OpenCL", "Blacklisted vendor='%s' / device='%s' implementation skipped",
336 vendor_name, device_name);
337 continue;
338 }
339
340 if( preferred_device_id == nullptr || (is_gpu && !preferred_is_gpu) )
341 {
342 preferred_device_id = device;
343 preferred_is_gpu = is_gpu;
344 }
345
346 pszPreferredVendor = CPLGetConfigOption("PREFERRED_OPENCL_VENDOR", nullptr);
347 if( pszPreferredVendor )
348 {
349 if( EQUAL( reinterpret_cast<const char*>(vendor_name), pszPreferredVendor ) )
350 {
351 preferred_device_id = device;
352 preferred_is_gpu = is_gpu;
353 break;
354 }
355 }
356 else if( is_gpu && !STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Intel") )
357 {
358 preferred_device_id = device;
359 preferred_is_gpu = is_gpu;
360 break;
361 }
362 }
363 if( preferred_device_id == nullptr )
364 {
365 CPLDebug("OpenCL", "No implementation found");
366 return nullptr;
367 }
368
369 err = clGetDeviceInfo(preferred_device_id, CL_DEVICE_VENDOR, sizeof(vendor_name),
370 vendor_name, &returned_size);
371 err |= clGetDeviceInfo(preferred_device_id, CL_DEVICE_NAME, sizeof(device_name),
372 device_name, &returned_size);
373 CPLDebug( "OpenCL", "Connected to vendor='%s' / device='%s' (%s implementation).",
374 vendor_name, device_name, (preferred_is_gpu) ? "GPU" : "CPU");
375
376 if (STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Advanced Micro Devices"))
377 *peVendor = VENDOR_AMD;
378 else if (STARTS_WITH(reinterpret_cast<const char*>(vendor_name), "Intel"))
379 *peVendor = VENDOR_INTEL;
380 else
381 *peVendor = VENDOR_OTHER;
382
383 return preferred_device_id;
384 }
385
386 /*
387 Given that not all OpenCL devices support the same image formats, we need to
388 make do with what we have. This leads to wasted space, but as OpenCL matures
389 I hope it'll get better.
390 */
set_supported_formats(struct oclWarper * warper,cl_channel_order minOrderSize,cl_channel_order * chosenOrder,unsigned int * chosenSize,cl_channel_type dataType)391 static cl_int set_supported_formats(struct oclWarper *warper,
392 cl_channel_order minOrderSize,
393 cl_channel_order *chosenOrder,
394 unsigned int *chosenSize,
395 cl_channel_type dataType )
396 {
397 cl_image_format *fmtBuf = static_cast<cl_image_format *>(
398 calloc(256, sizeof(cl_image_format)));
399 cl_uint numRet;
400 cl_uint i;
401 cl_uint extraSpace = 9999;
402 cl_int err;
403 int bFound = FALSE;
404
405 //Find what we *can* handle
406 handleErr(err = clGetSupportedImageFormats(warper->context,
407 CL_MEM_READ_ONLY,
408 CL_MEM_OBJECT_IMAGE2D,
409 256, fmtBuf, &numRet));
410 for (i = 0; i < numRet; ++i) {
411 cl_channel_order thisOrderSize = 0;
412 switch (fmtBuf[i].image_channel_order)
413 {
414 //Only support formats which use the channels in order (x,y,z,w)
415 case CL_R:
416 case CL_INTENSITY:
417 case CL_LUMINANCE:
418 thisOrderSize = 1;
419 break;
420 case CL_RG:
421 thisOrderSize = 2;
422 break;
423 case CL_RGB:
424 thisOrderSize = 3;
425 break;
426 case CL_RGBA:
427 thisOrderSize = 4;
428 break;
429 }
430
431 //Choose an order with the least wasted space
432 if (fmtBuf[i].image_channel_data_type == dataType &&
433 minOrderSize <= thisOrderSize &&
434 extraSpace > thisOrderSize - minOrderSize ) {
435
436 //Set the vector size, order, & remember wasted space
437 (*chosenSize) = thisOrderSize;
438 (*chosenOrder) = fmtBuf[i].image_channel_order;
439 extraSpace = thisOrderSize - minOrderSize;
440 bFound = TRUE;
441 }
442 }
443
444 free(fmtBuf);
445
446 if( !bFound )
447 {
448 CPLDebug("OpenCL",
449 "Cannot find supported format for dataType = %s and minOrderSize = %d",
450 getCLDataTypeString(dataType),
451 static_cast<int>(minOrderSize));
452 }
453 return (bFound) ? CL_SUCCESS : CL_INVALID_OPERATION;
454 }
455
456 /*
457 Allocate some pinned memory that we can use as an intermediate buffer. We're
458 using the pinned memory to assemble the data before transferring it to the
459 device. The reason we're using pinned RAM is because the transfer speed from
460 host RAM to device RAM is faster than non-pinned. The disadvantage is that
461 pinned RAM is a scarce OS resource. I'm making the assumption that the user
462 has as much pinned host RAM available as total device RAM because device RAM
463 tends to be similarly scarce. However, if the pinned memory fails we fall back
464 to using a regular memory allocation.
465
466 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
467 */
468 static
alloc_pinned_mem(struct oclWarper * warper,int imgNum,size_t dataSz,void ** wrkPtr,cl_mem * wrkCL)469 cl_int alloc_pinned_mem(struct oclWarper *warper, int imgNum, size_t dataSz,
470 void **wrkPtr, cl_mem *wrkCL)
471 {
472 cl_int err = CL_SUCCESS;
473 wrkCL[imgNum] = clCreateBuffer(warper->context,
474 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
475 dataSz, nullptr, &err);
476
477 if (err == CL_SUCCESS) {
478 wrkPtr[imgNum] = clEnqueueMapBuffer(warper->queue, wrkCL[imgNum],
479 CL_FALSE, CL_MAP_WRITE,
480 0, dataSz, 0, nullptr, nullptr, &err);
481 handleErr(err);
482 } else {
483 wrkCL[imgNum] = nullptr;
484 #ifdef DEBUG_OPENCL
485 CPLDebug("OpenCL", "Using fallback non-pinned memory!");
486 #endif
487 //Fallback to regular allocation
488 wrkPtr[imgNum] = VSI_MALLOC_VERBOSE(dataSz);
489
490 if (wrkPtr[imgNum] == nullptr)
491 handleErr(err = CL_OUT_OF_HOST_MEMORY);
492 }
493
494 return CL_SUCCESS;
495 }
496
497 /*
498 Allocates the working host memory for all bands of the image in the warper
499 structure. This includes both the source image buffers and the destination
500 buffers. This memory is located on the host, so we can assemble the image.
501 Reasons for buffering it like this include reading each row from disk and
502 de-interleaving bands and parts of bands. Then they can be copied to the device
503 as a single operation fit for use as an OpenCL memory object.
504
505 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
506 */
507 static
alloc_working_arr(struct oclWarper * warper,size_t ptrSz,size_t dataSz,CPL_UNUSED size_t * fmtSz)508 cl_int alloc_working_arr(struct oclWarper *warper,
509 size_t ptrSz, size_t dataSz, CPL_UNUSED size_t *fmtSz)
510 {
511 cl_int err = CL_SUCCESS;
512 int i, b;
513 size_t srcDataSz1, dstDataSz1, srcDataSz4, dstDataSz4;
514 const int numBands = warper->numBands;
515
516 //Find the best channel order for this format
517 err = set_supported_formats(warper, 1,
518 &(warper->imgChOrder1), &(warper->imgChSize1),
519 warper->imageFormat);
520 handleErr(err);
521 if(warper->useVec) {
522 err = set_supported_formats(warper, 4,
523 &(warper->imgChOrder4), &(warper->imgChSize4),
524 warper->imageFormat);
525 handleErr(err);
526 }
527
528 //Alloc space for pointers to the main image data
529 warper->realWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
530 warper->dstRealWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
531 if (warper->realWork.v == nullptr || warper->dstRealWork.v == nullptr)
532 handleErr(err = CL_OUT_OF_HOST_MEMORY);
533
534 if (warper->imagWorkCL != nullptr) {
535 //Alloc space for pointers to the extra channel, if it exists
536 warper->imagWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
537 warper->dstImagWork.v = static_cast<void **>(VSI_CALLOC_VERBOSE(ptrSz, warper->numImages));
538 if (warper->imagWork.v == nullptr || warper->dstImagWork.v == nullptr)
539 handleErr(err = CL_OUT_OF_HOST_MEMORY);
540 } else {
541 warper->imagWork.v = nullptr;
542 warper->dstImagWork.v = nullptr;
543 }
544
545 //Calc the sizes we need
546 srcDataSz1 = dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize1;
547 dstDataSz1 = dataSz * warper->dstWidth * warper->dstHeight;
548 srcDataSz4 = dataSz * warper->srcWidth * warper->srcHeight * warper->imgChSize4;
549 dstDataSz4 = dataSz * warper->dstWidth * warper->dstHeight * 4;
550
551 //Allocate pinned memory for each band's image
552 for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i) {
553 if(warper->useVec && b < numBands - numBands % 4) {
554 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
555 warper->realWork.v,
556 warper->realWorkCL));
557
558 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
559 warper->dstRealWork.v,
560 warper->dstRealWorkCL));
561 b += 4;
562 } else {
563 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
564 warper->realWork.v,
565 warper->realWorkCL));
566
567 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
568 warper->dstRealWork.v,
569 warper->dstRealWorkCL));
570 ++b;
571 }
572 }
573
574 if (warper->imagWorkCL != nullptr) {
575 //Allocate pinned memory for each band's extra channel, if exists
576 for (b = 0, i = 0; b < numBands && i < warper->numImages; ++i) {
577 if(warper->useVec && b < numBands - numBands % 4) {
578 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz4,
579 warper->imagWork.v,
580 warper->imagWorkCL));
581
582 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz4,
583 warper->dstImagWork.v,
584 warper->dstImagWorkCL));
585 b += 4;
586 } else {
587 handleErr(err = alloc_pinned_mem(warper, i, srcDataSz1,
588 warper->imagWork.v,
589 warper->imagWorkCL));
590
591 handleErr(err = alloc_pinned_mem(warper, i, dstDataSz1,
592 warper->dstImagWork.v,
593 warper->dstImagWorkCL));
594 ++b;
595 }
596 }
597 }
598
599 return CL_SUCCESS;
600 }
601
602 /*
603 Assemble and create the kernel. For optimization, portability, and
604 implementation limitation reasons, the program is actually assembled from
605 several strings, then compiled with as many invariants as possible defined by
606 the preprocessor. There is also quite a bit of error-catching code in here
607 because the kernel is where many bugs show up.
608
609 Returns CL_SUCCESS on success and other CL_* errors in the error buffer when
610 something goes wrong.
611 */
612 static
get_kernel(struct oclWarper * warper,char useVec,double dfXScale,double dfYScale,double dfXFilter,double dfYFilter,int nXRadius,int nYRadius,int nFiltInitX,int nFiltInitY,cl_int * clErr)613 cl_kernel get_kernel(struct oclWarper *warper, char useVec,
614 double dfXScale, double dfYScale, double dfXFilter, double dfYFilter,
615 int nXRadius, int nYRadius, int nFiltInitX, int nFiltInitY,
616 cl_int *clErr )
617 {
618 cl_program program;
619 cl_kernel kernel;
620 cl_int err = CL_SUCCESS;
621 #define PROGBUF_SIZE 128000
622 char *buffer = static_cast<char *>(CPLCalloc(PROGBUF_SIZE, sizeof(char)));
623 char *progBuf = static_cast<char *>(CPLCalloc(PROGBUF_SIZE, sizeof(char)));
624 float dstMinVal = 0.f, dstMaxVal = 0.0;
625
626 const char *outType;
627 const char *dUseVec = "";
628 const char *dVecf = "float";
629 const char *kernGenFuncs = R""""(
630 // ********************* General Funcs ********************
631 void clampToDst(float fReal,
632 __global outType *dstPtr,
633 unsigned int iDstOffset,
634 __constant float *fDstNoDataReal,
635 int bandNum);
636 void setPixel(__global outType *dstReal,
637 __global outType *dstImag,
638 __global float *dstDensity,
639 __global int *nDstValid,
640 __constant float *fDstNoDataReal,
641 const int bandNum,
642 vecf fDensity, vecf fReal, vecf fImag);
643 int getPixel(__read_only image2d_t srcReal,
644 __read_only image2d_t srcImag,
645 __global float *fUnifiedSrcDensity,
646 __global int *nUnifiedSrcValid,
647 __constant char *useBandSrcValid,
648 __global int *nBandSrcValid,
649 const int2 iSrc,
650 int bandNum,
651 vecf *fDensity, vecf *fReal, vecf *fImag);
652 int isValid(__global float *fUnifiedSrcDensity,
653 __global int *nUnifiedSrcValid,
654 float2 fSrcCoords );
655 float2 getSrcCoords(__read_only image2d_t srcCoords);
656
657 #ifdef USE_CLAMP_TO_DST_FLOAT
658 void clampToDst(float fReal,
659 __global outType *dstPtr,
660 unsigned int iDstOffset,
661 __constant float *fDstNoDataReal,
662 int bandNum)
663 {
664 dstPtr[iDstOffset] = fReal;
665 }
666 #else
667 void clampToDst(float fReal,
668 __global outType *dstPtr,
669 unsigned int iDstOffset,
670 __constant float *fDstNoDataReal,
671 int bandNum)
672 {
673 fReal *= dstMaxVal;
674
675 if (fReal < dstMinVal)
676 dstPtr[iDstOffset] = (outType)dstMinVal;
677 else if (fReal > dstMaxVal)
678 dstPtr[iDstOffset] = (outType)dstMaxVal;
679 else
680 dstPtr[iDstOffset] = (dstMinVal < 0) ? (outType)floor(fReal + 0.5f) : (outType)(fReal + 0.5f);
681
682 if (useDstNoDataReal && bandNum >= 0 &&
683 fDstNoDataReal[bandNum] == dstPtr[iDstOffset])
684 {
685 if (dstPtr[iDstOffset] == dstMinVal)
686 dstPtr[iDstOffset] = dstMinVal + 1;
687 else
688 dstPtr[iDstOffset] = dstPtr[iDstOffset] - 1;
689 }
690 }
691 #endif
692
693 void setPixel(__global outType *dstReal,
694 __global outType *dstImag,
695 __global float *dstDensity,
696 __global int *nDstValid,
697 __constant float *fDstNoDataReal,
698 const int bandNum,
699 vecf fDensity, vecf fReal, vecf fImag)
700 {
701 unsigned int iDstOffset = get_global_id(1)*iDstWidth + get_global_id(0);
702
703 #ifdef USE_VEC
704 if (fDensity.x < 0.00001f && fDensity.y < 0.00001f &&
705 fDensity.z < 0.00001f && fDensity.w < 0.00001f ) {
706
707 fReal = 0.0f;
708 fImag = 0.0f;
709
710 } else if (fDensity.x < 0.9999f || fDensity.y < 0.9999f ||
711 fDensity.z < 0.9999f || fDensity.w < 0.9999f ) {
712 vecf fDstReal, fDstImag;
713 float fDstDensity;
714
715 fDstReal.x = dstReal[iDstOffset];
716 fDstReal.y = dstReal[iDstOffset+iDstHeight*iDstWidth];
717 fDstReal.z = dstReal[iDstOffset+iDstHeight*iDstWidth*2];
718 fDstReal.w = dstReal[iDstOffset+iDstHeight*iDstWidth*3];
719 if (useImag) {
720 fDstImag.x = dstImag[iDstOffset];
721 fDstImag.y = dstImag[iDstOffset+iDstHeight*iDstWidth];
722 fDstImag.z = dstImag[iDstOffset+iDstHeight*iDstWidth*2];
723 fDstImag.w = dstImag[iDstOffset+iDstHeight*iDstWidth*3];
724 }
725 #else
726 if (fDensity < 0.00001f) {
727
728 fReal = 0.0f;
729 fImag = 0.0f;
730
731 } else if (fDensity < 0.9999f) {
732 vecf fDstReal, fDstImag;
733 float fDstDensity;
734
735 fDstReal = dstReal[iDstOffset];
736 if (useImag)
737 fDstImag = dstImag[iDstOffset];
738 #endif
739
740 if (useDstDensity)
741 fDstDensity = dstDensity[iDstOffset];
742 else if (useDstValid &&
743 !((nDstValid[iDstOffset>>5] & (0x01 << (iDstOffset & 0x1f))) ))
744 fDstDensity = 0.0f;
745 else
746 fDstDensity = 1.0f;
747
748 vecf fDstInfluence = (1.0f - fDensity) * fDstDensity;
749
750 // Density should be checked for <= 0.0 & handled by the calling function
751 fReal = (fReal * fDensity + fDstReal * fDstInfluence) / (fDensity + fDstInfluence);
752 if (useImag)
753 fImag = (fImag * fDensity + fDstImag * fDstInfluence) / (fDensity + fDstInfluence);
754 }
755
756 #ifdef USE_VEC
757 clampToDst(fReal.x, dstReal, iDstOffset, fDstNoDataReal, bandNum);
758 clampToDst(fReal.y, dstReal, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, bandNum);
759 clampToDst(fReal.z, dstReal, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, bandNum);
760 clampToDst(fReal.w, dstReal, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, bandNum);
761 if (useImag) {
762 clampToDst(fImag.x, dstImag, iDstOffset, fDstNoDataReal, -1);
763 clampToDst(fImag.y, dstImag, iDstOffset+iDstHeight*iDstWidth, fDstNoDataReal, -1);
764 clampToDst(fImag.z, dstImag, iDstOffset+iDstHeight*iDstWidth*2, fDstNoDataReal, -1);
765 clampToDst(fImag.w, dstImag, iDstOffset+iDstHeight*iDstWidth*3, fDstNoDataReal, -1);
766 }
767 #else
768 clampToDst(fReal, dstReal, iDstOffset, fDstNoDataReal, bandNum);
769 if (useImag)
770 clampToDst(fImag, dstImag, iDstOffset, fDstNoDataReal, -1);
771 #endif
772 }
773
774 int getPixel(__read_only image2d_t srcReal,
775 __read_only image2d_t srcImag,
776 __global float *fUnifiedSrcDensity,
777 __global int *nUnifiedSrcValid,
778 __constant char *useBandSrcValid,
779 __global int *nBandSrcValid,
780 const int2 iSrc,
781 int bandNum,
782 vecf *fDensity, vecf *fReal, vecf *fImag)
783 {
784 int iSrcOffset = 0, iBandValidLen = 0, iSrcOffsetMask = 0;
785 int bHasValid = FALSE;
786
787 // Clamp the src offset values if needed
788 if(useUnifiedSrcDensity | useUnifiedSrcValid | useUseBandSrcValid){
789 int iSrcX = iSrc.x;
790 int iSrcY = iSrc.y;
791
792 // Needed because the offset isn't clamped in OpenCL hardware
793 if(iSrcX < 0)
794 iSrcX = 0;
795 else if(iSrcX >= iSrcWidth)
796 iSrcX = iSrcWidth - 1;
797
798 if(iSrcY < 0)
799 iSrcY = 0;
800 else if(iSrcY >= iSrcHeight)
801 iSrcY = iSrcHeight - 1;
802
803 iSrcOffset = iSrcY*iSrcWidth + iSrcX;
804 iBandValidLen = 1 + ((iSrcWidth*iSrcHeight)>>5);
805 iSrcOffsetMask = (0x01 << (iSrcOffset & 0x1f));
806 }
807
808 if (useUnifiedSrcValid &&
809 !((nUnifiedSrcValid[iSrcOffset>>5] & iSrcOffsetMask) ) )
810 return FALSE;
811
812 #ifdef USE_VEC
813 if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
814 ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
815 bHasValid = TRUE;
816
817 if (!useUseBandSrcValid || !useBandSrcValid[bandNum+1] ||
818 ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(1+bandNum)] & iSrcOffsetMask)) )
819 bHasValid = TRUE;
820
821 if (!useUseBandSrcValid || !useBandSrcValid[bandNum+2] ||
822 ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(2+bandNum)] & iSrcOffsetMask)) )
823 bHasValid = TRUE;
824
825 if (!useUseBandSrcValid || !useBandSrcValid[bandNum+3] ||
826 ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*(3+bandNum)] & iSrcOffsetMask)) )
827 bHasValid = TRUE;
828 #else
829 if (!useUseBandSrcValid || !useBandSrcValid[bandNum] ||
830 ((nBandSrcValid[(iSrcOffset>>5)+iBandValidLen*bandNum ] & iSrcOffsetMask)) )
831 bHasValid = TRUE;
832 #endif
833
834 if (!bHasValid)
835 return FALSE;
836
837 const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE |
838 CLK_ADDRESS_CLAMP_TO_EDGE |
839 CLK_FILTER_NEAREST;
840
841 #ifdef USE_VEC
842 (*fReal) = read_imagef(srcReal, samp, iSrc);
843 if (useImag)
844 (*fImag) = read_imagef(srcImag, samp, iSrc);
845 #else
846 (*fReal) = read_imagef(srcReal, samp, iSrc).x;
847 if (useImag)
848 (*fImag) = read_imagef(srcImag, samp, iSrc).x;
849 #endif
850
851 if (useUnifiedSrcDensity) {
852 (*fDensity) = fUnifiedSrcDensity[iSrcOffset];
853 } else {
854 (*fDensity) = 1.0f;
855 return TRUE;
856 }
857
858 #ifdef USE_VEC
859 return (*fDensity).x > 0.0000001f || (*fDensity).y > 0.0000001f ||
860 (*fDensity).z > 0.0000001f || (*fDensity).w > 0.0000001f;
861 #else
862 return (*fDensity) > 0.0000001f;
863 #endif
864 }
865
866 int isValid(__global float *fUnifiedSrcDensity,
867 __global int *nUnifiedSrcValid,
868 float2 fSrcCoords )
869 {
870 if (fSrcCoords.x < 0.0f || fSrcCoords.y < 0.0f)
871 return FALSE;
872
873 int iSrcX = (int) (fSrcCoords.x - 0.5f);
874 int iSrcY = (int) (fSrcCoords.y - 0.5f);
875
876 if( iSrcX < 0 || iSrcX >= iSrcWidth || iSrcY < 0 || iSrcY >= iSrcHeight )
877 return FALSE;
878
879 int iSrcOffset = iSrcX + iSrcY * iSrcWidth;
880
881 if (useUnifiedSrcDensity && fUnifiedSrcDensity[iSrcOffset] < 0.00001f)
882 return FALSE;
883
884 if (useUnifiedSrcValid &&
885 !(nUnifiedSrcValid[iSrcOffset>>5] & (0x01 << (iSrcOffset & 0x1f))) )
886 return FALSE;
887
888 return TRUE;
889 }
890
891 float2 getSrcCoords(__read_only image2d_t srcCoords)
892 {
893 // Find an appropriate place to sample the coordinates so we're still
894 // accurate after linear interpolation.
895 int nDstX = get_global_id(0);
896 int nDstY = get_global_id(1);
897 float2 fDst = (float2)((0.5f * (float)iCoordMult + nDstX) /
898 (float)((ceil((iDstWidth - 1) / (float)iCoordMult) + 1) * iCoordMult),
899 (0.5f * (float)iCoordMult + nDstY) /
900 (float)((ceil((iDstHeight - 1) / (float)iCoordMult) + 1) * iCoordMult));
901
902 // Check & return when the thread group overruns the image size
903 if (nDstX >= iDstWidth || nDstY >= iDstHeight)
904 return (float2)(-99.0f, -99.0f);
905
906 const sampler_t samp = CLK_NORMALIZED_COORDS_TRUE |
907 CLK_ADDRESS_CLAMP_TO_EDGE |
908 CLK_FILTER_LINEAR;
909
910 float4 fSrcCoords = read_imagef(srcCoords,samp,fDst);
911
912 return (float2)(fSrcCoords.x, fSrcCoords.y);
913 }
914 )"""";
915
916 const char *kernBilinear = R""""(
917 // ************************ Bilinear ************************
918 __kernel void resamp(__read_only image2d_t srcCoords,
919 __read_only image2d_t srcReal,
920 __read_only image2d_t srcImag,
921 __global float *fUnifiedSrcDensity,
922 __global int *nUnifiedSrcValid,
923 __constant char *useBandSrcValid,
924 __global int *nBandSrcValid,
925 __global outType *dstReal,
926 __global outType *dstImag,
927 __constant float *fDstNoDataReal,
928 __global float *dstDensity,
929 __global int *nDstValid,
930 const int bandNum)
931 {
932 float2 fSrc = getSrcCoords(srcCoords);
933 if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
934 return;
935
936 int iSrcX = (int) floor(fSrc.x - 0.5f);
937 int iSrcY = (int) floor(fSrc.y - 0.5f);
938 float fRatioX = 1.5f - (fSrc.x - iSrcX);
939 float fRatioY = 1.5f - (fSrc.y - iSrcY);
940 vecf fReal, fImag, fDens;
941 vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
942 vecf fAccumulatorDensity = 0.0f;
943 float fAccumulatorDivisor = 0.0f;
944
945 if ( iSrcY >= 0 && iSrcY < iSrcHeight ) {
946 float fMult1 = fRatioX * fRatioY;
947 float fMult2 = (1.0f-fRatioX) * fRatioY;
948
949 // Upper Left Pixel
950 if ( iSrcX >= 0 && iSrcX < iSrcWidth
951 && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
952 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY),
953 bandNum, &fDens, &fReal, &fImag))
954 {
955 fAccumulatorDivisor += fMult1;
956 fAccumulatorReal += fReal * fMult1;
957 fAccumulatorImag += fImag * fMult1;
958 fAccumulatorDensity += fDens * fMult1;
959 }
960
961 // Upper Right Pixel
962 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
963 && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
964 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY),
965 bandNum, &fDens, &fReal, &fImag))
966 {
967 fAccumulatorDivisor += fMult2;
968 fAccumulatorReal += fReal * fMult2;
969 fAccumulatorImag += fImag * fMult2;
970 fAccumulatorDensity += fDens * fMult2;
971 }
972 }
973
974 if ( iSrcY+1 >= 0 && iSrcY+1 < iSrcHeight ) {
975 float fMult1 = fRatioX * (1.0f-fRatioY);
976 float fMult2 = (1.0f-fRatioX) * (1.0f-fRatioY);
977
978 // Lower Left Pixel
979 if ( iSrcX >= 0 && iSrcX < iSrcWidth
980 && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
981 useBandSrcValid, nBandSrcValid, (int2)(iSrcX, iSrcY+1),
982 bandNum, &fDens, &fReal, &fImag))
983 {
984 fAccumulatorDivisor += fMult1;
985 fAccumulatorReal += fReal * fMult1;
986 fAccumulatorImag += fImag * fMult1;
987 fAccumulatorDensity += fDens * fMult1;
988 }
989
990 // Lower Right Pixel
991 if ( iSrcX+1 >= 0 && iSrcX+1 < iSrcWidth
992 && getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
993 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+1),
994 bandNum, &fDens, &fReal, &fImag))
995 {
996 fAccumulatorDivisor += fMult2;
997 fAccumulatorReal += fReal * fMult2;
998 fAccumulatorImag += fImag * fMult2;
999 fAccumulatorDensity += fDens * fMult2;
1000 }
1001 }
1002
1003 // Compute and save final pixel
1004 if ( fAccumulatorDivisor < 0.00001f ) {
1005 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1006 0.0f, 0.0f, 0.0f );
1007 } else if ( fAccumulatorDivisor < 0.99999f || fAccumulatorDivisor > 1.00001f ) {
1008 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1009 fAccumulatorDensity / fAccumulatorDivisor,
1010 fAccumulatorReal / fAccumulatorDivisor,
1011 #if useImag != 0
1012 fAccumulatorImag / fAccumulatorDivisor );
1013 #else
1014 0.0f );
1015 #endif
1016 } else {
1017 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1018 fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag );
1019 }
1020 }
1021 )"""";
1022
1023 const char *kernCubic = R""""(
1024 // ************************ Cubic ************************
1025 vecf cubicConvolution(float dist1, float dist2, float dist3,
1026 vecf f0, vecf f1, vecf f2, vecf f3);
1027
1028 vecf cubicConvolution(float dist1, float dist2, float dist3,
1029 vecf f0, vecf f1, vecf f2, vecf f3)
1030 {
1031 return ( f1
1032 + 0.5f * (dist1*(f2 - f0)
1033 + dist2*(2.0f*f0 - 5.0f*f1 + 4.0f*f2 - f3)
1034 + dist3*(3.0f*(f1 - f2) + f3 - f0)));
1035 }
1036
1037 // ************************ Cubic ************************
1038 __kernel void resamp(__read_only image2d_t srcCoords,
1039 __read_only image2d_t srcReal,
1040 __read_only image2d_t srcImag,
1041 __global float *fUnifiedSrcDensity,
1042 __global int *nUnifiedSrcValid,
1043 __constant char *useBandSrcValid,
1044 __global int *nBandSrcValid,
1045 __global outType *dstReal,
1046 __global outType *dstImag,
1047 __constant float *fDstNoDataReal,
1048 __global float *dstDensity,
1049 __global int *nDstValid,
1050 const int bandNum)
1051 {
1052 int i;
1053 float2 fSrc = getSrcCoords(srcCoords);
1054
1055 if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1056 return;
1057
1058 int iSrcX = (int) floor( fSrc.x - 0.5f );
1059 int iSrcY = (int) floor( fSrc.y - 0.5f );
1060 float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1061 float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1062 float fDeltaX2 = fDeltaX * fDeltaX;
1063 float fDeltaY2 = fDeltaY * fDeltaY;
1064 float fDeltaX3 = fDeltaX2 * fDeltaX;
1065 float fDeltaY3 = fDeltaY2 * fDeltaY;
1066 vecf afReal[4], afDens[4];
1067 #if useImag != 0
1068 vecf afImag[4];
1069 #else
1070 vecf fImag = 0.0f;
1071 #endif
1072
1073 // Loop over rows
1074 for (i = -1; i < 3; ++i)
1075 {
1076 vecf fReal1 = 0.0f, fReal2 = 0.0f, fReal3 = 0.0f, fReal4 = 0.0f;
1077 vecf fDens1 = 0.0f, fDens2 = 0.0f, fDens3 = 0.0f, fDens4 = 0.0f;
1078 int hasPx;
1079 #if useImag != 0
1080 vecf fImag1 = 0.0f, fImag2 = 0.0f, fImag3 = 0.0f, fImag4 = 0.0f;
1081
1082 //Get all the pixels for this row
1083 hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1084 useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1085 bandNum, &fDens1, &fReal1, &fImag1);
1086
1087 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1088 useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1089 bandNum, &fDens2, &fReal2, &fImag2);
1090
1091 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1092 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1093 bandNum, &fDens3, &fReal3, &fImag3);
1094
1095 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1096 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1097 bandNum, &fDens4, &fReal4, &fImag4);
1098 #else
1099 //Get all the pixels for this row
1100 hasPx = getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1101 useBandSrcValid, nBandSrcValid, (int2)(iSrcX-1, iSrcY+i),
1102 bandNum, &fDens1, &fReal1, &fImag);
1103
1104 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1105 useBandSrcValid, nBandSrcValid, (int2)(iSrcX , iSrcY+i),
1106 bandNum, &fDens2, &fReal2, &fImag);
1107
1108 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1109 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+1, iSrcY+i),
1110 bandNum, &fDens3, &fReal3, &fImag);
1111
1112 hasPx |= getPixel(srcReal, srcImag, fUnifiedSrcDensity, nUnifiedSrcValid,
1113 useBandSrcValid, nBandSrcValid, (int2)(iSrcX+2, iSrcY+i),
1114 bandNum, &fDens4, &fReal4, &fImag);
1115 #endif
1116
1117 // Shortcut if no px
1118 if (!hasPx) {
1119 afDens[i+1] = 0.0f;
1120 afReal[i+1] = 0.0f;
1121 #if useImag != 0
1122 afImag[i+1] = 0.0f;
1123 #endif
1124 continue;
1125 }
1126
1127 // Process this row
1128 afDens[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fDens1, fDens2, fDens3, fDens4);
1129 afReal[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fReal1, fReal2, fReal3, fReal4);
1130 #if useImag != 0
1131 afImag[i+1] = cubicConvolution(fDeltaX, fDeltaX2, fDeltaX3, fImag1, fImag2, fImag3, fImag4);
1132 #endif
1133 }
1134
1135 // Compute and save final pixel
1136 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1137 cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afDens[0], afDens[1], afDens[2], afDens[3]),
1138 cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afReal[0], afReal[1], afReal[2], afReal[3]),
1139 #if useImag != 0
1140 cubicConvolution(fDeltaY, fDeltaY2, fDeltaY3, afImag[0], afImag[1], afImag[2], afImag[3]) );
1141 #else
1142 fImag );
1143 #endif
1144 }
1145 )"""";
1146
1147
1148 const char *kernResampler = R""""(
1149 // ************************ LanczosSinc ************************
1150
1151 float lanczosSinc( float fX, float fR );
1152 float bSpline( float x );
1153
1154 float lanczosSinc( float fX, float fR )
1155 {
1156 if ( fX > fR || fX < -fR)
1157 return 0.0f;
1158 if ( fX == 0.0f )
1159 return 1.0f;
1160
1161 float fPIX = PI * fX;
1162 return ( sin(fPIX) / fPIX ) * ( sin(fPIX / fR) * fR / fPIX );
1163 }
1164
1165 // ************************ Bicubic Spline ************************
1166
1167 float bSpline( float x )
1168 {
1169 float xp2 = x + 2.0f;
1170 float xp1 = x + 1.0f;
1171 float xm1 = x - 1.0f;
1172 float xp2c = xp2 * xp2 * xp2;
1173
1174 return (((xp2 > 0.0f)?((xp1 > 0.0f)?((x > 0.0f)?((xm1 > 0.0f)?
1175 -4.0f * xm1*xm1*xm1:0.0f) +
1176 6.0f * x*x*x:0.0f) +
1177 -4.0f * xp1*xp1*xp1:0.0f) +
1178 xp2c:0.0f) ) * 0.166666666666666666666f;
1179 }
1180
1181 // ************************ General Resampler ************************
1182
1183 __kernel void resamp(__read_only image2d_t srcCoords,
1184 __read_only image2d_t srcReal,
1185 __read_only image2d_t srcImag,
1186 __global float *fUnifiedSrcDensity,
1187 __global int *nUnifiedSrcValid,
1188 __constant char *useBandSrcValid,
1189 __global int *nBandSrcValid,
1190 __global outType *dstReal,
1191 __global outType *dstImag,
1192 __constant float *fDstNoDataReal,
1193 __global float *dstDensity,
1194 __global int *nDstValid,
1195 const int bandNum)
1196 {
1197 float2 fSrc = getSrcCoords(srcCoords);
1198
1199 if (!isValid(fUnifiedSrcDensity, nUnifiedSrcValid, fSrc))
1200 return;
1201
1202 int iSrcX = floor( fSrc.x - 0.5f );
1203 int iSrcY = floor( fSrc.y - 0.5f );
1204 float fDeltaX = fSrc.x - 0.5f - (float)iSrcX;
1205 float fDeltaY = fSrc.y - 0.5f - (float)iSrcY;
1206
1207 vecf fAccumulatorReal = 0.0f, fAccumulatorImag = 0.0f;
1208 vecf fAccumulatorDensity = 0.0f;
1209 float fAccumulatorWeight = 0.0f;
1210 int i, j;
1211
1212 // Loop over pixel rows in the kernel
1213 for ( j = nFiltInitY; j <= nYRadius; ++j )
1214 {
1215 float fWeight1;
1216 int2 iSrc = (int2)(0, iSrcY + j);
1217
1218 // Skip sampling over edge of image
1219 if ( iSrc.y < 0 || iSrc.y >= iSrcHeight )
1220 continue;
1221
1222 // Select the resampling algorithm
1223 if ( doCubicSpline )
1224 // Calculate the Y weight
1225 fWeight1 = ( fYScale < 1.0f ) ?
1226 bSpline(((float)j) * fYScale) * fYScale :
1227 bSpline(((float)j) - fDeltaY);
1228 else
1229 fWeight1 = ( fYScale < 1.0f ) ?
1230 lanczosSinc(j * fYScale, fYFilter) * fYScale :
1231 lanczosSinc(j - fDeltaY, fYFilter);
1232
1233 // Iterate over pixels in row
1234 for ( i = nFiltInitX; i <= nXRadius; ++i )
1235 {
1236 float fWeight2;
1237 vecf fDensity = 0.0f, fReal, fImag;
1238 iSrc.x = iSrcX + i;
1239
1240 // Skip sampling at edge of image
1241 // Skip sampling when invalid pixel
1242 if ( iSrc.x < 0 || iSrc.x >= iSrcWidth ||
1243 !getPixel(srcReal, srcImag, fUnifiedSrcDensity,
1244 nUnifiedSrcValid, useBandSrcValid, nBandSrcValid,
1245 iSrc, bandNum, &fDensity, &fReal, &fImag) )
1246 continue;
1247
1248 // Choose among possible algorithms
1249 if ( doCubicSpline )
1250 // Calculate & save the X weight
1251 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1252 bSpline((float)i * fXScale) * fXScale :
1253 bSpline(fDeltaX - (float)i));
1254 else
1255 // Calculate & save the X weight
1256 fWeight2 = fWeight1 * ((fXScale < 1.0f ) ?
1257 lanczosSinc(i * fXScale, fXFilter) * fXScale :
1258 lanczosSinc(i - fDeltaX, fXFilter));
1259
1260 // Accumulate!
1261 fAccumulatorReal += fReal * fWeight2;
1262 fAccumulatorImag += fImag * fWeight2;
1263 fAccumulatorDensity += fDensity * fWeight2;
1264 fAccumulatorWeight += fWeight2;
1265 }
1266 }
1267
1268 if ( fAccumulatorWeight < 0.000001f ) {
1269 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1270 0.0f, 0.0f, 0.0f);
1271 } else if ( fAccumulatorWeight < 0.99999f || fAccumulatorWeight > 1.00001f ) {
1272 // Calculate the output taking into account weighting
1273 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1274 fAccumulatorDensity / fAccumulatorWeight,
1275 fAccumulatorReal / fAccumulatorWeight,
1276 #if useImag != 0
1277 fAccumulatorImag / fAccumulatorWeight );
1278 #else
1279 0.0f );
1280 #endif
1281 } else {
1282 setPixel(dstReal, dstImag, dstDensity, nDstValid, fDstNoDataReal, bandNum,
1283 fAccumulatorDensity, fAccumulatorReal, fAccumulatorImag);
1284 }
1285 }
1286 )"""";
1287
1288 //Defines based on image format
1289 switch (warper->imageFormat) {
1290 case CL_FLOAT:
1291 dstMinVal = std::numeric_limits<float>::lowest();
1292 dstMaxVal = std::numeric_limits<float>::max();
1293 outType = "float";
1294 break;
1295 case CL_SNORM_INT8:
1296 dstMinVal = -128.0;
1297 dstMaxVal = 127.0;
1298 outType = "char";
1299 break;
1300 case CL_UNORM_INT8:
1301 dstMinVal = 0.0;
1302 dstMaxVal = 255.0;
1303 outType = "uchar";
1304 break;
1305 case CL_SNORM_INT16:
1306 dstMinVal = -32768.0;
1307 dstMaxVal = 32767.0;
1308 outType = "short";
1309 break;
1310 case CL_UNORM_INT16:
1311 dstMinVal = 0.0;
1312 dstMaxVal = 65535.0;
1313 outType = "ushort";
1314 break;
1315 }
1316
1317 //Use vector format?
1318 if (useVec) {
1319 dUseVec = "-D USE_VEC";
1320 dVecf = "float4";
1321 }
1322
1323 //Assemble the kernel from parts. The compiler is unable to handle multiple
1324 //kernels in one string with more than a few __constant modifiers each.
1325 if (warper->resampAlg == OCL_Bilinear)
1326 snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernBilinear);
1327 else if (warper->resampAlg == OCL_Cubic)
1328 snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernCubic);
1329 else
1330 snprintf(progBuf, PROGBUF_SIZE, "%s\n%s", kernGenFuncs, kernResampler);
1331
1332 //Actually make the program from assembled source
1333 const char* pszProgBuf = progBuf;
1334 program = clCreateProgramWithSource(warper->context, 1,
1335 &pszProgBuf,
1336 nullptr, &err);
1337 handleErrGoto(err, error_final);
1338
1339 //Assemble the compiler arg string for speed. All invariants should be defined here.
1340 snprintf(buffer, PROGBUF_SIZE,
1341 "-cl-fast-relaxed-math -Werror -D FALSE=0 -D TRUE=1 "
1342 "%s"
1343 "-D iSrcWidth=%d -D iSrcHeight=%d -D iDstWidth=%d -D iDstHeight=%d "
1344 "-D useUnifiedSrcDensity=%d -D useUnifiedSrcValid=%d "
1345 "-D useDstDensity=%d -D useDstValid=%d -D useImag=%d "
1346 "-D fXScale=%015.15lff -D fYScale=%015.15lff -D fXFilter=%015.15lff -D fYFilter=%015.15lff "
1347 "-D nXRadius=%d -D nYRadius=%d -D nFiltInitX=%d -D nFiltInitY=%d "
1348 "-D PI=%015.15lff -D outType=%s -D dstMinVal=%015.15lff -D dstMaxVal=%015.15lff "
1349 "-D useDstNoDataReal=%d -D vecf=%s %s -D doCubicSpline=%d "
1350 "-D useUseBandSrcValid=%d -D iCoordMult=%d ",
1351 (warper->imageFormat == CL_FLOAT) ? "-D USE_CLAMP_TO_DST_FLOAT=1 " : "",
1352 warper->srcWidth, warper->srcHeight, warper->dstWidth, warper->dstHeight,
1353 warper->useUnifiedSrcDensity, warper->useUnifiedSrcValid,
1354 warper->useDstDensity, warper->useDstValid, warper->imagWorkCL != nullptr,
1355 dfXScale, dfYScale, dfXFilter, dfYFilter,
1356 nXRadius, nYRadius, nFiltInitX, nFiltInitY,
1357 M_PI, outType, dstMinVal, dstMaxVal, warper->fDstNoDataRealCL != nullptr,
1358 dVecf, dUseVec, warper->resampAlg == OCL_CubicSpline,
1359 warper->nBandSrcValidCL != nullptr, warper->coordMult);
1360
1361 (*clErr) = err = clBuildProgram(program, 1, &(warper->dev), buffer, nullptr, nullptr);
1362
1363 //Detailed debugging info
1364 if (err != CL_SUCCESS)
1365 {
1366 const char* pszStatus = "unknown_status";
1367 err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_LOG,
1368 128000*sizeof(char), buffer, nullptr);
1369 handleErrGoto(err, error_free_program);
1370
1371 CPLError(CE_Failure, CPLE_AppDefined, "Error: Failed to build program executable!\nBuild Log:\n%s", buffer);
1372
1373 err = clGetProgramBuildInfo(program, warper->dev, CL_PROGRAM_BUILD_STATUS,
1374 128000*sizeof(char), buffer, nullptr);
1375 handleErrGoto(err, error_free_program);
1376
1377 if(buffer[0] == CL_BUILD_NONE)
1378 pszStatus = "CL_BUILD_NONE";
1379 else if(buffer[0] == CL_BUILD_ERROR)
1380 pszStatus = "CL_BUILD_ERROR";
1381 else if(buffer[0] == CL_BUILD_SUCCESS)
1382 pszStatus = "CL_BUILD_SUCCESS";
1383 else if(buffer[0] == CL_BUILD_IN_PROGRESS)
1384 pszStatus = "CL_BUILD_IN_PROGRESS";
1385
1386 CPLDebug("OpenCL", "Build Status: %s\nProgram Source:\n%s", pszStatus, progBuf);
1387 goto error_free_program;
1388 }
1389
1390 kernel = clCreateKernel(program, "resamp", &err);
1391 handleErrGoto(err, error_free_program);
1392
1393 err = clReleaseProgram(program);
1394 handleErrGoto(err, error_final);
1395
1396 CPLFree(buffer);
1397 CPLFree(progBuf);
1398 return kernel;
1399
1400 error_free_program:
1401 err = clReleaseProgram(program);
1402
1403 error_final:
1404 CPLFree(buffer);
1405 CPLFree(progBuf);
1406 return nullptr;
1407 }
1408
1409 /*
1410 Alloc & copy the coordinate data from host working memory to the device. The
1411 working memory should be a pinned, linear, array of floats. This allows us to
1412 allocate and copy all data in one step. The pointer to the device memory is
1413 saved and set as the appropriate argument number.
1414
1415 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1416 */
1417 static
1418 cl_int set_coord_data (struct oclWarper *warper, cl_mem *xy)
1419 {
1420 cl_int err = CL_SUCCESS;
1421 cl_image_format imgFmt;
1422
1423 //Copy coord data to the device
1424 imgFmt.image_channel_order = warper->xyChOrder;
1425 imgFmt.image_channel_data_type = CL_FLOAT;
1426
1427 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1428 #pragma GCC diagnostic push
1429 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1430 #endif
1431 (*xy) = clCreateImage2D(warper->context,
1432 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1433 static_cast<size_t>(warper->xyWidth),
1434 static_cast<size_t>(warper->xyHeight),
1435 sizeof(float) * warper->xyChSize * warper->xyWidth,
1436 warper->xyWork, &err);
1437 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1438 #pragma GCC diagnostic pop
1439 #endif
1440 handleErr(err);
1441
1442 //Free the source memory, now that it's copied we don't need it
1443 freeCLMem(warper->xyWorkCL, warper->xyWork);
1444
1445 //Set up argument
1446 if (warper->kern1 != nullptr) {
1447 handleErr(err = clSetKernelArg(warper->kern1, 0, sizeof(cl_mem), xy));
1448 }
1449 if (warper->kern4 != nullptr) {
1450 handleErr(err = clSetKernelArg(warper->kern4, 0, sizeof(cl_mem), xy));
1451 }
1452
1453 return CL_SUCCESS;
1454 }
1455
1456 /*
1457 Sets the unified density & valid data structures. These are optional structures
1458 from GDAL, and as such if they are NULL a small placeholder memory segment is
1459 defined. This is because the spec is unclear on if a NULL value can be passed
1460 as a kernel argument in place of memory. If it's not NULL, the data is copied
1461 from the working memory to the device memory. After that, we check if we are
1462 using the per-band validity mask, and set that as appropriate. At the end, the
1463 CL mem is passed as the kernel arguments.
1464
1465 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1466 */
1467 static
1468 cl_int set_unified_data(struct oclWarper *warper,
1469 cl_mem *unifiedSrcDensityCL, cl_mem *unifiedSrcValidCL,
1470 float *unifiedSrcDensity, unsigned int *unifiedSrcValid,
1471 cl_mem *useBandSrcValidCL, cl_mem *nBandSrcValidCL)
1472 {
1473 cl_int err = CL_SUCCESS;
1474 size_t sz = warper->srcWidth * warper->srcHeight;
1475 int useValid = warper->nBandSrcValidCL != nullptr;
1476 //32 bits in the mask
1477 int validSz = static_cast<int>(sizeof(int) * ((31 + sz) >> 5));
1478
1479 //Copy unifiedSrcDensity if it exists
1480 if (unifiedSrcDensity == nullptr) {
1481 //Alloc dummy device RAM
1482 (*unifiedSrcDensityCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1483 handleErr(err);
1484 } else {
1485 //Alloc & copy all density data
1486 (*unifiedSrcDensityCL) = clCreateBuffer(warper->context,
1487 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1488 sizeof(float) * sz, unifiedSrcDensity, &err);
1489 handleErr(err);
1490 }
1491
1492 //Copy unifiedSrcValid if it exists
1493 if (unifiedSrcValid == nullptr) {
1494 //Alloc dummy device RAM
1495 (*unifiedSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1496 handleErr(err);
1497 } else {
1498 //Alloc & copy all validity data
1499 (*unifiedSrcValidCL) = clCreateBuffer(warper->context,
1500 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1501 validSz, unifiedSrcValid, &err);
1502 handleErr(err);
1503 }
1504
1505 // Set the band validity usage
1506 if(useValid) {
1507 (*useBandSrcValidCL) = clCreateBuffer(warper->context,
1508 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1509 sizeof(char) * warper->numBands,
1510 warper->useBandSrcValid, &err);
1511 handleErr(err);
1512 } else {
1513 //Make a fake image so we don't have a NULL pointer
1514 (*useBandSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1515 handleErr(err);
1516 }
1517
1518 //Do a more thorough check for validity
1519 if (useValid) {
1520 int i;
1521 useValid = FALSE;
1522 for (i = 0; i < warper->numBands; ++i)
1523 if (warper->useBandSrcValid[i])
1524 useValid = TRUE;
1525 }
1526
1527 //And the validity mask if needed
1528 if (useValid) {
1529 (*nBandSrcValidCL) = clCreateBuffer(warper->context,
1530 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1531 warper->numBands * validSz,
1532 warper->nBandSrcValid, &err);
1533 handleErr(err);
1534 } else {
1535 //Make a fake image so we don't have a NULL pointer
1536 (*nBandSrcValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1537 handleErr(err);
1538 }
1539
1540 //Set up arguments
1541 if (warper->kern1 != nullptr) {
1542 handleErr(err = clSetKernelArg(warper->kern1, 3, sizeof(cl_mem), unifiedSrcDensityCL));
1543 handleErr(err = clSetKernelArg(warper->kern1, 4, sizeof(cl_mem), unifiedSrcValidCL));
1544 handleErr(err = clSetKernelArg(warper->kern1, 5, sizeof(cl_mem), useBandSrcValidCL));
1545 handleErr(err = clSetKernelArg(warper->kern1, 6, sizeof(cl_mem), nBandSrcValidCL));
1546 }
1547 if (warper->kern4 != nullptr) {
1548 handleErr(err = clSetKernelArg(warper->kern4, 3, sizeof(cl_mem), unifiedSrcDensityCL));
1549 handleErr(err = clSetKernelArg(warper->kern4, 4, sizeof(cl_mem), unifiedSrcValidCL));
1550 handleErr(err = clSetKernelArg(warper->kern4, 5, sizeof(cl_mem), useBandSrcValidCL));
1551 handleErr(err = clSetKernelArg(warper->kern4, 6, sizeof(cl_mem), nBandSrcValidCL));
1552 }
1553
1554 return CL_SUCCESS;
1555 }
1556
1557 /*
1558 Here we set the per-band raster data. First priority is the real raster data,
1559 of course. Then, if applicable, we set the additional image channel. Once this
1560 data is copied to the device, it can be freed on the host, so that is done
1561 here. Finally the appropriate kernel arguments are set.
1562
1563 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1564 */
1565 static
1566 cl_int set_src_rast_data (struct oclWarper *warper, int iNum, size_t sz,
1567 cl_channel_order chOrder, cl_mem *srcReal, cl_mem *srcImag)
1568 {
1569 cl_image_format imgFmt;
1570 cl_int err = CL_SUCCESS;
1571 int useImagWork = warper->imagWork.v != nullptr && warper->imagWork.v[iNum] != nullptr;
1572
1573 //Set up image vars
1574 imgFmt.image_channel_order = chOrder;
1575 imgFmt.image_channel_data_type = warper->imageFormat;
1576
1577 //Create & copy the source image
1578 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1579 #pragma GCC diagnostic push
1580 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1581 #endif
1582
1583 (*srcReal) = clCreateImage2D(warper->context,
1584 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1585 static_cast<size_t>(warper->srcWidth),
1586 static_cast<size_t>(warper->srcHeight),
1587 sz * warper->srcWidth, warper->realWork.v[iNum], &err);
1588 handleErr(err);
1589
1590 //And the source image parts if needed
1591 if (useImagWork) {
1592 (*srcImag) = clCreateImage2D(warper->context,
1593 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1594 static_cast<size_t>(warper->srcWidth),
1595 static_cast<size_t>(warper->srcHeight),
1596 sz * warper->srcWidth, warper->imagWork.v[iNum], &err);
1597 handleErr(err);
1598 } else {
1599 //Make a fake image so we don't have a NULL pointer
1600
1601 char dummyImageData[16];
1602 (*srcImag) = clCreateImage2D(warper->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,
1603 1, 1, sz, dummyImageData, &err);
1604
1605 handleErr(err);
1606 }
1607 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
1608 #pragma GCC diagnostic pop
1609 #endif
1610
1611 //Free the source memory, now that it's copied we don't need it
1612 freeCLMem(warper->realWorkCL[iNum], warper->realWork.v[iNum]);
1613 if (warper->imagWork.v != nullptr) {
1614 freeCLMem(warper->imagWorkCL[iNum], warper->imagWork.v[iNum]);
1615 }
1616
1617 //Set up per-band arguments
1618 if (warper->kern1 != nullptr) {
1619 handleErr(err = clSetKernelArg(warper->kern1, 1, sizeof(cl_mem), srcReal));
1620 handleErr(err = clSetKernelArg(warper->kern1, 2, sizeof(cl_mem), srcImag));
1621 }
1622 if (warper->kern4 != nullptr) {
1623 handleErr(err = clSetKernelArg(warper->kern4, 1, sizeof(cl_mem), srcReal));
1624 handleErr(err = clSetKernelArg(warper->kern4, 2, sizeof(cl_mem), srcImag));
1625 }
1626
1627 return CL_SUCCESS;
1628 }
1629
1630 /*
1631 Set the destination data for the raster. Although it's the output, it still
1632 is copied to the device because some blending is done there. First the real
1633 data is allocated and copied, then the imag data is allocated and copied if
1634 needed. They are then set as the appropriate arguments to the kernel.
1635
1636 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1637 */
1638 static
1639 cl_int set_dst_rast_data(struct oclWarper *warper, int iImg, size_t sz,
1640 cl_mem *dstReal, cl_mem *dstImag)
1641 {
1642 cl_int err = CL_SUCCESS;
1643 sz *= warper->dstWidth * warper->dstHeight;
1644
1645 //Copy the dst real data
1646 (*dstReal) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1647 sz, warper->dstRealWork.v[iImg], &err);
1648 handleErr(err);
1649
1650 //Copy the dst imag data if exists
1651 if (warper->dstImagWork.v != nullptr && warper->dstImagWork.v[iImg] != nullptr) {
1652 (*dstImag) = clCreateBuffer(warper->context,
1653 CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1654 sz, warper->dstImagWork.v[iImg], &err);
1655 handleErr(err);
1656 } else {
1657 (*dstImag) = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1, nullptr, &err);
1658 handleErr(err);
1659 }
1660
1661 //Set up per-band arguments
1662 if (warper->kern1 != nullptr) {
1663 handleErr(err = clSetKernelArg(warper->kern1, 7, sizeof(cl_mem), dstReal));
1664 handleErr(err = clSetKernelArg(warper->kern1, 8, sizeof(cl_mem), dstImag));
1665 }
1666 if (warper->kern4 != nullptr) {
1667 handleErr(err = clSetKernelArg(warper->kern4, 7, sizeof(cl_mem), dstReal));
1668 handleErr(err = clSetKernelArg(warper->kern4, 8, sizeof(cl_mem), dstImag));
1669 }
1670
1671 return CL_SUCCESS;
1672 }
1673
1674 /*
1675 Read the final raster data back from the graphics card to working memory. This
1676 copies both the real memory and the imag memory if appropriate.
1677
1678 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1679 */
1680 static
1681 cl_int get_dst_rast_data(struct oclWarper *warper, int iImg, size_t wordSz,
1682 cl_mem dstReal, cl_mem dstImag)
1683 {
1684 cl_int err = CL_SUCCESS;
1685 size_t sz = warper->dstWidth * warper->dstHeight * wordSz;
1686
1687 //Copy from dev into working memory
1688 handleErr(err = clEnqueueReadBuffer(warper->queue, dstReal,
1689 CL_FALSE, 0, sz, warper->dstRealWork.v[iImg],
1690 0, nullptr, nullptr));
1691
1692 //If we are expecting the imag channel, then copy it back also
1693 if (warper->dstImagWork.v != nullptr && warper->dstImagWork.v[iImg] != nullptr) {
1694 handleErr(err = clEnqueueReadBuffer(warper->queue, dstImag,
1695 CL_FALSE, 0, sz, warper->dstImagWork.v[iImg],
1696 0, nullptr, nullptr));
1697 }
1698
1699 //The copy requests were non-blocking, so we'll need to make sure they finish.
1700 handleErr(err = clFinish(warper->queue));
1701
1702 return CL_SUCCESS;
1703 }
1704
1705 /*
1706 Set the destination image density & validity mask on the device. This is used
1707 to blend the final output image with the existing buffer. This handles the
1708 unified structures that apply to all bands. After the buffers are created and
1709 copied, they are set as kernel arguments.
1710
1711 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1712 */
1713 static
1714 cl_int set_dst_data(struct oclWarper *warper,
1715 cl_mem *dstDensityCL, cl_mem *dstValidCL, cl_mem *dstNoDataRealCL,
1716 float *dstDensity, unsigned int *dstValid, float *dstNoDataReal)
1717 {
1718 cl_int err = CL_SUCCESS;
1719 size_t sz = warper->dstWidth * warper->dstHeight;
1720
1721 //Copy the no-data value(s)
1722 if (dstNoDataReal == nullptr) {
1723 (*dstNoDataRealCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1724 handleErr(err);
1725 } else {
1726 (*dstNoDataRealCL) = clCreateBuffer(warper->context,
1727 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1728 sizeof(float) * warper->numBands, dstNoDataReal, &err);
1729 handleErr(err);
1730 }
1731
1732 //Copy unifiedSrcDensity if it exists
1733 if (dstDensity == nullptr) {
1734 (*dstDensityCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1735 handleErr(err);
1736 } else {
1737 (*dstDensityCL) = clCreateBuffer(warper->context,
1738 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1739 sizeof(float) * sz, dstDensity, &err);
1740 handleErr(err);
1741 }
1742
1743 //Copy unifiedSrcValid if it exists
1744 if (dstValid == nullptr) {
1745 (*dstValidCL) = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, nullptr, &err);
1746 handleErr(err);
1747 } else {
1748 (*dstValidCL) = clCreateBuffer(warper->context,
1749 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
1750 sizeof(int) * ((31 + sz) >> 5), dstValid, &err);
1751 handleErr(err);
1752 }
1753
1754 //Set up arguments
1755 if (warper->kern1 != nullptr) {
1756 handleErr(err = clSetKernelArg(warper->kern1, 9, sizeof(cl_mem), dstNoDataRealCL));
1757 handleErr(err = clSetKernelArg(warper->kern1, 10, sizeof(cl_mem), dstDensityCL));
1758 handleErr(err = clSetKernelArg(warper->kern1, 11, sizeof(cl_mem), dstValidCL));
1759 }
1760 if (warper->kern4 != nullptr) {
1761 handleErr(err = clSetKernelArg(warper->kern4, 9, sizeof(cl_mem), dstNoDataRealCL));
1762 handleErr(err = clSetKernelArg(warper->kern4, 10, sizeof(cl_mem), dstDensityCL));
1763 handleErr(err = clSetKernelArg(warper->kern4, 11, sizeof(cl_mem), dstValidCL));
1764 }
1765
1766 return CL_SUCCESS;
1767 }
1768
1769 /*
1770 Go ahead and execute the kernel. This handles some housekeeping stuff like the
1771 run dimensions. When running in debug mode, it times the kernel call and prints
1772 the execution time.
1773
1774 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1775 */
1776 static
1777 cl_int execute_kern(struct oclWarper *warper, cl_kernel kern, size_t loc_size)
1778 {
1779 cl_int err = CL_SUCCESS;
1780 cl_event ev;
1781 size_t ceil_runs[2];
1782 size_t group_size[2];
1783 #ifdef DEBUG_OPENCL
1784 size_t start_time = 0;
1785 size_t end_time;
1786 const char *vecTxt = "";
1787 #endif
1788
1789 // Use a likely X-dimension which is a power of 2
1790 if (loc_size >= 512)
1791 group_size[0] = 32;
1792 else if (loc_size >= 64)
1793 group_size[0] = 16;
1794 else if (loc_size > 8)
1795 group_size[0] = 8;
1796 else
1797 group_size[0] = 1;
1798
1799 if (group_size[0] > loc_size)
1800 group_size[1] = group_size[0]/loc_size;
1801 else
1802 group_size[1] = 1;
1803
1804 //Round up num_runs to find the dim of the block of pixels we'll be processing
1805 if(warper->dstWidth % group_size[0])
1806 ceil_runs[0] = warper->dstWidth + group_size[0] - warper->dstWidth % group_size[0];
1807 else
1808 ceil_runs[0] = warper->dstWidth;
1809
1810 if(warper->dstHeight % group_size[1])
1811 ceil_runs[1] = warper->dstHeight + group_size[1] - warper->dstHeight % group_size[1];
1812 else
1813 ceil_runs[1] = warper->dstHeight;
1814
1815 #ifdef DEBUG_OPENCL
1816 handleErr(err = clSetCommandQueueProperty(warper->queue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, nullptr));
1817 #endif
1818
1819 // Run the calculation by enqueuing it and forcing the
1820 // command queue to complete the task
1821 handleErr(err = clEnqueueNDRangeKernel(warper->queue, kern, 2, nullptr,
1822 ceil_runs, group_size, 0, nullptr, &ev));
1823 handleErr(err = clFinish(warper->queue));
1824
1825 #ifdef DEBUG_OPENCL
1826 handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
1827 sizeof(size_t), &start_time, nullptr));
1828 handleErr(err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
1829 sizeof(size_t), &end_time, nullptr));
1830 assert(end_time != 0);
1831 assert(start_time != 0);
1832 if (kern == warper->kern4)
1833 vecTxt = "(vec)";
1834
1835 CPLDebug("OpenCL", "Kernel Time: %6s %10lu", vecTxt, static_cast<long int>((end_time-start_time)/100000));
1836 #endif
1837
1838 handleErr(err = clReleaseEvent(ev));
1839 return CL_SUCCESS;
1840 }
1841
1842 /*
1843 Copy data from a raw source to the warper's working memory. If the imag
1844 channel is expected, then the data will be de-interlaced into component blocks
1845 of memory.
1846
1847 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
1848 */
1849 static
1850 cl_int set_img_data(struct oclWarper *warper, void *srcImgData,
1851 unsigned int width, unsigned int height, int isSrc,
1852 unsigned int bandNum, void **dstRealImgs, void **dstImagImgs)
1853 {
1854 unsigned int imgChSize = warper->imgChSize1;
1855 unsigned int iSrcY, i;
1856 unsigned int vecOff = 0;
1857 unsigned int imgNum = bandNum;
1858 void *dstReal = nullptr;
1859 void *dstImag = nullptr;
1860
1861 // Handle vector if needed
1862 if (warper->useVec && static_cast<int>(bandNum) < warper->numBands - warper->numBands % 4) {
1863 imgChSize = warper->imgChSize4;
1864 vecOff = bandNum % 4;
1865 imgNum = bandNum / 4;
1866 } else if(warper->useVec) {
1867 imgNum = bandNum / 4 + bandNum % 4;
1868 }
1869
1870 // Set the images as needed
1871 dstReal = dstRealImgs[imgNum];
1872 if(dstImagImgs == nullptr)
1873 dstImag = nullptr;
1874 else
1875 dstImag = dstImagImgs[imgNum];
1876
1877 // Set stuff for dst imgs
1878 if (!isSrc) {
1879 vecOff *= height * width;
1880 imgChSize = 1;
1881 }
1882
1883 // Copy values as needed
1884 if (warper->imagWorkCL == nullptr && !(warper->useVec && isSrc)) {
1885 //Set memory size & location depending on the data type
1886 //This is the ideal code path for speed
1887 switch (warper->imageFormat) {
1888 case CL_UNORM_INT8:
1889 {
1890 unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[vecOff]);
1891 memcpy(realDst, srcImgData, width*height*sizeof(unsigned char));
1892 break;
1893 }
1894 case CL_SNORM_INT8:
1895 {
1896 char *realDst = &((static_cast<char *>(dstReal))[vecOff]);
1897 memcpy(realDst, srcImgData, width*height*sizeof(char));
1898 break;
1899 }
1900 case CL_UNORM_INT16:
1901 {
1902 unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[vecOff]);
1903 memcpy(realDst, srcImgData, width*height*sizeof(unsigned short));
1904 break;
1905 }
1906 case CL_SNORM_INT16:
1907 {
1908 short *realDst = &((static_cast<short *>(dstReal))[vecOff]);
1909 memcpy(realDst, srcImgData, width*height*sizeof(short));
1910 break;
1911 }
1912 case CL_FLOAT:
1913 {
1914 float *realDst = &((static_cast<float *>(dstReal))[vecOff]);
1915 memcpy(realDst, srcImgData, width*height*sizeof(float));
1916 break;
1917 }
1918 }
1919 } else if (warper->imagWorkCL == nullptr) {
1920 //We need to space the values due to OpenCL implementation reasons
1921 for( iSrcY = 0; iSrcY < height; iSrcY++ )
1922 {
1923 int pxOff = width*iSrcY;
1924 int imgOff = imgChSize*pxOff + vecOff;
1925 //Copy & deinterleave interleaved data
1926 switch (warper->imageFormat) {
1927 case CL_UNORM_INT8:
1928 {
1929 unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[imgOff]);
1930 unsigned char *dataSrc = &((static_cast<unsigned char *>(srcImgData))[pxOff]);
1931 for (i = 0; i < width; ++i)
1932 realDst[imgChSize*i] = dataSrc[i];
1933 }
1934 break;
1935 case CL_SNORM_INT8:
1936 {
1937 char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
1938 char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
1939 for (i = 0; i < width; ++i)
1940 realDst[imgChSize*i] = dataSrc[i];
1941 }
1942 break;
1943 case CL_UNORM_INT16:
1944 {
1945 unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[imgOff]);
1946 unsigned short *dataSrc = &((static_cast<unsigned short *>(srcImgData))[pxOff]);
1947 for (i = 0; i < width; ++i)
1948 realDst[imgChSize*i] = dataSrc[i];
1949 }
1950 break;
1951 case CL_SNORM_INT16:
1952 {
1953 short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
1954 short *dataSrc = &((static_cast<short *>(srcImgData))[pxOff]);
1955 for (i = 0; i < width; ++i)
1956 realDst[imgChSize*i] = dataSrc[i];
1957 }
1958 break;
1959 case CL_FLOAT:
1960 {
1961 float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
1962 float *dataSrc = &((static_cast<float *>(srcImgData))[pxOff]);
1963 for (i = 0; i < width; ++i)
1964 realDst[imgChSize*i] = dataSrc[i];
1965 }
1966 break;
1967 }
1968 }
1969 } else {
1970 //Copy, deinterleave, & space interleaved data
1971 for( iSrcY = 0; iSrcY < height; iSrcY++ )
1972 {
1973 int pxOff = width*iSrcY;
1974 int imgOff = imgChSize*pxOff + vecOff;
1975 //Copy & deinterleave interleaved data
1976 switch (warper->imageFormat) {
1977 case CL_FLOAT:
1978 {
1979 float *realDst = &((static_cast<float *>(dstReal))[imgOff]);
1980 float *imagDst = &((static_cast<float *>(dstImag))[imgOff]);
1981 float *dataSrc = &((static_cast<float *>(srcImgData))[pxOff]);
1982 for (i = 0; i < width; ++i) {
1983 realDst[imgChSize*i] = dataSrc[i*2 ];
1984 imagDst[imgChSize*i] = dataSrc[i*2+1];
1985 }
1986 }
1987 break;
1988 case CL_SNORM_INT8:
1989 {
1990 char *realDst = &((static_cast<char *>(dstReal))[imgOff]);
1991 char *imagDst = &((static_cast<char *>(dstImag))[imgOff]);
1992 char *dataSrc = &((static_cast<char *>(srcImgData))[pxOff]);
1993 for (i = 0; i < width; ++i) {
1994 realDst[imgChSize*i] = dataSrc[i*2 ];
1995 imagDst[imgChSize*i] = dataSrc[i*2+1];
1996 }
1997 }
1998 break;
1999 case CL_UNORM_INT8:
2000 {
2001 unsigned char *realDst = &((static_cast<unsigned char *>(dstReal))[imgOff]);
2002 unsigned char *imagDst = &((static_cast<unsigned char *>(dstImag))[imgOff]);
2003 unsigned char *dataSrc = &((static_cast<unsigned char *>(srcImgData))[pxOff]);
2004 for (i = 0; i < width; ++i) {
2005 realDst[imgChSize*i] = dataSrc[i*2 ];
2006 imagDst[imgChSize*i] = dataSrc[i*2+1];
2007 }
2008 }
2009 break;
2010 case CL_SNORM_INT16:
2011 {
2012 short *realDst = &((static_cast<short *>(dstReal))[imgOff]);
2013 short *imagDst = &((static_cast<short *>(dstImag))[imgOff]);
2014 short *dataSrc = &((static_cast<short *>(srcImgData))[pxOff]);
2015 for (i = 0; i < width; ++i) {
2016 realDst[imgChSize*i] = dataSrc[i*2 ];
2017 imagDst[imgChSize*i] = dataSrc[i*2+1];
2018 }
2019 }
2020 break;
2021 case CL_UNORM_INT16:
2022 {
2023 unsigned short *realDst = &((static_cast<unsigned short *>(dstReal))[imgOff]);
2024 unsigned short *imagDst = &((static_cast<unsigned short *>(dstImag))[imgOff]);
2025 unsigned short *dataSrc = &((static_cast<unsigned short *>(srcImgData))[pxOff]);
2026 for (i = 0; i < width; ++i) {
2027 realDst[imgChSize*i] = dataSrc[i*2 ];
2028 imagDst[imgChSize*i] = dataSrc[i*2+1];
2029 }
2030 }
2031 break;
2032 }
2033 }
2034 }
2035
2036 return CL_SUCCESS;
2037 }
2038
2039 /*
2040 Creates the struct which inits & contains the OpenCL context & environment.
2041 Inits wired(?) space to buffer the image in host RAM. Chooses the OpenCL
2042 device, perhaps the user can choose it later? This would also choose the
2043 appropriate OpenCL image format (R, RG, RGBA, or multiples thereof). Space
2044 for metadata can be allocated as required, though.
2045
2046 Supported image formats are:
2047 CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT16
2048 32-bit int formats won't keep precision when converted to floats internally
2049 and doubles are generally not supported on the GPU image formats.
2050 */
2051 struct oclWarper* GDALWarpKernelOpenCL_createEnv(int srcWidth, int srcHeight,
2052 int dstWidth, int dstHeight,
2053 cl_channel_type imageFormat,
2054 int numBands, int coordMult,
2055 int useImag, int useBandSrcValid,
2056 CPL_UNUSED float *fDstDensity,
2057 double *dfDstNoDataReal,
2058 OCLResampAlg resampAlg, cl_int *clErr)
2059 {
2060 struct oclWarper *warper;
2061 int i;
2062 size_t maxWidth = 0, maxHeight = 0;
2063 cl_int err = CL_SUCCESS;
2064 size_t fmtSize, sz;
2065 cl_device_id device;
2066 cl_bool bool_flag;
2067 OCLVendor eCLVendor = VENDOR_OTHER;
2068
2069 // Do we have a suitable OpenCL device?
2070 device = get_device(&eCLVendor);
2071 if( device == nullptr )
2072 return nullptr;
2073
2074 err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
2075 sizeof(cl_bool), &bool_flag, &sz);
2076 if( err != CL_SUCCESS || !bool_flag )
2077 {
2078 CPLDebug( "OpenCL", "No image support on selected device." );
2079 return nullptr;
2080 }
2081
2082 // Set up warper environment.
2083 warper = static_cast<struct oclWarper *>(CPLCalloc(1, sizeof(struct oclWarper)));
2084
2085 warper->eCLVendor = eCLVendor;
2086
2087 //Init passed vars
2088 warper->srcWidth = srcWidth;
2089 warper->srcHeight = srcHeight;
2090 warper->dstWidth = dstWidth;
2091 warper->dstHeight = dstHeight;
2092
2093 warper->coordMult = coordMult;
2094 warper->numBands = numBands;
2095 warper->imageFormat = imageFormat;
2096 warper->resampAlg = resampAlg;
2097
2098 warper->useUnifiedSrcDensity = FALSE;
2099 warper->useUnifiedSrcValid = FALSE;
2100 warper->useDstDensity = FALSE;
2101 warper->useDstValid = FALSE;
2102
2103 warper->imagWorkCL = nullptr;
2104 warper->dstImagWorkCL = nullptr;
2105 warper->useBandSrcValidCL = nullptr;
2106 warper->useBandSrcValid = nullptr;
2107 warper->nBandSrcValidCL = nullptr;
2108 warper->nBandSrcValid = nullptr;
2109 warper->fDstNoDataRealCL = nullptr;
2110 warper->fDstNoDataReal = nullptr;
2111 warper->kern1 = nullptr;
2112 warper->kern4 = nullptr;
2113
2114 warper->dev = device;
2115
2116 warper->context = clCreateContext(nullptr, 1, &(warper->dev), nullptr, nullptr, &err);
2117 handleErrGoto(err, error_label);
2118 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2119 #pragma GCC diagnostic push
2120 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2121 #endif
2122 warper->queue = clCreateCommandQueue(warper->context, warper->dev, 0, &err);
2123 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || defined(__clang__)
2124 #pragma GCC diagnostic pop
2125 #endif
2126 handleErrGoto(err, error_label);
2127
2128 //Ensure that we hand handle imagery of these dimensions
2129 err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &maxWidth, &sz);
2130 handleErrGoto(err, error_label);
2131 err = clGetDeviceInfo(warper->dev, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &maxHeight, &sz);
2132 handleErrGoto(err, error_label);
2133 if (maxWidth < static_cast<size_t>(srcWidth) ||
2134 maxHeight < static_cast<size_t>(srcHeight)) {
2135 err = CL_INVALID_IMAGE_SIZE;
2136 handleErrGoto(err, error_label);
2137 }
2138
2139 // Split bands into sets of four when possible
2140 // Cubic runs slower as vector, so don't use it (probably register pressure)
2141 // Feel free to do more testing and come up with more precise case statements
2142 if(numBands < 4 || resampAlg == OCL_Cubic) {
2143 warper->numImages = numBands;
2144 warper->useVec = FALSE;
2145 } else {
2146 warper->numImages = numBands/4 + numBands % 4;
2147 warper->useVec = TRUE;
2148 }
2149
2150 //Make the pointer space for the real images
2151 warper->realWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2152 warper->dstRealWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2153
2154 //Make space for the per-channel Imag data (if exists)
2155 if (useImag) {
2156 warper->imagWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2157 warper->dstImagWorkCL = static_cast<cl_mem*>(CPLCalloc(sizeof(cl_mem), warper->numImages));
2158 }
2159
2160 //Make space for the per-band BandSrcValid data (if exists)
2161 if (useBandSrcValid) {
2162 //32 bits in the mask
2163 sz = warper->numBands * ((31 + warper->srcWidth * warper->srcHeight) >> 5);
2164
2165 //Allocate some space for the validity of the validity mask
2166 err = alloc_pinned_mem(warper, 0, warper->numBands*sizeof(char),
2167 reinterpret_cast<void **>(&(warper->useBandSrcValid)),
2168 &(warper->useBandSrcValidCL));
2169 handleErrGoto(err, error_label);
2170
2171 for (i = 0; i < warper->numBands; ++i)
2172 warper->useBandSrcValid[i] = FALSE;
2173
2174 // Allocate one array for all the band validity masks.
2175 // Remember that the masks don't use much memory (they're bitwise).
2176 err = alloc_pinned_mem(warper, 0, sz * sizeof(int),
2177 reinterpret_cast<void **>(&(warper->nBandSrcValid)),
2178 &(warper->nBandSrcValidCL));
2179 handleErrGoto(err, error_label);
2180 }
2181
2182 //Make space for the per-band
2183 if (dfDstNoDataReal != nullptr) {
2184 alloc_pinned_mem(warper, 0, warper->numBands,
2185 reinterpret_cast<void **>(&(warper->fDstNoDataReal)),
2186 &(warper->fDstNoDataRealCL));
2187
2188 //Copy over values
2189 for (i = 0; i < warper->numBands; ++i)
2190 warper->fDstNoDataReal[i] = static_cast<float>(dfDstNoDataReal[i]);
2191 }
2192
2193 //Alloc working host image memory
2194 //We'll be copying into these buffers soon
2195 switch (imageFormat) {
2196 case CL_FLOAT:
2197 err = alloc_working_arr(warper, sizeof(float *), sizeof(float), &fmtSize);
2198 break;
2199 case CL_SNORM_INT8:
2200 err = alloc_working_arr(warper, sizeof(char *), sizeof(char), &fmtSize);
2201 break;
2202 case CL_UNORM_INT8:
2203 err = alloc_working_arr(warper, sizeof(unsigned char *), sizeof(unsigned char), &fmtSize);
2204 break;
2205 case CL_SNORM_INT16:
2206 err = alloc_working_arr(warper, sizeof(short *), sizeof(short), &fmtSize);
2207 break;
2208 case CL_UNORM_INT16:
2209 err = alloc_working_arr(warper, sizeof(unsigned short *), sizeof(unsigned short), &fmtSize);
2210 break;
2211 }
2212 handleErrGoto(err, error_label);
2213
2214 // Find a good & compatible image channel order for the Lat/Long array.
2215 err = set_supported_formats(warper, 2,
2216 &(warper->xyChOrder), &(warper->xyChSize),
2217 CL_FLOAT);
2218 handleErrGoto(err, error_label);
2219
2220 //Set coordinate image dimensions
2221 warper->xyWidth = static_cast<int>(ceil((static_cast<float>(warper->dstWidth) + static_cast<float>(warper->coordMult)-1)/static_cast<float>(warper->coordMult)));
2222 warper->xyHeight = static_cast<int>(ceil((static_cast<float>(warper->dstHeight) + static_cast<float>(warper->coordMult)-1)/static_cast<float>(warper->coordMult)));
2223
2224 //Alloc coord memory
2225 sz = sizeof(float) * warper->xyChSize * warper->xyWidth * warper->xyHeight;
2226 err = alloc_pinned_mem(warper, 0, sz, reinterpret_cast<void **>(&(warper->xyWork)),
2227 &(warper->xyWorkCL));
2228 handleErrGoto(err, error_label);
2229
2230 //Ensure everything is finished allocating, copying, & mapping
2231 err = clFinish(warper->queue);
2232 handleErrGoto(err, error_label);
2233
2234 (*clErr) = CL_SUCCESS;
2235 return warper;
2236
2237 error_label:
2238 GDALWarpKernelOpenCL_deleteEnv(warper);
2239 return nullptr;
2240 }
2241
2242 /*
2243 Copy the validity mask for an image band to the warper.
2244
2245 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2246 */
2247 cl_int GDALWarpKernelOpenCL_setSrcValid(struct oclWarper *warper,
2248 int *bandSrcValid, int bandNum)
2249 {
2250 //32 bits in the mask
2251 int stride = (31 + warper->srcWidth * warper->srcHeight) >> 5;
2252
2253 //Copy bandSrcValid
2254 assert(warper->nBandSrcValid != nullptr);
2255 memcpy(&(warper->nBandSrcValid[bandNum*stride]), bandSrcValid, sizeof(int) * stride);
2256 warper->useBandSrcValid[bandNum] = TRUE;
2257
2258 return CL_SUCCESS;
2259 }
2260
2261 /*
2262 Sets the source image real & imag into the host memory so that it is
2263 permuted (ex. RGBA) for better graphics card access.
2264
2265 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2266 */
2267 cl_int GDALWarpKernelOpenCL_setSrcImg(struct oclWarper *warper, void *imgData,
2268 int bandNum)
2269 {
2270 void **imagWorkPtr = nullptr;
2271
2272 if (warper->imagWorkCL != nullptr)
2273 imagWorkPtr = warper->imagWork.v;
2274
2275 return set_img_data(warper, imgData, warper->srcWidth, warper->srcHeight,
2276 TRUE, bandNum, warper->realWork.v, imagWorkPtr);
2277 }
2278
2279 /*
2280 Sets the destination image real & imag into the host memory so that it is
2281 permuted (ex. RGBA) for better graphics card access.
2282
2283 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2284 */
2285 cl_int GDALWarpKernelOpenCL_setDstImg(struct oclWarper *warper, void *imgData,
2286 int bandNum)
2287 {
2288 void **dstImagWorkPtr = nullptr;
2289
2290 if (warper->dstImagWorkCL != nullptr)
2291 dstImagWorkPtr = warper->dstImagWork.v;
2292
2293 return set_img_data(warper, imgData, warper->dstWidth, warper->dstHeight,
2294 FALSE, bandNum, warper->dstRealWork.v, dstImagWorkPtr);
2295 }
2296
2297 /*
2298 Inputs the source coordinates for a row of the destination pixels. Invalid
2299 coordinates are set as -99.0, which should be out of the image bounds. Sets
2300 the coordinates as ready to be used in OpenCL image memory: interleaved and
2301 minus the offset. By using image memory, we can use a smaller texture for
2302 coordinates and use OpenCL's built-in interpolation to save memory.
2303
2304 What it does: generates a smaller matrix of X/Y coordinate transformation
2305 values from an original matrix. When bilinearly sampled in the GPU hardware,
2306 the generated values are as close as possible to the original matrix.
2307
2308 Complication: matrices have arbitrary dimensions and the sub-sampling factor
2309 is an arbitrary integer greater than zero. Getting the edge cases right is
2310 difficult.
2311
2312 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2313 */
2314 cl_int GDALWarpKernelOpenCL_setCoordRow(struct oclWarper *warper,
2315 double *rowSrcX, double *rowSrcY,
2316 double srcXOff, double srcYOff,
2317 int *success, int rowNum)
2318 {
2319 int coordMult = warper->coordMult;
2320 int width = warper->dstWidth;
2321 int height = warper->dstHeight;
2322 int xyWidth = warper->xyWidth;
2323 int i;
2324 int xyChSize = warper->xyChSize;
2325 float *xyPtr, *xyPrevPtr = nullptr;
2326 int lastRow = rowNum == height - 1;
2327 double dstHeightMod = 1.0, dstWidthMod = 1.0;
2328
2329 //Return if we're at an off row
2330 if(!lastRow && rowNum % coordMult != 0)
2331 return CL_SUCCESS;
2332
2333 //Standard row, adjusted for the skipped rows
2334 xyPtr = &(warper->xyWork[xyWidth * xyChSize * rowNum / coordMult]);
2335
2336 //Find our row
2337 if(lastRow){
2338 //Setup for the final row
2339 xyPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 1)]);
2340 xyPrevPtr = &(warper->xyWork[xyWidth * xyChSize * (warper->xyHeight - 2)]);
2341
2342 if((height-1) % coordMult)
2343 dstHeightMod = static_cast<double>(coordMult) / static_cast<double>((height-1) % coordMult);
2344 }
2345
2346 //Copy selected coordinates
2347 for (i = 0; i < width; i += coordMult) {
2348 if (success[i]) {
2349 xyPtr[0] = static_cast<float>(rowSrcX[i] - srcXOff);
2350 xyPtr[1] = static_cast<float>(rowSrcY[i] - srcYOff);
2351
2352 if(lastRow) {
2353 //Adjust bottom row so interpolator returns correct value
2354 xyPtr[0] = static_cast<float>(dstHeightMod * (xyPtr[0] - xyPrevPtr[0]) + xyPrevPtr[0]);
2355 xyPtr[1] = static_cast<float>(dstHeightMod * (xyPtr[1] - xyPrevPtr[1]) + xyPrevPtr[1]);
2356 }
2357 } else {
2358 xyPtr[0] = -99.0f;
2359 xyPtr[1] = -99.0f;
2360 }
2361
2362 xyPtr += xyChSize;
2363 xyPrevPtr += xyChSize;
2364 }
2365
2366 //Copy remaining coordinate
2367 if((width-1) % coordMult){
2368 dstWidthMod = static_cast<double>(coordMult) / static_cast<double>((width-1) % coordMult);
2369 xyPtr -= xyChSize;
2370 xyPrevPtr -= xyChSize;
2371 } else {
2372 xyPtr -= xyChSize*2;
2373 xyPrevPtr -= xyChSize*2;
2374 }
2375
2376 if(lastRow) {
2377 double origX = rowSrcX[width-1] - srcXOff;
2378 double origY = rowSrcY[width-1] - srcYOff;
2379 double a = 1.0, b = 1.0;
2380
2381 // Calculate the needed x/y values using an equation from the OpenCL Spec
2382 // section 8.2, solving for Ti1j1
2383 if((width -1) % coordMult)
2384 a = ((width -1) % coordMult)/static_cast<double>(coordMult);
2385
2386 if((height-1) % coordMult)
2387 b = ((height-1) % coordMult)/static_cast<double>(coordMult);
2388
2389 xyPtr[xyChSize ] = static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[0]
2390 + a * (1.0 - b) * xyPrevPtr[xyChSize]
2391 + (1.0 - a) * b * xyPtr[0]) - origX)/(-a * b));
2392
2393 xyPtr[xyChSize+1] = static_cast<float>((((1.0 - a) * (1.0 - b) * xyPrevPtr[1]
2394 + a * (1.0 - b) * xyPrevPtr[xyChSize+1]
2395 + (1.0 - a) * b * xyPtr[1]) - origY)/(-a * b));
2396 } else {
2397 //Adjust last coordinate so interpolator returns correct value
2398 xyPtr[xyChSize ] = static_cast<float>(dstWidthMod * (rowSrcX[width-1] - srcXOff - xyPtr[0]) + xyPtr[0]);
2399 xyPtr[xyChSize+1] = static_cast<float>(dstWidthMod * (rowSrcY[width-1] - srcYOff - xyPtr[1]) + xyPtr[1]);
2400 }
2401
2402 return CL_SUCCESS;
2403 }
2404
2405 /*
2406 Copies all data to the device RAM, frees the host RAM, runs the
2407 appropriate resampling kernel, mallocs output space, & copies the data
2408 back from the device RAM for each band. Also check to make sure that
2409 setRow*() was called the appropriate number of times to init all image
2410 data.
2411
2412 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2413 */
2414 cl_int GDALWarpKernelOpenCL_runResamp(struct oclWarper *warper,
2415 float *unifiedSrcDensity,
2416 unsigned int *unifiedSrcValid,
2417 float *dstDensity,
2418 unsigned int *dstValid,
2419 double dfXScale, double dfYScale,
2420 double dfXFilter, double dfYFilter,
2421 int nXRadius, int nYRadius,
2422 int nFiltInitX, int nFiltInitY)
2423 {
2424 int i, nextBandNum = 0, chSize = 1;
2425 cl_int err = CL_SUCCESS;
2426 cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;
2427 cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;
2428 cl_mem useBandSrcValidCL, nBandSrcValidCL;
2429 size_t groupSize, wordSize = 0;
2430 cl_kernel kern = nullptr;
2431 cl_channel_order chOrder;
2432
2433 warper->useUnifiedSrcDensity = unifiedSrcDensity != nullptr;
2434 warper->useUnifiedSrcValid = unifiedSrcValid != nullptr;
2435
2436 //Check the word size
2437 switch (warper->imageFormat) {
2438 case CL_FLOAT:
2439 wordSize = sizeof(float);
2440 break;
2441 case CL_SNORM_INT8:
2442 wordSize = sizeof(char);
2443 break;
2444 case CL_UNORM_INT8:
2445 wordSize = sizeof(unsigned char);
2446 break;
2447 case CL_SNORM_INT16:
2448 wordSize = sizeof(short);
2449 break;
2450 case CL_UNORM_INT16:
2451 wordSize = sizeof(unsigned short);
2452 break;
2453 }
2454
2455 //Compile the kernel; the invariants are being compiled into the code
2456 if (!warper->useVec || warper->numBands % 4) {
2457 warper->kern1 = get_kernel(warper, FALSE,
2458 dfXScale, dfYScale, dfXFilter, dfYFilter,
2459 nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2460 handleErr(err);
2461 }
2462 if (warper->useVec){
2463 warper->kern4 = get_kernel(warper, TRUE,
2464 dfXScale, dfYScale, dfXFilter, dfYFilter,
2465 nXRadius, nYRadius, nFiltInitX, nFiltInitY, &err);
2466 handleErr(err);
2467 }
2468
2469 //Copy coord data to the device
2470 handleErr(err = set_coord_data(warper, &xy));
2471
2472 //Copy unified density & valid data
2473 handleErr(err = set_unified_data(warper, &unifiedSrcDensityCL, &unifiedSrcValidCL,
2474 unifiedSrcDensity, unifiedSrcValid,
2475 &useBandSrcValidCL, &nBandSrcValidCL));
2476
2477 //Copy output density & valid data
2478 handleErr(set_dst_data(warper, &dstDensityCL, &dstValidCL, &dstNoDataRealCL,
2479 dstDensity, dstValid, warper->fDstNoDataReal));
2480
2481 //What's the recommended group size?
2482 if (warper->useVec) {
2483 // Start with the vector kernel
2484 handleErr(clGetKernelWorkGroupInfo(warper->kern4, warper->dev,
2485 CL_KERNEL_WORK_GROUP_SIZE,
2486 sizeof(size_t), &groupSize, nullptr));
2487 kern = warper->kern4;
2488 chSize = warper->imgChSize4;
2489 chOrder = warper->imgChOrder4;
2490 } else {
2491 // We're only using the float kernel
2492 handleErr(clGetKernelWorkGroupInfo(warper->kern1, warper->dev,
2493 CL_KERNEL_WORK_GROUP_SIZE,
2494 sizeof(size_t), &groupSize, nullptr));
2495 kern = warper->kern1;
2496 chSize = warper->imgChSize1;
2497 chOrder = warper->imgChOrder1;
2498 }
2499
2500 //Loop over each image
2501 for (i = 0; i < warper->numImages; ++i)
2502 {
2503 cl_mem srcImag, srcReal;
2504 cl_mem dstReal, dstImag;
2505 int bandNum = nextBandNum;
2506
2507 //Switch kernels if needed
2508 if (warper->useVec && nextBandNum < warper->numBands - warper->numBands % 4) {
2509 nextBandNum += 4;
2510 } else {
2511 if (kern == warper->kern4) {
2512 handleErr(clGetKernelWorkGroupInfo(warper->kern1, warper->dev,
2513 CL_KERNEL_WORK_GROUP_SIZE,
2514 sizeof(size_t), &groupSize, nullptr));
2515 kern = warper->kern1;
2516 chSize = warper->imgChSize1;
2517 chOrder = warper->imgChOrder1;
2518 }
2519 ++nextBandNum;
2520 }
2521
2522 //Create & copy the source image
2523 handleErr(err = set_src_rast_data(warper, i, chSize*wordSize, chOrder,
2524 &srcReal, &srcImag));
2525
2526 //Create & copy the output image
2527 if (kern == warper->kern1) {
2528 handleErr(err = set_dst_rast_data(warper, i, wordSize, &dstReal, &dstImag));
2529 } else {
2530 handleErr(err = set_dst_rast_data(warper, i, wordSize*4, &dstReal, &dstImag));
2531 }
2532
2533 //Set the bandNum
2534 handleErr(err = clSetKernelArg(kern, 12, sizeof(int), &bandNum));
2535
2536 //Run the kernel
2537 handleErr(err = execute_kern(warper, kern, groupSize));
2538
2539 //Free loop CL mem
2540 handleErr(err = clReleaseMemObject(srcReal));
2541 handleErr(err = clReleaseMemObject(srcImag));
2542
2543 //Copy the back output results
2544 if (kern == warper->kern1) {
2545 handleErr(err = get_dst_rast_data(warper, i, wordSize, dstReal, dstImag));
2546 } else {
2547 handleErr(err = get_dst_rast_data(warper, i, wordSize*4, dstReal, dstImag));
2548 }
2549
2550 //Free remaining CL mem
2551 handleErr(err = clReleaseMemObject(dstReal));
2552 handleErr(err = clReleaseMemObject(dstImag));
2553 }
2554
2555 //Free remaining CL mem
2556 handleErr(err = clReleaseMemObject(xy));
2557 handleErr(err = clReleaseMemObject(unifiedSrcDensityCL));
2558 handleErr(err = clReleaseMemObject(unifiedSrcValidCL));
2559 handleErr(err = clReleaseMemObject(useBandSrcValidCL));
2560 handleErr(err = clReleaseMemObject(nBandSrcValidCL));
2561 handleErr(err = clReleaseMemObject(dstDensityCL));
2562 handleErr(err = clReleaseMemObject(dstValidCL));
2563 handleErr(err = clReleaseMemObject(dstNoDataRealCL));
2564
2565 return CL_SUCCESS;
2566 }
2567
2568 /*
2569 Sets pointers to the floating point data in the warper. The pointers
2570 are internal to the warper structure, so don't free() them. If the imag
2571 channel is in use, it will receive a pointer. Otherwise it'll be set to NULL.
2572 These are pointers to floating point data, so the caller will need to
2573 manipulate the output as appropriate before saving the data.
2574
2575 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2576 */
2577 cl_int GDALWarpKernelOpenCL_getRow(struct oclWarper *warper,
2578 void **rowReal, void **rowImag,
2579 int rowNum, int bandNum)
2580 {
2581 int memOff = rowNum * warper->dstWidth;
2582 int imgNum = bandNum;
2583
2584 if (warper->useVec && bandNum < warper->numBands - warper->numBands % 4) {
2585 memOff += warper->dstWidth * warper->dstHeight * (bandNum % 4);
2586 imgNum = bandNum / 4;
2587 } else if(warper->useVec) {
2588 imgNum = bandNum / 4 + bandNum % 4;
2589 }
2590
2591 //Return pointers into the warper's data
2592 switch (warper->imageFormat) {
2593 case CL_FLOAT:
2594 (*rowReal) = &(warper->dstRealWork.f[imgNum][memOff]);
2595 break;
2596 case CL_SNORM_INT8:
2597 (*rowReal) = &(warper->dstRealWork.c[imgNum][memOff]);
2598 break;
2599 case CL_UNORM_INT8:
2600 (*rowReal) = &(warper->dstRealWork.uc[imgNum][memOff]);
2601 break;
2602 case CL_SNORM_INT16:
2603 (*rowReal) = &(warper->dstRealWork.s[imgNum][memOff]);
2604 break;
2605 case CL_UNORM_INT16:
2606 (*rowReal) = &(warper->dstRealWork.us[imgNum][memOff]);
2607 break;
2608 }
2609
2610 if (warper->dstImagWorkCL == nullptr) {
2611 (*rowImag) = nullptr;
2612 } else {
2613 switch (warper->imageFormat) {
2614 case CL_FLOAT:
2615 (*rowImag) = &(warper->dstImagWork.f[imgNum][memOff]);
2616 break;
2617 case CL_SNORM_INT8:
2618 (*rowImag) = &(warper->dstImagWork.c[imgNum][memOff]);
2619 break;
2620 case CL_UNORM_INT8:
2621 (*rowImag) = &(warper->dstImagWork.uc[imgNum][memOff]);
2622 break;
2623 case CL_SNORM_INT16:
2624 (*rowImag) = &(warper->dstImagWork.s[imgNum][memOff]);
2625 break;
2626 case CL_UNORM_INT16:
2627 (*rowImag) = &(warper->dstImagWork.us[imgNum][memOff]);
2628 break;
2629 }
2630 }
2631
2632 return CL_SUCCESS;
2633 }
2634
2635 /*
2636 Free the OpenCL warper environment. It should check everything for NULL, so
2637 be sure to mark free()ed pointers as NULL or it'll be double free()ed.
2638
2639 Returns CL_SUCCESS on success and other CL_* errors when something goes wrong.
2640 */
2641 cl_int GDALWarpKernelOpenCL_deleteEnv(struct oclWarper *warper)
2642 {
2643 int i;
2644 cl_int err = CL_SUCCESS;
2645
2646 for (i = 0; i < warper->numImages; ++i) {
2647 // Run free!!
2648 void* dummy = nullptr;
2649 if( warper->realWork.v )
2650 freeCLMem(warper->realWorkCL[i], warper->realWork.v[i]);
2651 else
2652 freeCLMem(warper->realWorkCL[i], dummy);
2653 if( warper->realWork.v )
2654 freeCLMem(warper->dstRealWorkCL[i], warper->dstRealWork.v[i]);
2655 else
2656 freeCLMem(warper->dstRealWorkCL[i], dummy);
2657
2658 //(As applicable)
2659 if(warper->imagWorkCL != nullptr && warper->imagWork.v != nullptr && warper->imagWork.v[i] != nullptr) {
2660 freeCLMem(warper->imagWorkCL[i], warper->imagWork.v[i]);
2661 }
2662 if(warper->dstImagWorkCL != nullptr && warper->dstImagWork.v != nullptr && warper->dstImagWork.v[i] != nullptr) {
2663 freeCLMem(warper->dstImagWorkCL[i], warper->dstImagWork.v[i]);
2664 }
2665 }
2666
2667 //Free cl_mem
2668 freeCLMem(warper->useBandSrcValidCL, warper->useBandSrcValid);
2669 freeCLMem(warper->nBandSrcValidCL, warper->nBandSrcValid);
2670 freeCLMem(warper->xyWorkCL, warper->xyWork);
2671 freeCLMem(warper->fDstNoDataRealCL, warper->fDstNoDataReal);
2672
2673 //Free pointers to cl_mem*
2674 if (warper->realWorkCL != nullptr)
2675 CPLFree(warper->realWorkCL);
2676 if (warper->dstRealWorkCL != nullptr)
2677 CPLFree(warper->dstRealWorkCL);
2678
2679 if (warper->imagWorkCL != nullptr)
2680 CPLFree(warper->imagWorkCL);
2681 if (warper->dstImagWorkCL != nullptr)
2682 CPLFree(warper->dstImagWorkCL);
2683
2684 if (warper->realWork.v != nullptr)
2685 CPLFree(warper->realWork.v);
2686 if (warper->dstRealWork.v != nullptr)
2687 CPLFree(warper->dstRealWork.v);
2688
2689 if (warper->imagWork.v != nullptr)
2690 CPLFree(warper->imagWork.v);
2691 if (warper->dstImagWork.v != nullptr)
2692 CPLFree(warper->dstImagWork.v);
2693
2694 //Free OpenCL structures
2695 if (warper->kern1 != nullptr)
2696 clReleaseKernel(warper->kern1);
2697 if (warper->kern4 != nullptr)
2698 clReleaseKernel(warper->kern4);
2699 if (warper->queue != nullptr)
2700 clReleaseCommandQueue(warper->queue);
2701 if (warper->context != nullptr)
2702 clReleaseContext(warper->context);
2703
2704 CPLFree(warper);
2705
2706 return CL_SUCCESS;
2707 }
2708
2709 #endif /* defined(HAVE_OPENCL) */
2710