1 /******************************************************************************
2  *
3  * Project:  High Performance Image Reprojector
4  * Purpose:  Implementation of the GDALWarpKernel class.  Implements the actual
5  *           image warping for a "chunk" of input and output imagery already
6  *           loaded into memory.
7  * Author:   Frank Warmerdam, warmerdam@pobox.com
8  *
9  ******************************************************************************
10  * Copyright (c) 2003, Frank Warmerdam <warmerdam@pobox.com>
11  * Copyright (c) 2008-2013, Even Rouault <even dot rouault at spatialys.com>
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a
14  * copy of this software and associated documentation files (the "Software"),
15  * to deal in the Software without restriction, including without limitation
16  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17  * and/or sell copies of the Software, and to permit persons to whom the
18  * Software is furnished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included
21  * in all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29  * DEALINGS IN THE SOFTWARE.
30  ****************************************************************************/
31 
32 #include "cpl_port.h"
33 #include "gdalwarper.h"
34 
35 #include <cfloat>
36 #include <cmath>
37 #include <cstddef>
38 #include <cstdlib>
39 #include <cstring>
40 
41 #include <algorithm>
42 #include <limits>
43 #include <new>
44 #include <vector>
45 
46 #include "cpl_atomic_ops.h"
47 #include "cpl_conv.h"
48 #include "cpl_error.h"
49 #include "cpl_multiproc.h"
50 #include "cpl_progress.h"
51 #include "cpl_string.h"
52 #include "cpl_vsi.h"
53 #include "cpl_worker_thread_pool.h"
54 #include "gdal.h"
55 #include "gdal_alg.h"
56 #include "gdal_alg_priv.h"
57 #include "gdal_thread_pool.h"
58 #include "gdalwarpkernel_opencl.h"
59 
60 // We restrict to 64bit processors because they are guaranteed to have SSE2.
61 // Could possibly be used too on 32bit, but we would need to check at runtime.
62 #if defined(__x86_64) || defined(_M_X64)
63 #include "gdalsse_priv.h"
64 
65 #if __SSE4_1__
66 #include <smmintrin.h>
67 #endif
68 
69 #if __SSE3__
70 #include <pmmintrin.h>
71 #endif
72 
73 #endif
74 
75 CPL_CVSID("$Id: gdalwarpkernel.cpp 5cf9a59b0286589b642bb9c229bbb0b4966a88c8 2021-08-13 18:39:08 +0200 Even Rouault $")
76 
77 constexpr double BAND_DENSITY_THRESHOLD = 0.0000000001;
78 constexpr float SRC_DENSITY_THRESHOLD =  0.000000001f;
79 
80 // #define INSTANTIATE_FLOAT64_SSE2_IMPL
81 
82 static const int anGWKFilterRadius[] =
83 {
84     0,  // Nearest neighbour
85     1,  // Bilinear
86     2,  // Cubic Convolution (Catmull-Rom)
87     2,  // Cubic B-Spline
88     3,  // Lanczos windowed sinc
89     0,  // Average
90     0,  // Mode
91     0,  // Reserved GRA_Gauss=7
92     0,  // Max
93     0,  // Min
94     0,  // Med
95     0,  // Q1
96     0,  // Q3
97     0,  // Sum
98     0,  // RMS
99 };
100 
101 static double GWKBilinear(double dfX);
102 static double GWKCubic(double dfX);
103 static double GWKBSpline(double dfX);
104 static double GWKLanczosSinc(double dfX);
105 
106 static const FilterFuncType apfGWKFilter[] =
107 {
108     nullptr,            // Nearest neighbour
109     GWKBilinear,     // Bilinear
110     GWKCubic,        // Cubic Convolution (Catmull-Rom)
111     GWKBSpline,      // Cubic B-Spline
112     GWKLanczosSinc,  // Lanczos windowed sinc
113     nullptr,  // Average
114     nullptr,  // Mode
115     nullptr,  // Reserved GRA_Gauss=7
116     nullptr,  // Max
117     nullptr,  // Min
118     nullptr,  // Med
119     nullptr,  // Q1
120     nullptr,  // Q3
121     nullptr,  // Sum
122     nullptr,  // RMS
123 };
124 
125 // TODO(schwehr): Can we make these functions have a const * const arg?
126 static double GWKBilinear4Values(double* padfVals);
127 static double GWKCubic4Values(double* padfVals);
128 static double GWKBSpline4Values(double* padfVals);
129 static double GWKLanczosSinc4Values(double* padfVals);
130 
131 static const FilterFunc4ValuesType apfGWKFilter4Values[] =
132 {
133     nullptr,  // Nearest neighbour
134     GWKBilinear4Values,     // Bilinear
135     GWKCubic4Values,        // Cubic Convolution (Catmull-Rom)
136     GWKBSpline4Values,      // Cubic B-Spline
137     GWKLanczosSinc4Values,  // Lanczos windowed sinc
138     nullptr,  // Average
139     nullptr,  // Mode
140     nullptr,  // Reserved GRA_Gauss=7
141     nullptr,  // Max
142     nullptr,  // Min
143     nullptr,  // Med
144     nullptr,  // Q1
145     nullptr,  // Q3
146     nullptr,  // Sum
147     nullptr,  // RMS
148 };
149 
150 int GWKGetFilterRadius(GDALResampleAlg eResampleAlg)
151 {
152     static_assert( CPL_ARRAYSIZE(anGWKFilterRadius) == GRA_LAST_VALUE + 1, "Bad size of anGWKFilterRadius" );
153     return anGWKFilterRadius[eResampleAlg];
154 }
155 
156 FilterFuncType GWKGetFilterFunc(GDALResampleAlg eResampleAlg)
157 {
158     static_assert( CPL_ARRAYSIZE(apfGWKFilter) == GRA_LAST_VALUE + 1, "Bad size of apfGWKFilter" );
159     return apfGWKFilter[eResampleAlg];
160 }
161 
162 FilterFunc4ValuesType GWKGetFilterFunc4Values(GDALResampleAlg eResampleAlg)
163 {
164     static_assert( CPL_ARRAYSIZE(apfGWKFilter4Values) == GRA_LAST_VALUE + 1, "Bad size of apfGWKFilter4Values" );
165     return apfGWKFilter4Values[eResampleAlg];
166 }
167 
168 #ifdef HAVE_OPENCL
169 static CPLErr GWKOpenCLCase( GDALWarpKernel * );
170 #endif
171 
172 static CPLErr GWKGeneralCase( GDALWarpKernel * );
173 static CPLErr GWKRealCase( GDALWarpKernel *poWK );
174 static CPLErr GWKNearestNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK );
175 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK );
176 static CPLErr GWKCubicNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK );
177 static CPLErr GWKCubicNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK );
178 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
179 static CPLErr GWKCubicNoMasksOrDstDensityOnlyDouble( GDALWarpKernel *poWK );
180 #endif
181 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK );
182 static CPLErr GWKNearestByte( GDALWarpKernel *poWK );
183 static CPLErr GWKNearestNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK );
184 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK );
185 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK );
186 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
187 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyDouble( GDALWarpKernel *poWK );
188 #endif
189 static CPLErr GWKCubicNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK );
190 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK );
191 static CPLErr GWKNearestShort( GDALWarpKernel *poWK );
192 static CPLErr GWKNearestNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK );
193 static CPLErr GWKNearestFloat( GDALWarpKernel *poWK );
194 static CPLErr GWKAverageOrMode( GDALWarpKernel * );
195 static CPLErr GWKCubicNoMasksOrDstDensityOnlyUShort( GDALWarpKernel * );
196 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyUShort( GDALWarpKernel * );
197 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyUShort( GDALWarpKernel * );
198 
199 /************************************************************************/
200 /*                           GWKJobStruct                               */
201 /************************************************************************/
202 
203 typedef struct _GWKJobStruct GWKJobStruct;
204 
205 struct _GWKJobStruct
206 {
207     GDALWarpKernel *poWK;
208     int             iYMin;
209     int             iYMax;
210     volatile int   *pnCounter;
211     volatile int   *pbStop;
212     CPLCond        *hCond;
213     CPLMutex       *hCondMutex;
214     int           (*pfnProgress)(GWKJobStruct* psJob);
215     void           *pTransformerArg;
216 
217     void           (*pfnFunc)(void*); // used by GWKRun() to assign the proper pTransformerArg
218 } ;
219 
220 struct GWKThreadData
221 {
222     std::unique_ptr<CPLJobQueue> poJobQueue{};
223     GWKJobStruct* pasThreadJob = nullptr;
224     int nThreads = 0;
225     CPLCond* hCond = nullptr;
226     CPLMutex* hCondMutex = nullptr;
227     bool bTransformerArgInputAssignedToThread = false;
228     void* pTransformerArgInput = nullptr; // owned by calling layer. Not to be destroyed
229     std::map<GIntBig, void*> mapThreadToTransformerArg{};
230 };
231 
232 /************************************************************************/
233 /*                        GWKProgressThread()                           */
234 /************************************************************************/
235 
236 // Return TRUE if the computation must be interrupted.
237 static int GWKProgressThread( GWKJobStruct* psJob )
238 {
239     CPLAcquireMutex(psJob->hCondMutex, 1.0);
240     (*(psJob->pnCounter))++;
241     CPLCondSignal(psJob->hCond);
242     int bStop = *(psJob->pbStop);
243     CPLReleaseMutex(psJob->hCondMutex);
244 
245     return bStop;
246 }
247 
248 /************************************************************************/
249 /*                      GWKProgressMonoThread()                         */
250 /************************************************************************/
251 
252 // Return TRUE if the computation must be interrupted.
253 static int GWKProgressMonoThread( GWKJobStruct* psJob )
254 {
255     GDALWarpKernel *poWK = psJob->poWK;
256     int nCounter = ++(*(psJob->pnCounter));
257     if( !poWK->pfnProgress( poWK->dfProgressBase + poWK->dfProgressScale *
258                             (nCounter / static_cast<double>(psJob->iYMax)),
259                             "", poWK->pProgress ) )
260     {
261         CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
262         *(psJob->pbStop) = TRUE;
263         return TRUE;
264     }
265     return FALSE;
266 }
267 
268 /************************************************************************/
269 /*                       GWKGenericMonoThread()                         */
270 /************************************************************************/
271 
272 static CPLErr GWKGenericMonoThread( GDALWarpKernel *poWK,
273                                     void (*pfnFunc) (void *pUserData) )
274 {
275     volatile int bStop = FALSE;
276     volatile int nCounter = 0;
277 
278     GWKJobStruct sThreadJob;
279     sThreadJob.poWK = poWK;
280     sThreadJob.pnCounter = &nCounter;
281     sThreadJob.iYMin = 0;
282     sThreadJob.iYMax = poWK->nDstYSize;
283     sThreadJob.pbStop = &bStop;
284     sThreadJob.hCond = nullptr;
285     sThreadJob.hCondMutex = nullptr;
286     sThreadJob.pfnProgress = GWKProgressMonoThread;
287     sThreadJob.pTransformerArg = poWK->pTransformerArg;
288 
289     pfnFunc(&sThreadJob);
290 
291     return !bStop ? CE_None : CE_Failure;
292 }
293 
294 /************************************************************************/
295 /*                          GWKThreadsCreate()                          */
296 /************************************************************************/
297 
298 void* GWKThreadsCreate( char** papszWarpOptions,
299                         GDALTransformerFunc /* pfnTransformer */,
300                         void* pTransformerArg )
301 {
302     const char* pszWarpThreads =
303         CSLFetchNameValue(papszWarpOptions, "NUM_THREADS");
304     if( pszWarpThreads == nullptr )
305         pszWarpThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
306 
307     int nThreads = 0;
308     if( EQUAL(pszWarpThreads, "ALL_CPUS") )
309         nThreads = CPLGetNumCPUs();
310     else
311         nThreads = atoi(pszWarpThreads);
312     if( nThreads <= 1 )
313         nThreads = 0;
314     if( nThreads > 128 )
315         nThreads = 128;
316 
317     GWKThreadData* psThreadData = new GWKThreadData();
318     CPLCond* hCond = nullptr;
319     if( nThreads )
320         hCond = CPLCreateCond();
321     auto poThreadPool = nThreads > 0 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
322     if( nThreads && hCond && poThreadPool )
323     {
324         psThreadData->nThreads = nThreads;
325         psThreadData->hCond = hCond;
326         psThreadData->pasThreadJob = static_cast<GWKJobStruct *>(
327             VSI_CALLOC_VERBOSE(sizeof(GWKJobStruct), nThreads));
328         if( psThreadData->pasThreadJob == nullptr )
329         {
330             GWKThreadsEnd(psThreadData);
331             return nullptr;
332         }
333 
334         psThreadData->hCondMutex = CPLCreateMutex();
335         if( psThreadData->hCondMutex == nullptr )
336         {
337             GWKThreadsEnd(psThreadData);
338             return nullptr;
339         }
340         CPLReleaseMutex(psThreadData->hCondMutex);
341 
342         for( int i = 0; i < nThreads; i++ )
343         {
344             psThreadData->pasThreadJob[i].hCond = psThreadData->hCond;
345             psThreadData->pasThreadJob[i].hCondMutex = psThreadData->hCondMutex;
346         }
347 
348         psThreadData->poJobQueue = poThreadPool->CreateJobQueue();
349         psThreadData->pTransformerArgInput = pTransformerArg;
350     }
351     else if( hCond )
352         CPLDestroyCond(hCond);
353 
354     return psThreadData;
355 }
356 
357 /************************************************************************/
358 /*                             GWKThreadsEnd()                          */
359 /************************************************************************/
360 
361 void GWKThreadsEnd( void* psThreadDataIn )
362 {
363     if( psThreadDataIn == nullptr )
364         return;
365 
366     GWKThreadData* psThreadData = static_cast<GWKThreadData *>(psThreadDataIn);
367     if( psThreadData->poJobQueue )
368     {
369         for( auto& pair: psThreadData->mapThreadToTransformerArg )
370         {
371             if( pair.second != psThreadData->pTransformerArgInput )
372                 GDALDestroyTransformer(pair.second);
373         }
374         psThreadData->poJobQueue.reset();
375     }
376     CPLFree(psThreadData->pasThreadJob);
377     if( psThreadData->hCond )
378         CPLDestroyCond(psThreadData->hCond);
379     if( psThreadData->hCondMutex )
380         CPLDestroyMutex(psThreadData->hCondMutex);
381     delete psThreadData;
382 }
383 
384 /************************************************************************/
385 /*                         ThreadFuncAdapter()                          */
386 /************************************************************************/
387 
388 static void ThreadFuncAdapter(void* pData)
389 {
390     GWKJobStruct* psJob = static_cast<GWKJobStruct *>(pData);
391     GWKThreadData* psThreadData =
392         static_cast<GWKThreadData*>(psJob->poWK->psThreadData);
393 
394     // Look if we have already a per-thread transformer
395     void* pTransformerArg = nullptr;
396     const GIntBig nThreadId = CPLGetPID();
397     CPLAcquireMutex(psThreadData->hCondMutex, 1.0);
398     auto oIter = psThreadData->mapThreadToTransformerArg.find(nThreadId);
399     if (oIter != psThreadData->mapThreadToTransformerArg.end())
400     {
401         pTransformerArg = oIter->second;
402     }
403     else if( !psThreadData->bTransformerArgInputAssignedToThread )
404     {
405         // Borrow the original transformer, as it has not already been done
406         psThreadData->bTransformerArgInputAssignedToThread = true;
407         pTransformerArg = psThreadData->pTransformerArgInput;
408         psThreadData->mapThreadToTransformerArg[nThreadId] = pTransformerArg;
409     }
410     CPLReleaseMutex(psThreadData->hCondMutex);
411 
412     // If no transformer assigned to current thread, instantiate one
413     if( pTransformerArg == nullptr )
414     {
415         // This somehow assumes that GDALCloneTransformer() is thread-safe
416         // which should normally be the case.
417         pTransformerArg =
418             GDALCloneTransformer(psThreadData->pTransformerArgInput);
419         if( !pTransformerArg )
420         {
421             *(psJob->pbStop) = TRUE;
422             return;
423         }
424 
425         // register in map
426         CPLAcquireMutex(psThreadData->hCondMutex, 1.0);
427         psThreadData->mapThreadToTransformerArg[nThreadId] = pTransformerArg;
428         CPLReleaseMutex(psThreadData->hCondMutex);
429     }
430 
431     psJob->pTransformerArg = pTransformerArg;
432     psJob->pfnFunc(pData);
433 }
434 
435 
436 /************************************************************************/
437 /*                                GWKRun()                              */
438 /************************************************************************/
439 
440 static CPLErr GWKRun( GDALWarpKernel *poWK,
441                       const char* pszFuncName,
442                       void (*pfnFunc) (void *pUserData) )
443 
444 {
445     const int nDstYSize = poWK->nDstYSize;
446 
447     CPLDebug("GDAL", "GDALWarpKernel()::%s() "
448              "Src=%d,%d,%dx%d Dst=%d,%d,%dx%d",
449              pszFuncName,
450              poWK->nSrcXOff, poWK->nSrcYOff,
451              poWK->nSrcXSize, poWK->nSrcYSize,
452              poWK->nDstXOff, poWK->nDstYOff,
453              poWK->nDstXSize, poWK->nDstYSize);
454 
455     if( !poWK->pfnProgress( poWK->dfProgressBase, "", poWK->pProgress ) )
456     {
457         CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
458         return CE_Failure;
459     }
460 
461     GWKThreadData* psThreadData =
462         static_cast<GWKThreadData*>(poWK->psThreadData);
463     if( psThreadData == nullptr || psThreadData->poJobQueue == nullptr )
464     {
465         return GWKGenericMonoThread(poWK, pfnFunc);
466     }
467 
468     int nThreads = std::min(psThreadData->nThreads, nDstYSize / 2);
469     // Config option mostly useful for tests to be able to test multithreading
470     // with small rasters
471     const int nWarpChunkSize = atoi(
472         CPLGetConfigOption("WARP_THREAD_CHUNK_SIZE", "65536"));
473     if( nWarpChunkSize > 0 )
474     {
475         GIntBig nChunks =
476             static_cast<GIntBig>(nDstYSize) * poWK->nDstXSize / nWarpChunkSize;
477         if( nThreads > nChunks )
478             nThreads = static_cast<int>(nChunks);
479     }
480     if( nThreads <= 0 )
481         nThreads = 1;
482 
483     CPLDebug("WARP", "Using %d threads", nThreads);
484 
485     volatile int bStop = FALSE;
486     volatile int nCounter = 0;
487 
488     CPLAcquireMutex(psThreadData->hCondMutex, 1000);
489 
490 /* -------------------------------------------------------------------- */
491 /*      Submit jobs                                                     */
492 /* -------------------------------------------------------------------- */
493     for( int i = 0; i < nThreads; i++ )
494     {
495         psThreadData->pasThreadJob[i].poWK = poWK;
496         psThreadData->pasThreadJob[i].pnCounter = &nCounter;
497         psThreadData->pasThreadJob[i].iYMin =
498             static_cast<int>((static_cast<GIntBig>(i)) * nDstYSize / nThreads);
499         psThreadData->pasThreadJob[i].iYMax =
500             static_cast<int>((static_cast<GIntBig>(i + 1)) *
501                              nDstYSize / nThreads);
502         psThreadData->pasThreadJob[i].pbStop = &bStop;
503         if( poWK->pfnProgress != GDALDummyProgress )
504             psThreadData->pasThreadJob[i].pfnProgress = GWKProgressThread;
505         else
506             psThreadData->pasThreadJob[i].pfnProgress = nullptr;
507         psThreadData->pasThreadJob[i].pfnFunc = pfnFunc;
508         psThreadData->poJobQueue->SubmitJob( ThreadFuncAdapter,
509                             static_cast<void*>(&psThreadData->pasThreadJob[i]) );
510     }
511 
512 /* -------------------------------------------------------------------- */
513 /*      Report progress.                                                */
514 /* -------------------------------------------------------------------- */
515     if( poWK->pfnProgress != GDALDummyProgress )
516     {
517         while( nCounter < nDstYSize )
518         {
519             CPLCondWait(psThreadData->hCond, psThreadData->hCondMutex);
520 
521             if( !poWK->pfnProgress(
522                     poWK->dfProgressBase + poWK->dfProgressScale *
523                     (nCounter / static_cast<double>(nDstYSize)),
524                     "", poWK->pProgress ) )
525             {
526                 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
527                 bStop = TRUE;
528                 break;
529             }
530         }
531     }
532 
533     /* Release mutex before joining threads, otherwise they will dead-lock */
534     /* forever in GWKProgressThread() */
535     CPLReleaseMutex(psThreadData->hCondMutex);
536 
537 /* -------------------------------------------------------------------- */
538 /*      Wait for all jobs to complete.                                  */
539 /* -------------------------------------------------------------------- */
540     psThreadData->poJobQueue->WaitCompletion();
541 
542     return !bStop ? CE_None : CE_Failure;
543 }
544 
545 /************************************************************************/
546 /* ==================================================================== */
547 /*                            GDALWarpKernel                            */
548 /* ==================================================================== */
549 /************************************************************************/
550 
551 /**
552  * \class GDALWarpKernel "gdalwarper.h"
553  *
554  * Low level image warping class.
555  *
556  * This class is responsible for low level image warping for one
557  * "chunk" of imagery.  The class is essentially a structure with all
558  * data members public - primarily so that new special-case functions
559  * can be added without changing the class declaration.
560  *
561  * Applications are normally intended to interactive with warping facilities
562  * through the GDALWarpOperation class, though the GDALWarpKernel can in
563  * theory be used directly if great care is taken in setting up the
564  * control data.
565  *
566  * <h3>Design Issues</h3>
567  *
568  * The intention is that PerformWarp() would analyze the setup in terms
569  * of the datatype, resampling type, and validity/density mask usage and
570  * pick one of many specific implementations of the warping algorithm over
571  * a continuum of optimization vs. generality.  At one end there will be a
572  * reference general purpose implementation of the algorithm that supports
573  * any data type (working internally in double precision complex), all three
574  * resampling types, and any or all of the validity/density masks.  At the
575  * other end would be highly optimized algorithms for common cases like
576  * nearest neighbour resampling on GDT_Byte data with no masks.
577  *
578  * The full set of optimized versions have not been decided but we should
579  * expect to have at least:
580  *  - One for each resampling algorithm for 8bit data with no masks.
581  *  - One for each resampling algorithm for float data with no masks.
582  *  - One for each resampling algorithm for float data with any/all masks
583  *    (essentially the generic case for just float data).
584  *  - One for each resampling algorithm for 8bit data with support for
585  *    input validity masks (per band or per pixel).  This handles the common
586  *    case of nodata masking.
587  *  - One for each resampling algorithm for float data with support for
588  *    input validity masks (per band or per pixel).  This handles the common
589  *    case of nodata masking.
590  *
591  * Some of the specializations would operate on all bands in one pass
592  * (especially the ones without masking would do this), while others might
593  * process each band individually to reduce code complexity.
594  *
595  * <h3>Masking Semantics</h3>
596  *
597  * A detailed explanation of the semantics of the validity and density masks,
598  * and their effects on resampling kernels is needed here.
599  */
600 
601 /************************************************************************/
602 /*                     GDALWarpKernel Data Members                      */
603 /************************************************************************/
604 
605 /**
606  * \var GDALResampleAlg GDALWarpKernel::eResample;
607  *
608  * Resampling algorithm.
609  *
610  * The resampling algorithm to use.  One of GRA_NearestNeighbour, GRA_Bilinear,
611  * GRA_Cubic, GRA_CubicSpline, GRA_Lanczos, GRA_Average, GRA_RMS,
612  * GRA_Mode or GRA_Sum.
613  *
614  * This field is required. GDT_NearestNeighbour may be used as a default
615  * value.
616  */
617 
618 /**
619  * \var GDALDataType GDALWarpKernel::eWorkingDataType;
620  *
621  * Working pixel data type.
622  *
623  * The datatype of pixels in the source image (papabySrcimage) and
624  * destination image (papabyDstImage) buffers.  Note that operations on
625  * some data types (such as GDT_Byte) may be much better optimized than other
626  * less common cases.
627  *
628  * This field is required.  It may not be GDT_Unknown.
629  */
630 
631 /**
632  * \var int GDALWarpKernel::nBands;
633  *
634  * Number of bands.
635  *
636  * The number of bands (layers) of imagery being warped.  Determines the
637  * number of entries in the papabySrcImage, papanBandSrcValid,
638  * and papabyDstImage arrays.
639  *
640  * This field is required.
641  */
642 
643 /**
644  * \var int GDALWarpKernel::nSrcXSize;
645  *
646  * Source image width in pixels.
647  *
648  * This field is required.
649  */
650 
651 /**
652  * \var int GDALWarpKernel::nSrcYSize;
653  *
654  * Source image height in pixels.
655  *
656  * This field is required.
657  */
658 
659 /**
660  * \var double GDALWarpKernel::dfSrcXExtraSize;
661  *
662  * Number of pixels included in nSrcXSize that are present on the edges of
663  * the area of interest to take into account the width of the kernel.
664  *
665  * This field is required.
666  */
667 
668 /**
669  * \var double GDALWarpKernel::dfSrcYExtraSize;
670  *
671  * Number of pixels included in nSrcYExtraSize that are present on the edges of
672  * the area of interest to take into account the height of the kernel.
673  *
674  * This field is required.
675  */
676 
677 /**
678  * \var int GDALWarpKernel::papabySrcImage;
679  *
680  * Array of source image band data.
681  *
682  * This is an array of pointers (of size GDALWarpKernel::nBands) pointers
683  * to image data.  Each individual band of image data is organized as a single
684  * block of image data in left to right, then bottom to top order.  The actual
685  * type of the image data is determined by GDALWarpKernel::eWorkingDataType.
686  *
687  * To access the pixel value for the (x=3, y=4) pixel (zero based) of
688  * the second band with eWorkingDataType set to GDT_Float32 use code like
689  * this:
690  *
691  * \code
692  *   float dfPixelValue;
693  *   int   nBand = 2-1;  // Band indexes are zero based.
694  *   int   nPixel = 3; // Zero based.
695  *   int   nLine = 4;  // Zero based.
696  *
697  *   assert( nPixel >= 0 && nPixel < poKern->nSrcXSize );
698  *   assert( nLine >= 0 && nLine < poKern->nSrcYSize );
699  *   assert( nBand >= 0 && nBand < poKern->nBands );
700  *   dfPixelValue = ((float *) poKern->papabySrcImage[nBand])
701  *                                  [nPixel + nLine * poKern->nSrcXSize];
702  * \endcode
703  *
704  * This field is required.
705  */
706 
707 /**
708  * \var GUInt32 **GDALWarpKernel::papanBandSrcValid;
709  *
710  * Per band validity mask for source pixels.
711  *
712  * Array of pixel validity mask layers for each source band.   Each of
713  * the mask layers is the same size (in pixels) as the source image with
714  * one bit per pixel.  Note that it is legal (and common) for this to be
715  * NULL indicating that none of the pixels are invalidated, or for some
716  * band validity masks to be NULL in which case all pixels of the band are
717  * valid.  The following code can be used to test the validity of a particular
718  * pixel.
719  *
720  * \code
721  *   int   bIsValid = TRUE;
722  *   int   nBand = 2-1;  // Band indexes are zero based.
723  *   int   nPixel = 3; // Zero based.
724  *   int   nLine = 4;  // Zero based.
725  *
726  *   assert( nPixel >= 0 && nPixel < poKern->nSrcXSize );
727  *   assert( nLine >= 0 && nLine < poKern->nSrcYSize );
728  *   assert( nBand >= 0 && nBand < poKern->nBands );
729  *
730  *   if( poKern->papanBandSrcValid != NULL
731  *       && poKern->papanBandSrcValid[nBand] != NULL )
732  *   {
733  *       GUInt32 *panBandMask = poKern->papanBandSrcValid[nBand];
734  *       int    iPixelOffset = nPixel + nLine * poKern->nSrcXSize;
735  *
736  *       bIsValid = panBandMask[iPixelOffset>>5]
737  *                  & (0x01 << (iPixelOffset & 0x1f));
738  *   }
739  * \endcode
740  */
741 
742 /**
743  * \var GUInt32 *GDALWarpKernel::panUnifiedSrcValid;
744  *
745  * Per pixel validity mask for source pixels.
746  *
747  * A single validity mask layer that applies to the pixels of all source
748  * bands.  It is accessed similarly to papanBandSrcValid, but without the
749  * extra level of band indirection.
750  *
751  * This pointer may be NULL indicating that all pixels are valid.
752  *
753  * Note that if both panUnifiedSrcValid, and papanBandSrcValid are available,
754  * the pixel isn't considered to be valid unless both arrays indicate it is
755  * valid.
756  */
757 
758 /**
759  * \var float *GDALWarpKernel::pafUnifiedSrcDensity;
760  *
761  * Per pixel density mask for source pixels.
762  *
763  * A single density mask layer that applies to the pixels of all source
764  * bands.  It contains values between 0.0 and 1.0 indicating the degree to
765  * which this pixel should be allowed to contribute to the output result.
766  *
767  * This pointer may be NULL indicating that all pixels have a density of 1.0.
768  *
769  * The density for a pixel may be accessed like this:
770  *
771  * \code
772  *   float fDensity = 1.0;
773  *   int nPixel = 3;  // Zero based.
774  *   int nLine = 4;   // Zero based.
775  *
776  *   assert( nPixel >= 0 && nPixel < poKern->nSrcXSize );
777  *   assert( nLine >= 0 && nLine < poKern->nSrcYSize );
778  *   if( poKern->pafUnifiedSrcDensity != NULL )
779  *     fDensity = poKern->pafUnifiedSrcDensity
780  *                                  [nPixel + nLine * poKern->nSrcXSize];
781  * \endcode
782  */
783 
784 /**
785  * \var int GDALWarpKernel::nDstXSize;
786  *
787  * Width of destination image in pixels.
788  *
789  * This field is required.
790  */
791 
792 /**
793  * \var int GDALWarpKernel::nDstYSize;
794  *
795  * Height of destination image in pixels.
796  *
797  * This field is required.
798  */
799 
800 /**
801  * \var GByte **GDALWarpKernel::papabyDstImage;
802  *
803  * Array of destination image band data.
804  *
805  * This is an array of pointers (of size GDALWarpKernel::nBands) pointers
806  * to image data.  Each individual band of image data is organized as a single
807  * block of image data in left to right, then bottom to top order.  The actual
808  * type of the image data is determined by GDALWarpKernel::eWorkingDataType.
809  *
810  * To access the pixel value for the (x=3, y=4) pixel (zero based) of
811  * the second band with eWorkingDataType set to GDT_Float32 use code like
812  * this:
813  *
814  * \code
815  *   float dfPixelValue;
816  *   int   nBand = 2-1;  // Band indexes are zero based.
817  *   int   nPixel = 3; // Zero based.
818  *   int   nLine = 4;  // Zero based.
819  *
820  *   assert( nPixel >= 0 && nPixel < poKern->nDstXSize );
821  *   assert( nLine >= 0 && nLine < poKern->nDstYSize );
822  *   assert( nBand >= 0 && nBand < poKern->nBands );
823  *   dfPixelValue = ((float *) poKern->papabyDstImage[nBand])
824  *                                  [nPixel + nLine * poKern->nSrcYSize];
825  * \endcode
826  *
827  * This field is required.
828  */
829 
830 /**
831  * \var GUInt32 *GDALWarpKernel::panDstValid;
832  *
833  * Per pixel validity mask for destination pixels.
834  *
835  * A single validity mask layer that applies to the pixels of all destination
836  * bands.  It is accessed similarly to papanUnitifiedSrcValid, but based
837  * on the size of the destination image.
838  *
839  * This pointer may be NULL indicating that all pixels are valid.
840  */
841 
842 /**
843  * \var float *GDALWarpKernel::pafDstDensity;
844  *
845  * Per pixel density mask for destination pixels.
846  *
847  * A single density mask layer that applies to the pixels of all destination
848  * bands.  It contains values between 0.0 and 1.0.
849  *
850  * This pointer may be NULL indicating that all pixels have a density of 1.0.
851  *
852  * The density for a pixel may be accessed like this:
853  *
854  * \code
855  *   float fDensity = 1.0;
856  *   int   nPixel = 3; // Zero based.
857  *   int   nLine = 4;  // Zero based.
858  *
859  *   assert( nPixel >= 0 && nPixel < poKern->nDstXSize );
860  *   assert( nLine >= 0 && nLine < poKern->nDstYSize );
861  *   if( poKern->pafDstDensity != NULL )
862  *     fDensity = poKern->pafDstDensity[nPixel + nLine * poKern->nDstXSize];
863  * \endcode
864  */
865 
866 /**
867  * \var int GDALWarpKernel::nSrcXOff;
868  *
869  * X offset to source pixel coordinates for transformation.
870  *
871  * See pfnTransformer.
872  *
873  * This field is required.
874  */
875 
876 /**
877  * \var int GDALWarpKernel::nSrcYOff;
878  *
879  * Y offset to source pixel coordinates for transformation.
880  *
881  * See pfnTransformer.
882  *
883  * This field is required.
884  */
885 
886 /**
887  * \var int GDALWarpKernel::nDstXOff;
888  *
889  * X offset to destination pixel coordinates for transformation.
890  *
891  * See pfnTransformer.
892  *
893  * This field is required.
894  */
895 
896 /**
897  * \var int GDALWarpKernel::nDstYOff;
898  *
899  * Y offset to destination pixel coordinates for transformation.
900  *
901  * See pfnTransformer.
902  *
903  * This field is required.
904  */
905 
906 /**
907  * \var GDALTransformerFunc GDALWarpKernel::pfnTransformer;
908  *
909  * Source/destination location transformer.
910  *
911  * The function to call to transform coordinates between source image
912  * pixel/line coordinates and destination image pixel/line coordinates.
913  * See GDALTransformerFunc() for details of the semantics of this function.
914  *
915  * The GDALWarpKern algorithm will only ever use this transformer in
916  * "destination to source" mode (bDstToSrc=TRUE), and will always pass
917  * partial or complete scanlines of points in the destination image as
918  * input.  This means, among other things, that it is safe to the the
919  * approximating transform GDALApproxTransform() as the transformation
920  * function.
921  *
922  * Source and destination images may be subsets of a larger overall image.
923  * The transformation algorithms will expect and return pixel/line coordinates
924  * in terms of this larger image, so coordinates need to be offset by
925  * the offsets specified in nSrcXOff, nSrcYOff, nDstXOff, and nDstYOff before
926  * passing to pfnTransformer, and after return from it.
927  *
928  * The GDALWarpKernel::pfnTransformerArg value will be passed as the callback
929  * data to this function when it is called.
930  *
931  * This field is required.
932  */
933 
934 /**
935  * \var void *GDALWarpKernel::pTransformerArg;
936  *
937  * Callback data for pfnTransformer.
938  *
939  * This field may be NULL if not required for the pfnTransformer being used.
940  */
941 
942 /**
943  * \var GDALProgressFunc GDALWarpKernel::pfnProgress;
944  *
945  * The function to call to report progress of the algorithm, and to check
946  * for a requested termination of the operation.  It operates according to
947  * GDALProgressFunc() semantics.
948  *
949  * Generally speaking the progress function will be invoked for each
950  * scanline of the destination buffer that has been processed.
951  *
952  * This field may be NULL (internally set to GDALDummyProgress()).
953  */
954 
955 /**
956  * \var void *GDALWarpKernel::pProgress;
957  *
958  * Callback data for pfnProgress.
959  *
960  * This field may be NULL if not required for the pfnProgress being used.
961  */
962 
963 /************************************************************************/
964 /*                           GDALWarpKernel()                           */
965 /************************************************************************/
966 
967 GDALWarpKernel::GDALWarpKernel() :
968     papszWarpOptions(nullptr),
969     eResample(GRA_NearestNeighbour),
970     eWorkingDataType(GDT_Unknown),
971     nBands(0),
972     nSrcXSize(0),
973     nSrcYSize(0),
974     dfSrcXExtraSize(0.0),
975     dfSrcYExtraSize(0.0),
976     papabySrcImage(nullptr),
977     papanBandSrcValid(nullptr),
978     panUnifiedSrcValid(nullptr),
979     pafUnifiedSrcDensity(nullptr),
980     nDstXSize(0),
981     nDstYSize(0),
982     papabyDstImage(nullptr),
983     panDstValid(nullptr),
984     pafDstDensity(nullptr),
985     dfXScale(1.0),
986     dfYScale(1.0),
987     dfXFilter(0.0),
988     dfYFilter(0.0),
989     nXRadius(0),
990     nYRadius(0),
991     nFiltInitX(0),
992     nFiltInitY(0),
993     nSrcXOff(0),
994     nSrcYOff(0),
995     nDstXOff(0),
996     nDstYOff(0),
997     pfnTransformer(nullptr),
998     pTransformerArg(nullptr),
999     pfnProgress(GDALDummyProgress),
1000     pProgress(nullptr),
1001     dfProgressBase(0.0),
1002     dfProgressScale(1.0),
1003     padfDstNoDataReal(nullptr),
1004     psThreadData(nullptr)
1005 {}
1006 
1007 /************************************************************************/
1008 /*                          ~GDALWarpKernel()                           */
1009 /************************************************************************/
1010 
1011 GDALWarpKernel::~GDALWarpKernel() {}
1012 
1013 /************************************************************************/
1014 /*                            PerformWarp()                             */
1015 /************************************************************************/
1016 
1017 /**
1018  * \fn CPLErr GDALWarpKernel::PerformWarp();
1019  *
1020  * This method performs the warp described in the GDALWarpKernel.
1021  *
1022  * @return CE_None on success or CE_Failure if an error occurs.
1023  */
1024 
1025 CPLErr GDALWarpKernel::PerformWarp()
1026 
1027 {
1028     const CPLErr eErr = Validate();
1029 
1030     if( eErr != CE_None )
1031         return eErr;
1032 
1033     // See #2445 and #3079.
1034     if( nSrcXSize <= 0 || nSrcYSize <= 0 )
1035     {
1036         if( !pfnProgress( dfProgressBase + dfProgressScale,
1037                           "", pProgress ) )
1038         {
1039             CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
1040             return CE_Failure;
1041         }
1042         return CE_None;
1043     }
1044 
1045 /* -------------------------------------------------------------------- */
1046 /*      Pre-calculate resampling scales and window sizes for filtering. */
1047 /* -------------------------------------------------------------------- */
1048 
1049     dfXScale = static_cast<double>(nDstXSize) / (nSrcXSize - dfSrcXExtraSize);
1050     dfYScale = static_cast<double>(nDstYSize) / (nSrcYSize - dfSrcYExtraSize);
1051     if( nSrcXSize >= nDstXSize && nSrcXSize <= nDstXSize + dfSrcXExtraSize )
1052         dfXScale = 1.0;
1053     if( nSrcYSize >= nDstYSize && nSrcYSize <= nDstYSize + dfSrcYExtraSize )
1054         dfYScale = 1.0;
1055     if( dfXScale < 1.0 )
1056     {
1057         double dfXReciprocalScale =  1.0 / dfXScale;
1058         const int nXReciprocalScale =
1059             static_cast<int>(dfXReciprocalScale + 0.5);
1060         if( fabs(dfXReciprocalScale-nXReciprocalScale) < 0.05 )
1061             dfXScale = 1.0 / nXReciprocalScale;
1062     }
1063     if( dfYScale < 1.0 )
1064     {
1065         double dfYReciprocalScale =  1.0 / dfYScale;
1066         const int nYReciprocalScale =
1067             static_cast<int>(dfYReciprocalScale + 0.5);
1068         if( fabs(dfYReciprocalScale-nYReciprocalScale) < 0.05 )
1069             dfYScale = 1.0 / nYReciprocalScale;
1070     }
1071 
1072     // XSCALE and YSCALE undocumented for now. Can help in some cases.
1073     // Best would probably be a per-pixel scale computation.
1074     const char* pszXScale = CSLFetchNameValue(papszWarpOptions, "XSCALE");
1075     if( pszXScale != nullptr && !EQUAL(pszXScale, "FROM_GRID_SAMPLING") )
1076         dfXScale = CPLAtof(pszXScale);
1077     const char* pszYScale = CSLFetchNameValue(papszWarpOptions, "YSCALE");
1078     if( pszYScale != nullptr )
1079         dfYScale = CPLAtof(pszYScale);
1080 
1081     // If the xscale is significantly lower than the yscale, this is highly
1082     // suspicious of a situation of wrapping a very large virtual file in
1083     // geographic coordinates with left and right parts being close to the
1084     // antimeridian. In that situation, the xscale computed by the above method
1085     // is completely wrong. Prefer doing an average of a few sample points
1086     // instead
1087     if( (dfYScale / dfXScale > 100 ||
1088          (pszXScale != nullptr && EQUAL(pszXScale, "FROM_GRID_SAMPLING"))) )
1089     {
1090         // Sample points along a grid
1091         const int nPointsX = std::min(10, nDstXSize);
1092         const int nPointsY = std::min(10, nDstYSize);
1093         const int nPoints = 3 * nPointsX * nPointsY;
1094         std::vector<double> padfX;
1095         std::vector<double> padfY;
1096         std::vector<double> padfZ(nPoints);
1097         std::vector<int> pabSuccess(nPoints);
1098         for(int iY = 0; iY < nPointsY; iY++)
1099         {
1100             for(int iX = 0; iX < nPointsX; iX++)
1101             {
1102                 const double dfX = nPointsX == 1 ? 0.0 :
1103                     static_cast<double>(iX) * nDstXSize / (nPointsX - 1);
1104                 const double dfY = nPointsY == 1 ? 0.0 :
1105                 static_cast<double>(iY) * nDstYSize / (nPointsY - 1);
1106 
1107                 // Reproject each destination sample point and its neighbours
1108                 // at (x+1,y) and (x,y+1), so as to get the local scale.
1109                 padfX.push_back( dfX );
1110                 padfY.push_back( dfY );
1111 
1112                 padfX.push_back( (iX == nPointsX - 1) ? dfX - 1 : dfX + 1 );
1113                 padfY.push_back( dfY);
1114 
1115                 padfX.push_back( dfX );
1116                 padfY.push_back( (iY == nPointsY - 1) ? dfY - 1 : dfY + 1 );
1117             }
1118         }
1119         pfnTransformer(pTransformerArg, TRUE, nPoints,
1120                        &padfX[0], &padfY[0], &padfZ[0], &pabSuccess[0]);
1121 
1122         // Compute the xscale at each sampling point
1123         std::vector<double> adfXScales;
1124         for( int i = 0; i < nPoints; i+=3 )
1125         {
1126             if( pabSuccess[i] && pabSuccess[i+1] && pabSuccess[i+2] )
1127             {
1128                 const double dfPointXScale = 1.0 / std::max(
1129                     std::abs(padfX[i+1] - padfX[i]), std::abs(padfX[i+2] - padfX[i]) );
1130                 adfXScales.push_back(dfPointXScale);
1131             }
1132         }
1133 
1134         // Sort by increasing xcale
1135         std::sort(adfXScales.begin(), adfXScales.end());
1136 
1137         if( !adfXScales.empty() )
1138         {
1139             // Compute the average of scales, but eliminate outliers small
1140             // scales, if some samples are just along the discontinuity.
1141             const double dfMaxPointXScale = adfXScales.back();
1142             double dfSumPointXScale = 0;
1143             int nCountPointScale = 0;
1144             for( double dfPointXScale : adfXScales )
1145             {
1146                 if( dfPointXScale > dfMaxPointXScale / 10 )
1147                 {
1148                     dfSumPointXScale += dfPointXScale;
1149                     nCountPointScale ++;
1150                 }
1151             }
1152             if( nCountPointScale > 0 ) // should always be true
1153             {
1154                 const double dfXScaleFromSampling = dfSumPointXScale / nCountPointScale;
1155 #if DEBUG_VERBOSE
1156                 CPLDebug("WARP", "Correcting dfXScale from %f to %f",
1157                         dfXScale, dfXScaleFromSampling);
1158 #endif
1159                 dfXScale = dfXScaleFromSampling;
1160             }
1161         }
1162     }
1163 
1164 #if DEBUG_VERBOSE
1165     CPLDebug("WARP", "dfXScale = %f, dfYScale = %f", dfXScale, dfYScale);
1166 #endif
1167 
1168     const int bUse4SamplesFormula = dfXScale >= 0.95 && dfYScale >= 0.95;
1169 
1170     // Safety check for callers that would use GDALWarpKernel without using
1171     // GDALWarpOperation.
1172     if( (eResample == GRA_CubicSpline || eResample == GRA_Lanczos ||
1173          ((eResample == GRA_Cubic || eResample == GRA_Bilinear) &&
1174           !bUse4SamplesFormula)) &&
1175         atoi(CSLFetchNameValueDef(papszWarpOptions,
1176                                   "EXTRA_ELTS", "0") ) != WARP_EXTRA_ELTS )
1177     {
1178         CPLError( CE_Failure, CPLE_AppDefined,
1179                   "Source arrays must have WARP_EXTRA_ELTS extra elements at "
1180                   "their end. "
1181                   "See GDALWarpKernel class definition. If this condition is "
1182                   "fulfilled, define a EXTRA_ELTS=%d warp options",
1183                   WARP_EXTRA_ELTS);
1184         return CE_Failure;
1185     }
1186 
1187     dfXFilter = anGWKFilterRadius[eResample];
1188     dfYFilter = anGWKFilterRadius[eResample];
1189 
1190     nXRadius =
1191         dfXScale < 1.0
1192         ? static_cast<int>(ceil(dfXFilter / dfXScale))
1193         : static_cast<int>(dfXFilter);
1194     nYRadius =
1195         dfYScale < 1.0
1196         ? static_cast<int>(ceil(dfYFilter / dfYScale))
1197         : static_cast<int>(dfYFilter);
1198 
1199     // Filter window offset depends on the parity of the kernel radius.
1200     nFiltInitX = ((anGWKFilterRadius[eResample] + 1) % 2) - nXRadius;
1201     nFiltInitY = ((anGWKFilterRadius[eResample] + 1) % 2) - nYRadius;
1202 
1203 /* -------------------------------------------------------------------- */
1204 /*      Set up resampling functions.                                    */
1205 /* -------------------------------------------------------------------- */
1206     if( CPLFetchBool( papszWarpOptions, "USE_GENERAL_CASE", false ) )
1207         return GWKGeneralCase( this );
1208 
1209 #if defined(HAVE_OPENCL)
1210     if( (eWorkingDataType == GDT_Byte
1211          || eWorkingDataType == GDT_CInt16
1212          || eWorkingDataType == GDT_UInt16
1213          || eWorkingDataType == GDT_Int16
1214          || eWorkingDataType == GDT_CFloat32
1215          || eWorkingDataType == GDT_Float32) &&
1216         (eResample == GRA_Bilinear
1217          || eResample == GRA_Cubic
1218          || eResample == GRA_CubicSpline
1219          || eResample == GRA_Lanczos) &&
1220         CPLFetchBool( papszWarpOptions, "USE_OPENCL", true ) )
1221     {
1222         const CPLErr eResult = GWKOpenCLCase( this );
1223 
1224         // CE_Warning tells us a suitable OpenCL environment was not available
1225         // so we fall through to other CPU based methods.
1226         if( eResult != CE_Warning )
1227             return eResult;
1228     }
1229 #endif  // defined HAVE_OPENCL
1230 
1231     const bool bNoMasksOrDstDensityOnly =
1232         papanBandSrcValid == nullptr
1233         && panUnifiedSrcValid == nullptr
1234         && pafUnifiedSrcDensity == nullptr
1235         && panDstValid == nullptr;
1236 
1237     if( eWorkingDataType == GDT_Byte
1238         && eResample == GRA_NearestNeighbour
1239         && bNoMasksOrDstDensityOnly )
1240         return GWKNearestNoMasksOrDstDensityOnlyByte( this );
1241 
1242     if( eWorkingDataType == GDT_Byte
1243         && eResample == GRA_Bilinear
1244         && bNoMasksOrDstDensityOnly )
1245         return GWKBilinearNoMasksOrDstDensityOnlyByte( this );
1246 
1247     if( eWorkingDataType == GDT_Byte
1248         && eResample == GRA_Cubic
1249         && bNoMasksOrDstDensityOnly )
1250         return GWKCubicNoMasksOrDstDensityOnlyByte( this );
1251 
1252     if( eWorkingDataType == GDT_Byte
1253         && eResample == GRA_CubicSpline
1254         && bNoMasksOrDstDensityOnly )
1255         return GWKCubicSplineNoMasksOrDstDensityOnlyByte( this );
1256 
1257     if( eWorkingDataType == GDT_Byte
1258         && eResample == GRA_NearestNeighbour )
1259         return GWKNearestByte( this );
1260 
1261     if( (eWorkingDataType == GDT_Int16 || eWorkingDataType == GDT_UInt16)
1262         && eResample == GRA_NearestNeighbour
1263         && bNoMasksOrDstDensityOnly )
1264         return GWKNearestNoMasksOrDstDensityOnlyShort( this );
1265 
1266     if( (eWorkingDataType == GDT_Int16 )
1267         && eResample == GRA_Cubic
1268         && bNoMasksOrDstDensityOnly )
1269         return GWKCubicNoMasksOrDstDensityOnlyShort( this );
1270 
1271     if( (eWorkingDataType == GDT_Int16 )
1272         && eResample == GRA_CubicSpline
1273         && bNoMasksOrDstDensityOnly )
1274         return GWKCubicSplineNoMasksOrDstDensityOnlyShort( this );
1275 
1276     if( (eWorkingDataType == GDT_Int16 )
1277         && eResample == GRA_Bilinear
1278         && bNoMasksOrDstDensityOnly )
1279         return GWKBilinearNoMasksOrDstDensityOnlyShort( this );
1280 
1281     if( (eWorkingDataType == GDT_UInt16 )
1282         && eResample == GRA_Cubic
1283         && bNoMasksOrDstDensityOnly )
1284         return GWKCubicNoMasksOrDstDensityOnlyUShort( this );
1285 
1286     if( (eWorkingDataType == GDT_UInt16 )
1287         && eResample == GRA_CubicSpline
1288         && bNoMasksOrDstDensityOnly )
1289         return GWKCubicSplineNoMasksOrDstDensityOnlyUShort( this );
1290 
1291     if( (eWorkingDataType == GDT_UInt16 )
1292         && eResample == GRA_Bilinear
1293         && bNoMasksOrDstDensityOnly )
1294         return GWKBilinearNoMasksOrDstDensityOnlyUShort( this );
1295 
1296     if( (eWorkingDataType == GDT_Int16 || eWorkingDataType == GDT_UInt16)
1297         && eResample == GRA_NearestNeighbour )
1298         return GWKNearestShort( this );
1299 
1300     if( eWorkingDataType == GDT_Float32
1301         && eResample == GRA_NearestNeighbour
1302         && bNoMasksOrDstDensityOnly )
1303         return GWKNearestNoMasksOrDstDensityOnlyFloat( this );
1304 
1305     if( eWorkingDataType == GDT_Float32
1306         && eResample == GRA_NearestNeighbour )
1307         return GWKNearestFloat( this );
1308 
1309     if( eWorkingDataType == GDT_Float32
1310         && eResample == GRA_Bilinear
1311         && bNoMasksOrDstDensityOnly )
1312         return GWKBilinearNoMasksOrDstDensityOnlyFloat( this );
1313 
1314     if( eWorkingDataType == GDT_Float32
1315         && eResample == GRA_Cubic
1316         && bNoMasksOrDstDensityOnly )
1317         return GWKCubicNoMasksOrDstDensityOnlyFloat( this );
1318 
1319 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
1320     if( eWorkingDataType == GDT_Float64
1321         && eResample == GRA_Bilinear
1322         && bNoMasksOrDstDensityOnly )
1323         return GWKBilinearNoMasksOrDstDensityOnlyDouble( this );
1324 
1325     if( eWorkingDataType == GDT_Float64
1326         && eResample == GRA_Cubic
1327         && bNoMasksOrDstDensityOnly )
1328         return GWKCubicNoMasksOrDstDensityOnlyDouble( this );
1329 #endif
1330 
1331     if( eResample == GRA_Average )
1332         return GWKAverageOrMode( this );
1333 
1334     if( eResample == GRA_RMS )
1335         return GWKAverageOrMode( this );
1336 
1337     if( eResample == GRA_Mode )
1338         return GWKAverageOrMode( this );
1339 
1340     if( eResample == GRA_Max )
1341         return GWKAverageOrMode( this );
1342 
1343     if( eResample == GRA_Min )
1344         return GWKAverageOrMode( this );
1345 
1346     if( eResample == GRA_Med )
1347         return GWKAverageOrMode( this );
1348 
1349     if( eResample == GRA_Q1 )
1350         return GWKAverageOrMode( this );
1351 
1352     if( eResample == GRA_Q3 )
1353         return GWKAverageOrMode( this );
1354 
1355     if( eResample == GRA_Sum )
1356         return GWKAverageOrMode( this );
1357 
1358     if (!GDALDataTypeIsComplex(eWorkingDataType))
1359     {
1360         return GWKRealCase( this );
1361     }
1362 
1363     return GWKGeneralCase( this );
1364 }
1365 
1366 /************************************************************************/
1367 /*                              Validate()                              */
1368 /************************************************************************/
1369 
1370 /**
1371  * \fn CPLErr GDALWarpKernel::Validate()
1372  *
1373  * Check the settings in the GDALWarpKernel, and issue a CPLError()
1374  * (and return CE_Failure) if the configuration is considered to be
1375  * invalid for some reason.
1376  *
1377  * This method will also do some standard defaulting such as setting
1378  * pfnProgress to GDALDummyProgress() if it is NULL.
1379  *
1380  * @return CE_None on success or CE_Failure if an error is detected.
1381  */
1382 
1383 CPLErr GDALWarpKernel::Validate()
1384 
1385 {
1386     if( static_cast<size_t>(eResample) >=
1387         (sizeof(anGWKFilterRadius) / sizeof(anGWKFilterRadius[0])) )
1388     {
1389         CPLError( CE_Failure, CPLE_AppDefined,
1390                   "Unsupported resampling method %d.",
1391                   static_cast<int>(eResample) );
1392         return CE_Failure;
1393     }
1394 
1395     return CE_None;
1396 }
1397 
1398 /************************************************************************/
1399 /*                         GWKOverlayDensity()                          */
1400 /*                                                                      */
1401 /*      Compute the final density for the destination pixel.  This      */
1402 /*      is a function of the overlay density (passed in) and the        */
1403 /*      original density.                                               */
1404 /************************************************************************/
1405 
1406 static void GWKOverlayDensity( const GDALWarpKernel *poWK, GPtrDiff_t iDstOffset,
1407                                double dfDensity )
1408 {
1409     if( dfDensity < 0.0001 || poWK->pafDstDensity == nullptr )
1410         return;
1411 
1412     poWK->pafDstDensity[iDstOffset] =  static_cast<float>
1413         ( 1.0 - (1.0 - dfDensity) * (1.0 - poWK->pafDstDensity[iDstOffset]) );
1414 }
1415 
1416 /************************************************************************/
1417 /*                          GWKRoundValueT()                            */
1418 /************************************************************************/
1419 
1420 template<class T, EMULATED_BOOL is_signed> struct sGWKRoundValueT
1421 {
1422     static T eval(double);
1423 };
1424 
1425 template<class T> struct sGWKRoundValueT<T, true> /* signed */
1426 {
1427     static T eval(double dfValue) { return static_cast<T>(floor(dfValue + 0.5)); }
1428 };
1429 
1430 template<class T> struct sGWKRoundValueT<T, false> /* unsigned */
1431 {
1432     static T eval(double dfValue) { return static_cast<T>(dfValue + 0.5); }
1433 };
1434 
1435 template<class T> static T GWKRoundValueT(double dfValue)
1436 {
1437     return sGWKRoundValueT<T, std::numeric_limits<T>::is_signed>::eval(dfValue);
1438 }
1439 
1440 template<> float GWKRoundValueT<float>(double dfValue)
1441 {
1442     return static_cast<float>(dfValue);
1443 }
1444 
1445 #ifdef notused
1446 template<> double GWKRoundValueT<double>(double dfValue)
1447 {
1448     return dfValue;
1449 }
1450 #endif
1451 
1452 /************************************************************************/
1453 /*                            GWKClampValueT()                          */
1454 /************************************************************************/
1455 
1456 template<class T>
1457 static CPL_INLINE T GWKClampValueT(double dfValue)
1458 {
1459     if( dfValue < std::numeric_limits<T>::min() )
1460         return std::numeric_limits<T>::min();
1461     else if( dfValue > std::numeric_limits<T>::max() )
1462         return std::numeric_limits<T>::max();
1463     else
1464         return GWKRoundValueT<T>(dfValue);
1465 }
1466 
1467 template<> float GWKClampValueT<float>(double dfValue)
1468 {
1469     return static_cast<float>(dfValue);
1470 }
1471 
1472 #ifdef notused
1473 template<> double GWKClampValueT<double>(double dfValue)
1474 {
1475     return dfValue;
1476 }
1477 #endif
1478 
1479 /************************************************************************/
1480 /*                         GWKSetPixelValueRealT()                      */
1481 /************************************************************************/
1482 
1483 template<class T>
1484 static bool GWKSetPixelValueRealT( const GDALWarpKernel *poWK, int iBand,
1485                                    GPtrDiff_t iDstOffset, double dfDensity,
1486                                    T value)
1487 {
1488     T *pDst = reinterpret_cast<T*>(poWK->papabyDstImage[iBand]);
1489 
1490 /* -------------------------------------------------------------------- */
1491 /*      If the source density is less than 100% we need to fetch the    */
1492 /*      existing destination value, and mix it with the source to       */
1493 /*      get the new "to apply" value.  Also compute composite           */
1494 /*      density.                                                        */
1495 /*                                                                      */
1496 /*      We avoid mixing if density is very near one or risk mixing      */
1497 /*      in very extreme nodata values and causing odd results (#1610)   */
1498 /* -------------------------------------------------------------------- */
1499     if( dfDensity < 0.9999 )
1500     {
1501         if( dfDensity < 0.0001 )
1502             return true;
1503 
1504         double dfDstDensity = 1.0;
1505 
1506         if( poWK->pafDstDensity != nullptr )
1507             dfDstDensity = poWK->pafDstDensity[iDstOffset];
1508         else if( poWK->panDstValid != nullptr
1509                  && !((poWK->panDstValid[iDstOffset>>5]
1510                        & (0x01 << (iDstOffset & 0x1f))) ) )
1511             dfDstDensity = 0.0;
1512 
1513         // It seems like we also ought to be testing panDstValid[] here!
1514 
1515         const double dfDstReal = pDst[iDstOffset];
1516 
1517         // The destination density is really only relative to the portion
1518         // not occluded by the overlay.
1519         const double dfDstInfluence = (1.0 - dfDensity) * dfDstDensity;
1520 
1521         const double dfReal =
1522             (value * dfDensity + dfDstReal * dfDstInfluence)
1523             / (dfDensity + dfDstInfluence);
1524 
1525 /* -------------------------------------------------------------------- */
1526 /*      Actually apply the destination value.                           */
1527 /*                                                                      */
1528 /*      Avoid using the destination nodata value for integer datatypes  */
1529 /*      if by chance it is equal to the computed pixel value.           */
1530 /* -------------------------------------------------------------------- */
1531         pDst[iDstOffset] = GWKClampValueT<T>(dfReal);
1532     }
1533     else
1534     {
1535         pDst[iDstOffset] = value;
1536     }
1537 
1538     if( poWK->padfDstNoDataReal != nullptr &&
1539         poWK->padfDstNoDataReal[iBand] ==
1540         static_cast<double>(pDst[iDstOffset]) )
1541     {
1542         if( pDst[iDstOffset] == std::numeric_limits<T>::min() )
1543             pDst[iDstOffset] = std::numeric_limits<T>::min() + 1;
1544         else
1545             pDst[iDstOffset]--;
1546     }
1547 
1548     return true;
1549 }
1550 
1551 /************************************************************************/
1552 /*                          GWKSetPixelValue()                          */
1553 /************************************************************************/
1554 
1555 static bool GWKSetPixelValue( const GDALWarpKernel *poWK, int iBand,
1556                               GPtrDiff_t iDstOffset, double dfDensity,
1557                               double dfReal, double dfImag )
1558 
1559 {
1560     GByte *pabyDst = poWK->papabyDstImage[iBand];
1561 
1562 /* -------------------------------------------------------------------- */
1563 /*      If the source density is less than 100% we need to fetch the    */
1564 /*      existing destination value, and mix it with the source to       */
1565 /*      get the new "to apply" value.  Also compute composite           */
1566 /*      density.                                                        */
1567 /*                                                                      */
1568 /*      We avoid mixing if density is very near one or risk mixing      */
1569 /*      in very extreme nodata values and causing odd results (#1610)   */
1570 /* -------------------------------------------------------------------- */
1571     if( dfDensity < 0.9999 )
1572     {
1573         if( dfDensity < 0.0001 )
1574             return true;
1575 
1576         double dfDstDensity = 1.0;
1577         if( poWK->pafDstDensity != nullptr )
1578             dfDstDensity = poWK->pafDstDensity[iDstOffset];
1579         else if( poWK->panDstValid != nullptr
1580                  && !((poWK->panDstValid[iDstOffset>>5]
1581                        & (0x01 << (iDstOffset & 0x1f))) ) )
1582             dfDstDensity = 0.0;
1583 
1584         double dfDstReal = 0.0;
1585         double dfDstImag = 0.0;
1586         // It seems like we also ought to be testing panDstValid[] here!
1587 
1588         // TODO(schwehr): Factor out this repreated type of set.
1589         switch( poWK->eWorkingDataType )
1590         {
1591           case GDT_Byte:
1592             dfDstReal = pabyDst[iDstOffset];
1593             dfDstImag = 0.0;
1594             break;
1595 
1596           case GDT_Int16:
1597             dfDstReal = reinterpret_cast<GInt16 *>(pabyDst)[iDstOffset];
1598             dfDstImag = 0.0;
1599             break;
1600 
1601           case GDT_UInt16:
1602             dfDstReal = reinterpret_cast<GUInt16*>(pabyDst)[iDstOffset];
1603             dfDstImag = 0.0;
1604             break;
1605 
1606           case GDT_Int32:
1607             dfDstReal = reinterpret_cast<GInt32*>(pabyDst)[iDstOffset];
1608             dfDstImag = 0.0;
1609             break;
1610 
1611           case GDT_UInt32:
1612             dfDstReal = reinterpret_cast<GUInt32*>(pabyDst)[iDstOffset];
1613             dfDstImag = 0.0;
1614             break;
1615 
1616           case GDT_Float32:
1617             dfDstReal = reinterpret_cast<float*>(pabyDst)[iDstOffset];
1618             dfDstImag = 0.0;
1619             break;
1620 
1621           case GDT_Float64:
1622             dfDstReal = reinterpret_cast<double*>(pabyDst)[iDstOffset];
1623             dfDstImag = 0.0;
1624             break;
1625 
1626           case GDT_CInt16:
1627             dfDstReal = reinterpret_cast<GInt16*>(pabyDst)[iDstOffset*2];
1628             dfDstImag = reinterpret_cast<GInt16*>(pabyDst)[iDstOffset*2+1];
1629             break;
1630 
1631           case GDT_CInt32:
1632             dfDstReal = reinterpret_cast<GInt32*>(pabyDst)[iDstOffset*2];
1633             dfDstImag = reinterpret_cast<GInt32*>(pabyDst)[iDstOffset*2+1];
1634             break;
1635 
1636           case GDT_CFloat32:
1637             dfDstReal = reinterpret_cast<float*>(pabyDst)[iDstOffset*2];
1638             dfDstImag = reinterpret_cast<float*>(pabyDst)[iDstOffset*2+1];
1639             break;
1640 
1641           case GDT_CFloat64:
1642             dfDstReal = reinterpret_cast<double*>(pabyDst)[iDstOffset*2];
1643             dfDstImag = reinterpret_cast<double*>(pabyDst)[iDstOffset*2+1];
1644             break;
1645 
1646           default:
1647             CPLAssert( false );
1648             return false;
1649         }
1650 
1651         // The destination density is really only relative to the portion
1652         // not occluded by the overlay.
1653         const double dfDstInfluence = (1.0 - dfDensity) * dfDstDensity;
1654 
1655         dfReal =
1656             (dfReal * dfDensity + dfDstReal * dfDstInfluence)
1657             / (dfDensity + dfDstInfluence);
1658 
1659         dfImag =
1660             (dfImag * dfDensity + dfDstImag * dfDstInfluence)
1661             / (dfDensity + dfDstInfluence);
1662     }
1663 
1664 /* -------------------------------------------------------------------- */
1665 /*      Actually apply the destination value.                           */
1666 /*                                                                      */
1667 /*      Avoid using the destination nodata value for integer datatypes  */
1668 /*      if by chance it is equal to the computed pixel value.           */
1669 /* -------------------------------------------------------------------- */
1670 
1671 // TODO(schwehr): Can we make this a template?
1672 #define CLAMP(type) \
1673     do { \
1674     type* _pDst = reinterpret_cast<type*>(pabyDst); \
1675     if( dfReal < static_cast<double>(std::numeric_limits<type>::min()) ) \
1676         _pDst[iDstOffset] = static_cast<type>(std::numeric_limits<type>::min()); \
1677     else if( dfReal > static_cast<double>(std::numeric_limits<type>::max()) ) \
1678         _pDst[iDstOffset] = static_cast<type>(std::numeric_limits<type>::max()); \
1679     else \
1680         _pDst[iDstOffset] = \
1681             (std::numeric_limits<type>::is_signed) ? \
1682                 static_cast<type>(floor(dfReal + 0.5)) : \
1683                 static_cast<type>(dfReal + 0.5);  \
1684     if( poWK->padfDstNoDataReal != nullptr && \
1685         poWK->padfDstNoDataReal[iBand] == static_cast<double>(_pDst[iDstOffset]) ) \
1686     { \
1687         if( _pDst[iDstOffset] == static_cast<type>(std::numeric_limits<type>::min()) )  \
1688             _pDst[iDstOffset] = static_cast<type>(std::numeric_limits<type>::min() + 1); \
1689         else \
1690             _pDst[iDstOffset]--; \
1691     } } while( false )
1692 
1693     switch( poWK->eWorkingDataType )
1694     {
1695       case GDT_Byte:
1696         CLAMP(GByte);
1697         break;
1698 
1699       case GDT_Int16:
1700         CLAMP(GInt16);
1701         break;
1702 
1703       case GDT_UInt16:
1704         CLAMP(GUInt16);
1705         break;
1706 
1707       case GDT_UInt32:
1708         CLAMP(GUInt32);
1709         break;
1710 
1711       case GDT_Int32:
1712         CLAMP(GInt32);
1713         break;
1714 
1715       case GDT_Float32:
1716         reinterpret_cast<float*>(pabyDst)[iDstOffset] = static_cast<float>(dfReal);
1717         break;
1718 
1719       case GDT_Float64:
1720         reinterpret_cast<double*>(pabyDst)[iDstOffset] = dfReal;
1721         break;
1722 
1723       case GDT_CInt16:
1724       {
1725         typedef GInt16 T;
1726         if( dfReal < static_cast<double>(std::numeric_limits<T>::min()) )
1727             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = std::numeric_limits<T>::min();
1728         else if( dfReal > static_cast<double>(std::numeric_limits<T>::max()) )
1729             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = std::numeric_limits<T>::max();
1730         else
1731             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = static_cast<T>(floor(dfReal+0.5));
1732         if( dfImag < static_cast<double>(std::numeric_limits<T>::min()) )
1733             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] =  std::numeric_limits<T>::min();
1734         else if( dfImag > static_cast<double>(std::numeric_limits<T>::max()) )
1735             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] = std::numeric_limits<T>::max();
1736         else
1737             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] = static_cast<T>(floor(dfImag+0.5));
1738         break;
1739       }
1740 
1741       case GDT_CInt32:
1742       {
1743         typedef GInt32 T;
1744         if( dfReal < static_cast<double>(std::numeric_limits<T>::min()) )
1745             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = std::numeric_limits<T>::min();
1746         else if( dfReal > static_cast<double>(std::numeric_limits<T>::max()) )
1747             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = std::numeric_limits<T>::max();
1748         else
1749             reinterpret_cast<T*>(pabyDst)[iDstOffset*2] = static_cast<T>(floor(dfReal+0.5));
1750         if( dfImag < static_cast<double>(std::numeric_limits<T>::min()) )
1751             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] = std::numeric_limits<T>::min();
1752         else if( dfImag > static_cast<double>(std::numeric_limits<T>::max()) )
1753             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] = std::numeric_limits<T>::max();
1754         else
1755             reinterpret_cast<T*>(pabyDst)[iDstOffset*2+1] = static_cast<T>(floor(dfImag+0.5));
1756         break;
1757       }
1758 
1759       case GDT_CFloat32:
1760         reinterpret_cast<float*>(pabyDst)[iDstOffset*2] =  static_cast<float>(dfReal);
1761         reinterpret_cast<float*>(pabyDst)[iDstOffset*2+1] = static_cast<float>(dfImag);
1762         break;
1763 
1764       case GDT_CFloat64:
1765         reinterpret_cast<double*>(pabyDst)[iDstOffset*2] = dfReal;
1766         reinterpret_cast<double*>(pabyDst)[iDstOffset*2+1] = dfImag;
1767         break;
1768 
1769       default:
1770         return false;
1771     }
1772 
1773     return true;
1774 }
1775 
1776 /************************************************************************/
1777 /*                       GWKSetPixelValueReal()                         */
1778 /************************************************************************/
1779 
1780 static bool GWKSetPixelValueReal( const GDALWarpKernel *poWK, int iBand,
1781                                   GPtrDiff_t iDstOffset, double dfDensity,
1782                                   double dfReal )
1783 
1784 {
1785     GByte *pabyDst = poWK->papabyDstImage[iBand];
1786 
1787 /* -------------------------------------------------------------------- */
1788 /*      If the source density is less than 100% we need to fetch the    */
1789 /*      existing destination value, and mix it with the source to       */
1790 /*      get the new "to apply" value.  Also compute composite           */
1791 /*      density.                                                        */
1792 /*                                                                      */
1793 /*      We avoid mixing if density is very near one or risk mixing      */
1794 /*      in very extreme nodata values and causing odd results (#1610)   */
1795 /* -------------------------------------------------------------------- */
1796     if( dfDensity < 0.9999 )
1797     {
1798         if( dfDensity < 0.0001 )
1799             return true;
1800 
1801         double dfDstReal = 0.0;
1802         double dfDstDensity = 1.0;
1803 
1804         if( poWK->pafDstDensity != nullptr )
1805             dfDstDensity = poWK->pafDstDensity[iDstOffset];
1806         else if( poWK->panDstValid != nullptr
1807                  && !((poWK->panDstValid[iDstOffset>>5]
1808                        & (0x01 << (iDstOffset & 0x1f))) ) )
1809             dfDstDensity = 0.0;
1810 
1811         // It seems like we also ought to be testing panDstValid[] here!
1812 
1813         switch( poWK->eWorkingDataType )
1814         {
1815           case GDT_Byte:
1816             dfDstReal = pabyDst[iDstOffset];
1817             break;
1818 
1819           case GDT_Int16:
1820             dfDstReal = reinterpret_cast<GInt16*>(pabyDst)[iDstOffset];
1821             break;
1822 
1823           case GDT_UInt16:
1824             dfDstReal = reinterpret_cast<GUInt16*>(pabyDst)[iDstOffset];
1825             break;
1826 
1827           case GDT_Int32:
1828             dfDstReal = reinterpret_cast<GInt32*>(pabyDst)[iDstOffset];
1829             break;
1830 
1831           case GDT_UInt32:
1832             dfDstReal = reinterpret_cast<GUInt32*>(pabyDst)[iDstOffset];
1833             break;
1834 
1835           case GDT_Float32:
1836             dfDstReal = reinterpret_cast<float*>(pabyDst)[iDstOffset];
1837             break;
1838 
1839           case GDT_Float64:
1840             dfDstReal = reinterpret_cast<double*>(pabyDst)[iDstOffset];
1841             break;
1842 
1843           default:
1844             CPLAssert( false );
1845             return false;
1846         }
1847 
1848         // The destination density is really only relative to the portion
1849         // not occluded by the overlay.
1850         const double dfDstInfluence = (1.0 - dfDensity) * dfDstDensity;
1851 
1852         dfReal =
1853             (dfReal * dfDensity + dfDstReal * dfDstInfluence)
1854             / (dfDensity + dfDstInfluence);
1855     }
1856 
1857 /* -------------------------------------------------------------------- */
1858 /*      Actually apply the destination value.                           */
1859 /*                                                                      */
1860 /*      Avoid using the destination nodata value for integer datatypes  */
1861 /*      if by chance it is equal to the computed pixel value.           */
1862 /* -------------------------------------------------------------------- */
1863 
1864     switch( poWK->eWorkingDataType )
1865     {
1866       case GDT_Byte:
1867         CLAMP(GByte);
1868         break;
1869 
1870       case GDT_Int16:
1871         CLAMP(GInt16);
1872         break;
1873 
1874       case GDT_UInt16:
1875         CLAMP(GUInt16);
1876         break;
1877 
1878       case GDT_UInt32:
1879         CLAMP(GUInt32);
1880         break;
1881 
1882       case GDT_Int32:
1883         CLAMP(GInt32);
1884         break;
1885 
1886       case GDT_Float32:
1887         reinterpret_cast<float*>(pabyDst)[iDstOffset] = static_cast<float>(dfReal);
1888         break;
1889 
1890       case GDT_Float64:
1891         reinterpret_cast<double*>(pabyDst)[iDstOffset] = dfReal;
1892         break;
1893 
1894       default:
1895         return false;
1896     }
1897 
1898     return true;
1899 }
1900 
1901 /************************************************************************/
1902 /*                          GWKGetPixelValue()                          */
1903 /************************************************************************/
1904 
1905 /* It is assumed that panUnifiedSrcValid has been checked before */
1906 
1907 static bool GWKGetPixelValue( const GDALWarpKernel *poWK, int iBand,
1908                               GPtrDiff_t iSrcOffset, double *pdfDensity,
1909                               double *pdfReal, double *pdfImag )
1910 
1911 {
1912     GByte *pabySrc = poWK->papabySrcImage[iBand];
1913 
1914     if( poWK->papanBandSrcValid != nullptr
1915         && poWK->papanBandSrcValid[iBand] != nullptr
1916         && !((poWK->papanBandSrcValid[iBand][iSrcOffset>>5]
1917               & (0x01 << (iSrcOffset & 0x1f)))) )
1918     {
1919         *pdfDensity = 0.0;
1920         return false;
1921     }
1922 
1923     // TODO(schwehr): Fix casting.
1924     switch( poWK->eWorkingDataType )
1925     {
1926       case GDT_Byte:
1927         *pdfReal = pabySrc[iSrcOffset];
1928         *pdfImag = 0.0;
1929         break;
1930 
1931       case GDT_Int16:
1932         *pdfReal = reinterpret_cast<GInt16*>(pabySrc)[iSrcOffset];
1933         *pdfImag = 0.0;
1934         break;
1935 
1936       case GDT_UInt16:
1937         *pdfReal = reinterpret_cast<GUInt16*>(pabySrc)[iSrcOffset];
1938         *pdfImag = 0.0;
1939         break;
1940 
1941       case GDT_Int32:
1942         *pdfReal = reinterpret_cast<GInt32*>(pabySrc)[iSrcOffset];
1943         *pdfImag = 0.0;
1944         break;
1945 
1946       case GDT_UInt32:
1947         *pdfReal = reinterpret_cast<GUInt32*>(pabySrc)[iSrcOffset];
1948         *pdfImag = 0.0;
1949         break;
1950 
1951       case GDT_Float32:
1952         *pdfReal = reinterpret_cast<float*>(pabySrc)[iSrcOffset];
1953         *pdfImag = 0.0;
1954         break;
1955 
1956       case GDT_Float64:
1957         *pdfReal = reinterpret_cast<double*>(pabySrc)[iSrcOffset];
1958         *pdfImag = 0.0;
1959         break;
1960 
1961       case GDT_CInt16:
1962         *pdfReal = reinterpret_cast<GInt16*>(pabySrc)[iSrcOffset*2];
1963         *pdfImag = reinterpret_cast<GInt16*>(pabySrc)[iSrcOffset*2+1];
1964         break;
1965 
1966       case GDT_CInt32:
1967         *pdfReal = reinterpret_cast<GInt32*>(pabySrc)[iSrcOffset*2];
1968         *pdfImag = reinterpret_cast<GInt32*>(pabySrc)[iSrcOffset*2+1];
1969         break;
1970 
1971       case GDT_CFloat32:
1972         *pdfReal = reinterpret_cast<float*>(pabySrc)[iSrcOffset*2];
1973         *pdfImag = reinterpret_cast<float*>(pabySrc)[iSrcOffset*2+1];
1974         break;
1975 
1976       case GDT_CFloat64:
1977         *pdfReal = reinterpret_cast<double*>(pabySrc)[iSrcOffset*2];
1978         *pdfImag = reinterpret_cast<double*>(pabySrc)[iSrcOffset*2+1];
1979         break;
1980 
1981       default:
1982         *pdfDensity = 0.0;
1983         return false;
1984     }
1985 
1986     if( poWK->pafUnifiedSrcDensity != nullptr )
1987         *pdfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
1988     else
1989         *pdfDensity = 1.0;
1990 
1991     return *pdfDensity != 0.0;
1992 }
1993 
1994 /************************************************************************/
1995 /*                       GWKGetPixelValueReal()                         */
1996 /************************************************************************/
1997 
1998 static bool GWKGetPixelValueReal( const GDALWarpKernel *poWK, int iBand,
1999                                   GPtrDiff_t iSrcOffset, double *pdfDensity,
2000                                   double *pdfReal )
2001 
2002 {
2003     GByte *pabySrc = poWK->papabySrcImage[iBand];
2004 
2005     if( poWK->papanBandSrcValid != nullptr
2006         && poWK->papanBandSrcValid[iBand] != nullptr
2007         && !((poWK->papanBandSrcValid[iBand][iSrcOffset>>5]
2008               & (0x01 << (iSrcOffset & 0x1f)))) )
2009     {
2010         *pdfDensity = 0.0;
2011         return false;
2012     }
2013 
2014     switch( poWK->eWorkingDataType )
2015     {
2016       case GDT_Byte:
2017         *pdfReal = pabySrc[iSrcOffset];
2018         break;
2019 
2020       case GDT_Int16:
2021         *pdfReal = reinterpret_cast<GInt16*>(pabySrc)[iSrcOffset];
2022         break;
2023 
2024       case GDT_UInt16:
2025         *pdfReal = reinterpret_cast<GUInt16*>(pabySrc)[iSrcOffset];
2026         break;
2027 
2028       case GDT_Int32:
2029         *pdfReal = reinterpret_cast<GInt32*>(pabySrc)[iSrcOffset];
2030         break;
2031 
2032       case GDT_UInt32:
2033         *pdfReal = reinterpret_cast<GUInt32*>(pabySrc)[iSrcOffset];
2034         break;
2035 
2036       case GDT_Float32:
2037         *pdfReal = reinterpret_cast<float*>(pabySrc)[iSrcOffset];
2038         break;
2039 
2040       case GDT_Float64:
2041         *pdfReal = reinterpret_cast<double*>(pabySrc)[iSrcOffset];
2042         break;
2043 
2044       default:
2045         CPLAssert(false);
2046         return false;
2047     }
2048 
2049     if( poWK->pafUnifiedSrcDensity != nullptr )
2050         *pdfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
2051     else
2052         *pdfDensity = 1.0;
2053 
2054     return *pdfDensity != 0.0;
2055 }
2056 
2057 /************************************************************************/
2058 /*                          GWKGetPixelRow()                            */
2059 /************************************************************************/
2060 
2061 /* It is assumed that adfImag[] is set to 0 by caller code for non-complex */
2062 /* data-types. */
2063 
2064 static bool GWKGetPixelRow( const GDALWarpKernel *poWK, int iBand,
2065                             GPtrDiff_t iSrcOffset, int nHalfSrcLen,
2066                             double* padfDensity,
2067                             double adfReal[],
2068                             double* padfImag )
2069 {
2070     // We know that nSrcLen is even, so we can *always* unroll loops 2x.
2071     const int nSrcLen = nHalfSrcLen * 2;
2072     bool bHasValid = false;
2073 
2074     if( padfDensity != nullptr )
2075     {
2076         // Init the density.
2077         for( int i = 0; i < nSrcLen; i += 2 )
2078         {
2079             padfDensity[i] = 1.0;
2080             padfDensity[i+1] = 1.0;
2081         }
2082 
2083         if( poWK->panUnifiedSrcValid != nullptr )
2084         {
2085             for( int i = 0; i < nSrcLen; i += 2 )
2086             {
2087                 if( poWK->panUnifiedSrcValid[(iSrcOffset+i)>>5]
2088                     & (0x01 << ((iSrcOffset+i) & 0x1f)) )
2089                     bHasValid = true;
2090                 else
2091                     padfDensity[i] = 0.0;
2092 
2093                 if( poWK->panUnifiedSrcValid[(iSrcOffset+i+1)>>5]
2094                     & (0x01 << ((iSrcOffset+i+1) & 0x1f)) )
2095                     bHasValid = true;
2096                 else
2097                     padfDensity[i+1] = 0.0;
2098             }
2099 
2100             // Reset or fail as needed.
2101             if( bHasValid )
2102                 bHasValid = false;
2103             else
2104                 return false;
2105         }
2106 
2107         if( poWK->papanBandSrcValid != nullptr
2108             && poWK->papanBandSrcValid[iBand] != nullptr )
2109         {
2110             for( int i = 0; i < nSrcLen; i += 2 )
2111             {
2112                 if( poWK->papanBandSrcValid[iBand][(iSrcOffset+i)>>5]
2113                     & (0x01 << ((iSrcOffset+i) & 0x1f)) )
2114                     bHasValid = true;
2115                 else
2116                     padfDensity[i] = 0.0;
2117 
2118                 if( poWK->papanBandSrcValid[iBand][(iSrcOffset+i+1)>>5]
2119                     & (0x01 << ((iSrcOffset+i+1) & 0x1f)) )
2120                     bHasValid = true;
2121                 else
2122                     padfDensity[i+1] = 0.0;
2123             }
2124 
2125             // Reset or fail as needed.
2126             if( bHasValid )
2127                 bHasValid = false;
2128             else
2129                 return false;
2130         }
2131     }
2132 
2133     // TODO(schwehr): Fix casting.
2134     // Fetch data.
2135     switch( poWK->eWorkingDataType )
2136     {
2137         case GDT_Byte:
2138         {
2139             GByte* pSrc = reinterpret_cast<GByte*>(poWK->papabySrcImage[iBand]);
2140             pSrc += iSrcOffset;
2141             for( int i = 0; i < nSrcLen; i += 2 )
2142             {
2143                 adfReal[i] = pSrc[i];
2144                 adfReal[i+1] = pSrc[i+1];
2145             }
2146             break;
2147         }
2148 
2149         case GDT_Int16:
2150         {
2151             GInt16* pSrc = reinterpret_cast<GInt16*>(poWK->papabySrcImage[iBand]);
2152             pSrc += iSrcOffset;
2153             for( int i = 0; i < nSrcLen; i += 2 )
2154             {
2155                 adfReal[i] = pSrc[i];
2156                 adfReal[i+1] = pSrc[i+1];
2157             }
2158             break;
2159         }
2160 
2161          case GDT_UInt16:
2162          {
2163             GUInt16* pSrc = reinterpret_cast<GUInt16*>(poWK->papabySrcImage[iBand]);
2164             pSrc += iSrcOffset;
2165             for( int i = 0; i < nSrcLen; i += 2 )
2166             {
2167                 adfReal[i] = pSrc[i];
2168                 adfReal[i+1] = pSrc[i+1];
2169             }
2170             break;
2171          }
2172 
2173         case GDT_Int32:
2174         {
2175             GInt32* pSrc = reinterpret_cast<GInt32*>(poWK->papabySrcImage[iBand]);
2176             pSrc += iSrcOffset;
2177             for( int i = 0; i < nSrcLen; i += 2 )
2178             {
2179                 adfReal[i] = pSrc[i];
2180                 adfReal[i+1] = pSrc[i+1];
2181             }
2182             break;
2183         }
2184 
2185         case GDT_UInt32:
2186         {
2187             GUInt32* pSrc = reinterpret_cast<GUInt32*>(poWK->papabySrcImage[iBand]);
2188             pSrc += iSrcOffset;
2189             for( int i = 0; i < nSrcLen; i += 2 )
2190             {
2191                 adfReal[i] = pSrc[i];
2192                 adfReal[i+1] = pSrc[i+1];
2193             }
2194             break;
2195         }
2196 
2197         case GDT_Float32:
2198         {
2199             float* pSrc = reinterpret_cast<float*>(poWK->papabySrcImage[iBand]);
2200             pSrc += iSrcOffset;
2201             for( int i = 0; i < nSrcLen; i += 2 )
2202             {
2203                 adfReal[i] = pSrc[i];
2204                 adfReal[i+1] = pSrc[i+1];
2205             }
2206             break;
2207         }
2208 
2209        case GDT_Float64:
2210        {
2211             double* pSrc = reinterpret_cast<double*>(poWK->papabySrcImage[iBand]);
2212             pSrc += iSrcOffset;
2213             for( int i = 0; i < nSrcLen; i += 2 )
2214             {
2215                 adfReal[i] = pSrc[i];
2216                 adfReal[i+1] = pSrc[i+1];
2217             }
2218             break;
2219        }
2220 
2221         case GDT_CInt16:
2222         {
2223             GInt16* pSrc = reinterpret_cast<GInt16*>(poWK->papabySrcImage[iBand]);
2224             pSrc += 2 * iSrcOffset;
2225             for( int i = 0; i < nSrcLen; i += 2 )
2226             {
2227                 adfReal[i] = pSrc[2*i];
2228                 padfImag[i] = pSrc[2*i+1];
2229 
2230                 adfReal[i+1] = pSrc[2*i+2];
2231                 padfImag[i+1] = pSrc[2*i+3];
2232             }
2233             break;
2234         }
2235 
2236         case GDT_CInt32:
2237         {
2238             GInt32* pSrc = reinterpret_cast<GInt32*>(poWK->papabySrcImage[iBand]);
2239             pSrc += 2 * iSrcOffset;
2240             for( int i = 0; i < nSrcLen; i += 2 )
2241             {
2242                 adfReal[i] = pSrc[2*i];
2243                 padfImag[i] = pSrc[2*i+1];
2244 
2245                 adfReal[i+1] = pSrc[2*i+2];
2246                 padfImag[i+1] = pSrc[2*i+3];
2247             }
2248             break;
2249         }
2250 
2251         case GDT_CFloat32:
2252         {
2253             float* pSrc = reinterpret_cast<float*>(poWK->papabySrcImage[iBand]);
2254             pSrc += 2 * iSrcOffset;
2255             for( int i = 0; i < nSrcLen; i += 2 )
2256             {
2257                 adfReal[i] = pSrc[2*i];
2258                 padfImag[i] = pSrc[2*i+1];
2259 
2260                 adfReal[i+1] = pSrc[2*i+2];
2261                 padfImag[i+1] = pSrc[2*i+3];
2262             }
2263             break;
2264         }
2265 
2266         case GDT_CFloat64:
2267         {
2268             double* pSrc = reinterpret_cast<double*>(poWK->papabySrcImage[iBand]);
2269             pSrc += 2 * iSrcOffset;
2270             for( int i = 0; i < nSrcLen; i += 2 )
2271             {
2272                 adfReal[i] = pSrc[2*i];
2273                 padfImag[i] = pSrc[2*i+1];
2274 
2275                 adfReal[i+1] = pSrc[2*i+2];
2276                 padfImag[i+1] = pSrc[2*i+3];
2277             }
2278             break;
2279         }
2280 
2281         default:
2282             CPLAssert(false);
2283             if( padfDensity )
2284                 memset( padfDensity, 0, nSrcLen * sizeof(double) );
2285             return false;
2286     }
2287 
2288     if( padfDensity == nullptr )
2289         return true;
2290 
2291     if( poWK->pafUnifiedSrcDensity == nullptr )
2292     {
2293         for( int i = 0; i < nSrcLen; i += 2 )
2294         {
2295             // Take into account earlier calcs.
2296             if( padfDensity[i] > SRC_DENSITY_THRESHOLD )
2297             {
2298                 padfDensity[i] = 1.0;
2299                 bHasValid = true;
2300             }
2301 
2302             if( padfDensity[i+1] > SRC_DENSITY_THRESHOLD )
2303             {
2304                 padfDensity[i+1] = 1.0;
2305                 bHasValid = true;
2306             }
2307         }
2308     }
2309     else
2310     {
2311         for( int i = 0; i < nSrcLen; i += 2 )
2312         {
2313             if( padfDensity[i] > SRC_DENSITY_THRESHOLD )
2314                 padfDensity[i] = poWK->pafUnifiedSrcDensity[iSrcOffset+i];
2315             if( padfDensity[i] > SRC_DENSITY_THRESHOLD )
2316                 bHasValid = true;
2317 
2318             if( padfDensity[i+1] > SRC_DENSITY_THRESHOLD )
2319                 padfDensity[i+1] = poWK->pafUnifiedSrcDensity[iSrcOffset+i+1];
2320             if( padfDensity[i+1] > SRC_DENSITY_THRESHOLD )
2321                 bHasValid = true;
2322         }
2323     }
2324 
2325     return bHasValid;
2326 }
2327 
2328 /************************************************************************/
2329 /*                          GWKGetPixelT()                              */
2330 /************************************************************************/
2331 
2332 template<class T>
2333 static bool GWKGetPixelT( const GDALWarpKernel *poWK, int iBand,
2334                           GPtrDiff_t iSrcOffset, double *pdfDensity,
2335                           T *pValue )
2336 
2337 {
2338     T *pSrc = reinterpret_cast<T *>(poWK->papabySrcImage[iBand]);
2339 
2340     if( ( poWK->panUnifiedSrcValid != nullptr
2341           && !((poWK->panUnifiedSrcValid[iSrcOffset>>5]
2342                 & (0x01 << (iSrcOffset & 0x1f))) ) )
2343         || ( poWK->papanBandSrcValid != nullptr
2344              && poWK->papanBandSrcValid[iBand] != nullptr
2345              && !((poWK->papanBandSrcValid[iBand][iSrcOffset>>5]
2346                    & (0x01 << (iSrcOffset & 0x1f)))) ) )
2347     {
2348         *pdfDensity = 0.0;
2349         return false;
2350     }
2351 
2352     *pValue = pSrc[iSrcOffset];
2353 
2354     if( poWK->pafUnifiedSrcDensity == nullptr )
2355         *pdfDensity = 1.0;
2356     else
2357         *pdfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
2358 
2359     return *pdfDensity != 0.0;
2360 }
2361 
2362 /************************************************************************/
2363 /*                        GWKBilinearResample()                         */
2364 /*     Set of bilinear interpolators                                    */
2365 /************************************************************************/
2366 
2367 static bool GWKBilinearResample4Sample( const GDALWarpKernel *poWK, int iBand,
2368                                 double dfSrcX, double dfSrcY,
2369                                 double *pdfDensity,
2370                                 double *pdfReal, double *pdfImag )
2371 
2372 {
2373     // Save as local variables to avoid following pointers.
2374     const int nSrcXSize = poWK->nSrcXSize;
2375     const int nSrcYSize = poWK->nSrcYSize;
2376 
2377     int iSrcX = static_cast<int>(floor(dfSrcX - 0.5));
2378     int iSrcY = static_cast<int>(floor(dfSrcY - 0.5));
2379     double dfRatioX = 1.5 - (dfSrcX - iSrcX);
2380     double dfRatioY = 1.5 - (dfSrcY - iSrcY);
2381     bool bShifted = false;
2382 
2383     if( iSrcX == -1 )
2384     {
2385         iSrcX = 0;
2386         dfRatioX = 1;
2387     }
2388     if( iSrcY == -1 )
2389     {
2390         iSrcY = 0;
2391         dfRatioY = 1;
2392     }
2393     GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
2394 
2395     // Shift so we don't overrun the array.
2396     if( static_cast<GPtrDiff_t>(nSrcXSize) * nSrcYSize == iSrcOffset + 1
2397         || static_cast<GPtrDiff_t>(nSrcXSize) * nSrcYSize == iSrcOffset + nSrcXSize + 1 )
2398     {
2399         bShifted = true;
2400         --iSrcOffset;
2401     }
2402 
2403     double adfDensity[2] = { 0.0, 0.0 };
2404     double adfReal[2] = { 0.0, 0.0 };
2405     double adfImag[2] = { 0.0, 0.0 };
2406     double dfAccumulatorReal = 0.0;
2407     double dfAccumulatorImag = 0.0;
2408     double dfAccumulatorDensity = 0.0;
2409     double dfAccumulatorDivisor = 0.0;
2410 
2411     const GPtrDiff_t nSrcPixels = static_cast<GPtrDiff_t>(nSrcXSize) * nSrcYSize;
2412     // Get pixel row.
2413     if( iSrcY >= 0 && iSrcY < nSrcYSize
2414         && iSrcOffset >= 0 && iSrcOffset < nSrcPixels
2415         && GWKGetPixelRow( poWK, iBand, iSrcOffset, 1,
2416                            adfDensity, adfReal, adfImag ) )
2417     {
2418         double dfMult1 = dfRatioX * dfRatioY;
2419         double dfMult2 = (1.0-dfRatioX) * dfRatioY;
2420 
2421         // Shifting corrected.
2422         if( bShifted )
2423         {
2424             adfReal[0] = adfReal[1];
2425             adfImag[0] = adfImag[1];
2426             adfDensity[0] = adfDensity[1];
2427         }
2428 
2429         // Upper Left Pixel.
2430         if( iSrcX >= 0 && iSrcX < nSrcXSize
2431             && adfDensity[0] > SRC_DENSITY_THRESHOLD )
2432         {
2433             dfAccumulatorDivisor += dfMult1;
2434 
2435             dfAccumulatorReal += adfReal[0] * dfMult1;
2436             dfAccumulatorImag += adfImag[0] * dfMult1;
2437             dfAccumulatorDensity += adfDensity[0] * dfMult1;
2438         }
2439 
2440         // Upper Right Pixel.
2441         if( iSrcX+1 >= 0 && iSrcX+1 < nSrcXSize
2442             && adfDensity[1] > SRC_DENSITY_THRESHOLD )
2443         {
2444             dfAccumulatorDivisor += dfMult2;
2445 
2446             dfAccumulatorReal += adfReal[1] * dfMult2;
2447             dfAccumulatorImag += adfImag[1] * dfMult2;
2448             dfAccumulatorDensity += adfDensity[1] * dfMult2;
2449         }
2450     }
2451 
2452     // Get pixel row.
2453     if( iSrcY+1 >= 0 && iSrcY+1 < nSrcYSize
2454         && iSrcOffset+nSrcXSize >= 0
2455         && iSrcOffset+nSrcXSize < nSrcPixels
2456         && GWKGetPixelRow( poWK, iBand, iSrcOffset+nSrcXSize, 1,
2457                            adfDensity, adfReal, adfImag ) )
2458     {
2459         double dfMult1 = dfRatioX * (1.0-dfRatioY);
2460         double dfMult2 = (1.0-dfRatioX) * (1.0-dfRatioY);
2461 
2462         // Shifting corrected
2463         if ( bShifted )
2464         {
2465             adfReal[0] = adfReal[1];
2466             adfImag[0] = adfImag[1];
2467             adfDensity[0] = adfDensity[1];
2468         }
2469 
2470         // Lower Left Pixel
2471         if ( iSrcX >= 0 && iSrcX < nSrcXSize
2472              && adfDensity[0] > SRC_DENSITY_THRESHOLD )
2473         {
2474             dfAccumulatorDivisor += dfMult1;
2475 
2476             dfAccumulatorReal += adfReal[0] * dfMult1;
2477             dfAccumulatorImag += adfImag[0] * dfMult1;
2478             dfAccumulatorDensity += adfDensity[0] * dfMult1;
2479         }
2480 
2481         // Lower Right Pixel.
2482         if( iSrcX+1 >= 0 && iSrcX+1 < nSrcXSize
2483             && adfDensity[1] > SRC_DENSITY_THRESHOLD )
2484         {
2485             dfAccumulatorDivisor += dfMult2;
2486 
2487             dfAccumulatorReal += adfReal[1] * dfMult2;
2488             dfAccumulatorImag += adfImag[1] * dfMult2;
2489             dfAccumulatorDensity += adfDensity[1] * dfMult2;
2490         }
2491     }
2492 
2493 /* -------------------------------------------------------------------- */
2494 /*      Return result.                                                  */
2495 /* -------------------------------------------------------------------- */
2496     if( dfAccumulatorDivisor == 1.0 )
2497     {
2498         *pdfReal = dfAccumulatorReal;
2499         *pdfImag = dfAccumulatorImag;
2500         *pdfDensity = dfAccumulatorDensity;
2501         return false;
2502     }
2503     else if( dfAccumulatorDivisor < 0.00001 )
2504     {
2505         *pdfReal = 0.0;
2506         *pdfImag = 0.0;
2507         *pdfDensity = 0.0;
2508         return false;
2509     }
2510     else
2511     {
2512         *pdfReal = dfAccumulatorReal / dfAccumulatorDivisor;
2513         *pdfImag = dfAccumulatorImag / dfAccumulatorDivisor;
2514         *pdfDensity = dfAccumulatorDensity / dfAccumulatorDivisor;
2515         return true;
2516     }
2517 }
2518 
2519 template<class T>
2520 static bool GWKBilinearResampleNoMasks4SampleT( const GDALWarpKernel *poWK, int iBand,
2521                                         double dfSrcX, double dfSrcY,
2522                                         T *pValue )
2523 
2524 {
2525 
2526     const int iSrcX = static_cast<int>(floor(dfSrcX - 0.5));
2527     const int iSrcY = static_cast<int>(floor(dfSrcY - 0.5));
2528     GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
2529     const double dfRatioX = 1.5 - (dfSrcX - iSrcX);
2530     const double dfRatioY = 1.5 - (dfSrcY - iSrcY);
2531 
2532     const T* const pSrc = reinterpret_cast<T *>(poWK->papabySrcImage[iBand]);
2533 
2534 
2535     if( iSrcX >= 0 && iSrcX+1 < poWK->nSrcXSize
2536         && iSrcY >= 0 && iSrcY+1 < poWK->nSrcYSize )
2537     {
2538         const double dfAccumulator =
2539             (pSrc[iSrcOffset] * dfRatioX +
2540              pSrc[iSrcOffset+1] * (1.0 - dfRatioX)) *
2541             dfRatioY +
2542             (pSrc[iSrcOffset+poWK->nSrcXSize] * dfRatioX +
2543              pSrc[iSrcOffset+1+poWK->nSrcXSize] *
2544              (1.0 - dfRatioX)) * (1.0-dfRatioY);
2545 
2546         *pValue = GWKRoundValueT<T>(dfAccumulator);
2547 
2548         return true;
2549     }
2550 
2551     double dfAccumulatorDivisor = 0.0;
2552     double dfAccumulator = 0.0;
2553 
2554     // Upper Left Pixel.
2555     if( iSrcX >= 0 && iSrcX < poWK->nSrcXSize
2556         && iSrcY >= 0 && iSrcY < poWK->nSrcYSize )
2557     {
2558         const double dfMult = dfRatioX * dfRatioY;
2559 
2560         dfAccumulatorDivisor += dfMult;
2561 
2562         dfAccumulator += pSrc[iSrcOffset] * dfMult;
2563     }
2564 
2565     // Upper Right Pixel.
2566     if( iSrcX+1 >= 0 && iSrcX+1 < poWK->nSrcXSize
2567         && iSrcY >= 0 && iSrcY < poWK->nSrcYSize )
2568     {
2569         const double dfMult = (1.0 - dfRatioX) * dfRatioY;
2570 
2571         dfAccumulatorDivisor += dfMult;
2572 
2573         dfAccumulator += pSrc[iSrcOffset+1] * dfMult;
2574     }
2575 
2576     // Lower Right Pixel.
2577     if( iSrcX+1 >= 0 && iSrcX+1 < poWK->nSrcXSize
2578         && iSrcY+1 >= 0 && iSrcY+1 < poWK->nSrcYSize )
2579     {
2580         const double dfMult = (1.0 - dfRatioX) * (1.0 - dfRatioY);
2581 
2582         dfAccumulatorDivisor += dfMult;
2583 
2584         dfAccumulator += pSrc[iSrcOffset+1+poWK->nSrcXSize] * dfMult;
2585     }
2586 
2587     // Lower Left Pixel.
2588     if( iSrcX >= 0 && iSrcX < poWK->nSrcXSize
2589         && iSrcY+1 >= 0 && iSrcY+1 < poWK->nSrcYSize )
2590     {
2591         const double dfMult = dfRatioX * (1.0 - dfRatioY);
2592 
2593         dfAccumulatorDivisor += dfMult;
2594 
2595         dfAccumulator += pSrc[iSrcOffset+poWK->nSrcXSize] * dfMult;
2596     }
2597 
2598 /* -------------------------------------------------------------------- */
2599 /*      Return result.                                                  */
2600 /* -------------------------------------------------------------------- */
2601     double dfValue = 0.0;
2602 
2603     if( dfAccumulatorDivisor < 0.00001 )
2604     {
2605         *pValue = 0;
2606         return false;
2607     }
2608     else if( dfAccumulatorDivisor == 1.0 )
2609     {
2610         dfValue = dfAccumulator;
2611     }
2612     else
2613     {
2614         dfValue = dfAccumulator / dfAccumulatorDivisor;
2615     }
2616 
2617     *pValue = GWKRoundValueT<T>(dfValue);
2618 
2619     return true;
2620 }
2621 
2622 /************************************************************************/
2623 /*                        GWKCubicResample()                            */
2624 /*     Set of bicubic interpolators using cubic convolution.            */
2625 /************************************************************************/
2626 
2627 // http://verona.fi-p.unam.mx/boris/practicas/CubConvInterp.pdf Formula 18
2628 // or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2)
2629 // http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation
2630 
2631 // TODO(schwehr): Use an inline function.
2632 #define CubicConvolution(distance1,distance2,distance3,f0,f1,f2,f3) \
2633      (             f1                                               \
2634       + 0.5 * (distance1*(f2 - f0)                                  \
2635              + distance2*(2.0*f0 - 5.0*f1 + 4.0*f2 - f3)            \
2636              + distance3*(3.0*(f1 - f2) + f3 - f0)))
2637 
2638 /************************************************************************/
2639 /*                       GWKCubicComputeWeights()                       */
2640 /************************************************************************/
2641 
2642 // adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]);
2643 
2644 // TODO(schwehr): Use an inline function.
2645 #define GWKCubicComputeWeights(dfX_, adfCoeffs) \
2646 { \
2647     const double dfX = dfX_; \
2648     const double dfHalfX = 0.5 * dfX; \
2649     const double dfThreeX = 3.0 * dfX; \
2650     const double dfHalfX2 = dfHalfX * dfX; \
2651  \
2652     adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX)); \
2653     adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX); \
2654     adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX)); \
2655     adfCoeffs[3] = dfHalfX2 * (-1 + dfX); \
2656 }
2657 
2658 // TODO(schwehr): Use an inline function.
2659 #define CONVOL4(v1, v2) ((v1)[0] * (v2)[0] + (v1)[1] * (v2)[1] + \
2660                          (v1)[2] * (v2)[2] + (v1)[3] * (v2)[3])
2661 
2662 #if 0
2663 // Optimal (in theory...) for max 2 convolutions: 14 multiplications
2664 // instead of 17.
2665 // TODO(schwehr): Use an inline function.
2666 #define GWKCubicComputeWeights_Optim2MAX(dfX_, adfCoeffs, dfHalfX) \
2667 { \
2668     const double dfX = dfX_; \
2669     dfHalfX = 0.5 * dfX; \
2670     const double dfThreeX = 3.0 * dfX; \
2671     const double dfXMinus1 = dfX - 1; \
2672  \
2673     adfCoeffs[0] = -1 + dfX * (2 - dfX); \
2674     adfCoeffs[1] = dfX * (-5 + dfThreeX); \
2675     /*adfCoeffs[2] = 1 + dfX * (4 - dfThreeX);*/ \
2676     adfCoeffs[2] = -dfXMinus1 - adfCoeffs[1]; \
2677     /*adfCoeffs[3] = dfX * (-1 + dfX); */ \
2678     adfCoeffs[3] = dfXMinus1 - adfCoeffs[0]; \
2679 }
2680 
2681 // TODO(schwehr): Use an inline function.
2682 #define CONVOL4_Optim2MAX(adfCoeffs, v, dfHalfX) \
2683       ((v)[1] + (dfHalfX) * (\
2684           (adfCoeffs)[0] * (v)[0] + (adfCoeffs)[1] * (v)[1] + \
2685           (adfCoeffs)[2] * (v)[2] + (adfCoeffs)[3] * (v)[3]))
2686 #endif
2687 
2688 static bool GWKCubicResample4Sample( const GDALWarpKernel *poWK, int iBand,
2689                                      double dfSrcX, double dfSrcY,
2690                                      double *pdfDensity,
2691                                      double *pdfReal, double *pdfImag )
2692 
2693 {
2694     const int iSrcX = static_cast<int>(dfSrcX - 0.5);
2695     const int iSrcY = static_cast<int>(dfSrcY - 0.5);
2696     GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
2697     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
2698     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
2699     double adfDensity[4] = {};
2700     double adfReal[4] = {};
2701     double adfImag[4] = {};
2702 
2703     // Get the bilinear interpolation at the image borders.
2704     if( iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize
2705         || iSrcY - 1 < 0 || iSrcY + 2 >= poWK->nSrcYSize )
2706         return GWKBilinearResample4Sample( poWK, iBand, dfSrcX, dfSrcY,
2707                                            pdfDensity, pdfReal, pdfImag );
2708 
2709     double adfValueDens[4] = {};
2710     double adfValueReal[4] = {};
2711     double adfValueImag[4] = {};
2712 
2713     double adfCoeffsX[4] = {};
2714     GWKCubicComputeWeights(dfDeltaX, adfCoeffsX);
2715 
2716     for( GPtrDiff_t i = -1; i < 3; i++ )
2717     {
2718         if( !GWKGetPixelRow(poWK, iBand, iSrcOffset + i * poWK->nSrcXSize - 1,
2719                             2, adfDensity, adfReal, adfImag)
2720             || adfDensity[0] < SRC_DENSITY_THRESHOLD
2721             || adfDensity[1] < SRC_DENSITY_THRESHOLD
2722             || adfDensity[2] < SRC_DENSITY_THRESHOLD
2723             || adfDensity[3] < SRC_DENSITY_THRESHOLD )
2724         {
2725             return GWKBilinearResample4Sample( poWK, iBand, dfSrcX, dfSrcY,
2726                                        pdfDensity, pdfReal, pdfImag );
2727         }
2728 
2729         adfValueDens[i + 1] = CONVOL4(adfCoeffsX, adfDensity);
2730         adfValueReal[i + 1] = CONVOL4(adfCoeffsX, adfReal);
2731         adfValueImag[i + 1] = CONVOL4(adfCoeffsX, adfImag);
2732     }
2733 
2734 /* -------------------------------------------------------------------- */
2735 /*      For now, if we have any pixels missing in the kernel area,      */
2736 /*      we fallback on using bilinear interpolation.  Ideally we        */
2737 /*      should do "weight adjustment" of our results similarly to       */
2738 /*      what is done for the cubic spline and lanc. interpolators.      */
2739 /* -------------------------------------------------------------------- */
2740 
2741     double adfCoeffsY[4] = {};
2742     GWKCubicComputeWeights(dfDeltaY, adfCoeffsY);
2743 
2744     *pdfDensity = CONVOL4(adfCoeffsY, adfValueDens);
2745     *pdfReal    = CONVOL4(adfCoeffsY, adfValueReal);
2746     *pdfImag    = CONVOL4(adfCoeffsY, adfValueImag);
2747 
2748     return true;
2749 }
2750 
2751 // We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
2752 // perf benefit.
2753 
2754 #if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
2755 
2756 /************************************************************************/
2757 /*                           XMMLoad4Values()                           */
2758 /*                                                                      */
2759 /*  Load 4 packed byte or uint16, cast them to float and put them in a  */
2760 /*  m128 register.                                                      */
2761 /************************************************************************/
2762 
2763 static CPL_INLINE __m128 XMMLoad4Values(const GByte* ptr)
2764 {
2765 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
2766     unsigned int i;
2767     memcpy(&i, ptr, 4);
2768     __m128i xmm_i = _mm_cvtsi32_si128(s);
2769 #else
2770     __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned int*)(ptr));
2771 #endif
2772     // Zero extend 4 packed unsigned 8-bit integers in a to packed
2773     // 32-bit integers.
2774 #if __SSE4_1__
2775     xmm_i = _mm_cvtepu8_epi32(xmm_i);
2776 #else
2777     xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
2778     xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
2779 #endif
2780     return _mm_cvtepi32_ps(xmm_i);
2781 }
2782 
2783 static CPL_INLINE __m128 XMMLoad4Values(const GUInt16* ptr)
2784 {
2785 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
2786     GUInt64 i;
2787     memcpy(&i, ptr, 8);
2788     __m128i xmm_i =  _mm_cvtsi64_si128(s);
2789 #else
2790     __m128i xmm_i =  _mm_cvtsi64_si128(*(GUInt64*)(ptr));
2791 #endif
2792     // Zero extend 4 packed unsigned 16-bit integers in a to packed
2793     // 32-bit integers.
2794 #if __SSE4_1__
2795     xmm_i = _mm_cvtepu16_epi32(xmm_i);
2796 #else
2797     xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
2798 #endif
2799     return _mm_cvtepi32_ps(xmm_i);
2800 }
2801 
2802 /************************************************************************/
2803 /*                           XMMHorizontalAdd()                         */
2804 /*                                                                      */
2805 /*  Return the sum of the 4 floating points of the register.            */
2806 /************************************************************************/
2807 
2808 #if __SSE3__
2809 static CPL_INLINE float XMMHorizontalAdd(__m128 v)
2810 {
2811     __m128 shuf = _mm_movehdup_ps(v);        // (v3   , v3   , v1   , v1)
2812     __m128 sums = _mm_add_ps(v, shuf);       // (v3+v3, v3+v2, v1+v1, v1+v0)
2813     shuf        = _mm_movehl_ps(shuf, sums); // (v3   , v3   , v3+v3, v3+v2)
2814     sums        = _mm_add_ss(sums, shuf);    // (v1+v0)+(v3+v2)
2815     return        _mm_cvtss_f32(sums);
2816 }
2817 #else
2818 static CPL_INLINE float XMMHorizontalAdd(__m128 v)
2819 {
2820     __m128 shuf = _mm_movehl_ps(v, v);           // (v3   , v2   , v3   , v2)
2821     __m128 sums = _mm_add_ps(v, shuf);           // (v3+v3, v2+v2, v3+v1, v2+v0)
2822     shuf        = _mm_shuffle_ps(sums, sums, 1); // (v2+v0, v2+v0, v2+v0, v3+v1)
2823     sums        = _mm_add_ss(sums, shuf);        // (v2+v0)+(v3+v1)
2824     return _mm_cvtss_f32(sums);
2825 }
2826 #endif
2827 
2828 #endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
2829 
2830 /************************************************************************/
2831 /*            GWKCubicResampleSrcMaskIsDensity4SampleRealT()            */
2832 /************************************************************************/
2833 
2834 // Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16,
2835 // because there are a few assumptions above those types.
2836 
2837 template<class T>
2838 static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
2839     const GDALWarpKernel *poWK, int iBand,
2840     double dfSrcX, double dfSrcY,
2841     double *pdfDensity,
2842     double *pdfReal )
2843 {
2844     const int iSrcX = static_cast<int>(dfSrcX - 0.5);
2845     const int iSrcY = static_cast<int>(dfSrcY - 0.5);
2846     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
2847 
2848     // Get the bilinear interpolation at the image borders.
2849     if( iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize
2850         || iSrcY - 1 < 0 || iSrcY + 2 >= poWK->nSrcYSize )
2851     {
2852         double adfImagIgnored[4] = {};
2853         return GWKBilinearResample4Sample(poWK, iBand, dfSrcX, dfSrcY,
2854                                           pdfDensity, pdfReal, adfImagIgnored);
2855     }
2856 
2857 #if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
2858     const float fDeltaX = static_cast<float>(dfSrcX) - 0.5f - iSrcX;
2859     const float fDeltaY = static_cast<float>(dfSrcY) - 0.5f - iSrcY;
2860 
2861     // TODO(schwehr): Explain the magic numbers.
2862     float  afTemp[4 + 4 + 4 + 1];
2863     float* pafAligned =
2864         reinterpret_cast<float*>(afTemp + ((size_t)afTemp & 0xf));
2865     float* pafCoeffs = pafAligned;
2866     float* pafDensity = pafAligned + 4;
2867     float* pafValue = pafAligned + 8;
2868 
2869     const float fHalfDeltaX = 0.5f * fDeltaX;
2870     const float fThreeDeltaX = 3.0f * fDeltaX;
2871     const float fHalfDeltaX2 = fHalfDeltaX * fDeltaX;
2872 
2873     pafCoeffs[0] = fHalfDeltaX * (-1 + fDeltaX * (2 - fDeltaX));
2874     pafCoeffs[1] = 1 + fHalfDeltaX2 * (-5 + fThreeDeltaX);
2875     pafCoeffs[2] = fHalfDeltaX * (1 + fDeltaX * (4 - fThreeDeltaX));
2876     pafCoeffs[3] = fHalfDeltaX2 * (-1 + fDeltaX);
2877     __m128 xmmCoeffs = _mm_load_ps(pafCoeffs);
2878     const __m128 xmmThreshold = _mm_load1_ps(&SRC_DENSITY_THRESHOLD);
2879 
2880     __m128 xmmMaskLowDensity = _mm_setzero_ps();
2881     for( GPtrDiff_t i = -1, iOffset = iSrcOffset - poWK->nSrcXSize - 1;
2882          i < 3; i++, iOffset += poWK->nSrcXSize )
2883     {
2884         const __m128 xmmDensity = _mm_loadu_ps(
2885                                     poWK->pafUnifiedSrcDensity + iOffset);
2886         xmmMaskLowDensity = _mm_or_ps(xmmMaskLowDensity,
2887                                       _mm_cmplt_ps(xmmDensity, xmmThreshold));
2888         pafDensity[i + 1] = XMMHorizontalAdd(_mm_mul_ps(xmmCoeffs, xmmDensity));
2889 
2890         const __m128 xmmValues = XMMLoad4Values(
2891                               ((T*) poWK->papabySrcImage[iBand]) + iOffset);
2892         pafValue[i + 1] = XMMHorizontalAdd(_mm_mul_ps(xmmCoeffs, xmmValues));
2893     }
2894     if( _mm_movemask_ps(xmmMaskLowDensity) )
2895     {
2896         double adfImagIgnored[4] = {};
2897         return GWKBilinearResample4Sample( poWK, iBand, dfSrcX, dfSrcY,
2898                                   pdfDensity, pdfReal, adfImagIgnored );
2899     }
2900 
2901     const float fHalfDeltaY = 0.5f * fDeltaY;
2902     const float fThreeDeltaY = 3.0f * fDeltaY;
2903     const float fHalfDeltaY2 = fHalfDeltaY * fDeltaY;
2904 
2905     pafCoeffs[0] = fHalfDeltaY * (-1 + fDeltaY * (2 - fDeltaY));
2906     pafCoeffs[1] = 1 + fHalfDeltaY2 * (-5 + fThreeDeltaY);
2907     pafCoeffs[2] = fHalfDeltaY * (1 + fDeltaY * (4 - fThreeDeltaY));
2908     pafCoeffs[3] = fHalfDeltaY2 * (-1 + fDeltaY);
2909 
2910     xmmCoeffs = _mm_load_ps(pafCoeffs);
2911 
2912     const __m128 xmmDensity = _mm_load_ps(pafDensity);
2913     const __m128 xmmValue = _mm_load_ps(pafValue);
2914     *pdfDensity = XMMHorizontalAdd(_mm_mul_ps(xmmCoeffs, xmmDensity));
2915     *pdfReal = XMMHorizontalAdd(_mm_mul_ps(xmmCoeffs, xmmValue));
2916 
2917     // We did all above computations on float32 whereas the general case is
2918     // float64. Not sure if one is fundamentally more correct than the other
2919     // one, but we want our optimization to give the same result as the
2920     // general case as much as possible, so if the resulting value is
2921     // close to some_int_value + 0.5, redo the computation with the general
2922     // case.
2923     // Note: If other types than Byte or UInt16, will need changes.
2924     if( fabs(*pdfReal - static_cast<int>(*pdfReal) - 0.5) > .007 )
2925         return true;
2926 
2927 #endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
2928 
2929     const double  dfDeltaX = dfSrcX - 0.5 - iSrcX;
2930     const double  dfDeltaY = dfSrcY - 0.5 - iSrcY;
2931 
2932     double adfValueDens[4] = {};
2933     double adfValueReal[4] = {};
2934 
2935     double adfCoeffsX[4] = {};
2936     GWKCubicComputeWeights(dfDeltaX, adfCoeffsX);
2937 
2938     double adfCoeffsY[4] = {};
2939     GWKCubicComputeWeights(dfDeltaY, adfCoeffsY);
2940 
2941     for( GPtrDiff_t i = -1; i < 3; i++ )
2942     {
2943         const GPtrDiff_t iOffset = iSrcOffset+i*poWK->nSrcXSize - 1;
2944 #if !(defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)))
2945         if( poWK->pafUnifiedSrcDensity[iOffset + 0] < SRC_DENSITY_THRESHOLD ||
2946             poWK->pafUnifiedSrcDensity[iOffset + 1] < SRC_DENSITY_THRESHOLD ||
2947             poWK->pafUnifiedSrcDensity[iOffset + 2] < SRC_DENSITY_THRESHOLD ||
2948             poWK->pafUnifiedSrcDensity[iOffset + 3] < SRC_DENSITY_THRESHOLD )
2949         {
2950             double adfImagIgnored[4] = {};
2951             return GWKBilinearResample4Sample( poWK, iBand, dfSrcX, dfSrcY,
2952                                       pdfDensity, pdfReal, adfImagIgnored );
2953         }
2954 #endif
2955 
2956         adfValueDens[i + 1] = CONVOL4(
2957             adfCoeffsX,
2958             poWK->pafUnifiedSrcDensity + iOffset);
2959 
2960         adfValueReal[i + 1] = CONVOL4(
2961             adfCoeffsX,
2962             reinterpret_cast<T*>(poWK->papabySrcImage[iBand]) + iOffset);
2963     }
2964 
2965     *pdfDensity = CONVOL4(adfCoeffsY, adfValueDens);
2966     *pdfReal    = CONVOL4(adfCoeffsY, adfValueReal);
2967 
2968     return true;
2969 }
2970 
2971 /************************************************************************/
2972 /*              GWKCubicResampleSrcMaskIsDensity4SampleReal()             */
2973 /*     Bi-cubic when source has and only has pafUnifiedSrcDensity.      */
2974 /************************************************************************/
2975 
2976 static bool GWKCubicResampleSrcMaskIsDensity4SampleReal(
2977                              const GDALWarpKernel *poWK, int iBand,
2978                              double dfSrcX, double dfSrcY,
2979                              double *pdfDensity,
2980                              double *pdfReal )
2981 
2982 {
2983     const int iSrcX = static_cast<int>(dfSrcX - 0.5);
2984     const int iSrcY = static_cast<int>(dfSrcY - 0.5);
2985     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
2986     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
2987     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
2988 
2989     // Get the bilinear interpolation at the image borders.
2990     if( iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize
2991         || iSrcY - 1 < 0 || iSrcY + 2 >= poWK->nSrcYSize )
2992     {
2993         double adfImagIgnored[4] = {};
2994         return GWKBilinearResample4Sample(poWK, iBand, dfSrcX, dfSrcY,
2995                                           pdfDensity, pdfReal, adfImagIgnored);
2996     }
2997 
2998     double adfCoeffsX[4] = {};
2999     GWKCubicComputeWeights(dfDeltaX, adfCoeffsX);
3000 
3001     double adfCoeffsY[4] = {};
3002     GWKCubicComputeWeights(dfDeltaY, adfCoeffsY);
3003 
3004     double adfValueDens[4] = {};
3005     double adfValueReal[4] = {};
3006     double adfDensity[4] = {};
3007     double adfReal[4] = {};
3008     double adfImagIgnored[4] = {};
3009 
3010     for( GPtrDiff_t i = -1; i < 3; i++ )
3011     {
3012         if( !GWKGetPixelRow(poWK, iBand, iSrcOffset + i * poWK->nSrcXSize - 1,
3013                             2, adfDensity, adfReal, adfImagIgnored)
3014             || adfDensity[0] < SRC_DENSITY_THRESHOLD
3015             || adfDensity[1] < SRC_DENSITY_THRESHOLD
3016             || adfDensity[2] < SRC_DENSITY_THRESHOLD
3017             || adfDensity[3] < SRC_DENSITY_THRESHOLD )
3018         {
3019             return GWKBilinearResample4Sample( poWK, iBand, dfSrcX, dfSrcY,
3020                                       pdfDensity, pdfReal, adfImagIgnored );
3021         }
3022 
3023         adfValueDens[i + 1] = CONVOL4(adfCoeffsX, adfDensity);
3024         adfValueReal[i + 1] = CONVOL4(adfCoeffsX, adfReal);
3025     }
3026 
3027     *pdfDensity = CONVOL4(adfCoeffsY, adfValueDens);
3028     *pdfReal    = CONVOL4(adfCoeffsY, adfValueReal);
3029 
3030     return true;
3031 }
3032 
3033 template<class T>
3034 static bool GWKCubicResampleNoMasks4SampleT( const GDALWarpKernel *poWK, int iBand,
3035                                      double dfSrcX, double dfSrcY,
3036                                      T *pValue )
3037 
3038 {
3039     const int iSrcX = static_cast<int>(dfSrcX - 0.5);
3040     const int iSrcY = static_cast<int>(dfSrcY - 0.5);
3041     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
3042     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
3043     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
3044     const double dfDeltaY2 = dfDeltaY * dfDeltaY;
3045     const double dfDeltaY3 = dfDeltaY2 * dfDeltaY;
3046 
3047     // Get the bilinear interpolation at the image borders.
3048     if( iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize
3049         || iSrcY - 1 < 0 || iSrcY + 2 >= poWK->nSrcYSize )
3050         return GWKBilinearResampleNoMasks4SampleT ( poWK, iBand, dfSrcX, dfSrcY,
3051                                              pValue );
3052 
3053     double adfCoeffs[4] = {};
3054     GWKCubicComputeWeights(dfDeltaX, adfCoeffs);
3055 
3056     double adfValue[4] = {};
3057 
3058     for( GPtrDiff_t i = -1; i < 3; i++ )
3059     {
3060         const GPtrDiff_t iOffset = iSrcOffset + i * poWK->nSrcXSize - 1;
3061 
3062         adfValue[i + 1] = CONVOL4(
3063             adfCoeffs,
3064             reinterpret_cast<T*>(poWK->papabySrcImage[iBand]) + iOffset);
3065     }
3066 
3067     const double dfValue = CubicConvolution(
3068         dfDeltaY, dfDeltaY2, dfDeltaY3,
3069         adfValue[0], adfValue[1], adfValue[2], adfValue[3]);
3070 
3071     *pValue = GWKClampValueT<T>(dfValue);
3072 
3073     return true;
3074 }
3075 
3076 /************************************************************************/
3077 /*                          GWKLanczosSinc()                            */
3078 /************************************************************************/
3079 
3080 /*
3081  * Lanczos windowed sinc interpolation kernel with radius r.
3082  *        /
3083  *        | sinc(x) * sinc(x/r), if |x| < r
3084  * L(x) = | 1, if x = 0                     ,
3085  *        | 0, otherwise
3086  *        \
3087  *
3088  * where sinc(x) = sin(PI * x) / (PI * x).
3089  */
3090 
3091 static double GWKLanczosSinc( double dfX )
3092 {
3093     if( dfX == 0.0 )
3094         return 1.0;
3095 
3096     const double dfPIX = M_PI * dfX;
3097     const double dfPIXoverR = dfPIX / 3;
3098     const double dfPIX2overR = dfPIX * dfPIXoverR;
3099     return sin(dfPIX) * sin(dfPIXoverR) / dfPIX2overR;
3100 }
3101 
3102 static double GWKLanczosSinc4Values( double* padfValues )
3103 {
3104     for( int i = 0; i < 4; i++ )
3105     {
3106         if( padfValues[i] == 0.0 )
3107         {
3108             padfValues[i] = 1.0;
3109         }
3110         else
3111         {
3112             const double dfPIX = M_PI * padfValues[i];
3113             const double dfPIXoverR = dfPIX / 3;
3114             const double dfPIX2overR = dfPIX * dfPIXoverR;
3115             padfValues[i] = sin(dfPIX) * sin(dfPIXoverR) / dfPIX2overR;
3116         }
3117     }
3118     return padfValues[0] + padfValues[1] + padfValues[2] + padfValues[3];
3119 }
3120 
3121 /************************************************************************/
3122 /*                           GWKBilinear()                              */
3123 /************************************************************************/
3124 
3125 static double GWKBilinear( double dfX )
3126 {
3127     double dfAbsX = fabs(dfX);
3128     if( dfAbsX <= 1.0 )
3129         return 1 - dfAbsX;
3130     else
3131         return 0.0;
3132 }
3133 
3134 static double GWKBilinear4Values( double* padfValues )
3135 {
3136     double dfAbsX0 = fabs(padfValues[0]);
3137     double dfAbsX1 = fabs(padfValues[1]);
3138     double dfAbsX2 = fabs(padfValues[2]);
3139     double dfAbsX3 = fabs(padfValues[3]);
3140     if( dfAbsX0 <= 1.0 )
3141         padfValues[0] = 1 - dfAbsX0;
3142     else
3143         padfValues[0] = 0.0;
3144     if( dfAbsX1 <= 1.0 )
3145         padfValues[1] = 1 - dfAbsX1;
3146     else
3147         padfValues[1] = 0.0;
3148     if( dfAbsX2 <= 1.0 )
3149         padfValues[2] = 1 - dfAbsX2;
3150     else
3151         padfValues[2] = 0.0;
3152     if( dfAbsX3 <= 1.0 )
3153         padfValues[3] = 1 - dfAbsX3;
3154     else
3155         padfValues[3] = 0.0;
3156     return  padfValues[0] +  padfValues[1] + padfValues[2] + padfValues[3];
3157 }
3158 
3159 /************************************************************************/
3160 /*                            GWKCubic()                                */
3161 /************************************************************************/
3162 
3163 static double GWKCubic( double dfX )
3164 {
3165     // http://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
3166     // W(x) formula with a = -0.5 (cubic hermite spline )
3167     // or https://www.cs.utexas.edu/~fussell/courses/cs384g-fall2013/lectures/mitchell/Mitchell.pdf
3168     // k(x) (formula 8) with (B,C)=(0,0.5) the Catmull-Rom spline
3169     double dfAbsX = fabs(dfX);
3170     if( dfAbsX <= 1.0 )
3171     {
3172         double dfX2 = dfX * dfX;
3173         return dfX2 * (1.5 * dfAbsX - 2.5) + 1;
3174     }
3175     else if( dfAbsX <= 2.0 )
3176     {
3177         double dfX2 = dfX * dfX;
3178         return dfX2 * (-0.5 * dfAbsX + 2.5) - 4 * dfAbsX + 2;
3179     }
3180     else
3181         return 0.0;
3182 }
3183 
3184 static double GWKCubic4Values( double* padfValues )
3185 {
3186     const double dfAbsX_0 = fabs(padfValues[0]);
3187     const double dfAbsX_1 = fabs(padfValues[1]);
3188     const double dfAbsX_2 = fabs(padfValues[2]);
3189     const double dfAbsX_3 = fabs(padfValues[3]);
3190     const double dfX2_0 = padfValues[0] * padfValues[0];
3191     const double dfX2_1 = padfValues[1] * padfValues[1];
3192     const double dfX2_2 = padfValues[2] * padfValues[2];
3193     const double dfX2_3 = padfValues[3] * padfValues[3];
3194 
3195     double dfVal0 = 0.0;
3196     if( dfAbsX_0 <= 1.0 )
3197         dfVal0 = dfX2_0 * (1.5 * dfAbsX_0 - 2.5) + 1.0;
3198     else if( dfAbsX_0 <= 2.0 )
3199         dfVal0 = dfX2_0 * (-0.5 * dfAbsX_0 + 2.5) - 4.0 * dfAbsX_0 + 2.0;
3200 
3201     double dfVal1 = 0.0;
3202     if( dfAbsX_1 <= 1.0 )
3203         dfVal1 = dfX2_1 * (1.5 * dfAbsX_1 - 2.5) + 1.0;
3204     else if( dfAbsX_1 <= 2.0 )
3205         dfVal1 = dfX2_1 * (-0.5 * dfAbsX_1 + 2.5) - 4.0 * dfAbsX_1 + 2.0;
3206 
3207     double dfVal2 = 0.0;
3208     if( dfAbsX_2 <= 1.0 )
3209         dfVal2 = dfX2_2 * (1.5 * dfAbsX_2 - 2.5) + 1.0;
3210     else if( dfAbsX_2 <= 2.0 )
3211         dfVal2 = dfX2_2 * (-0.5 * dfAbsX_2 + 2.5) - 4.0 * dfAbsX_2 + 2.0;
3212 
3213     double dfVal3 = 0.0;
3214     if( dfAbsX_3 <= 1.0 )
3215         dfVal3 = dfX2_3 * (1.5 * dfAbsX_3 - 2.5) + 1.0;
3216     else if( dfAbsX_3 <= 2.0 )
3217         dfVal3 = dfX2_3 * (-0.5 * dfAbsX_3 + 2.5) - 4.0 * dfAbsX_3 + 2.0;
3218 
3219     padfValues[0] = dfVal0;
3220     padfValues[1] = dfVal1;
3221     padfValues[2] = dfVal2;
3222     padfValues[3] = dfVal3;
3223     return dfVal0 + dfVal1 + dfVal2 + dfVal3;
3224 }
3225 
3226 /************************************************************************/
3227 /*                           GWKBSpline()                               */
3228 /************************************************************************/
3229 
3230 // https://www.cs.utexas.edu/~fussell/courses/cs384g-fall2013/lectures/mitchell/Mitchell.pdf
3231 // Equation 8 with (B,C)=(1,0)
3232 // 1/6 * ( 3 * |x|^3 -  6 * |x|^2 + 4) |x| < 1
3233 // 1/6 * ( -|x|^3 + 6 |x|^2  - 12|x| + 8) |x| >= 1 and |x| < 2
3234 
3235 static double GWKBSpline( double x )
3236 {
3237     const double xp2 = x + 2.0;
3238     const double xp1 = x + 1.0;
3239     const double xm1 = x - 1.0;
3240 
3241     // This will most likely be used, so we'll compute it ahead of time to
3242     // avoid stalling the processor.
3243     const double xp2c = xp2 * xp2 * xp2;
3244 
3245     // Note that the test is computed only if it is needed.
3246     // TODO(schwehr): Make this easier to follow.
3247     return
3248         xp2 > 0.0
3249         ? ((xp1 > 0.0)?((x > 0.0)?((xm1 > 0.0)?
3250                                    -4.0 * xm1*xm1*xm1:0.0) +
3251                         6.0 * x*x*x:0.0) +
3252            -4.0 * xp1*xp1*xp1:0.0) + xp2c
3253         : 0.0;  // * 0.166666666666666666666
3254 }
3255 
3256 static double GWKBSpline4Values( double* padfValues )
3257 {
3258     for( int i = 0; i < 4; i++ )
3259     {
3260         const double x = padfValues[i];
3261         const double xp2 = x + 2.0;
3262         const double xp1 = x + 1.0;
3263         const double xm1 = x - 1.0;
3264 
3265         // This will most likely be used, so we'll compute it ahead of time to
3266         // avoid stalling the processor.
3267         const double xp2c = xp2 * xp2 * xp2;
3268 
3269         // Note that the test is computed only if it is needed.
3270         // TODO(schwehr): Make this easier to follow.
3271         padfValues[i] = (xp2 > 0.0)?((xp1 > 0.0)?((x > 0.0)?((xm1 > 0.0)?
3272                                                     -4.0 * xm1*xm1*xm1:0.0) +
3273                                         6.0 * x*x*x:0.0) +
3274                             -4.0 * xp1*xp1*xp1:0.0) +
3275                 xp2c:0.0;  // * 0.166666666666666666666
3276     }
3277     return padfValues[0] + padfValues[1] + padfValues[2] + padfValues[3];
3278 }
3279 /************************************************************************/
3280 /*                       GWKResampleWrkStruct                           */
3281 /************************************************************************/
3282 
3283 typedef struct _GWKResampleWrkStruct GWKResampleWrkStruct;
3284 
3285 typedef bool (*pfnGWKResampleType) ( const GDALWarpKernel *poWK, int iBand,
3286                                      double dfSrcX, double dfSrcY,
3287                                      double *pdfDensity,
3288                                      double *pdfReal, double *pdfImag,
3289                                      GWKResampleWrkStruct* psWrkStruct );
3290 
3291 struct _GWKResampleWrkStruct
3292 {
3293     pfnGWKResampleType pfnGWKResample;
3294 
3295     // Space for saved X weights.
3296     double  *padfWeightsX;
3297     bool    *pabCalcX;
3298 
3299     double  *padfWeightsY; // Only used by GWKResampleOptimizedLanczos.
3300     int      iLastSrcX; // Only used by GWKResampleOptimizedLanczos.
3301     int      iLastSrcY; // Only used by GWKResampleOptimizedLanczos.
3302     double   dfLastDeltaX; // Only used by GWKResampleOptimizedLanczos.
3303     double   dfLastDeltaY; // Only used by GWKResampleOptimizedLanczos.
3304 
3305     // Space for saving a row of pixels.
3306     double  *padfRowDensity;
3307     double  *padfRowReal;
3308     double  *padfRowImag;
3309 };
3310 
3311 /************************************************************************/
3312 /*                    GWKResampleCreateWrkStruct()                      */
3313 /************************************************************************/
3314 
3315 static bool GWKResample( const GDALWarpKernel *poWK, int iBand,
3316                         double dfSrcX, double dfSrcY,
3317                         double *pdfDensity,
3318                         double *pdfReal, double *pdfImag,
3319                         GWKResampleWrkStruct* psWrkStruct );
3320 
3321 static bool GWKResampleOptimizedLanczos( const GDALWarpKernel *poWK, int iBand,
3322                                         double dfSrcX, double dfSrcY,
3323                                         double *pdfDensity,
3324                                         double *pdfReal, double *pdfImag,
3325                                         GWKResampleWrkStruct* psWrkStruct );
3326 
3327 static GWKResampleWrkStruct* GWKResampleCreateWrkStruct(GDALWarpKernel *poWK)
3328 {
3329     const int nXDist = ( poWK->nXRadius + 1 ) * 2;
3330     const int nYDist = ( poWK->nYRadius + 1 ) * 2;
3331 
3332     GWKResampleWrkStruct* psWrkStruct = static_cast<GWKResampleWrkStruct *>(
3333         CPLMalloc(sizeof(GWKResampleWrkStruct)));
3334 
3335     // Alloc space for saved X weights.
3336     psWrkStruct->padfWeightsX =
3337         static_cast<double *>(CPLCalloc( nXDist, sizeof(double)));
3338     psWrkStruct->pabCalcX =
3339         static_cast<bool *>(CPLMalloc(nXDist * sizeof(bool)));
3340 
3341     psWrkStruct->padfWeightsY =
3342         static_cast<double *>(CPLCalloc(nYDist, sizeof(double)));
3343     psWrkStruct->iLastSrcX = -10;
3344     psWrkStruct->iLastSrcY = -10;
3345     psWrkStruct->dfLastDeltaX = -10;
3346     psWrkStruct->dfLastDeltaY = -10;
3347 
3348     // Alloc space for saving a row of pixels.
3349     if( poWK->pafUnifiedSrcDensity == nullptr &&
3350         poWK->panUnifiedSrcValid == nullptr &&
3351         poWK->papanBandSrcValid == nullptr )
3352     {
3353         psWrkStruct->padfRowDensity = nullptr;
3354     }
3355     else
3356     {
3357         psWrkStruct->padfRowDensity =
3358             static_cast<double *>(CPLCalloc(nXDist, sizeof(double)));
3359     }
3360     psWrkStruct->padfRowReal =
3361         static_cast<double *>(CPLCalloc(nXDist, sizeof(double)));
3362     psWrkStruct->padfRowImag =
3363         static_cast<double *>(CPLCalloc(nXDist, sizeof(double)));
3364 
3365     if( poWK->eResample == GRA_Lanczos )
3366     {
3367         psWrkStruct->pfnGWKResample = GWKResampleOptimizedLanczos;
3368 
3369         const double dfXScale = poWK->dfXScale;
3370         if( dfXScale < 1.0 )
3371         {
3372             int iMin = poWK->nFiltInitX;
3373             int iMax = poWK->nXRadius;
3374             while( iMin * dfXScale < -3.0 )
3375                 iMin++;
3376             while( iMax * dfXScale > 3.0 )
3377                 iMax--;
3378 
3379             for( int i = iMin; i <= iMax; ++i )
3380             {
3381                 psWrkStruct->padfWeightsX[i-poWK->nFiltInitX] =
3382                     GWKLanczosSinc(i * dfXScale);
3383             }
3384         }
3385 
3386         const double dfYScale = poWK->dfYScale;
3387         if( dfYScale < 1.0 )
3388         {
3389             int jMin = poWK->nFiltInitY;
3390             int jMax = poWK->nYRadius;
3391             while( jMin * dfYScale < -3.0 )
3392                 jMin++;
3393             while( jMax * dfYScale > 3.0 )
3394                 jMax--;
3395 
3396             for( int j = jMin; j <= jMax; ++j )
3397             {
3398                 psWrkStruct->padfWeightsY[j-poWK->nFiltInitY] =
3399                     GWKLanczosSinc(j * dfYScale);
3400             }
3401         }
3402     }
3403     else
3404         psWrkStruct->pfnGWKResample = GWKResample;
3405 
3406     return psWrkStruct;
3407 }
3408 
3409 /************************************************************************/
3410 /*                    GWKResampleDeleteWrkStruct()                      */
3411 /************************************************************************/
3412 
3413 static void GWKResampleDeleteWrkStruct(GWKResampleWrkStruct* psWrkStruct)
3414 {
3415     CPLFree( psWrkStruct->padfWeightsX );
3416     CPLFree( psWrkStruct->padfWeightsY );
3417     CPLFree( psWrkStruct->pabCalcX );
3418     CPLFree( psWrkStruct->padfRowDensity );
3419     CPLFree( psWrkStruct->padfRowReal );
3420     CPLFree( psWrkStruct->padfRowImag );
3421     CPLFree( psWrkStruct );
3422 }
3423 
3424 /************************************************************************/
3425 /*                           GWKResample()                              */
3426 /************************************************************************/
3427 
3428 static bool GWKResample( const GDALWarpKernel *poWK, int iBand,
3429                         double dfSrcX, double dfSrcY,
3430                         double *pdfDensity,
3431                         double *pdfReal, double *pdfImag,
3432                         GWKResampleWrkStruct* psWrkStruct )
3433 
3434 {
3435     // Save as local variables to avoid following pointers in loops.
3436     const int nSrcXSize = poWK->nSrcXSize;
3437     const int nSrcYSize = poWK->nSrcYSize;
3438 
3439     double dfAccumulatorReal = 0.0;
3440     double dfAccumulatorImag = 0.0;
3441     double dfAccumulatorDensity = 0.0;
3442     double dfAccumulatorWeight = 0.0;
3443     const int iSrcX = static_cast<int>(floor(dfSrcX - 0.5));
3444     const int iSrcY = static_cast<int>(floor(dfSrcY - 0.5));
3445     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
3446     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
3447     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
3448 
3449     const double dfXScale = poWK->dfXScale;
3450     const double dfYScale = poWK->dfYScale;
3451 
3452     const int nXDist = ( poWK->nXRadius + 1 ) * 2;
3453 
3454     // Space for saved X weights.
3455     double *padfWeightsX = psWrkStruct->padfWeightsX;
3456     bool *pabCalcX = psWrkStruct->pabCalcX;
3457 
3458     // Space for saving a row of pixels.
3459     double *padfRowDensity = psWrkStruct->padfRowDensity;
3460     double *padfRowReal = psWrkStruct->padfRowReal;
3461     double *padfRowImag = psWrkStruct->padfRowImag;
3462 
3463     // Mark as needing calculation (don't calculate the weights yet,
3464     // because a mask may render it unnecessary).
3465     memset( pabCalcX, false, nXDist * sizeof(bool) );
3466 
3467     FilterFuncType pfnGetWeight = apfGWKFilter[poWK->eResample];
3468     CPLAssert(pfnGetWeight);
3469 
3470     // Skip sampling over edge of image.
3471     int j = poWK->nFiltInitY;
3472     int jMax= poWK->nYRadius;
3473     if( iSrcY + j < 0 )
3474         j = -iSrcY;
3475     if( iSrcY + jMax >= nSrcYSize )
3476         jMax = nSrcYSize - iSrcY - 1;
3477 
3478     int iMin = poWK->nFiltInitX;
3479     int iMax = poWK->nXRadius;
3480     if( iSrcX + iMin < 0 )
3481         iMin = -iSrcX;
3482     if( iSrcX + iMax >= nSrcXSize )
3483         iMax = nSrcXSize - iSrcX - 1;
3484 
3485     const int bXScaleBelow1 = ( dfXScale < 1.0 );
3486     const int bYScaleBelow1 = ( dfYScale < 1.0 );
3487 
3488     GPtrDiff_t iRowOffset = iSrcOffset + static_cast<GPtrDiff_t>(j - 1) * nSrcXSize + iMin;
3489 
3490     // Loop over pixel rows in the kernel.
3491     for( ; j <= jMax; ++j )
3492     {
3493         iRowOffset += nSrcXSize;
3494 
3495         // Get pixel values.
3496         // We can potentially read extra elements after the "normal" end of the
3497         // source arrays, but the contract of papabySrcImage[iBand],
3498         // papanBandSrcValid[iBand], panUnifiedSrcValid and pafUnifiedSrcDensity
3499         // is to have WARP_EXTRA_ELTS reserved at their end.
3500         if( !GWKGetPixelRow( poWK, iBand, iRowOffset, (iMax-iMin+2)/2,
3501                              padfRowDensity, padfRowReal, padfRowImag ) )
3502             continue;
3503 
3504         // Calculate the Y weight.
3505         double dfWeight1 = ( bYScaleBelow1 ) ?
3506                 pfnGetWeight((j - dfDeltaY) * dfYScale):
3507                 pfnGetWeight(j - dfDeltaY);
3508 
3509         // Iterate over pixels in row.
3510         double dfAccumulatorRealLocal = 0.0;
3511         double dfAccumulatorImagLocal = 0.0;
3512         double dfAccumulatorDensityLocal = 0.0;
3513         double dfAccumulatorWeightLocal = 0.0;
3514 
3515         for( int i = iMin; i <= iMax; ++i )
3516         {
3517             // Skip sampling if pixel has zero density.
3518             if( padfRowDensity != nullptr &&
3519                 padfRowDensity[i-iMin] < SRC_DENSITY_THRESHOLD )
3520                 continue;
3521 
3522             double dfWeight2 = 0.0;
3523 
3524             // Make or use a cached set of weights for this row.
3525             if( pabCalcX[i-iMin] )
3526             {
3527                 // Use saved weight value instead of recomputing it.
3528                 dfWeight2 = padfWeightsX[i-iMin];
3529             }
3530             else
3531             {
3532                 // Calculate & save the X weight.
3533                 padfWeightsX[i-iMin] = dfWeight2 = ( bXScaleBelow1 ) ?
3534                         pfnGetWeight((i - dfDeltaX) * dfXScale):
3535                         pfnGetWeight(i - dfDeltaX);
3536 
3537                 pabCalcX[i-iMin] = true;
3538             }
3539 
3540             // Accumulate!
3541             dfAccumulatorRealLocal += padfRowReal[i-iMin] * dfWeight2;
3542             dfAccumulatorImagLocal += padfRowImag[i-iMin] * dfWeight2;
3543             if( padfRowDensity != nullptr )
3544                 dfAccumulatorDensityLocal += padfRowDensity[i-iMin] * dfWeight2;
3545             dfAccumulatorWeightLocal += dfWeight2;
3546         }
3547 
3548         dfAccumulatorReal += dfAccumulatorRealLocal * dfWeight1;
3549         dfAccumulatorImag += dfAccumulatorImagLocal * dfWeight1;
3550         dfAccumulatorDensity += dfAccumulatorDensityLocal * dfWeight1;
3551         dfAccumulatorWeight += dfAccumulatorWeightLocal * dfWeight1;
3552     }
3553 
3554     if( dfAccumulatorWeight < 0.000001 ||
3555         (padfRowDensity != nullptr && dfAccumulatorDensity < 0.000001) )
3556     {
3557         *pdfDensity = 0.0;
3558         return false;
3559     }
3560 
3561     // Calculate the output taking into account weighting.
3562     if( dfAccumulatorWeight < 0.99999 || dfAccumulatorWeight > 1.00001 )
3563     {
3564         *pdfReal = dfAccumulatorReal / dfAccumulatorWeight;
3565         *pdfImag = dfAccumulatorImag / dfAccumulatorWeight;
3566         if( padfRowDensity != nullptr )
3567             *pdfDensity = dfAccumulatorDensity / dfAccumulatorWeight;
3568         else
3569             *pdfDensity = 1.0;
3570     }
3571     else
3572     {
3573         *pdfReal = dfAccumulatorReal;
3574         *pdfImag = dfAccumulatorImag;
3575         if( padfRowDensity != nullptr )
3576             *pdfDensity = dfAccumulatorDensity;
3577         else
3578             *pdfDensity = 1.0;
3579     }
3580 
3581     return true;
3582 }
3583 
3584 /************************************************************************/
3585 /*                      GWKResampleOptimizedLanczos()                   */
3586 /************************************************************************/
3587 
3588 static bool GWKResampleOptimizedLanczos( const GDALWarpKernel *poWK, int iBand,
3589                         double dfSrcX, double dfSrcY,
3590                         double *pdfDensity,
3591                         double *pdfReal, double *pdfImag,
3592                         GWKResampleWrkStruct* psWrkStruct )
3593 
3594 {
3595     // Save as local variables to avoid following pointers in loops.
3596     const int nSrcXSize = poWK->nSrcXSize;
3597     const int nSrcYSize = poWK->nSrcYSize;
3598 
3599     double dfAccumulatorReal = 0.0;
3600     double dfAccumulatorImag = 0.0;
3601     double dfAccumulatorDensity = 0.0;
3602     double dfAccumulatorWeight = 0.0;
3603     const int     iSrcX = static_cast<int>(floor( dfSrcX - 0.5 ));
3604     const int     iSrcY = static_cast<int>(floor( dfSrcY - 0.5 ));
3605     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
3606     const double  dfDeltaX = dfSrcX - 0.5 - iSrcX;
3607     const double  dfDeltaY = dfSrcY - 0.5 - iSrcY;
3608 
3609     const double dfXScale = poWK->dfXScale;
3610     const double dfYScale = poWK->dfYScale;
3611 
3612     // Space for saved X weights.
3613     double *padfWeightsX = psWrkStruct->padfWeightsX;
3614     double *padfWeightsY = psWrkStruct->padfWeightsY;
3615 
3616     // Space for saving a row of pixels.
3617     double *padfRowDensity = psWrkStruct->padfRowDensity;
3618     double *padfRowReal = psWrkStruct->padfRowReal;
3619     double *padfRowImag = psWrkStruct->padfRowImag;
3620 
3621     // Skip sampling over edge of image.
3622     int jMin = poWK->nFiltInitY;
3623     int jMax = poWK->nYRadius;
3624     if( iSrcY + jMin < 0 )
3625         jMin = -iSrcY;
3626     if( iSrcY + jMax >= nSrcYSize )
3627         jMax = nSrcYSize - iSrcY - 1;
3628 
3629     int iMin = poWK->nFiltInitX;
3630     int iMax = poWK->nXRadius;
3631     if( iSrcX + iMin < 0 )
3632         iMin = -iSrcX;
3633     if( iSrcX + iMax >= nSrcXSize )
3634         iMax = nSrcXSize - iSrcX - 1;
3635 
3636     if( dfXScale < 1.0 )
3637     {
3638         while( iMin * dfXScale < -3.0 )
3639             iMin++;
3640         while( iMax * dfXScale > 3.0 )
3641             iMax--;
3642         // padfWeightsX computed in GWKResampleCreateWrkStruct.
3643     }
3644     else
3645     {
3646         while( iMin - dfDeltaX < -3.0 )
3647             iMin++;
3648         while( iMax - dfDeltaX > 3.0 )
3649             iMax--;
3650 
3651         if( iSrcX != psWrkStruct->iLastSrcX ||
3652             dfDeltaX != psWrkStruct->dfLastDeltaX )
3653         {
3654             // Optimisation of GWKLanczosSinc(i - dfDeltaX) based on the
3655             // following trigonometric formulas.
3656 
3657 // TODO(schwehr): Move this somewhere where it can be rendered at LaTeX.
3658 // sin(M_PI * (dfBase + k)) = sin(M_PI * dfBase) * cos(M_PI * k) + cos(M_PI * dfBase) * sin(M_PI * k)
3659 // sin(M_PI * (dfBase + k)) = dfSinPIBase * cos(M_PI * k) + dfCosPIBase * sin(M_PI * k)
3660 // sin(M_PI * (dfBase + k)) = dfSinPIBase * cos(M_PI * k)
3661 // sin(M_PI * (dfBase + k)) = dfSinPIBase * (((k % 2) == 0) ? 1 : -1)
3662 
3663 // sin(M_PI / dfR * (dfBase + k)) = sin(M_PI / dfR * dfBase) * cos(M_PI / dfR * k) + cos(M_PI / dfR * dfBase) * sin(M_PI / dfR * k)
3664 // sin(M_PI / dfR * (dfBase + k)) = dfSinPIBaseOverR * cos(M_PI / dfR * k) + dfCosPIBaseOverR * sin(M_PI / dfR * k)
3665 
3666             const double dfSinPIDeltaXOver3 = sin((-M_PI / 3.0) * dfDeltaX);
3667             const double dfSin2PIDeltaXOver3 =
3668                 dfSinPIDeltaXOver3 * dfSinPIDeltaXOver3;
3669             // Ok to use sqrt(1-sin^2) since M_PI / 3 * dfDeltaX < PI/2.
3670             const double dfCosPIDeltaXOver3 = sqrt(1.0 - dfSin2PIDeltaXOver3);
3671             const double dfSinPIDeltaX =
3672                 (3.0 - 4 * dfSin2PIDeltaXOver3) * dfSinPIDeltaXOver3;
3673             const double dfInvPI2Over3 = 3.0 / (M_PI * M_PI);
3674             const double dfInvPI2Over3xSinPIDeltaX =
3675                 dfInvPI2Over3 * dfSinPIDeltaX;
3676             const double dfInvPI2Over3xSinPIDeltaXxm0d5SinPIDeltaXOver3 =
3677                 -0.5 * dfInvPI2Over3xSinPIDeltaX * dfSinPIDeltaXOver3;
3678             const double dfSinPIOver3 = 0.8660254037844386;
3679             const double dfInvPI2Over3xSinPIDeltaXxSinPIOver3xCosPIDeltaXOver3 =
3680                 dfSinPIOver3 * dfInvPI2Over3xSinPIDeltaX * dfCosPIDeltaXOver3;
3681             const double padfCst[] = {
3682                 dfInvPI2Over3xSinPIDeltaX * dfSinPIDeltaXOver3,
3683                 dfInvPI2Over3xSinPIDeltaXxm0d5SinPIDeltaXOver3 -
3684                         dfInvPI2Over3xSinPIDeltaXxSinPIOver3xCosPIDeltaXOver3,
3685                 dfInvPI2Over3xSinPIDeltaXxm0d5SinPIDeltaXOver3 +
3686                         dfInvPI2Over3xSinPIDeltaXxSinPIOver3xCosPIDeltaXOver3 };
3687 
3688             for( int i = iMin; i <= iMax; ++i )
3689             {
3690                 const double dfX = i - dfDeltaX;
3691                 if( dfX == 0.0 )
3692                     padfWeightsX[i-poWK->nFiltInitX] = 1.0;
3693                 else
3694                     padfWeightsX[i-poWK->nFiltInitX] =
3695                                             padfCst[(i + 3) % 3] / (dfX * dfX);
3696 #if DEBUG_VERBOSE
3697                 // TODO(schwehr): AlmostEqual.
3698                 //CPLAssert(fabs(padfWeightsX[i-poWK->nFiltInitX] -
3699                 //               GWKLanczosSinc(dfX, 3.0)) < 1e-10);
3700 #endif
3701             }
3702 
3703             psWrkStruct->iLastSrcX = iSrcX;
3704             psWrkStruct->dfLastDeltaX = dfDeltaX;
3705         }
3706     }
3707 
3708     if( dfYScale < 1.0 )
3709     {
3710         while( jMin * dfYScale < -3.0 )
3711             jMin++;
3712         while( jMax * dfYScale > 3.0 )
3713             jMax--;
3714         // padfWeightsY computed in GWKResampleCreateWrkStruct.
3715     }
3716     else
3717     {
3718         while( jMin - dfDeltaY < -3.0 )
3719             jMin++;
3720         while( jMax - dfDeltaY > 3.0 )
3721             jMax--;
3722 
3723         if( iSrcY != psWrkStruct->iLastSrcY ||
3724             dfDeltaY != psWrkStruct->dfLastDeltaY )
3725         {
3726             const double dfSinPIDeltaYOver3 = sin((-M_PI / 3.0) * dfDeltaY);
3727             const double dfSin2PIDeltaYOver3 =
3728                 dfSinPIDeltaYOver3 * dfSinPIDeltaYOver3;
3729             // Ok to use sqrt(1-sin^2) since M_PI / 3 * dfDeltaY < PI/2.
3730             const double dfCosPIDeltaYOver3 = sqrt(1.0 - dfSin2PIDeltaYOver3);
3731             const double dfSinPIDeltaY =
3732                 (3.0 - 4.0 * dfSin2PIDeltaYOver3) * dfSinPIDeltaYOver3;
3733             const double dfInvPI2Over3 = 3.0 / (M_PI * M_PI);
3734             const double dfInvPI2Over3xSinPIDeltaY =
3735                 dfInvPI2Over3 * dfSinPIDeltaY;
3736             const double dfInvPI2Over3xSinPIDeltaYxm0d5SinPIDeltaYOver3 =
3737                 -0.5 * dfInvPI2Over3xSinPIDeltaY * dfSinPIDeltaYOver3;
3738             const double dfSinPIOver3 = 0.8660254037844386;
3739             const double dfInvPI2Over3xSinPIDeltaYxSinPIOver3xCosPIDeltaYOver3 =
3740                 dfSinPIOver3 * dfInvPI2Over3xSinPIDeltaY * dfCosPIDeltaYOver3;
3741             const double padfCst[] = {
3742                 dfInvPI2Over3xSinPIDeltaY * dfSinPIDeltaYOver3,
3743                 dfInvPI2Over3xSinPIDeltaYxm0d5SinPIDeltaYOver3 -
3744                         dfInvPI2Over3xSinPIDeltaYxSinPIOver3xCosPIDeltaYOver3,
3745                 dfInvPI2Over3xSinPIDeltaYxm0d5SinPIDeltaYOver3 +
3746                         dfInvPI2Over3xSinPIDeltaYxSinPIOver3xCosPIDeltaYOver3 };
3747 
3748             for( int j = jMin; j <= jMax; ++j )
3749             {
3750                 const double dfY = j - dfDeltaY;
3751                 if( dfY == 0.0 )
3752                     padfWeightsY[j-poWK->nFiltInitY] = 1.0;
3753                 else
3754                     padfWeightsY[j-poWK->nFiltInitY] =
3755                                             padfCst[(j + 3) % 3] / (dfY * dfY);
3756 #if DEBUG_VERBOSE
3757                 // TODO(schwehr): AlmostEqual.
3758                 //CPLAssert(fabs(padfWeightsY[j-poWK->nFiltInitY] -
3759                 //               GWKLanczosSinc(dfY, 3.0)) < 1e-10);
3760 #endif
3761             }
3762 
3763             psWrkStruct->iLastSrcY = iSrcY;
3764             psWrkStruct->dfLastDeltaY = dfDeltaY;
3765         }
3766     }
3767 
3768     GPtrDiff_t iRowOffset = iSrcOffset + static_cast<GPtrDiff_t>(jMin - 1) * nSrcXSize + iMin;
3769 
3770     // If we have no density information, we can simply compute the
3771     // accumulated weight.
3772     if( padfRowDensity == nullptr )
3773     {
3774         double dfRowAccWeight = 0.0;
3775         for( int i = iMin; i <= iMax; ++i )
3776         {
3777             dfRowAccWeight += padfWeightsX[i-poWK->nFiltInitX];
3778         }
3779         double dfColAccWeight = 0.0;
3780         for( int j = jMin; j <= jMax; ++j )
3781         {
3782             dfColAccWeight += padfWeightsY[j-poWK->nFiltInitY];
3783         }
3784         dfAccumulatorWeight = dfRowAccWeight * dfColAccWeight;
3785     }
3786 
3787     const bool bIsNonComplex = !GDALDataTypeIsComplex(poWK->eWorkingDataType);
3788 
3789     // Loop over pixel rows in the kernel.
3790     int nCountValid = 0;
3791     for( int j = jMin; j <= jMax; ++j )
3792     {
3793         iRowOffset += nSrcXSize;
3794 
3795         // Get pixel values.
3796         // We can potentially read extra elements after the "normal" end of the
3797         // source arrays, but the contract of papabySrcImage[iBand],
3798         // papanBandSrcValid[iBand], panUnifiedSrcValid and pafUnifiedSrcDensity
3799         // is to have WARP_EXTRA_ELTS reserved at their end.
3800         if( !GWKGetPixelRow( poWK, iBand, iRowOffset, (iMax-iMin+2)/2,
3801                              padfRowDensity, padfRowReal, padfRowImag ) )
3802             continue;
3803 
3804         const double dfWeight1 = padfWeightsY[j-poWK->nFiltInitY];
3805 
3806         // Iterate over pixels in row.
3807         if( padfRowDensity != nullptr )
3808         {
3809             for( int i = iMin; i <= iMax; ++i )
3810             {
3811                 // Skip sampling if pixel has zero density.
3812                 if( padfRowDensity[i - iMin] < SRC_DENSITY_THRESHOLD )
3813                     continue;
3814 
3815                 nCountValid ++;
3816 
3817                 //  Use a cached set of weights for this row.
3818                 const double dfWeight2 =
3819                     dfWeight1 * padfWeightsX[i- poWK->nFiltInitX];
3820 
3821                 // Accumulate!
3822                 dfAccumulatorReal += padfRowReal[i - iMin] * dfWeight2;
3823                 dfAccumulatorImag += padfRowImag[i - iMin] * dfWeight2;
3824                 dfAccumulatorDensity += padfRowDensity[i - iMin] * dfWeight2;
3825                 dfAccumulatorWeight += dfWeight2;
3826             }
3827         }
3828         else if( bIsNonComplex )
3829         {
3830             double dfRowAccReal = 0.0;
3831             for( int i = iMin; i <= iMax; ++i )
3832             {
3833                 const double dfWeight2 = padfWeightsX[i- poWK->nFiltInitX];
3834 
3835                 // Accumulate!
3836                 dfRowAccReal += padfRowReal[i - iMin] * dfWeight2;
3837             }
3838 
3839             dfAccumulatorReal += dfRowAccReal * dfWeight1;
3840         }
3841         else
3842         {
3843             double dfRowAccReal = 0.0;
3844             double dfRowAccImag = 0.0;
3845             for( int i = iMin; i <= iMax; ++i )
3846             {
3847                 const double dfWeight2 = padfWeightsX[i- poWK->nFiltInitX];
3848 
3849                 // Accumulate!
3850                 dfRowAccReal += padfRowReal[i - iMin] * dfWeight2;
3851                 dfRowAccImag += padfRowImag[i - iMin] * dfWeight2;
3852             }
3853 
3854             dfAccumulatorReal += dfRowAccReal * dfWeight1;
3855             dfAccumulatorImag += dfRowAccImag * dfWeight1;
3856         }
3857     }
3858 
3859     if( dfAccumulatorWeight < 0.000001 ||
3860          (padfRowDensity != nullptr && (
3861              dfAccumulatorDensity < 0.000001 ||
3862              nCountValid < (jMax - jMin + 1) * (iMax - iMin + 1)/2)) )
3863     {
3864         *pdfDensity = 0.0;
3865         return false;
3866     }
3867 
3868     // Calculate the output taking into account weighting.
3869     if( dfAccumulatorWeight < 0.99999 || dfAccumulatorWeight > 1.00001 )
3870     {
3871         const double dfInvAcc = 1.0 / dfAccumulatorWeight;
3872         *pdfReal = dfAccumulatorReal * dfInvAcc;
3873         *pdfImag = dfAccumulatorImag * dfInvAcc;
3874         if( padfRowDensity != nullptr )
3875             *pdfDensity = dfAccumulatorDensity * dfInvAcc;
3876         else
3877             *pdfDensity = 1.0;
3878     }
3879     else
3880     {
3881         *pdfReal = dfAccumulatorReal;
3882         *pdfImag = dfAccumulatorImag;
3883         if( padfRowDensity != nullptr )
3884             *pdfDensity = dfAccumulatorDensity;
3885         else
3886             *pdfDensity = 1.0;
3887     }
3888 
3889     return true;
3890 }
3891 
3892 /************************************************************************/
3893 /*                        GWKResampleNoMasksT()                         */
3894 /************************************************************************/
3895 
3896 template <class T>
3897 static bool GWKResampleNoMasksT( const GDALWarpKernel *poWK, int iBand,
3898                                 double dfSrcX, double dfSrcY,
3899                                 T *pValue, double *padfWeight )
3900 
3901 {
3902     // Commonly used; save locally.
3903     const int nSrcXSize = poWK->nSrcXSize;
3904     const int nSrcYSize = poWK->nSrcYSize;
3905 
3906     const int iSrcX = static_cast<int>(floor(dfSrcX - 0.5));
3907     const int iSrcY = static_cast<int>(floor(dfSrcY - 0.5));
3908     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
3909 
3910     const int nXRadius = poWK->nXRadius;
3911     const int nYRadius = poWK->nYRadius;
3912 
3913     // Politely refuse to process invalid coordinates or obscenely small image.
3914     if( iSrcX >= nSrcXSize || iSrcY >= nSrcYSize
3915         || nXRadius > nSrcXSize || nYRadius > nSrcYSize )
3916         return GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
3917                                                   pValue);
3918 
3919     T* pSrcBand = reinterpret_cast<T*>(poWK->papabySrcImage[iBand]);
3920     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
3921     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
3922 
3923     const FilterFuncType pfnGetWeight = apfGWKFilter[poWK->eResample];
3924     CPLAssert(pfnGetWeight);
3925     const FilterFunc4ValuesType pfnGetWeight4Values =
3926         apfGWKFilter4Values[poWK->eResample];
3927     CPLAssert(pfnGetWeight4Values);
3928 
3929     const double dfXScale = std::min(poWK->dfXScale, 1.0);
3930     const double dfYScale = std::min(poWK->dfYScale, 1.0);
3931 
3932     // Loop over all rows in the kernel.
3933     double dfAccumulatorWeightHorizontal = 0.0;
3934     double dfAccumulatorWeightVertical = 0.0;
3935 
3936     int iMin = 1 - nXRadius;
3937     if( iSrcX + iMin < 0 )
3938         iMin = -iSrcX;
3939     int iMax = nXRadius;
3940     if( iSrcX + iMax >= nSrcXSize-1 )
3941         iMax = nSrcXSize-1 - iSrcX;
3942     int i = iMin;  // Used after for.
3943     int iC = 0;  // Used after for.
3944     for( ; i+2 < iMax; i += 4, iC += 4 )
3945     {
3946         padfWeight[iC] = (i - dfDeltaX) * dfXScale;
3947         padfWeight[iC+1] = padfWeight[iC] + dfXScale;
3948         padfWeight[iC+2] = padfWeight[iC+1] + dfXScale;
3949         padfWeight[iC+3] = padfWeight[iC+2] + dfXScale;
3950         dfAccumulatorWeightHorizontal += pfnGetWeight4Values(padfWeight+iC);
3951     }
3952     for( ; i <= iMax; ++i, ++iC )
3953     {
3954         const double dfWeight = pfnGetWeight((i - dfDeltaX) * dfXScale);
3955         padfWeight[iC] = dfWeight;
3956         dfAccumulatorWeightHorizontal += dfWeight;
3957     }
3958 
3959     int j = 1 - nYRadius;
3960     if(  iSrcY + j < 0 )
3961         j = -iSrcY;
3962     int jMax = nYRadius;
3963     if( iSrcY + jMax >= nSrcYSize-1 )
3964         jMax = nSrcYSize-1 - iSrcY;
3965 
3966     double dfAccumulator = 0.0;
3967 
3968     for( ; j <= jMax; ++j )
3969     {
3970         const GPtrDiff_t iSampJ = iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
3971 
3972         // Loop over all pixels in the row.
3973         double dfAccumulatorLocal = 0.0;
3974         double dfAccumulatorLocal2 = 0.0;
3975         iC = 0;
3976         i = iMin;
3977         // Process by chunk of 4 cols.
3978         for( ; i+2 < iMax; i += 4, iC += 4 )
3979         {
3980             // Retrieve the pixel & accumulate.
3981             dfAccumulatorLocal += pSrcBand[i+iSampJ] * padfWeight[iC];
3982             dfAccumulatorLocal += pSrcBand[i+1+iSampJ] * padfWeight[iC+1];
3983             dfAccumulatorLocal2 += pSrcBand[i+2+iSampJ] * padfWeight[iC+2];
3984             dfAccumulatorLocal2 += pSrcBand[i+3+iSampJ] * padfWeight[iC+3];
3985         }
3986         dfAccumulatorLocal += dfAccumulatorLocal2;
3987         if( i < iMax )
3988         {
3989             dfAccumulatorLocal += pSrcBand[i+iSampJ] * padfWeight[iC];
3990             dfAccumulatorLocal += pSrcBand[i+1+iSampJ] * padfWeight[iC+1];
3991             i += 2;
3992             iC += 2;
3993         }
3994         if( i == iMax )
3995         {
3996             dfAccumulatorLocal += pSrcBand[i+iSampJ] * padfWeight[iC];
3997         }
3998 
3999         // Calculate the Y weight.
4000         const double dfWeight = pfnGetWeight((j - dfDeltaY) * dfYScale);
4001         dfAccumulator += dfWeight * dfAccumulatorLocal;
4002         dfAccumulatorWeightVertical += dfWeight;
4003     }
4004 
4005     const double dfAccumulatorWeight =
4006         dfAccumulatorWeightHorizontal * dfAccumulatorWeightVertical;
4007 
4008     *pValue = GWKClampValueT<T>(dfAccumulator / dfAccumulatorWeight);
4009 
4010     return true;
4011 }
4012 
4013 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
4014 /* Could possibly be used too on 32bit, but we would need to check at runtime */
4015 #if defined(__x86_64) || defined(_M_X64)
4016 
4017 /************************************************************************/
4018 /*                    GWKResampleNoMasks_SSE2_T()                       */
4019 /************************************************************************/
4020 
4021 template<class T>
4022 static bool GWKResampleNoMasks_SSE2_T( const GDALWarpKernel *poWK, int iBand,
4023                                       double dfSrcX, double dfSrcY,
4024                                       T *pValue, double *padfWeight )
4025 {
4026     // Commonly used; save locally.
4027     const int nSrcXSize = poWK->nSrcXSize;
4028     const int nSrcYSize = poWK->nSrcYSize;
4029 
4030     const int iSrcX = static_cast<int>(floor(dfSrcX - 0.5));
4031     const int iSrcY = static_cast<int>(floor(dfSrcY - 0.5));
4032     const GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
4033     const int nXRadius = poWK->nXRadius;
4034     const int nYRadius = poWK->nYRadius;
4035 
4036     // Politely refuse to process invalid coordinates or obscenely small image.
4037     if( iSrcX >= nSrcXSize || iSrcY >= nSrcYSize ||
4038         nXRadius > nSrcXSize || nYRadius > nSrcYSize )
4039         return GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
4040                                                   pValue);
4041 
4042     const T* pSrcBand = reinterpret_cast<const T*>(poWK->papabySrcImage[iBand]);
4043 
4044     const FilterFuncType pfnGetWeight = apfGWKFilter[poWK->eResample];
4045     CPLAssert(pfnGetWeight);
4046     const FilterFunc4ValuesType pfnGetWeight4Values =
4047         apfGWKFilter4Values[poWK->eResample];
4048     CPLAssert(pfnGetWeight4Values);
4049 
4050     const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
4051     const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
4052     const double dfXScale = std::min(poWK->dfXScale, 1.0);
4053     const double dfYScale = std::min(poWK->dfYScale, 1.0);
4054 
4055     // Loop over all rows in the kernel.
4056     double dfAccumulatorWeightHorizontal = 0.0;
4057     double dfAccumulatorWeightVertical = 0.0;
4058     double dfAccumulator = 0.0;
4059 
4060     int iMin = 1 - nXRadius;
4061     if( iSrcX + iMin < 0 )
4062         iMin = -iSrcX;
4063     int iMax = nXRadius;
4064     if( iSrcX + iMax >= nSrcXSize-1 )
4065         iMax = nSrcXSize-1 - iSrcX;
4066     int i, iC;
4067     for( iC = 0, i = iMin; i+2 < iMax; i+=4, iC+=4 )
4068     {
4069         padfWeight[iC] = (i - dfDeltaX) * dfXScale;
4070         padfWeight[iC+1] = padfWeight[iC] + dfXScale;
4071         padfWeight[iC+2] = padfWeight[iC+1] + dfXScale;
4072         padfWeight[iC+3] = padfWeight[iC+2] + dfXScale;
4073         dfAccumulatorWeightHorizontal += pfnGetWeight4Values(padfWeight+iC);
4074     }
4075     for( ; i <= iMax; ++i, ++iC )
4076     {
4077         double dfWeight = pfnGetWeight((i - dfDeltaX) * dfXScale);
4078         padfWeight[iC] = dfWeight;
4079         dfAccumulatorWeightHorizontal += dfWeight;
4080     }
4081 
4082     int j = 1 - nYRadius;
4083     if(  iSrcY + j < 0 )
4084         j = -iSrcY;
4085     int jMax = nYRadius;
4086     if( iSrcY + jMax >= nSrcYSize-1 )
4087         jMax = nSrcYSize-1 - iSrcY;
4088 
4089     // Process by chunk of 4 rows.
4090     for( ; j+2 < jMax; j+=4 )
4091     {
4092         const GPtrDiff_t iSampJ = iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
4093 
4094         // Loop over all pixels in the row.
4095         iC = 0;
4096         i = iMin;
4097         // Process by chunk of 4 cols.
4098         XMMReg4Double v_acc_1 = XMMReg4Double::Zero();
4099         XMMReg4Double v_acc_2 = XMMReg4Double::Zero();
4100         XMMReg4Double v_acc_3 = XMMReg4Double::Zero();
4101         XMMReg4Double v_acc_4 = XMMReg4Double::Zero();
4102         for( ; i+2 < iMax; i+=4, iC+=4 )
4103         {
4104             // Retrieve the pixel & accumulate.
4105             XMMReg4Double v_pixels_1 =
4106                 XMMReg4Double::Load4Val(pSrcBand+i+iSampJ);
4107             XMMReg4Double v_pixels_2 =
4108                 XMMReg4Double::Load4Val(pSrcBand+i+iSampJ+nSrcXSize);
4109             XMMReg4Double v_pixels_3 =
4110                 XMMReg4Double::Load4Val(pSrcBand+i+iSampJ+2*nSrcXSize);
4111             XMMReg4Double v_pixels_4 =
4112                 XMMReg4Double::Load4Val(pSrcBand+i+iSampJ+3*nSrcXSize);
4113 
4114             XMMReg4Double v_padfWeight =
4115                 XMMReg4Double::Load4Val(padfWeight + iC);
4116 
4117             v_acc_1 += v_pixels_1 * v_padfWeight;
4118             v_acc_2 += v_pixels_2 * v_padfWeight;
4119             v_acc_3 += v_pixels_3 * v_padfWeight;
4120             v_acc_4 += v_pixels_4 * v_padfWeight;
4121         }
4122 
4123         if( i < iMax )
4124         {
4125             XMMReg2Double v_pixels_1 =
4126                 XMMReg2Double::Load2Val(pSrcBand+i+iSampJ);
4127             XMMReg2Double v_pixels_2 =
4128                 XMMReg2Double::Load2Val(pSrcBand+i+iSampJ+nSrcXSize);
4129             XMMReg2Double v_pixels_3 =
4130                 XMMReg2Double::Load2Val(pSrcBand+i+iSampJ+2*nSrcXSize);
4131             XMMReg2Double v_pixels_4 =
4132                 XMMReg2Double::Load2Val(pSrcBand+i+iSampJ+3*nSrcXSize);
4133 
4134             XMMReg2Double v_padfWeight =
4135                 XMMReg2Double::Load2Val(padfWeight + iC);
4136 
4137             v_acc_1.AddToLow( v_pixels_1 * v_padfWeight );
4138             v_acc_2.AddToLow( v_pixels_2 * v_padfWeight );
4139             v_acc_3.AddToLow( v_pixels_3 * v_padfWeight );
4140             v_acc_4.AddToLow( v_pixels_4 * v_padfWeight );
4141 
4142             i+=2;
4143             iC+=2;
4144         }
4145 
4146         double dfAccumulatorLocal_1 = v_acc_1.GetHorizSum();
4147         double dfAccumulatorLocal_2 = v_acc_2.GetHorizSum();
4148         double dfAccumulatorLocal_3 = v_acc_3.GetHorizSum();
4149         double dfAccumulatorLocal_4 = v_acc_4.GetHorizSum();
4150 
4151         if( i == iMax )
4152         {
4153             dfAccumulatorLocal_1 +=
4154                 static_cast<double>(pSrcBand[i+iSampJ]) * padfWeight[iC];
4155             dfAccumulatorLocal_2 +=
4156                 static_cast<double>(pSrcBand[i+iSampJ + nSrcXSize]) *
4157                 padfWeight[iC];
4158             dfAccumulatorLocal_3 +=
4159                 static_cast<double>(pSrcBand[i+iSampJ + 2 * nSrcXSize]) *
4160                 padfWeight[iC];
4161             dfAccumulatorLocal_4 +=
4162                 static_cast<double>(pSrcBand[i+iSampJ + 3 * nSrcXSize]) *
4163                 padfWeight[iC];
4164         }
4165 
4166         // Calculate the Y weight.
4167         const double dfWeight0 = (j - dfDeltaY) * dfYScale;
4168         const double dfWeight1 = dfWeight0 + dfYScale;
4169         const double dfWeight2 = dfWeight1 + dfYScale;
4170         const double dfWeight3 = dfWeight2 + dfYScale;
4171         double adfWeight[4] = { dfWeight0, dfWeight1, dfWeight2, dfWeight3 };
4172 
4173         dfAccumulatorWeightVertical += pfnGetWeight4Values(adfWeight);
4174         dfAccumulator += adfWeight[0] * dfAccumulatorLocal_1;
4175         dfAccumulator += adfWeight[1] * dfAccumulatorLocal_2;
4176         dfAccumulator += adfWeight[2] * dfAccumulatorLocal_3;
4177         dfAccumulator += adfWeight[3] * dfAccumulatorLocal_4;
4178     }
4179     for( ; j <= jMax; ++j )
4180     {
4181         const GPtrDiff_t iSampJ = iSrcOffset + static_cast<GPtrDiff_t>(j) * nSrcXSize;
4182 
4183         // Loop over all pixels in the row.
4184         iC = 0;
4185         i = iMin;
4186         // Process by chunk of 4 cols.
4187         XMMReg4Double v_acc = XMMReg4Double::Zero();
4188         for( ; i+2 < iMax; i+=4, iC+=4 )
4189         {
4190             // Retrieve the pixel & accumulate.
4191             XMMReg4Double v_pixels= XMMReg4Double::Load4Val(pSrcBand+i+iSampJ);
4192             XMMReg4Double v_padfWeight =
4193                 XMMReg4Double::Load4Val(padfWeight + iC);
4194 
4195             v_acc += v_pixels * v_padfWeight;
4196         }
4197 
4198         double dfAccumulatorLocal = v_acc.GetHorizSum();
4199 
4200         if( i < iMax )
4201         {
4202             dfAccumulatorLocal += pSrcBand[i+iSampJ] * padfWeight[iC];
4203             dfAccumulatorLocal += pSrcBand[i+1+iSampJ] * padfWeight[iC+1];
4204             i+=2;
4205             iC+=2;
4206         }
4207         if( i == iMax )
4208         {
4209             dfAccumulatorLocal += static_cast<double>(pSrcBand[i+iSampJ]) * padfWeight[iC];
4210         }
4211 
4212         // Calculate the Y weight.
4213         double dfWeight = pfnGetWeight((j - dfDeltaY) * dfYScale);
4214         dfAccumulator += dfWeight * dfAccumulatorLocal;
4215         dfAccumulatorWeightVertical += dfWeight;
4216     }
4217 
4218     const double dfAccumulatorWeight =
4219         dfAccumulatorWeightHorizontal * dfAccumulatorWeightVertical;
4220 
4221     *pValue = GWKClampValueT<T>(dfAccumulator / dfAccumulatorWeight);
4222 
4223     return true;
4224 }
4225 
4226 /************************************************************************/
4227 /*                     GWKResampleNoMasksT<GByte>()                     */
4228 /************************************************************************/
4229 
4230 template<>
4231 bool GWKResampleNoMasksT<GByte>( const GDALWarpKernel *poWK, int iBand,
4232                                  double dfSrcX, double dfSrcY,
4233                                  GByte *pValue, double *padfWeight )
4234 {
4235     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY,
4236                                      pValue, padfWeight);
4237 }
4238 
4239 /************************************************************************/
4240 /*                     GWKResampleNoMasksT<GInt16>()                    */
4241 /************************************************************************/
4242 
4243 template<>
4244 bool GWKResampleNoMasksT<GInt16>( const GDALWarpKernel *poWK, int iBand,
4245                                   double dfSrcX, double dfSrcY,
4246                                   GInt16 *pValue, double *padfWeight )
4247 {
4248     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY,
4249                                      pValue, padfWeight);
4250 }
4251 
4252 /************************************************************************/
4253 /*                     GWKResampleNoMasksT<GUInt16>()                   */
4254 /************************************************************************/
4255 
4256 template<>
4257 bool GWKResampleNoMasksT<GUInt16>( const GDALWarpKernel *poWK, int iBand,
4258                                    double dfSrcX, double dfSrcY,
4259                                    GUInt16 *pValue, double *padfWeight )
4260 {
4261     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY,
4262                                      pValue, padfWeight);
4263 }
4264 
4265 /************************************************************************/
4266 /*                     GWKResampleNoMasksT<float>()                     */
4267 /************************************************************************/
4268 
4269 template<>
4270 bool GWKResampleNoMasksT<float>( const GDALWarpKernel *poWK, int iBand,
4271                                  double dfSrcX, double dfSrcY,
4272                                  float *pValue, double *padfWeight )
4273 {
4274     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY,
4275                                      pValue, padfWeight);
4276 }
4277 
4278 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
4279 
4280 /************************************************************************/
4281 /*                     GWKResampleNoMasksT<double>()                    */
4282 /************************************************************************/
4283 
4284 template<>
4285 bool GWKResampleNoMasksT<double>( const GDALWarpKernel *poWK, int iBand,
4286                                   double dfSrcX, double dfSrcY,
4287                                   double *pValue, double *padfWeight )
4288 {
4289     return GWKResampleNoMasks_SSE2_T(poWK, iBand, dfSrcX, dfSrcY,
4290                                      pValue, padfWeight);
4291 }
4292 
4293 #endif /* INSTANTIATE_FLOAT64_SSE2_IMPL */
4294 
4295 #endif /* defined(__x86_64) || defined(_M_X64) */
4296 
4297 /************************************************************************/
4298 /*                     GWKRoundSourceCoordinates()                      */
4299 /************************************************************************/
4300 
4301 static void GWKRoundSourceCoordinates( int nDstXSize,
4302                                        double* padfX,
4303                                        double* padfY,
4304                                        double* padfZ,
4305                                        int* pabSuccess,
4306                                        double dfSrcCoordPrecision,
4307                                        double dfErrorThreshold,
4308                                        GDALTransformerFunc pfnTransformer,
4309                                        void* pTransformerArg,
4310                                        double dfDstXOff,
4311                                        double dfDstY )
4312 {
4313     double dfPct = 0.8;
4314     if( dfErrorThreshold > 0 && dfSrcCoordPrecision / dfErrorThreshold >= 10.0 )
4315     {
4316         dfPct = 1.0 - 2 * 1.0 / (dfSrcCoordPrecision / dfErrorThreshold);
4317     }
4318     const double dfExactTransformThreshold = 0.5 * dfPct * dfSrcCoordPrecision;
4319 
4320     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4321     {
4322         const double dfXBefore = padfX[iDstX];
4323         const double dfYBefore = padfY[iDstX];
4324         padfX[iDstX] =
4325             floor(padfX[iDstX] / dfSrcCoordPrecision + 0.5) *
4326             dfSrcCoordPrecision;
4327         padfY[iDstX] =
4328             floor(padfY[iDstX] / dfSrcCoordPrecision + 0.5) *
4329             dfSrcCoordPrecision;
4330 
4331         // If we are in an uncertainty zone, go to non-approximated
4332         // transformation.
4333         // Due to the 80% of half-precision threshold, dfSrcCoordPrecision must
4334         // be at least 10 times greater than the approximation error.
4335         if( fabs(dfXBefore - padfX[iDstX]) > dfExactTransformThreshold ||
4336             fabs(dfYBefore - padfY[iDstX]) > dfExactTransformThreshold )
4337         {
4338             padfX[iDstX] = iDstX + dfDstXOff;
4339             padfY[iDstX] = dfDstY;
4340             padfZ[iDstX] = 0.0;
4341             pfnTransformer( pTransformerArg, TRUE, 1,
4342                             padfX + iDstX, padfY + iDstX,
4343                             padfZ + iDstX, pabSuccess + iDstX );
4344             padfX[iDstX] =
4345                 floor(padfX[iDstX] / dfSrcCoordPrecision + 0.5) *
4346                 dfSrcCoordPrecision;
4347             padfY[iDstX] =
4348                 floor(padfY[iDstX] / dfSrcCoordPrecision + 0.5) *
4349                 dfSrcCoordPrecision;
4350         }
4351     }
4352 }
4353 
4354 /************************************************************************/
4355 /*                           GWKOpenCLCase()                            */
4356 /*                                                                      */
4357 /*      This is identical to GWKGeneralCase(), but functions via        */
4358 /*      OpenCL. This means we have vector optimization (SSE) and/or     */
4359 /*      GPU optimization depending on our prefs. The code itself is     */
4360 /*      general and not optimized, but by defining constants we can     */
4361 /*      make some pretty darn good code on the fly.                     */
4362 /************************************************************************/
4363 
4364 #if defined(HAVE_OPENCL)
4365 static CPLErr GWKOpenCLCase( GDALWarpKernel *poWK )
4366 {
4367     const int nDstXSize = poWK->nDstXSize;
4368     const int nDstYSize = poWK->nDstYSize;
4369     const int nSrcXSize = poWK->nSrcXSize;
4370     const int nSrcYSize = poWK->nSrcYSize;
4371     const int nDstXOff = poWK->nDstXOff;
4372     const int nDstYOff = poWK->nDstYOff;
4373     const int nSrcXOff = poWK->nSrcXOff;
4374     const int nSrcYOff = poWK->nSrcYOff;
4375     bool bUseImag = false;
4376 
4377     cl_channel_type imageFormat;
4378     switch( poWK->eWorkingDataType )
4379     {
4380       case GDT_Byte:
4381         imageFormat = CL_UNORM_INT8;
4382         break;
4383       case GDT_UInt16:
4384         imageFormat = CL_UNORM_INT16;
4385         break;
4386       case GDT_CInt16:
4387         bUseImag = true;
4388         CPL_FALLTHROUGH
4389       case GDT_Int16:
4390         imageFormat = CL_SNORM_INT16;
4391         break;
4392       case GDT_CFloat32:
4393         bUseImag = true;
4394         CPL_FALLTHROUGH
4395       case GDT_Float32:
4396         imageFormat = CL_FLOAT;
4397         break;
4398       default:
4399         // No support for higher precision formats.
4400         CPLDebug("OpenCL",
4401                  "Unsupported resampling OpenCL data type %d.",
4402                  static_cast<int>(poWK->eWorkingDataType));
4403         return CE_Warning;
4404     }
4405 
4406     OCLResampAlg resampAlg;
4407     switch( poWK->eResample )
4408     {
4409       case GRA_Bilinear:
4410         resampAlg = OCL_Bilinear;
4411         break;
4412       case GRA_Cubic:
4413         resampAlg = OCL_Cubic;
4414         break;
4415       case GRA_CubicSpline:
4416         resampAlg = OCL_CubicSpline;
4417         break;
4418       case GRA_Lanczos:
4419         resampAlg = OCL_Lanczos;
4420         break;
4421       default:
4422         // No support for higher precision formats.
4423         CPLDebug("OpenCL",
4424                  "Unsupported resampling OpenCL resampling alg %d.",
4425                  static_cast<int>(poWK->eResample));
4426         return CE_Warning;
4427     }
4428 
4429     struct oclWarper *warper = nullptr;
4430     cl_int err;
4431     CPLErr eErr = CE_None;
4432 
4433     // TODO(schwehr): Fix indenting.
4434   try
4435   {
4436 
4437     // Using a factor of 2 or 4 seems to have much less rounding error
4438     // than 3 on the GPU.
4439     // Then the rounding error can cause strange artifacts under the
4440     // right conditions.
4441     warper =
4442         GDALWarpKernelOpenCL_createEnv(nSrcXSize, nSrcYSize,
4443                                        nDstXSize, nDstYSize,
4444                                        imageFormat, poWK->nBands, 4,
4445                                        bUseImag,
4446                                        poWK->papanBandSrcValid != nullptr,
4447                                        poWK->pafDstDensity,
4448                                        poWK->padfDstNoDataReal,
4449                                        resampAlg, &err);
4450 
4451     if( err != CL_SUCCESS || warper == nullptr )
4452     {
4453         eErr = CE_Warning;
4454         if( warper != nullptr )
4455             throw eErr;
4456         return eErr;
4457     }
4458 
4459     CPLDebug("GDAL", "GDALWarpKernel()::GWKOpenCLCase() "
4460              "Src=%d,%d,%dx%d Dst=%d,%d,%dx%d",
4461              nSrcXOff, nSrcYOff, nSrcXSize, nSrcYSize,
4462              nDstXOff, nDstYOff, nDstXSize, nDstYSize);
4463 
4464     if( !poWK->pfnProgress( poWK->dfProgressBase, "", poWK->pProgress ) )
4465     {
4466         CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
4467         eErr = CE_Failure;
4468         throw eErr;
4469     }
4470 
4471     /* ==================================================================== */
4472     /*      Loop over bands.                                                */
4473     /* ==================================================================== */
4474     for( int iBand = 0; iBand < poWK->nBands; iBand++ )
4475     {
4476         if( poWK->papanBandSrcValid != nullptr &&
4477             poWK->papanBandSrcValid[iBand] != nullptr)
4478         {
4479             GDALWarpKernelOpenCL_setSrcValid(
4480                 warper, reinterpret_cast<int *>(poWK->papanBandSrcValid[iBand]), iBand);
4481             if( err != CL_SUCCESS )
4482             {
4483                 CPLError( CE_Failure, CPLE_AppDefined,
4484                           "OpenCL routines reported failure (%d) on line %d.",
4485                           static_cast<int>(err), __LINE__ );
4486                 eErr = CE_Failure;
4487                 throw eErr;
4488             }
4489         }
4490 
4491         err = GDALWarpKernelOpenCL_setSrcImg(warper,
4492                                              poWK->papabySrcImage[iBand],
4493                                              iBand);
4494         if( err != CL_SUCCESS )
4495         {
4496             CPLError(CE_Failure, CPLE_AppDefined,
4497                      "OpenCL routines reported failure (%d) on line %d.",
4498                      static_cast<int>(err), __LINE__);
4499             eErr = CE_Failure;
4500             throw eErr;
4501         }
4502 
4503         err = GDALWarpKernelOpenCL_setDstImg(warper,
4504                                              poWK->papabyDstImage[iBand],
4505                                              iBand);
4506         if( err != CL_SUCCESS )
4507         {
4508             CPLError(CE_Failure, CPLE_AppDefined,
4509                      "OpenCL routines reported failure (%d) on line %d.",
4510                      static_cast<int>(err), __LINE__);
4511             eErr = CE_Failure;
4512             throw eErr;
4513         }
4514     }
4515 
4516     /* -------------------------------------------------------------------- */
4517     /*      Allocate x,y,z coordinate arrays for transformation ... one     */
4518     /*      scanlines worth of positions.                                   */
4519     /* -------------------------------------------------------------------- */
4520 
4521     // For x, 2 *, because we cache the precomputed values at the end.
4522     double *padfX =
4523         static_cast<double *>(CPLMalloc(2 * sizeof(double) * nDstXSize));
4524     double * padfY =
4525         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
4526     double *padfZ =
4527         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
4528     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
4529     const double dfSrcCoordPrecision = CPLAtof(
4530         CSLFetchNameValueDef(poWK->papszWarpOptions,
4531                              "SRC_COORD_PRECISION", "0"));
4532     const double dfErrorThreshold = CPLAtof(
4533         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
4534 
4535     // Precompute values.
4536     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4537         padfX[nDstXSize + iDstX] = iDstX + 0.5 + poWK->nDstXOff;
4538 
4539     /* ==================================================================== */
4540     /*      Loop over output lines.                                         */
4541     /* ==================================================================== */
4542     for( int iDstY = 0; iDstY < nDstYSize && eErr == CE_None; ++iDstY )
4543     {
4544         /* ---------------------------------------------------------------- */
4545         /*      Setup points to transform to source image space.            */
4546         /* ---------------------------------------------------------------- */
4547         memcpy( padfX, padfX + nDstXSize, sizeof(double) * nDstXSize );
4548         const double dfYConst = iDstY + 0.5 + poWK->nDstYOff;
4549         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4550             padfY[iDstX] = dfYConst;
4551         memset( padfZ, 0, sizeof(double) * nDstXSize );
4552 
4553         /* ---------------------------------------------------------------- */
4554         /*      Transform the points from destination pixel/line coordinates*/
4555         /*      to source pixel/line coordinates.                           */
4556         /* ---------------------------------------------------------------- */
4557         poWK->pfnTransformer( poWK->pTransformerArg, TRUE, nDstXSize,
4558                               padfX, padfY, padfZ, pabSuccess );
4559         if( dfSrcCoordPrecision > 0.0 )
4560         {
4561             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ,
4562                                       pabSuccess,
4563                                       dfSrcCoordPrecision,
4564                                       dfErrorThreshold,
4565                                       poWK->pfnTransformer,
4566                                       poWK->pTransformerArg,
4567                                       0.5 + nDstXOff,
4568                                       iDstY + 0.5 + nDstYOff);
4569         }
4570 
4571         err = GDALWarpKernelOpenCL_setCoordRow(warper, padfX, padfY,
4572                                                nSrcXOff, nSrcYOff,
4573                                                pabSuccess, iDstY);
4574         if( err != CL_SUCCESS )
4575         {
4576             CPLError(CE_Failure, CPLE_AppDefined,
4577                      "OpenCL routines reported failure (%d) on line %d.",
4578                      static_cast<int>(err), __LINE__);
4579             eErr = CE_Failure;
4580             break;
4581         }
4582 
4583         // Update the valid & density masks because we don't do so in the
4584         // kernel.
4585         for( int iDstX = 0; iDstX < nDstXSize && eErr == CE_None; iDstX++ )
4586         {
4587             const double dfX = padfX[iDstX];
4588             const double dfY = padfY[iDstX];
4589             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
4590 
4591             // See GWKGeneralCase() for appropriate commenting.
4592             if( !pabSuccess[iDstX] || dfX < nSrcXOff || dfY < nSrcYOff )
4593                 continue;
4594 
4595             int iSrcX = static_cast<int>(dfX) - nSrcXOff;
4596             int iSrcY = static_cast<int>(dfY) - nSrcYOff;
4597 
4598             if( iSrcX < 0 || iSrcX >= nSrcXSize ||
4599                 iSrcY < 0 || iSrcY >= nSrcYSize )
4600                 continue;
4601 
4602             GPtrDiff_t iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
4603             double dfDensity = 1.0;
4604 
4605             if( poWK->pafUnifiedSrcDensity != nullptr
4606                 && iSrcX >= 0 && iSrcY >= 0
4607                 && iSrcX < nSrcXSize && iSrcY < nSrcYSize )
4608                 dfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
4609 
4610             GWKOverlayDensity( poWK, iDstOffset, dfDensity );
4611 
4612             // Because this is on the bit-wise level, it can't be done well in
4613             // OpenCL.
4614             if( poWK->panDstValid != nullptr )
4615                 poWK->panDstValid[iDstOffset>>5] |= 0x01 << (iDstOffset & 0x1f);
4616         }
4617     }
4618 
4619     CPLFree( padfX );
4620     CPLFree( padfY );
4621     CPLFree( padfZ );
4622     CPLFree( pabSuccess );
4623 
4624     if( eErr != CE_None )
4625         throw eErr;
4626 
4627     err = GDALWarpKernelOpenCL_runResamp(warper,
4628                                          poWK->pafUnifiedSrcDensity,
4629                                          poWK->panUnifiedSrcValid,
4630                                          poWK->pafDstDensity,
4631                                          poWK->panDstValid,
4632                                          poWK->dfXScale, poWK->dfYScale,
4633                                          poWK->dfXFilter, poWK->dfYFilter,
4634                                          poWK->nXRadius, poWK->nYRadius,
4635                                          poWK->nFiltInitX, poWK->nFiltInitY);
4636 
4637     if( err != CL_SUCCESS )
4638     {
4639         CPLError(CE_Failure, CPLE_AppDefined,
4640                  "OpenCL routines reported failure (%d) on line %d.",
4641                  static_cast<int>(err), __LINE__);
4642         eErr = CE_Failure;
4643         throw eErr;
4644     }
4645 
4646     /* ==================================================================== */
4647     /*      Loop over output lines.                                         */
4648     /* ==================================================================== */
4649     for( int iDstY = 0; iDstY < nDstYSize && eErr == CE_None; iDstY++ )
4650     {
4651         for( int iBand = 0; iBand < poWK->nBands; iBand++ )
4652         {
4653             void *rowReal = nullptr;
4654             void *rowImag = nullptr;
4655             GByte *pabyDst = poWK->papabyDstImage[iBand];
4656 
4657             err = GDALWarpKernelOpenCL_getRow(warper, &rowReal, &rowImag,
4658                                               iDstY, iBand);
4659             if( err != CL_SUCCESS )
4660             {
4661                 CPLError( CE_Failure, CPLE_AppDefined,
4662                           "OpenCL routines reported failure (%d) on line %d.",
4663                           static_cast<int>(err), __LINE__ );
4664                 eErr = CE_Failure;
4665                 throw eErr;
4666             }
4667 
4668             // Copy the data from the warper to GDAL's memory.
4669             switch( poWK->eWorkingDataType )
4670             {
4671               case GDT_Byte:
4672                 memcpy(&(pabyDst[iDstY*nDstXSize]),
4673                        rowReal, sizeof(GByte)*nDstXSize);
4674                 break;
4675               case GDT_Int16:
4676                 memcpy(&(reinterpret_cast<GInt16 *>(pabyDst)[iDstY*nDstXSize]),
4677                        rowReal, sizeof(GInt16)*nDstXSize);
4678                 break;
4679               case GDT_UInt16:
4680                 memcpy(&(reinterpret_cast<GUInt16 *>(pabyDst)[iDstY*nDstXSize]),
4681                        rowReal, sizeof(GUInt16)*nDstXSize);
4682                 break;
4683               case GDT_Float32:
4684                 memcpy(&(reinterpret_cast<float *>(pabyDst)[iDstY*nDstXSize]),
4685                        rowReal, sizeof(float)*nDstXSize);
4686                 break;
4687               case GDT_CInt16:
4688               {
4689                   GInt16 *pabyDstI16 = &(reinterpret_cast<GInt16 *>(pabyDst)[iDstY*nDstXSize]);
4690                   for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4691                   {
4692                       pabyDstI16[iDstX*2  ] = static_cast<GInt16 *>(rowReal)[iDstX];
4693                       pabyDstI16[iDstX*2+1] = static_cast<GInt16 *>(rowImag)[iDstX];
4694                   }
4695               }
4696               break;
4697               case GDT_CFloat32:
4698               {
4699                   float *pabyDstF32 = &(reinterpret_cast<float *>(pabyDst)[iDstY*nDstXSize]);
4700                   for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4701                   {
4702                       pabyDstF32[iDstX*2  ] = static_cast<float *>(rowReal)[iDstX];
4703                       pabyDstF32[iDstX*2+1] = static_cast<float *>(rowImag)[iDstX];
4704                   }
4705               }
4706               break;
4707               default:
4708                 // No support for higher precision formats.
4709                 CPLError(CE_Failure, CPLE_AppDefined,
4710                          "Unsupported resampling OpenCL data type %d.",
4711                          static_cast<int>(poWK->eWorkingDataType) );
4712                 eErr = CE_Failure;
4713                 throw eErr;
4714             }
4715         }
4716     }
4717   }
4718   catch( const CPLErr& )
4719   {
4720   }
4721 
4722     if( (err = GDALWarpKernelOpenCL_deleteEnv(warper)) != CL_SUCCESS )
4723     {
4724         CPLError(CE_Failure, CPLE_AppDefined,
4725                  "OpenCL routines reported failure (%d) on line %d.",
4726                  static_cast<int>(err), __LINE__ );
4727         return CE_Failure;
4728     }
4729 
4730     return eErr;
4731 }
4732 #endif /* defined(HAVE_OPENCL) */
4733 
4734 /************************************************************************/
4735 /*                     GWKCheckAndComputeSrcOffsets()                   */
4736 /************************************************************************/
4737 static CPL_INLINE bool GWKCheckAndComputeSrcOffsets(
4738     const int* _pabSuccess,
4739     int _iDstX,
4740     const double* _padfX,
4741     const double* _padfY,
4742     const GDALWarpKernel* _poWK,
4743     int _nSrcXSize,
4744     int _nSrcYSize,
4745     GPtrDiff_t& iSrcOffset)
4746 {
4747     if( !_pabSuccess[_iDstX] )
4748         return false;
4749 
4750     // If this happens this is likely the symptom of a bug somewhere.
4751     if( CPLIsNan(_padfX[_iDstX]) || CPLIsNan(_padfY[_iDstX]) )
4752     {
4753         static bool bNanCoordFound = false;
4754         if( !bNanCoordFound )
4755         {
4756             CPLDebug("WARP",
4757                      "GWKCheckAndComputeSrcOffsets(): "
4758                      "NaN coordinate found on point %d.",
4759                      _iDstX);
4760             bNanCoordFound = true;
4761         }
4762         return false;
4763     }
4764 
4765 /* -------------------------------------------------------------------- */
4766 /*      Figure out what pixel we want in our source raster, and skip    */
4767 /*      further processing if it is well off the source image.          */
4768 /* -------------------------------------------------------------------- */
4769     /* We test against the value before casting to avoid the */
4770     /* problem of asymmetric truncation effects around zero.  That is */
4771     /* -0.5 will be 0 when cast to an int. */
4772     if( _padfX[_iDstX] < _poWK->nSrcXOff
4773         || _padfY[_iDstX] < _poWK->nSrcYOff )
4774         return false;
4775 
4776     // Check for potential overflow when casting from float to int, (if
4777     // operating outside natural projection area, padfX/Y can be a very huge
4778     // positive number before doing the actual conversion), as such cast is
4779     // undefined behavior that can trigger exception with some compilers
4780     // (see #6753)
4781     if( _padfX[_iDstX] + 1e-10 > _nSrcXSize + _poWK->nSrcXOff ||
4782         _padfY[_iDstX] + 1e-10 > _nSrcYSize + _poWK->nSrcYOff )
4783     {
4784         return false;
4785     }
4786 
4787     int iSrcX =
4788         static_cast<int>(_padfX[_iDstX] + 1.0e-10) - _poWK->nSrcXOff;
4789     int iSrcY =
4790         static_cast<int>(_padfY[_iDstX] + 1.0e-10) - _poWK->nSrcYOff;
4791     if( iSrcX == _nSrcXSize )
4792         iSrcX --;
4793     if( iSrcY == _nSrcYSize )
4794         iSrcY --;
4795 
4796     // Those checks should normally be OK given the previous ones.
4797     CPLAssert( iSrcX >= 0 );
4798     CPLAssert( iSrcY >= 0 );
4799     CPLAssert( iSrcX < _nSrcXSize );
4800     CPLAssert( iSrcY < _nSrcYSize );
4801 
4802     iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * _nSrcXSize;
4803 
4804     return true;
4805 }
4806 
4807 /************************************************************************/
4808 /*                           GWKGeneralCase()                           */
4809 /*                                                                      */
4810 /*      This is the most general case.  It attempts to handle all       */
4811 /*      possible features with relatively little concern for            */
4812 /*      efficiency.                                                     */
4813 /************************************************************************/
4814 
4815 static void GWKGeneralCaseThread( void* pData)
4816 
4817 {
4818     GWKJobStruct* psJob = reinterpret_cast<GWKJobStruct *>(pData);
4819     GDALWarpKernel *poWK = psJob->poWK;
4820     const int iYMin = psJob->iYMin;
4821     const int iYMax = psJob->iYMax;
4822 
4823     int nDstXSize = poWK->nDstXSize;
4824     int nSrcXSize = poWK->nSrcXSize;
4825     int nSrcYSize = poWK->nSrcYSize;
4826 
4827 /* -------------------------------------------------------------------- */
4828 /*      Allocate x,y,z coordinate arrays for transformation ... one     */
4829 /*      scanlines worth of positions.                                   */
4830 /* -------------------------------------------------------------------- */
4831     // For x, 2 *, because we cache the precomputed values at the end.
4832     double *padfX =
4833         static_cast<double *>(CPLMalloc(2 * sizeof(double) * nDstXSize));
4834     double *padfY =
4835         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
4836     double *padfZ =
4837         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
4838     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
4839 
4840     const bool bUse4SamplesFormula =
4841         poWK->dfXScale >= 0.95 && poWK->dfYScale >= 0.95;
4842 
4843     GWKResampleWrkStruct* psWrkStruct = nullptr;
4844     if( poWK->eResample != GRA_NearestNeighbour )
4845     {
4846         psWrkStruct = GWKResampleCreateWrkStruct(poWK);
4847     }
4848     const double dfSrcCoordPrecision = CPLAtof(
4849         CSLFetchNameValueDef(poWK->papszWarpOptions,
4850                              "SRC_COORD_PRECISION", "0"));
4851     const double dfErrorThreshold = CPLAtof(
4852         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
4853 
4854     // Precompute values.
4855     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4856         padfX[nDstXSize + iDstX] = iDstX + 0.5 + poWK->nDstXOff;
4857 
4858 /* ==================================================================== */
4859 /*      Loop over output lines.                                         */
4860 /* ==================================================================== */
4861     for( int iDstY = iYMin; iDstY < iYMax; iDstY++ )
4862     {
4863 /* -------------------------------------------------------------------- */
4864 /*      Setup points to transform to source image space.                */
4865 /* -------------------------------------------------------------------- */
4866         memcpy( padfX, padfX + nDstXSize, sizeof(double) * nDstXSize );
4867         const double dfY = iDstY + 0.5 + poWK->nDstYOff;
4868         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4869             padfY[iDstX] = dfY;
4870         memset( padfZ, 0, sizeof(double) * nDstXSize );
4871 
4872 /* -------------------------------------------------------------------- */
4873 /*      Transform the points from destination pixel/line coordinates    */
4874 /*      to source pixel/line coordinates.                               */
4875 /* -------------------------------------------------------------------- */
4876         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
4877                               padfX, padfY, padfZ, pabSuccess );
4878         if( dfSrcCoordPrecision > 0.0 )
4879         {
4880             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ,
4881                                       pabSuccess,
4882                                       dfSrcCoordPrecision,
4883                                       dfErrorThreshold,
4884                                       poWK->pfnTransformer,
4885                                       psJob->pTransformerArg,
4886                                       0.5 + poWK->nDstXOff,
4887                                       iDstY + 0.5 + poWK->nDstYOff);
4888         }
4889 
4890 /* ==================================================================== */
4891 /*      Loop over pixels in output scanline.                            */
4892 /* ==================================================================== */
4893         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
4894         {
4895             GPtrDiff_t iSrcOffset = 0;
4896             if( !GWKCheckAndComputeSrcOffsets(pabSuccess, iDstX, padfX, padfY,
4897                                     poWK, nSrcXSize, nSrcYSize, iSrcOffset) )
4898                 continue;
4899 
4900 /* -------------------------------------------------------------------- */
4901 /*      Do not try to apply transparent/invalid source pixels to the    */
4902 /*      destination.  This currently ignores the multi-pixel input      */
4903 /*      of bilinear and cubic resamples.                                */
4904 /* -------------------------------------------------------------------- */
4905             double dfDensity = 1.0;
4906 
4907             if( poWK->pafUnifiedSrcDensity != nullptr )
4908             {
4909                 dfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
4910                 if( dfDensity < SRC_DENSITY_THRESHOLD )
4911                     continue;
4912             }
4913 
4914             if( poWK->panUnifiedSrcValid != nullptr
4915                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
4916                      & (0x01 << (iSrcOffset & 0x1f))) )
4917                 continue;
4918 
4919 /* ==================================================================== */
4920 /*      Loop processing each band.                                      */
4921 /* ==================================================================== */
4922             bool bHasFoundDensity = false;
4923 
4924             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
4925             for( int iBand = 0; iBand < poWK->nBands; iBand++ )
4926             {
4927                 double dfBandDensity = 0.0;
4928                 double dfValueReal = 0.0;
4929                 double dfValueImag = 0.0;
4930 
4931 /* -------------------------------------------------------------------- */
4932 /*      Collect the source value.                                       */
4933 /* -------------------------------------------------------------------- */
4934                 if( poWK->eResample == GRA_NearestNeighbour ||
4935                     nSrcXSize == 1 || nSrcYSize == 1 )
4936                 {
4937                     // FALSE is returned if dfBandDensity == 0, which is
4938                     // checked below.
4939                     CPL_IGNORE_RET_VAL(
4940                         GWKGetPixelValue( poWK, iBand, iSrcOffset,
4941                                           &dfBandDensity,
4942                                           &dfValueReal, &dfValueImag ));
4943                 }
4944                 else if( poWK->eResample == GRA_Bilinear &&
4945                          bUse4SamplesFormula )
4946                 {
4947                     GWKBilinearResample4Sample( poWK, iBand,
4948                                          padfX[iDstX]-poWK->nSrcXOff,
4949                                          padfY[iDstX]-poWK->nSrcYOff,
4950                                          &dfBandDensity,
4951                                          &dfValueReal, &dfValueImag );
4952                 }
4953                 else if( poWK->eResample == GRA_Cubic &&
4954                          bUse4SamplesFormula )
4955                 {
4956                     GWKCubicResample4Sample( poWK, iBand,
4957                                         padfX[iDstX]-poWK->nSrcXOff,
4958                                         padfY[iDstX]-poWK->nSrcYOff,
4959                                         &dfBandDensity,
4960                                         &dfValueReal, &dfValueImag );
4961                 }
4962                 else
4963 #ifdef DEBUG
4964                 // Only useful for clang static analyzer.
4965                 if( psWrkStruct != nullptr )
4966 #endif
4967                 {
4968                     psWrkStruct->pfnGWKResample( poWK, iBand,
4969                                  padfX[iDstX]-poWK->nSrcXOff,
4970                                  padfY[iDstX]-poWK->nSrcYOff,
4971                                  &dfBandDensity,
4972                                  &dfValueReal, &dfValueImag, psWrkStruct );
4973                 }
4974 
4975                 // If we didn't find any valid inputs skip to next band.
4976                 if( dfBandDensity < BAND_DENSITY_THRESHOLD )
4977                     continue;
4978 
4979                 bHasFoundDensity = true;
4980 
4981 /* -------------------------------------------------------------------- */
4982 /*      We have a computed value from the source.  Now apply it to      */
4983 /*      the destination pixel.                                          */
4984 /* -------------------------------------------------------------------- */
4985                 GWKSetPixelValue( poWK, iBand, iDstOffset,
4986                                   dfBandDensity,
4987                                   dfValueReal, dfValueImag );
4988             }
4989 
4990             if( !bHasFoundDensity )
4991               continue;
4992 
4993 /* -------------------------------------------------------------------- */
4994 /*      Update destination density/validity masks.                      */
4995 /* -------------------------------------------------------------------- */
4996             GWKOverlayDensity( poWK, iDstOffset, dfDensity );
4997 
4998             if( poWK->panDstValid != nullptr )
4999             {
5000                 poWK->panDstValid[iDstOffset>>5] |=
5001                     0x01 << (iDstOffset & 0x1f);
5002             }
5003         } /* Next iDstX */
5004 
5005 /* -------------------------------------------------------------------- */
5006 /*      Report progress to the user, and optionally cancel out.         */
5007 /* -------------------------------------------------------------------- */
5008         if( psJob->pfnProgress && psJob->pfnProgress(psJob) )
5009             break;
5010     }
5011 
5012 /* -------------------------------------------------------------------- */
5013 /*      Cleanup and return.                                             */
5014 /* -------------------------------------------------------------------- */
5015     CPLFree( padfX );
5016     CPLFree( padfY );
5017     CPLFree( padfZ );
5018     CPLFree( pabSuccess );
5019     if( psWrkStruct )
5020         GWKResampleDeleteWrkStruct(psWrkStruct);
5021 }
5022 
5023 static CPLErr GWKGeneralCase( GDALWarpKernel *poWK )
5024 {
5025     return GWKRun( poWK, "GWKGeneralCase", GWKGeneralCaseThread );
5026 }
5027 
5028 /************************************************************************/
5029 /*                            GWKRealCase()                             */
5030 /*                                                                      */
5031 /*      General case for non-complex data types.                        */
5032 /************************************************************************/
5033 
5034 static void GWKRealCaseThread( void* pData)
5035 
5036 {
5037     GWKJobStruct* psJob = static_cast<GWKJobStruct*>(pData);
5038     GDALWarpKernel *poWK = psJob->poWK;
5039     const int iYMin = psJob->iYMin;
5040     const int iYMax = psJob->iYMax;
5041 
5042     const int nDstXSize = poWK->nDstXSize;
5043     const int nSrcXSize = poWK->nSrcXSize;
5044     const int nSrcYSize = poWK->nSrcYSize;
5045 
5046 /* -------------------------------------------------------------------- */
5047 /*      Allocate x,y,z coordinate arrays for transformation ... one     */
5048 /*      scanlines worth of positions.                                   */
5049 /* -------------------------------------------------------------------- */
5050 
5051     // For x, 2 *, because we cache the precomputed values at the end.
5052     double *padfX =
5053         static_cast<double *>(CPLMalloc(2 * sizeof(double) * nDstXSize));
5054     double *padfY =
5055         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5056     double *padfZ =
5057         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5058     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
5059 
5060     const bool bUse4SamplesFormula =
5061         poWK->dfXScale >= 0.95 && poWK->dfYScale >= 0.95;
5062 
5063     GWKResampleWrkStruct* psWrkStruct = nullptr;
5064     if( poWK->eResample != GRA_NearestNeighbour )
5065     {
5066         psWrkStruct = GWKResampleCreateWrkStruct(poWK);
5067     }
5068     const double dfSrcCoordPrecision = CPLAtof(
5069         CSLFetchNameValueDef(poWK->papszWarpOptions,
5070                              "SRC_COORD_PRECISION", "0"));
5071     const double dfErrorThreshold = CPLAtof(
5072         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
5073 
5074     const bool bSrcMaskIsDensity =
5075         poWK->panUnifiedSrcValid == nullptr &&
5076         poWK->papanBandSrcValid == nullptr &&
5077         poWK->pafUnifiedSrcDensity != nullptr;
5078 
5079     // Precompute values.
5080     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5081         padfX[nDstXSize + iDstX] = iDstX + 0.5 + poWK->nDstXOff;
5082 
5083 /* ==================================================================== */
5084 /*      Loop over output lines.                                         */
5085 /* ==================================================================== */
5086     for( int iDstY = iYMin; iDstY < iYMax; iDstY++ )
5087     {
5088 /* -------------------------------------------------------------------- */
5089 /*      Setup points to transform to source image space.                */
5090 /* -------------------------------------------------------------------- */
5091         memcpy( padfX, padfX + nDstXSize, sizeof(double) * nDstXSize );
5092         const double dfY = iDstY + 0.5 + poWK->nDstYOff;
5093         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5094             padfY[iDstX] = dfY;
5095         memset( padfZ, 0, sizeof(double) * nDstXSize );
5096 
5097 /* -------------------------------------------------------------------- */
5098 /*      Transform the points from destination pixel/line coordinates    */
5099 /*      to source pixel/line coordinates.                               */
5100 /* -------------------------------------------------------------------- */
5101         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
5102                               padfX, padfY, padfZ, pabSuccess );
5103         if( dfSrcCoordPrecision > 0.0 )
5104         {
5105             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ,
5106                                       pabSuccess,
5107                                       dfSrcCoordPrecision,
5108                                       dfErrorThreshold,
5109                                       poWK->pfnTransformer,
5110                                       psJob->pTransformerArg,
5111                                       0.5 + poWK->nDstXOff,
5112                                       iDstY + 0.5 + poWK->nDstYOff);
5113         }
5114 
5115 /* ==================================================================== */
5116 /*      Loop over pixels in output scanline.                            */
5117 /* ==================================================================== */
5118         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5119         {
5120             GPtrDiff_t iSrcOffset = 0;
5121             if( !GWKCheckAndComputeSrcOffsets(pabSuccess, iDstX, padfX, padfY,
5122                                     poWK, nSrcXSize, nSrcYSize, iSrcOffset) )
5123                 continue;
5124 
5125 /* -------------------------------------------------------------------- */
5126 /*      Do not try to apply transparent/invalid source pixels to the    */
5127 /*      destination.  This currently ignores the multi-pixel input      */
5128 /*      of bilinear and cubic resamples.                                */
5129 /* -------------------------------------------------------------------- */
5130             double dfDensity = 1.0;
5131 
5132             if( poWK->pafUnifiedSrcDensity != nullptr )
5133             {
5134                 dfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
5135                 if( dfDensity < SRC_DENSITY_THRESHOLD )
5136                     continue;
5137             }
5138 
5139             if( poWK->panUnifiedSrcValid != nullptr
5140                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
5141                      & (0x01 << (iSrcOffset & 0x1f))) )
5142                 continue;
5143 
5144 /* ==================================================================== */
5145 /*      Loop processing each band.                                      */
5146 /* ==================================================================== */
5147             bool bHasFoundDensity = false;
5148 
5149             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
5150             for( int iBand = 0; iBand < poWK->nBands; iBand++ )
5151             {
5152                 double dfBandDensity = 0.0;
5153                 double dfValueReal = 0.0;
5154 
5155 /* -------------------------------------------------------------------- */
5156 /*      Collect the source value.                                       */
5157 /* -------------------------------------------------------------------- */
5158                 if( poWK->eResample == GRA_NearestNeighbour ||
5159                     nSrcXSize == 1 || nSrcYSize == 1 )
5160                 {
5161                     // FALSE is returned if dfBandDensity == 0, which is
5162                     // checked below.
5163                     CPL_IGNORE_RET_VAL(
5164                         GWKGetPixelValueReal( poWK, iBand, iSrcOffset,
5165                                               &dfBandDensity, &dfValueReal ));
5166                 }
5167                 else if( poWK->eResample == GRA_Bilinear &&
5168                          bUse4SamplesFormula )
5169                 {
5170                     double dfValueImagIgnored = 0.0;
5171                     GWKBilinearResample4Sample( poWK, iBand,
5172                                          padfX[iDstX]-poWK->nSrcXOff,
5173                                          padfY[iDstX]-poWK->nSrcYOff,
5174                                          &dfBandDensity,
5175                                          &dfValueReal, &dfValueImagIgnored );
5176                 }
5177                 else if( poWK->eResample == GRA_Cubic &&
5178                          bUse4SamplesFormula )
5179                 {
5180                     if( bSrcMaskIsDensity )
5181                     {
5182                         if( poWK->eWorkingDataType == GDT_Byte )
5183                         {
5184                             GWKCubicResampleSrcMaskIsDensity4SampleRealT<GByte>(
5185                                                 poWK, iBand,
5186                                                 padfX[iDstX]-poWK->nSrcXOff,
5187                                                 padfY[iDstX]-poWK->nSrcYOff,
5188                                                 &dfBandDensity,
5189                                                 &dfValueReal );
5190                         }
5191                         else if( poWK->eWorkingDataType == GDT_UInt16 )
5192                         {
5193                             GWKCubicResampleSrcMaskIsDensity4SampleRealT<GUInt16>(
5194                                                 poWK, iBand,
5195                                                 padfX[iDstX]-poWK->nSrcXOff,
5196                                                 padfY[iDstX]-poWK->nSrcYOff,
5197                                                 &dfBandDensity,
5198                                                 &dfValueReal );
5199                         }
5200                         else
5201                         {
5202                             GWKCubicResampleSrcMaskIsDensity4SampleReal( poWK, iBand,
5203                                     padfX[iDstX]-poWK->nSrcXOff,
5204                                     padfY[iDstX]-poWK->nSrcYOff,
5205                                     &dfBandDensity,
5206                                     &dfValueReal );
5207                         }
5208                     }
5209                     else
5210                     {
5211                         double dfValueImagIgnored = 0.0;
5212                         GWKCubicResample4Sample( poWK, iBand,
5213                                             padfX[iDstX]-poWK->nSrcXOff,
5214                                             padfY[iDstX]-poWK->nSrcYOff,
5215                                             &dfBandDensity,
5216                                             &dfValueReal,
5217                                             &dfValueImagIgnored);
5218                     }
5219                 }
5220                 else
5221 #ifdef DEBUG
5222                 // Only useful for clang static analyzer.
5223                 if( psWrkStruct != nullptr )
5224 #endif
5225                 {
5226                     double dfValueImagIgnored = 0.0;
5227                     psWrkStruct->pfnGWKResample(
5228                         poWK, iBand,
5229                         padfX[iDstX]-poWK->nSrcXOff,
5230                         padfY[iDstX]-poWK->nSrcYOff,
5231                         &dfBandDensity,
5232                         &dfValueReal, &dfValueImagIgnored, psWrkStruct);
5233                 }
5234 
5235                 // If we didn't find any valid inputs skip to next band.
5236                 if( dfBandDensity < BAND_DENSITY_THRESHOLD )
5237                     continue;
5238 
5239                 bHasFoundDensity = true;
5240 
5241 /* -------------------------------------------------------------------- */
5242 /*      We have a computed value from the source.  Now apply it to      */
5243 /*      the destination pixel.                                          */
5244 /* -------------------------------------------------------------------- */
5245                 GWKSetPixelValueReal(poWK, iBand, iDstOffset,
5246                                      dfBandDensity,
5247                                      dfValueReal);
5248             }
5249 
5250             if( !bHasFoundDensity )
5251               continue;
5252 
5253 /* -------------------------------------------------------------------- */
5254 /*      Update destination density/validity masks.                      */
5255 /* -------------------------------------------------------------------- */
5256             GWKOverlayDensity( poWK, iDstOffset, dfDensity );
5257 
5258             if( poWK->panDstValid != nullptr )
5259             {
5260                 poWK->panDstValid[iDstOffset>>5] |=
5261                     0x01 << (iDstOffset & 0x1f);
5262             }
5263         }  // Next iDstX.
5264 
5265 /* -------------------------------------------------------------------- */
5266 /*      Report progress to the user, and optionally cancel out.         */
5267 /* -------------------------------------------------------------------- */
5268         if( psJob->pfnProgress && psJob->pfnProgress(psJob) )
5269             break;
5270     }
5271 
5272 /* -------------------------------------------------------------------- */
5273 /*      Cleanup and return.                                             */
5274 /* -------------------------------------------------------------------- */
5275     CPLFree( padfX );
5276     CPLFree( padfY );
5277     CPLFree( padfZ );
5278     CPLFree( pabSuccess );
5279     if( psWrkStruct )
5280         GWKResampleDeleteWrkStruct(psWrkStruct);
5281 }
5282 
5283 static CPLErr GWKRealCase( GDALWarpKernel *poWK )
5284 {
5285     return GWKRun( poWK, "GWKRealCase", GWKRealCaseThread );
5286 }
5287 
5288 /************************************************************************/
5289 /*                GWKResampleNoMasksOrDstDensityOnlyThreadInternal()    */
5290 /************************************************************************/
5291 
5292 template<class T, GDALResampleAlg eResample, int bUse4SamplesFormula>
5293 static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal( void* pData )
5294 
5295 {
5296     GWKJobStruct* psJob = static_cast<GWKJobStruct *>(pData);
5297     GDALWarpKernel *poWK = psJob->poWK;
5298     const int iYMin = psJob->iYMin;
5299     const int iYMax = psJob->iYMax;
5300 
5301     const int nDstXSize = poWK->nDstXSize;
5302     const int nSrcXSize = poWK->nSrcXSize;
5303     const int nSrcYSize = poWK->nSrcYSize;
5304 
5305 /* -------------------------------------------------------------------- */
5306 /*      Allocate x,y,z coordinate arrays for transformation ... one     */
5307 /*      scanlines worth of positions.                                   */
5308 /* -------------------------------------------------------------------- */
5309 
5310     // For x, 2 *, because we cache the precomputed values at the end.
5311     double *padfX =
5312         static_cast<double *>(CPLMalloc(2 * sizeof(double) * nDstXSize));
5313     double *padfY =
5314         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5315     double *padfZ =
5316         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5317     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
5318 
5319     const int nXRadius = poWK->nXRadius;
5320     double  *padfWeight =
5321       static_cast<double *>(CPLCalloc(1 + nXRadius * 2, sizeof(double)));
5322     const double dfSrcCoordPrecision = CPLAtof(
5323         CSLFetchNameValueDef(poWK->papszWarpOptions,
5324                              "SRC_COORD_PRECISION", "0"));
5325     const double dfErrorThreshold = CPLAtof(
5326         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
5327 
5328     // Precompute values.
5329     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5330         padfX[nDstXSize + iDstX] = iDstX + 0.5 + poWK->nDstXOff;
5331 
5332 /* ==================================================================== */
5333 /*      Loop over output lines.                                         */
5334 /* ==================================================================== */
5335     for( int iDstY = iYMin; iDstY < iYMax; iDstY++ )
5336     {
5337 /* -------------------------------------------------------------------- */
5338 /*      Setup points to transform to source image space.                */
5339 /* -------------------------------------------------------------------- */
5340         memcpy( padfX, padfX + nDstXSize, sizeof(double) * nDstXSize );
5341         const double dfY = iDstY + 0.5 + poWK->nDstYOff;
5342         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5343             padfY[iDstX] = dfY;
5344         memset( padfZ, 0, sizeof(double) * nDstXSize );
5345 
5346 /* -------------------------------------------------------------------- */
5347 /*      Transform the points from destination pixel/line coordinates    */
5348 /*      to source pixel/line coordinates.                               */
5349 /* -------------------------------------------------------------------- */
5350         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
5351                               padfX, padfY, padfZ, pabSuccess );
5352         if( dfSrcCoordPrecision > 0.0 )
5353         {
5354             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ, pabSuccess,
5355                                       dfSrcCoordPrecision,
5356                                       dfErrorThreshold,
5357                                       poWK->pfnTransformer,
5358                                       psJob->pTransformerArg,
5359                                       0.5 + poWK->nDstXOff,
5360                                       iDstY + 0.5 + poWK->nDstYOff);
5361         }
5362 
5363 /* ==================================================================== */
5364 /*      Loop over pixels in output scanline.                            */
5365 /* ==================================================================== */
5366         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5367         {
5368             GPtrDiff_t iSrcOffset = 0;
5369             if( !GWKCheckAndComputeSrcOffsets(pabSuccess, iDstX, padfX, padfY,
5370                                               poWK, nSrcXSize, nSrcYSize,
5371                                               iSrcOffset) )
5372                 continue;
5373 
5374 /* ==================================================================== */
5375 /*      Loop processing each band.                                      */
5376 /* ==================================================================== */
5377             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
5378 
5379             for( int iBand = 0; iBand < poWK->nBands; iBand++ )
5380             {
5381                 T value = 0;
5382                 if( eResample == GRA_NearestNeighbour )
5383                 {
5384                     value = reinterpret_cast<T *>(
5385                         poWK->papabySrcImage[iBand])[iSrcOffset];
5386                 }
5387                 else if( bUse4SamplesFormula )
5388                 {
5389                     if( eResample == GRA_Bilinear )
5390                         GWKBilinearResampleNoMasks4SampleT( poWK, iBand,
5391                                                 padfX[iDstX]-poWK->nSrcXOff,
5392                                                 padfY[iDstX]-poWK->nSrcYOff,
5393                                                 &value );
5394                     else
5395                         GWKCubicResampleNoMasks4SampleT( poWK, iBand,
5396                                               padfX[iDstX]-poWK->nSrcXOff,
5397                                               padfY[iDstX]-poWK->nSrcYOff,
5398                                               &value );
5399                 }
5400                 else
5401                 {
5402                     GWKResampleNoMasksT( poWK, iBand,
5403                                     padfX[iDstX]-poWK->nSrcXOff,
5404                                     padfY[iDstX]-poWK->nSrcYOff,
5405                                     &value,
5406                                     padfWeight);
5407                 }
5408                 reinterpret_cast<T *>(poWK->papabyDstImage[iBand])[iDstOffset] =
5409                     value;
5410             }
5411 
5412             if( poWK->pafDstDensity )
5413                 poWK->pafDstDensity[iDstOffset] = 1.0f;
5414         }
5415 
5416 /* -------------------------------------------------------------------- */
5417 /*      Report progress to the user, and optionally cancel out.         */
5418 /* -------------------------------------------------------------------- */
5419         if( psJob->pfnProgress && psJob->pfnProgress(psJob) )
5420             break;
5421     }
5422 
5423 /* -------------------------------------------------------------------- */
5424 /*      Cleanup and return.                                             */
5425 /* -------------------------------------------------------------------- */
5426     CPLFree( padfX );
5427     CPLFree( padfY );
5428     CPLFree( padfZ );
5429     CPLFree( pabSuccess );
5430     CPLFree( padfWeight );
5431 }
5432 
5433 template<class T, GDALResampleAlg eResample>
5434 static void GWKResampleNoMasksOrDstDensityOnlyThread( void* pData )
5435 {
5436     GWKResampleNoMasksOrDstDensityOnlyThreadInternal<T, eResample,
5437                                                      FALSE>(pData);
5438 }
5439 
5440 template<class T, GDALResampleAlg eResample>
5441 static void GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread( void* pData )
5442 
5443 {
5444     GWKJobStruct* psJob = static_cast<GWKJobStruct *>(pData);
5445     GDALWarpKernel *poWK = psJob->poWK;
5446     CPLAssert(eResample == GRA_Bilinear || eResample == GRA_Cubic);
5447     const bool bUse4SamplesFormula =
5448         poWK->dfXScale >= 0.95 && poWK->dfYScale >= 0.95;
5449     if( bUse4SamplesFormula )
5450         GWKResampleNoMasksOrDstDensityOnlyThreadInternal<T, eResample,
5451                                                          TRUE>(pData);
5452     else
5453         GWKResampleNoMasksOrDstDensityOnlyThreadInternal<T, eResample,
5454                                                          FALSE>(pData);
5455 }
5456 
5457 static CPLErr GWKNearestNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK )
5458 {
5459     return GWKRun(
5460         poWK, "GWKNearestNoMasksOrDstDensityOnlyByte",
5461         GWKResampleNoMasksOrDstDensityOnlyThread<GByte, GRA_NearestNeighbour>);
5462 }
5463 
5464 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK )
5465 {
5466     return GWKRun(
5467         poWK, "GWKBilinearNoMasksOrDstDensityOnlyByte",
5468         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GByte,
5469                                                            GRA_Bilinear>);
5470 }
5471 
5472 static CPLErr GWKCubicNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK )
5473 {
5474     return GWKRun(
5475         poWK, "GWKCubicNoMasksOrDstDensityOnlyByte",
5476         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GByte, GRA_Cubic>);
5477 }
5478 
5479 static CPLErr GWKCubicNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK )
5480 {
5481     return GWKRun(
5482         poWK, "GWKCubicNoMasksOrDstDensityOnlyFloat",
5483         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<float, GRA_Cubic>);
5484 }
5485 
5486 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
5487 
5488 static CPLErr GWKCubicNoMasksOrDstDensityOnlyDouble( GDALWarpKernel *poWK )
5489 {
5490     return GWKRun(
5491         poWK, "GWKCubicNoMasksOrDstDensityOnlyDouble",
5492         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<double, GRA_Cubic>);
5493 }
5494 #endif
5495 
5496 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyByte( GDALWarpKernel *poWK )
5497 {
5498     return GWKRun(
5499         poWK, "GWKCubicSplineNoMasksOrDstDensityOnlyByte",
5500         GWKResampleNoMasksOrDstDensityOnlyThread<GByte, GRA_CubicSpline>);
5501 }
5502 
5503 /************************************************************************/
5504 /*                          GWKNearestByte()                            */
5505 /*                                                                      */
5506 /*      Case for 8bit input data with nearest neighbour resampling      */
5507 /*      using valid flags. Should be as fast as possible for this       */
5508 /*      particular transformation type.                                 */
5509 /************************************************************************/
5510 
5511 template<class T>
5512 static void GWKNearestThread( void* pData )
5513 
5514 {
5515     GWKJobStruct* psJob = static_cast<GWKJobStruct *>(pData);
5516     GDALWarpKernel *poWK = psJob->poWK;
5517     const int iYMin = psJob->iYMin;
5518     const int iYMax = psJob->iYMax;
5519 
5520     const int nDstXSize = poWK->nDstXSize;
5521     const int nSrcXSize = poWK->nSrcXSize;
5522     const int nSrcYSize = poWK->nSrcYSize;
5523 
5524 /* -------------------------------------------------------------------- */
5525 /*      Allocate x,y,z coordinate arrays for transformation ... one     */
5526 /*      scanlines worth of positions.                                   */
5527 /* -------------------------------------------------------------------- */
5528 
5529     // For x, 2 *, because we cache the precomputed values at the end.
5530     double *padfX =
5531         static_cast<double *>(CPLMalloc(2 * sizeof(double) * nDstXSize));
5532     double *padfY =
5533         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5534     double *padfZ =
5535        static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5536     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
5537 
5538     const double dfSrcCoordPrecision = CPLAtof(
5539         CSLFetchNameValueDef(poWK->papszWarpOptions,
5540                              "SRC_COORD_PRECISION", "0"));
5541     const double dfErrorThreshold = CPLAtof(
5542         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
5543 
5544     // Precompute values.
5545     for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5546         padfX[nDstXSize + iDstX] = iDstX + 0.5 + poWK->nDstXOff;
5547 
5548 /* ==================================================================== */
5549 /*      Loop over output lines.                                         */
5550 /* ==================================================================== */
5551     for( int iDstY = iYMin; iDstY < iYMax; iDstY++ )
5552     {
5553 
5554 /* -------------------------------------------------------------------- */
5555 /*      Setup points to transform to source image space.                */
5556 /* -------------------------------------------------------------------- */
5557         memcpy( padfX, padfX + nDstXSize, sizeof(double) * nDstXSize );
5558         const double dfY = iDstY + 0.5 + poWK->nDstYOff;
5559         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5560             padfY[iDstX] = dfY;
5561         memset( padfZ, 0, sizeof(double) * nDstXSize );
5562 
5563 /* -------------------------------------------------------------------- */
5564 /*      Transform the points from destination pixel/line coordinates    */
5565 /*      to source pixel/line coordinates.                               */
5566 /* -------------------------------------------------------------------- */
5567         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
5568                               padfX, padfY, padfZ, pabSuccess );
5569         if( dfSrcCoordPrecision > 0.0 )
5570         {
5571             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ,
5572                                       pabSuccess,
5573                                       dfSrcCoordPrecision,
5574                                       dfErrorThreshold,
5575                                       poWK->pfnTransformer,
5576                                       psJob->pTransformerArg,
5577                                       0.5 + poWK->nDstXOff,
5578                                       iDstY + 0.5 + poWK->nDstYOff);
5579         }
5580 /* ==================================================================== */
5581 /*      Loop over pixels in output scanline.                            */
5582 /* ==================================================================== */
5583         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5584         {
5585             GPtrDiff_t iSrcOffset = 0;
5586             if( !GWKCheckAndComputeSrcOffsets(pabSuccess, iDstX, padfX, padfY,
5587                                     poWK, nSrcXSize, nSrcYSize, iSrcOffset) )
5588                 continue;
5589 
5590 /* -------------------------------------------------------------------- */
5591 /*      Do not try to apply invalid source pixels to the dest.          */
5592 /* -------------------------------------------------------------------- */
5593             if( poWK->panUnifiedSrcValid != nullptr
5594                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
5595                      & (0x01 << (iSrcOffset & 0x1f))) )
5596                 continue;
5597 
5598 /* -------------------------------------------------------------------- */
5599 /*      Do not try to apply transparent source pixels to the destination.*/
5600 /* -------------------------------------------------------------------- */
5601             double dfDensity = 1.0;
5602 
5603             if( poWK->pafUnifiedSrcDensity != nullptr )
5604             {
5605                 dfDensity = poWK->pafUnifiedSrcDensity[iSrcOffset];
5606                 if( dfDensity < SRC_DENSITY_THRESHOLD )
5607                     continue;
5608             }
5609 
5610 /* ==================================================================== */
5611 /*      Loop processing each band.                                      */
5612 /* ==================================================================== */
5613 
5614             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
5615 
5616             for( int iBand = 0; iBand < poWK->nBands; iBand++ )
5617             {
5618                 T value = 0;
5619                 double dfBandDensity = 0.0;
5620 
5621 /* -------------------------------------------------------------------- */
5622 /*      Collect the source value.                                       */
5623 /* -------------------------------------------------------------------- */
5624                 if( GWKGetPixelT(poWK, iBand, iSrcOffset,
5625                                  &dfBandDensity, &value) )
5626                 {
5627                     if( dfBandDensity < 1.0 )
5628                     {
5629                         if( dfBandDensity == 0.0 )
5630                         {
5631                             // Do nothing.
5632                         }
5633                         else
5634                         {
5635                             // Let the general code take care of mixing.
5636                             GWKSetPixelValueRealT( poWK, iBand, iDstOffset,
5637                                           dfBandDensity, value );
5638                         }
5639                     }
5640                     else
5641                     {
5642                         reinterpret_cast<T *>(
5643                             poWK->papabyDstImage[iBand])[iDstOffset] = value;
5644                     }
5645                 }
5646             }
5647 
5648 /* -------------------------------------------------------------------- */
5649 /*      Mark this pixel valid/opaque in the output.                     */
5650 /* -------------------------------------------------------------------- */
5651             GWKOverlayDensity( poWK, iDstOffset, dfDensity );
5652 
5653             if( poWK->panDstValid != nullptr )
5654             {
5655                 poWK->panDstValid[iDstOffset>>5] |=
5656                     0x01 << (iDstOffset & 0x1f);
5657             }
5658         } /* Next iDstX */
5659 
5660 /* -------------------------------------------------------------------- */
5661 /*      Report progress to the user, and optionally cancel out.         */
5662 /* -------------------------------------------------------------------- */
5663         if( psJob->pfnProgress && psJob->pfnProgress(psJob) )
5664             break;
5665     }
5666 
5667 /* -------------------------------------------------------------------- */
5668 /*      Cleanup and return.                                             */
5669 /* -------------------------------------------------------------------- */
5670     CPLFree( padfX );
5671     CPLFree( padfY );
5672     CPLFree( padfZ );
5673     CPLFree( pabSuccess );
5674 }
5675 
5676 static CPLErr GWKNearestByte( GDALWarpKernel *poWK )
5677 {
5678     return GWKRun( poWK, "GWKNearestByte", GWKNearestThread<GByte> );
5679 }
5680 
5681 static CPLErr GWKNearestNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK )
5682 {
5683     return GWKRun(
5684         poWK, "GWKNearestNoMasksOrDstDensityOnlyShort",
5685         GWKResampleNoMasksOrDstDensityOnlyThread<GInt16, GRA_NearestNeighbour>);
5686 }
5687 
5688 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK )
5689 {
5690     return GWKRun(
5691         poWK, "GWKBilinearNoMasksOrDstDensityOnlyShort",
5692         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GInt16,
5693                                                            GRA_Bilinear>);
5694 }
5695 
5696 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyUShort( GDALWarpKernel *poWK )
5697 {
5698     return GWKRun(
5699         poWK, "GWKBilinearNoMasksOrDstDensityOnlyUShort",
5700         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GUInt16,
5701                                                            GRA_Bilinear>);
5702 }
5703 
5704 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK )
5705 {
5706     return GWKRun(
5707         poWK, "GWKBilinearNoMasksOrDstDensityOnlyFloat",
5708         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<float,
5709                                                            GRA_Bilinear>);
5710 }
5711 
5712 #ifdef INSTANTIATE_FLOAT64_SSE2_IMPL
5713 
5714 static CPLErr GWKBilinearNoMasksOrDstDensityOnlyDouble( GDALWarpKernel *poWK )
5715 {
5716     return GWKRun(
5717         poWK, "GWKBilinearNoMasksOrDstDensityOnlyDouble",
5718         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<double,
5719                                                            GRA_Bilinear>);
5720 }
5721 #endif
5722 
5723 static CPLErr GWKCubicNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK )
5724 {
5725     return GWKRun(
5726         poWK, "GWKCubicNoMasksOrDstDensityOnlyShort",
5727         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GInt16, GRA_Cubic>);
5728 }
5729 
5730 static CPLErr GWKCubicNoMasksOrDstDensityOnlyUShort( GDALWarpKernel *poWK )
5731 {
5732     return GWKRun(
5733         poWK, "GWKCubicNoMasksOrDstDensityOnlyUShort",
5734         GWKResampleNoMasksOrDstDensityOnlyHas4SampleThread<GUInt16, GRA_Cubic>);
5735 }
5736 
5737 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyShort( GDALWarpKernel *poWK )
5738 {
5739     return GWKRun(
5740         poWK, "GWKCubicSplineNoMasksOrDstDensityOnlyShort",
5741         GWKResampleNoMasksOrDstDensityOnlyThread<GInt16, GRA_CubicSpline>);
5742 }
5743 
5744 static CPLErr GWKCubicSplineNoMasksOrDstDensityOnlyUShort(
5745     GDALWarpKernel *poWK )
5746 {
5747     return GWKRun(
5748         poWK, "GWKCubicSplineNoMasksOrDstDensityOnlyUShort",
5749         GWKResampleNoMasksOrDstDensityOnlyThread<GUInt16, GRA_CubicSpline>);
5750 }
5751 
5752 static CPLErr GWKNearestShort( GDALWarpKernel *poWK )
5753 {
5754     return GWKRun(poWK, "GWKNearestShort", GWKNearestThread<GInt16>);
5755 }
5756 
5757 static CPLErr GWKNearestNoMasksOrDstDensityOnlyFloat( GDALWarpKernel *poWK )
5758 {
5759     return GWKRun(
5760         poWK, "GWKNearestNoMasksOrDstDensityOnlyFloat",
5761         GWKResampleNoMasksOrDstDensityOnlyThread<float, GRA_NearestNeighbour>);
5762 }
5763 
5764 static CPLErr GWKNearestFloat( GDALWarpKernel *poWK )
5765 {
5766     return GWKRun(poWK, "GWKNearestFloat", GWKNearestThread<float>);
5767 }
5768 
5769 /************************************************************************/
5770 /*                           GWKAverageOrMode()                         */
5771 /*                                                                      */
5772 /************************************************************************/
5773 
5774 static void GWKAverageOrModeThread(void* pData);
5775 
5776 static CPLErr GWKAverageOrMode( GDALWarpKernel *poWK )
5777 {
5778     return GWKRun( poWK, "GWKAverageOrMode", GWKAverageOrModeThread );
5779 }
5780 
5781 // Overall logic based on GWKGeneralCaseThread().
5782 static void GWKAverageOrModeThread( void* pData)
5783 {
5784     GWKJobStruct* psJob = static_cast<GWKJobStruct *>(pData);
5785     GDALWarpKernel *poWK = psJob->poWK;
5786     const int iYMin = psJob->iYMin;
5787     const int iYMax = psJob->iYMax;
5788 
5789     const int nDstXSize = poWK->nDstXSize;
5790     const int nSrcXSize = poWK->nSrcXSize;
5791     const int nSrcYSize = poWK->nSrcYSize;
5792 
5793 /* -------------------------------------------------------------------- */
5794 /*      Find out which algorithm to use (small optim.)                  */
5795 /* -------------------------------------------------------------------- */
5796     int nAlgo = 0;
5797 
5798     // These vars only used with nAlgo == 3.
5799     int *panVals = nullptr;
5800     int nBins = 0;
5801     int nBinsOffset = 0;
5802 
5803     // Only used with nAlgo = 2.
5804     float* pafRealVals = nullptr;
5805     float* pafImagVals = nullptr;
5806     int* panRealSums = nullptr;
5807     int* panImagSums = nullptr;
5808 
5809     // Only used with nAlgo = 6.
5810     float quant = 0.5;
5811 
5812     //To control array allocation only when data type is complex
5813     const bool bIsComplex = GDALDataTypeIsComplex(poWK->eWorkingDataType)!=0;
5814 
5815     if( poWK->eResample == GRA_Average )
5816     {
5817         nAlgo = GWKAOM_Average;
5818     }
5819     else if( poWK->eResample == GRA_RMS )
5820     {
5821         nAlgo = GWKAOM_RMS;
5822     }
5823     else if( poWK->eResample == GRA_Mode )
5824     {
5825         // TODO check color table count > 256.
5826         if( poWK->eWorkingDataType == GDT_Byte ||
5827             poWK->eWorkingDataType == GDT_UInt16 ||
5828             poWK->eWorkingDataType == GDT_Int16 )
5829         {
5830             nAlgo = GWKAOM_Imode;
5831 
5832             // In the case of a paletted or non-paletted byte band,
5833             // Input values are between 0 and 255.
5834             if( poWK->eWorkingDataType == GDT_Byte )
5835             {
5836                 nBins = 256;
5837             }
5838             // In the case of Int16, input values are between -32768 and 32767.
5839             else if( poWK->eWorkingDataType == GDT_Int16 )
5840             {
5841                 nBins = 65536;
5842                 nBinsOffset = 32768;
5843             }
5844             // In the case of UInt16, input values are between 0 and 65537.
5845             else if( poWK->eWorkingDataType == GDT_UInt16 )
5846             {
5847                 nBins = 65536;
5848             }
5849             panVals =
5850                 static_cast<int *>(VSI_MALLOC_VERBOSE(nBins * sizeof(int)));
5851             if( panVals == nullptr )
5852                 return;
5853         }
5854         else
5855         {
5856             nAlgo = GWKAOM_Fmode;
5857 
5858             if( nSrcXSize > 0 && nSrcYSize > 0 )
5859             {
5860                 pafRealVals = static_cast<float *>(
5861                     VSI_MALLOC3_VERBOSE(nSrcXSize, nSrcYSize, sizeof(float)));
5862                 panRealSums = static_cast<int *>(
5863                     VSI_MALLOC3_VERBOSE(nSrcXSize, nSrcYSize, sizeof(int)));
5864                 if( pafRealVals == nullptr || panRealSums == nullptr )
5865                 {
5866                     VSIFree(pafRealVals);
5867                     VSIFree(panRealSums);
5868                     return;
5869                 }
5870             }
5871         }
5872     }
5873     else if( poWK->eResample == GRA_Max )
5874     {
5875         nAlgo = GWKAOM_Max;
5876     }
5877     else if( poWK->eResample == GRA_Min )
5878     {
5879         nAlgo = GWKAOM_Min;
5880     }
5881     else if( poWK->eResample == GRA_Med )
5882     {
5883         nAlgo = GWKAOM_Quant;
5884         quant = 0.5;
5885     }
5886     else if( poWK->eResample == GRA_Q1 )
5887     {
5888         nAlgo = GWKAOM_Quant;
5889         quant = 0.25;
5890     }
5891     else if( poWK->eResample == GRA_Q3 )
5892     {
5893         nAlgo = GWKAOM_Quant;
5894         quant = 0.75;
5895     }
5896     else if( poWK->eResample == GRA_Sum )
5897     {
5898         nAlgo = GWKAOM_Sum;
5899     }
5900     else
5901     {
5902         // Other resample algorithms not permitted here.
5903         CPLDebug("GDAL",
5904                  "GDALWarpKernel():GWKAverageOrModeThread() ERROR, "
5905                  "illegal resample" );
5906         return;
5907     }
5908 
5909     CPLDebug("GDAL",
5910              "GDALWarpKernel():GWKAverageOrModeThread() using algo %d", nAlgo);
5911 
5912 /* -------------------------------------------------------------------- */
5913 /*      Allocate x,y,z coordinate arrays for transformation ... two     */
5914 /*      scanlines worth of positions.                                   */
5915 /* -------------------------------------------------------------------- */
5916 
5917     double *padfX =
5918         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5919     double *padfY =
5920         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5921     double *padfZ =
5922         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5923     double *padfX2 =
5924         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5925     double *padfY2 =
5926         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5927     double *padfZ2 =
5928         static_cast<double *>(CPLMalloc(sizeof(double) * nDstXSize));
5929     int *pabSuccess = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
5930     int *pabSuccess2 = static_cast<int *>(CPLMalloc(sizeof(int) * nDstXSize));
5931 
5932     const double dfSrcCoordPrecision = CPLAtof(
5933         CSLFetchNameValueDef(poWK->papszWarpOptions,
5934                              "SRC_COORD_PRECISION", "0"));
5935     const double dfErrorThreshold = CPLAtof(
5936         CSLFetchNameValueDef(poWK->papszWarpOptions, "ERROR_THRESHOLD", "0"));
5937 
5938 /* ==================================================================== */
5939 /*      Loop over output lines.                                         */
5940 /* ==================================================================== */
5941     for( int iDstY = iYMin; iDstY < iYMax; iDstY++ )
5942     {
5943 
5944 /* -------------------------------------------------------------------- */
5945 /*      Setup points to transform to source image space.                */
5946 /* -------------------------------------------------------------------- */
5947         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5948         {
5949             padfX[iDstX] = iDstX + poWK->nDstXOff;
5950             padfY[iDstX] = iDstY + poWK->nDstYOff;
5951             padfZ[iDstX] = 0.0;
5952             padfX2[iDstX] = iDstX + 1.0 + poWK->nDstXOff;
5953             padfY2[iDstX] = iDstY + 1.0 + poWK->nDstYOff;
5954             padfZ2[iDstX] = 0.0;
5955         }
5956 
5957 /* -------------------------------------------------------------------- */
5958 /*      Transform the points from destination pixel/line coordinates    */
5959 /*      to source pixel/line coordinates.                               */
5960 /* -------------------------------------------------------------------- */
5961         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
5962                               padfX, padfY, padfZ, pabSuccess );
5963         poWK->pfnTransformer( psJob->pTransformerArg, TRUE, nDstXSize,
5964                               padfX2, padfY2, padfZ2, pabSuccess2 );
5965 
5966         if( dfSrcCoordPrecision > 0.0 )
5967         {
5968             GWKRoundSourceCoordinates(nDstXSize, padfX, padfY, padfZ,
5969                                       pabSuccess,
5970                                       dfSrcCoordPrecision,
5971                                       dfErrorThreshold,
5972                                       poWK->pfnTransformer,
5973                                       psJob->pTransformerArg,
5974                                       poWK->nDstXOff,
5975                                       iDstY + poWK->nDstYOff);
5976             GWKRoundSourceCoordinates(nDstXSize, padfX2, padfY2, padfZ2,
5977                                       pabSuccess2,
5978                                       dfSrcCoordPrecision,
5979                                       dfErrorThreshold,
5980                                       poWK->pfnTransformer,
5981                                       psJob->pTransformerArg,
5982                                       1.0 + poWK->nDstXOff,
5983                                       iDstY + 1.0 + poWK->nDstYOff);
5984         }
5985 
5986 /* ==================================================================== */
5987 /*      Loop over pixels in output scanline.                            */
5988 /* ==================================================================== */
5989         for( int iDstX = 0; iDstX < nDstXSize; iDstX++ )
5990         {
5991             GPtrDiff_t iSrcOffset = 0;
5992             double dfDensity = 1.0;
5993             bool bHasFoundDensity = false;
5994 
5995             if( !pabSuccess[iDstX] || !pabSuccess2[iDstX] )
5996                 continue;
5997 
5998             if( padfX[iDstX] < poWK->nSrcXOff - 1 ||
5999                 padfX2[iDstX] < poWK->nSrcXOff - 1 ||
6000                 padfY[iDstX] < poWK->nSrcYOff - 1 ||
6001                 padfY2[iDstX] < poWK->nSrcYOff -1 ||
6002                 padfX[iDstX] > nSrcXSize + poWK->nSrcXOff + 1 ||
6003                 padfX2[iDstX] > nSrcXSize + poWK->nSrcXOff + 1 ||
6004                 padfY[iDstX] > nSrcYSize + poWK->nSrcYOff + 1 ||
6005                 padfY2[iDstX] > nSrcYSize + poWK->nSrcYOff + 1 )
6006             {
6007                 continue;
6008             }
6009 
6010             const GPtrDiff_t iDstOffset = iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
6011 
6012             // Compute corners in source crs.
6013 
6014             // The transformation might not have preserved ordering of
6015             // coordinates so do the necessary swapping (#5433).
6016             // NOTE: this is really an approximative fix. To do something
6017             // more precise we would for example need to compute the
6018             // transformation of coordinates in the
6019             // [iDstX,iDstY]x[iDstX+1,iDstY+1] square back to source
6020             // coordinates, and take the bounding box of the got source
6021             // coordinates.
6022             const double dfXMin = std::min(padfX[iDstX],padfX2[iDstX]) -
6023                                     poWK->nSrcXOff;
6024             int iSrcXMin =
6025                 std::max(static_cast<int>(floor(dfXMin + 1e-10)), 0);
6026             const double dfXMax = std::max(padfX[iDstX],padfX2[iDstX])  -
6027                                     poWK->nSrcXOff;
6028             int iSrcXMax =
6029                 std::min(static_cast<int>(ceil(dfXMax - 1e-10)), nSrcXSize);
6030             const double dfYMin = std::min(padfY[iDstX],padfY2[iDstX]) -
6031                                     poWK->nSrcYOff;
6032             int iSrcYMin =
6033                 std::max(static_cast<int>(floor(dfYMin + 1e-10)), 0);
6034             const double dfYMax = std::max(padfY[iDstX],padfY2[iDstX]) -
6035                                     poWK->nSrcYOff;
6036             int iSrcYMax =
6037                 std::min(static_cast<int>(ceil(dfYMax - 1e-10)), nSrcYSize);
6038 
6039             if( iSrcXMin == iSrcXMax && iSrcXMax < nSrcXSize )
6040                 iSrcXMax++;
6041             if( iSrcYMin == iSrcYMax && iSrcYMax < nSrcYSize )
6042                 iSrcYMax++;
6043 
6044 /* ==================================================================== */
6045 /*      Loop processing each band.                                      */
6046 /* ==================================================================== */
6047 
6048             for( int iBand = 0; iBand < poWK->nBands; iBand++ )
6049             {
6050                 double dfBandDensity = 0.0;
6051                 double dfValueReal = 0.0;
6052                 double dfValueImag = 0.0;
6053                 double dfValueRealTmp = 0.0;
6054                 double dfValueImagTmp = 0.0;
6055 
6056 /* -------------------------------------------------------------------- */
6057 /*      Collect the source value.                                       */
6058 /* -------------------------------------------------------------------- */
6059 
6060                 // Loop over source lines and pixels - 3 possible algorithms.
6061 
6062 #define COMPUTE_WEIGHT_Y(iSrcY) \
6063     ((iSrcY == iSrcYMin) ? \
6064         ((iSrcYMin + 1 == iSrcYMax) ? 1.0 : 1 - (dfYMin - iSrcYMin)): \
6065      (iSrcY + 1 == iSrcYMax) ? 1 - (iSrcYMax - dfYMax): \
6066      1.0)
6067 
6068 #define COMPUTE_WEIGHT(iSrcX, dfWeightY) \
6069     ((iSrcX == iSrcXMin) ? \
6070         ((iSrcXMin + 1 == iSrcXMax) ? dfWeightY : dfWeightY * (1 - (dfXMin - iSrcXMin))): \
6071      (iSrcX + 1 == iSrcXMax) ? dfWeightY * (1 - (iSrcXMax - dfXMax)): \
6072      dfWeightY)
6073 
6074                 // poWK->eResample == GRA_Average.
6075                 if( nAlgo == GWKAOM_Average )
6076                 {
6077                     double dfTotalReal = 0.0;
6078                     double dfTotalImag = 0.0;
6079                     double dfTotalWeight = 0.0;
6080 
6081                     // This code adapted from GDALDownsampleChunk32R_AverageT()
6082                     // in gcore/overview.cpp.
6083                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6084                     {
6085                         const double dfWeightY = COMPUTE_WEIGHT_Y(iSrcY);
6086                         iSrcOffset = iSrcXMin + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6087                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++, iSrcOffset++ )
6088                         {
6089                             if( poWK->panUnifiedSrcValid != nullptr
6090                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6091                                      & (0x01 << (iSrcOffset & 0x1f))) )
6092                             {
6093                                 continue;
6094                             }
6095 
6096                             if( GWKGetPixelValue(
6097                                     poWK, iBand, iSrcOffset,
6098                                     &dfBandDensity, &dfValueRealTmp,
6099                                     &dfValueImagTmp ) &&
6100                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6101                             {
6102                                 const double dfWeight = COMPUTE_WEIGHT(iSrcX, dfWeightY);
6103                                 dfTotalWeight += dfWeight;
6104                                 dfTotalReal += dfValueRealTmp * dfWeight;
6105                                 if (bIsComplex)
6106                                 {
6107                                     dfTotalImag += dfValueImagTmp * dfWeight;
6108                                 }
6109                             }
6110                         }
6111                     }
6112 
6113                     if( dfTotalWeight > 0 )
6114                     {
6115                         dfValueReal = dfTotalReal / dfTotalWeight;
6116                         if (bIsComplex)
6117                         {
6118                             dfValueImag = dfTotalImag / dfTotalWeight;
6119                         }
6120                         dfBandDensity = 1;
6121                         bHasFoundDensity = true;
6122                     }
6123                 }  // GRA_Average.
6124                 // poWK->eResample == GRA_RMS.
6125                 if( nAlgo == GWKAOM_RMS )
6126                 {
6127                     double dfTotalReal = 0.0;
6128                     double dfTotalImag = 0.0;
6129                     double dfTotalWeight = 0.0;
6130                     // This code adapted from GDALDownsampleChunk32R_AverageT()
6131                     // in gcore/overview.cpp.
6132                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6133                     {
6134                         const double dfWeightY = COMPUTE_WEIGHT_Y(iSrcY);
6135                         iSrcOffset = iSrcXMin + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6136                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++, iSrcOffset++ )
6137                         {
6138                             if( poWK->panUnifiedSrcValid != nullptr
6139                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6140                                      & (0x01 << (iSrcOffset & 0x1f))) )
6141                             {
6142                                 continue;
6143                             }
6144 
6145                             if( GWKGetPixelValue(
6146                                     poWK, iBand, iSrcOffset,
6147                                     &dfBandDensity, &dfValueRealTmp,
6148                                     &dfValueImagTmp ) &&
6149                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6150                             {
6151                                 const double dfWeight = COMPUTE_WEIGHT(iSrcX, dfWeightY);
6152                                 dfTotalWeight += dfWeight;
6153                                 dfTotalReal += dfValueRealTmp * dfValueRealTmp * dfWeight;
6154                                 if (bIsComplex)
6155                                     dfTotalImag += dfValueImagTmp * dfValueImagTmp * dfWeight;
6156                             }
6157                         }
6158                     }
6159 
6160                     if( dfTotalWeight > 0 )
6161                     {
6162                         dfValueReal = sqrt( dfTotalReal / dfTotalWeight );
6163                         if (bIsComplex)
6164                             dfValueImag = sqrt( dfTotalImag / dfTotalWeight );
6165 
6166                         dfBandDensity = 1;
6167                         bHasFoundDensity = true;
6168                     }
6169                 }  // GRA_RMS.
6170                 else if( nAlgo == GWKAOM_Sum )
6171                 // poWK->eResample == GRA_Sum
6172                 {
6173                     double dfTotalReal = 0.0;
6174                     double dfTotalImag = 0.0;
6175                     bool bFoundValid = false;
6176 
6177                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6178                     {
6179                         const double dfWeightY = COMPUTE_WEIGHT_Y(iSrcY);
6180                         iSrcOffset = iSrcXMin + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6181                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++, iSrcOffset++ )
6182                         {
6183                             if( poWK->panUnifiedSrcValid != nullptr
6184                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6185                                      & (0x01 << (iSrcOffset & 0x1f))) )
6186                             {
6187                                 continue;
6188                             }
6189 
6190                             if( GWKGetPixelValue(
6191                                     poWK, iBand, iSrcOffset,
6192                                     &dfBandDensity, &dfValueRealTmp,
6193                                     &dfValueImagTmp ) &&
6194                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6195                             {
6196                                 const double dfWeight = COMPUTE_WEIGHT(iSrcX, dfWeightY);
6197                                 bFoundValid = true;
6198                                 dfTotalReal += dfValueRealTmp * dfWeight;
6199                                 if (bIsComplex)
6200                                 {
6201                                     dfTotalImag += dfValueImagTmp * dfWeight;
6202                                 }
6203                             }
6204                         }
6205                     }
6206 
6207                     if( bFoundValid )
6208                     {
6209                         dfValueReal = dfTotalReal;
6210                         if (bIsComplex)
6211                         {
6212                             dfValueImag = dfTotalImag;
6213                         }
6214                         dfBandDensity = 1;
6215                         bHasFoundDensity = true;
6216                     }
6217                 }  // GRA_Sum.
6218                 else if( nAlgo == GWKAOM_Imode || nAlgo == GWKAOM_Fmode )
6219                 // poWK->eResample == GRA_Mode
6220                 {
6221                     // This code adapted from GDALDownsampleChunk32R_Mode() in
6222                     // gcore/overview.cpp.
6223                     if( nAlgo == GWKAOM_Fmode ) // int32 or float.
6224                     {
6225                         // Does it make sense it makes to run a
6226                         // majority filter on floating point data? But, here it
6227                         // is for the sake of compatibility. It won't look
6228                         // right on RGB images by the nature of the filter.
6229                         int iMaxInd = 0;
6230                         int iMaxVal = -1;
6231                         int i = 0;
6232 
6233                         for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6234                         {
6235                             for( int iSrcX = iSrcXMin;
6236                                  iSrcX < iSrcXMax;
6237                                  iSrcX++ )
6238                             {
6239                                 iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6240 
6241                                 if( poWK->panUnifiedSrcValid != nullptr
6242                                     && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6243                                          & (0x01 << (iSrcOffset & 0x1f))) )
6244                                     continue;
6245 
6246                                 if( GWKGetPixelValue(
6247                                         poWK, iBand, iSrcOffset,
6248                                         &dfBandDensity, &dfValueRealTmp,
6249                                         &dfValueImagTmp ) &&
6250                                     dfBandDensity > BAND_DENSITY_THRESHOLD )
6251                                 {
6252                                     const float fVal =
6253                                         static_cast<float>(dfValueRealTmp);
6254 
6255                                     // Check array for existing entry.
6256                                     for( i = 0; i < iMaxInd; ++i )
6257                                         if( pafRealVals[i] == fVal
6258                                             && ++panRealSums[i] > panRealSums[iMaxVal] )
6259                                         {
6260                                             iMaxVal = i;
6261                                             break;
6262                                         }
6263 
6264                                     // Add to arr if entry not already there.
6265                                     if( i == iMaxInd )
6266                                     {
6267                                         pafRealVals[iMaxInd] = fVal;
6268                                         panRealSums[iMaxInd] = 1;
6269 
6270                                         if( iMaxVal < 0 )
6271                                             iMaxVal = iMaxInd;
6272 
6273                                         ++iMaxInd;
6274                                     }
6275                                 }
6276                             }
6277                         }
6278 
6279                         if( iMaxVal != -1 )
6280                         {
6281                             dfValueReal = pafRealVals[iMaxVal];
6282                             dfBandDensity = 1;
6283                             bHasFoundDensity = true;
6284                         }
6285                     }
6286                     else // byte or int16.
6287                     {
6288                         int nMaxVal = 0;
6289                         int iMaxInd = -1;
6290 
6291                         memset(panVals, 0, nBins*sizeof(int));
6292 
6293                         for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6294                         {
6295                             for( int iSrcX = iSrcXMin;
6296                                  iSrcX < iSrcXMax;
6297                                  iSrcX++ )
6298                             {
6299                                 iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6300 
6301                                 if( poWK->panUnifiedSrcValid != nullptr
6302                                     && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6303                                          & (0x01 << (iSrcOffset & 0x1f))) )
6304                                     continue;
6305 
6306                                 if( GWKGetPixelValue(
6307                                         poWK, iBand, iSrcOffset,
6308                                         &dfBandDensity, &dfValueRealTmp,
6309                                         &dfValueImagTmp ) &&
6310                                     dfBandDensity > BAND_DENSITY_THRESHOLD )
6311                                 {
6312                                     const int nVal =
6313                                         static_cast<int>(dfValueRealTmp);
6314                                     if( ++panVals[nVal+nBinsOffset] > nMaxVal )
6315                                     {
6316                                         // Sum the density.
6317                                         // Is it the most common value so far?
6318                                         iMaxInd = nVal;
6319                                         nMaxVal = panVals[nVal+nBinsOffset];
6320                                     }
6321                                 }
6322                             }
6323                         }
6324 
6325                         if( iMaxInd != -1 )
6326                         {
6327                             dfValueReal = iMaxInd;
6328                             dfBandDensity = 1;
6329                             bHasFoundDensity = true;
6330                         }
6331                     }
6332                 }  // GRA_Mode.
6333                 else if( nAlgo == GWKAOM_Max )
6334                 // poWK->eResample == GRA_Max.
6335                 {
6336                     bool bFoundValid = false;
6337                     double dfTotalReal = std::numeric_limits<double>::lowest();
6338                     // This code adapted from nAlgo 1 method, GRA_Average.
6339                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6340                     {
6341                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++ )
6342                         {
6343                             iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6344 
6345                             if( poWK->panUnifiedSrcValid != nullptr
6346                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6347                                      & (0x01 << (iSrcOffset & 0x1f))) )
6348                             {
6349                                 continue;
6350                             }
6351 
6352                             // Returns pixel value if it is not no data.
6353                             if( GWKGetPixelValue(
6354                                     poWK, iBand, iSrcOffset,
6355                                     &dfBandDensity, &dfValueRealTmp,
6356                                     &dfValueImagTmp ) &&
6357                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6358                             {
6359                                 bFoundValid = true;
6360                                 if( dfTotalReal < dfValueRealTmp )
6361                                 {
6362                                     dfTotalReal = dfValueRealTmp;
6363                                 }
6364                             }
6365                         }
6366                     }
6367 
6368                     if( bFoundValid )
6369                     {
6370                         dfValueReal = dfTotalReal;
6371                         dfBandDensity = 1;
6372                         bHasFoundDensity = true;
6373                     }
6374                 }  // GRA_Max.
6375                 else if( nAlgo == GWKAOM_Min )
6376                 // poWK->eResample == GRA_Min.
6377                 {
6378                     bool bFoundValid = false;
6379                     double dfTotalReal = std::numeric_limits<double>::max();
6380                     // This code adapted from nAlgo 1 method, GRA_Average.
6381                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6382                     {
6383                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++ )
6384                         {
6385                             iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6386 
6387                             if( poWK->panUnifiedSrcValid != nullptr
6388                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6389                                      & (0x01 << (iSrcOffset & 0x1f))) )
6390                             {
6391                                 continue;
6392                             }
6393 
6394                             // Returns pixel value if it is not no data.
6395                             if( GWKGetPixelValue(
6396                                     poWK, iBand, iSrcOffset,
6397                                     &dfBandDensity, &dfValueRealTmp,
6398                                     &dfValueImagTmp ) &&
6399                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6400                             {
6401                                 bFoundValid = true;
6402                                 if( dfTotalReal > dfValueRealTmp )
6403                                 {
6404                                     dfTotalReal = dfValueRealTmp;
6405                                 }
6406                             }
6407                         }
6408                     }
6409 
6410                     if( bFoundValid )
6411                     {
6412                         dfValueReal = dfTotalReal;
6413                         dfBandDensity = 1;
6414                         bHasFoundDensity = true;
6415                     }
6416                 }  // GRA_Min.
6417                 else if( nAlgo == GWKAOM_Quant )
6418                 // poWK->eResample == GRA_Med | GRA_Q1 | GRA_Q3.
6419                 {
6420                     bool bFoundValid = false;
6421                     std::vector<double> dfRealValuesTmp;
6422 
6423                     // This code adapted from nAlgo 1 method, GRA_Average.
6424                     for( int iSrcY = iSrcYMin; iSrcY < iSrcYMax; iSrcY++ )
6425                     {
6426                         for( int iSrcX = iSrcXMin; iSrcX < iSrcXMax; iSrcX++ )
6427                         {
6428                             iSrcOffset = iSrcX + static_cast<GPtrDiff_t>(iSrcY) * nSrcXSize;
6429 
6430                             if( poWK->panUnifiedSrcValid != nullptr
6431                                 && !(poWK->panUnifiedSrcValid[iSrcOffset>>5]
6432                                      & (0x01 << (iSrcOffset & 0x1f))) )
6433                             {
6434                                 continue;
6435                             }
6436 
6437                             // Returns pixel value if it is not no data.
6438                             if( GWKGetPixelValue(
6439                                     poWK, iBand, iSrcOffset,
6440                                     &dfBandDensity, &dfValueRealTmp,
6441                                     &dfValueImagTmp ) &&
6442                                 dfBandDensity > BAND_DENSITY_THRESHOLD )
6443                             {
6444                                 bFoundValid = true;
6445                                 dfRealValuesTmp.push_back(dfValueRealTmp);
6446                             }
6447                         }
6448                     }
6449 
6450                     if( bFoundValid )
6451                     {
6452                         std::sort(dfRealValuesTmp.begin(), dfRealValuesTmp.end());
6453                         int quantIdx = static_cast<int>(
6454                             std::ceil(quant * dfRealValuesTmp.size() - 1));
6455                         dfValueReal = dfRealValuesTmp[quantIdx];
6456 
6457                         dfBandDensity = 1;
6458                         bHasFoundDensity = true;
6459                         dfRealValuesTmp.clear();
6460                     }
6461                 }  // Quantile.
6462 
6463 /* -------------------------------------------------------------------- */
6464 /*      We have a computed value from the source.  Now apply it to      */
6465 /*      the destination pixel.                                          */
6466 /* -------------------------------------------------------------------- */
6467                 if( bHasFoundDensity )
6468                 {
6469                     // TODO: Should we compute dfBandDensity in fct of
6470                     // nCount/nCount2, or use as a threshold to set the dest
6471                     // value?
6472                     // dfBandDensity = (float) nCount / nCount2;
6473                     // if( (float) nCount / nCount2 > 0.1 )
6474                     // or fix gdalwarp crop_to_cutline to crop partially
6475                     // overlapping pixels.
6476                     GWKSetPixelValue( poWK, iBand, iDstOffset,
6477                                       dfBandDensity,
6478                                       dfValueReal, dfValueImag );
6479                 }
6480             }
6481 
6482             if( !bHasFoundDensity )
6483                 continue;
6484 
6485 /* -------------------------------------------------------------------- */
6486 /*      Update destination density/validity masks.                      */
6487 /* -------------------------------------------------------------------- */
6488             GWKOverlayDensity( poWK, iDstOffset, dfDensity );
6489 
6490             if( poWK->panDstValid != nullptr )
6491             {
6492                 poWK->panDstValid[iDstOffset>>5] |=
6493                     0x01 << (iDstOffset & 0x1f);
6494             }
6495         } /* Next iDstX */
6496 
6497 /* -------------------------------------------------------------------- */
6498 /*      Report progress to the user, and optionally cancel out.         */
6499 /* -------------------------------------------------------------------- */
6500         if( psJob->pfnProgress && psJob->pfnProgress(psJob) )
6501             break;
6502     }
6503 
6504 /* -------------------------------------------------------------------- */
6505 /*      Cleanup and return.                                             */
6506 /* -------------------------------------------------------------------- */
6507     CPLFree( padfX );
6508     CPLFree( padfY );
6509     CPLFree( padfZ );
6510     CPLFree( padfX2 );
6511     CPLFree( padfY2 );
6512     CPLFree( padfZ2 );
6513     CPLFree( pabSuccess );
6514     CPLFree( pabSuccess2 );
6515     VSIFree( panVals );
6516     VSIFree(pafRealVals);
6517     VSIFree(panRealSums);
6518     if (bIsComplex)
6519     {
6520         VSIFree(pafImagVals);
6521         VSIFree(panImagSums);
6522     }
6523 }
6524