1 /******************************************************************************
2  *
3  * Project:  GDAL Core
4  * Purpose:  Helper code to implement overview support in different drivers.
5  * Author:   Frank Warmerdam, warmerdam@pobox.com
6  *
7  ******************************************************************************
8  * Copyright (c) 2000, Frank Warmerdam
9  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #include "cpl_port.h"
31 #include "gdal_priv.h"
32 
33 #include <cmath>
34 #include <cstddef>
35 #include <cstdlib>
36 
37 #include <algorithm>
38 #include <complex>
39 #include <condition_variable>
40 #include <limits>
41 #include <list>
42 #include <memory>
43 #include <mutex>
44 #include <vector>
45 
46 #include "cpl_conv.h"
47 #include "cpl_error.h"
48 #include "cpl_progress.h"
49 #include "cpl_vsi.h"
50 #include "gdal.h"
51 #include "gdal_thread_pool.h"
52 #include "gdalwarper.h"
53 
54 // Restrict to 64bit processors because they are guaranteed to have SSE2.
55 // Could possibly be used too on 32bit, but we would need to check at runtime.
56 #if defined(__x86_64) || defined(_M_X64)
57 #define USE_SSE2
58 
59 #include "gdalsse_priv.h"
60 #endif
61 
62 CPL_CVSID("$Id: overview.cpp e2f3698499b06dd03205c26333cd316db9e6e65a 2021-08-26 21:37:16 +0200 Even Rouault $")
63 
64 /************************************************************************/
65 /*                     GDALResampleChunk32R_Near()                      */
66 /************************************************************************/
67 
68 template <class T>
69 static CPLErr
GDALResampleChunk32R_NearT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,GDALDataType eWrkDataType,const T * pChunk,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,T ** ppDstBuffer)70 GDALResampleChunk32R_NearT( double dfXRatioDstToSrc,
71                             double dfYRatioDstToSrc,
72                             GDALDataType eWrkDataType,
73                             const T * pChunk,
74                             int nChunkXOff, int nChunkXSize,
75                             int nChunkYOff,
76                             int nDstXOff, int nDstXOff2,
77                             int nDstYOff, int nDstYOff2,
78                             T** ppDstBuffer )
79 
80 {
81     const int nDstXWidth = nDstXOff2 - nDstXOff;
82 
83 /* -------------------------------------------------------------------- */
84 /*      Allocate buffers.                                               */
85 /* -------------------------------------------------------------------- */
86     *ppDstBuffer = static_cast<T *>(
87         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
88                             GDALGetDataTypeSizeBytes(eWrkDataType)));
89     if( *ppDstBuffer == nullptr )
90     {
91         return CE_Failure;
92     }
93     T* const pDstBuffer = *ppDstBuffer;
94 
95     int* panSrcXOff = static_cast<int *>(
96         VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)) );
97 
98     if( panSrcXOff == nullptr )
99     {
100         VSIFree(panSrcXOff);
101         return CE_Failure;
102     }
103 
104 /* ==================================================================== */
105 /*      Precompute inner loop constants.                                */
106 /* ==================================================================== */
107     for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
108     {
109         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
110         if( nSrcXOff < nChunkXOff )
111             nSrcXOff = nChunkXOff;
112 
113         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
114     }
115 
116 /* ==================================================================== */
117 /*      Loop over destination scanlines.                                */
118 /* ==================================================================== */
119     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
120     {
121         int   nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
122         if( nSrcYOff < nChunkYOff )
123             nSrcYOff = nChunkYOff;
124 
125         const T * const pSrcScanline =
126             pChunk + (static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize) - nChunkXOff;
127 
128 /* -------------------------------------------------------------------- */
129 /*      Loop over destination pixels                                    */
130 /* -------------------------------------------------------------------- */
131         T* pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
132         for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
133         {
134             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
135         }
136     }
137 
138     CPLFree( panSrcXOff );
139 
140     return CE_None;
141 }
142 
143 static CPLErr
GDALResampleChunk32R_Near(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double,double,GDALDataType eWrkDataType,const void * pChunk,const GByte *,int nChunkXOff,int nChunkXSize,int nChunkYOff,int,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand *,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int,float,GDALColorTable *,GDALDataType,bool)144 GDALResampleChunk32R_Near( double dfXRatioDstToSrc,
145                            double dfYRatioDstToSrc,
146                            double /* dfSrcXDelta */,
147                            double /* dfSrcYDelta */,
148                            GDALDataType eWrkDataType,
149                            const void * pChunk,
150                            const GByte * /* pabyChunkNodataMask_unused */,
151                            int nChunkXOff, int nChunkXSize,
152                            int nChunkYOff, int /* nChunkYSize */,
153                            int nDstXOff, int nDstXOff2,
154                            int nDstYOff, int nDstYOff2,
155                            GDALRasterBand * /*poOverview*/,
156                            void** ppDstBuffer,
157                            GDALDataType* peDstBufferDataType,
158                            const char * /* pszResampling_unused */,
159                            int /* bHasNoData_unused */,
160                            float /* fNoDataValue_unused */,
161                            GDALColorTable* /* poColorTable_unused */,
162                            GDALDataType /* eSrcDataType */,
163                            bool /* bPropagateNoData */ )
164 {
165     *peDstBufferDataType = eWrkDataType;
166     if( eWrkDataType == GDT_Byte )
167         return GDALResampleChunk32R_NearT(
168             dfXRatioDstToSrc,
169             dfYRatioDstToSrc,
170             eWrkDataType,
171             static_cast<const GByte *>( pChunk ),
172             nChunkXOff, nChunkXSize,
173             nChunkYOff,
174             nDstXOff, nDstXOff2,
175             nDstYOff, nDstYOff2,
176             reinterpret_cast<GByte **>( ppDstBuffer ) );
177     else if( eWrkDataType == GDT_UInt16 )
178         return GDALResampleChunk32R_NearT(
179             dfXRatioDstToSrc,
180             dfYRatioDstToSrc,
181             eWrkDataType,
182             static_cast<const GInt16 *>( pChunk ),
183             nChunkXOff, nChunkXSize,
184             nChunkYOff,
185             nDstXOff, nDstXOff2,
186             nDstYOff, nDstYOff2,
187             reinterpret_cast<GInt16 **>( ppDstBuffer ) );
188     else if( eWrkDataType == GDT_Float32 )
189         return GDALResampleChunk32R_NearT(
190             dfXRatioDstToSrc,
191             dfYRatioDstToSrc,
192             eWrkDataType,
193             static_cast<const float *>( pChunk ),
194             nChunkXOff, nChunkXSize,
195             nChunkYOff,
196             nDstXOff, nDstXOff2,
197             nDstYOff, nDstYOff2,
198             reinterpret_cast<float **>( ppDstBuffer ));
199 
200     CPLAssert(false);
201     return CE_Failure;
202 }
203 
204 /************************************************************************/
205 /*                          GDALFindBestEntry()                         */
206 /************************************************************************/
207 
208 // Find in the color table the entry whose (c1,c2,c3) value is the closest
209 // (using quadratic distance) to the passed (nR,nG,nB) triplet, ignoring
210 // transparent entries.
GDALFindBestEntry(int nEntryCount,const GDALColorEntry * aEntries,int nR,int nG,int nB)211 static int GDALFindBestEntry( int nEntryCount, const GDALColorEntry* aEntries,
212                               int nR, int nG, int nB )
213 {
214     int nMinDist = std::numeric_limits<int>::max();
215     int iBestEntry = 0;
216     for( int i = 0; i < nEntryCount; ++i )
217     {
218         // Ignore transparent entries
219         if( aEntries[i].c4 == 0 )
220             continue;
221         int nDist = (nR - aEntries[i].c1) *  (nR - aEntries[i].c1) +
222             (nG - aEntries[i].c2) *  (nG - aEntries[i].c2) +
223             (nB - aEntries[i].c3) *  (nB - aEntries[i].c3);
224         if( nDist < nMinDist )
225         {
226             nMinDist = nDist;
227             iBestEntry = i;
228         }
229     }
230     return iBestEntry;
231 }
232 
233 /************************************************************************/
234 /*                      ReadColorTableAsArray()                         */
235 /************************************************************************/
236 
ReadColorTableAsArray(const GDALColorTable * poColorTable,int & nEntryCount,GDALColorEntry * & aEntries,int & nTransparentIdx)237 static bool ReadColorTableAsArray( const GDALColorTable* poColorTable,
238                                    int& nEntryCount,
239                                    GDALColorEntry*& aEntries,
240                                    int& nTransparentIdx )
241 {
242     nEntryCount = poColorTable->GetColorEntryCount();
243     aEntries = static_cast<GDALColorEntry *>(
244         VSI_MALLOC2_VERBOSE(sizeof(GDALColorEntry), nEntryCount) );
245     nTransparentIdx = -1;
246     if( aEntries == nullptr )
247         return false;
248     for( int i = 0; i < nEntryCount; ++i )
249     {
250         poColorTable->GetColorEntryAsRGB(i, &aEntries[i]);
251         if( nTransparentIdx < 0 && aEntries[i].c4 == 0 )
252             nTransparentIdx = i;
253     }
254     return true;
255 }
256 
257 /************************************************************************/
258 /*                    GetReplacementValueIfNoData()                     */
259 /************************************************************************/
260 
GetReplacementValueIfNoData(GDALDataType dt,int bHasNoData,float fNoDataValue)261 static float GetReplacementValueIfNoData(GDALDataType dt, int bHasNoData,
262                                          float fNoDataValue)
263 {
264     float fReplacementVal = 0.0f;
265     if( bHasNoData )
266     {
267         if( dt == GDT_Byte )
268         {
269             if( fNoDataValue == std::numeric_limits<unsigned char>::max() )
270                 fReplacementVal = static_cast<float>(
271                     std::numeric_limits<unsigned char>::max() - 1);
272             else
273                 fReplacementVal = fNoDataValue + 1;
274         }
275         else if( dt == GDT_UInt16 )
276         {
277             if( fNoDataValue == std::numeric_limits<GUInt16>::max() )
278                 fReplacementVal = static_cast<float>(
279                     std::numeric_limits<GUInt16>::max() - 1);
280             else
281                 fReplacementVal = fNoDataValue + 1;
282         }
283         else if( dt == GDT_Int16 )
284         {
285             if( fNoDataValue == std::numeric_limits<GInt16>::max() )
286                 fReplacementVal = static_cast<float>(
287                     std::numeric_limits<GInt16>::max() - 1);
288             else
289                 fReplacementVal = fNoDataValue + 1;
290         }
291         else if( dt == GDT_UInt32 )
292         {
293             // Be careful to limited precision of float
294             fReplacementVal = fNoDataValue + 1;
295             double dfVal = fNoDataValue;
296             if( fReplacementVal >= static_cast<double>(std::numeric_limits<GUInt32>::max() - 128) )
297             {
298                 while( fReplacementVal == fNoDataValue )
299                 {
300                     dfVal -= 1.0;
301                     fReplacementVal = static_cast<float>(dfVal);
302                 }
303             }
304             else
305             {
306                 while( fReplacementVal == fNoDataValue )
307                 {
308                     dfVal += 1.0;
309                     fReplacementVal = static_cast<float>(dfVal);
310                 }
311             }
312         }
313         else if( dt == GDT_Int32 )
314         {
315             // Be careful to limited precision of float
316             fReplacementVal = fNoDataValue + 1;
317             double dfVal = fNoDataValue;
318             if( fReplacementVal >= static_cast<double>(std::numeric_limits<GInt32>::max() - 64) )
319             {
320                 while( fReplacementVal == fNoDataValue )
321                 {
322                     dfVal -= 1.0;
323                     fReplacementVal = static_cast<float>(dfVal);
324                 }
325             }
326             else
327             {
328                 while( fReplacementVal == fNoDataValue )
329                 {
330                     dfVal += 1.0;
331                     fReplacementVal = static_cast<float>(dfVal);
332                 }
333             }
334         }
335         else if( dt == GDT_Float32 || dt == GDT_Float64 )
336         {
337             if( fNoDataValue == 0 )
338             {
339                 fReplacementVal = std::numeric_limits<float>::min();
340             }
341             else
342             {
343                 fReplacementVal = static_cast<float>(
344                     fNoDataValue + 1e-7 * fNoDataValue);
345             }
346         }
347     }
348     return fReplacementVal;
349 }
350 
351 /************************************************************************/
352 /*                             SQUARE()                                 */
353 /************************************************************************/
354 
SQUARE(T val)355 template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
356 {
357     return static_cast<Tsquare>(val) * val;
358 }
359 
360 /************************************************************************/
361 /*                          ComputeIntegerRMS()                         */
362 /************************************************************************/
363 // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
364 // integer that minimizes abs(rms**2 - sumSquares / weight)
ComputeIntegerRMS(double sumSquares,double weight)365 template <class T> inline T ComputeIntegerRMS(double sumSquares, double weight)
366 {
367     const double sumDivWeight = sumSquares / weight;
368     T rms = static_cast<T>(sqrt(sumDivWeight));
369 
370     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
371     // Naive version:
372     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
373     if( 2 * rms * (rms + 1) + 1 < 2 * sumDivWeight )
374         rms += 1;
375     return rms;
376 }
377 
ComputeIntegerRMS_4values(Tsum)378 template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
379 {
380     CPLAssert(false);
381     return 0;
382 }
383 
ComputeIntegerRMS_4values(int sumSquares)384 template<> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
385 {
386     // It has been verified that given the correction on rms below, using
387     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
388     // is equivalent, so use the former as it is used twice.
389     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
390     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
391     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
392 
393     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
394     // Naive version:
395     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
396     // Optimized version for integer case and weight == 4
397     if( static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4 )
398         rms += 1;
399     return rms;
400 }
401 
ComputeIntegerRMS_4values(double sumSquares)402 template<> inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
403 {
404     const double sumDivWeight = sumSquares * 0.25;
405     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
406 
407     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
408     // Naive version:
409     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
410     // Optimized version for integer case and weight == 4
411     if( static_cast<GUInt32>(rms) * (rms + 1) < static_cast<GUInt32>(sumDivWeight + 0.25) )
412         rms += 1;
413     return rms;
414 }
415 
416 #ifdef USE_SSE2
417 
418 /************************************************************************/
419 /*                   QuadraticMeanByteSSE2OrAVX2()                      */
420 /************************************************************************/
421 
422 #ifdef __SSE4_1__
423 #define sse2_packus_epi32    _mm_packus_epi32
424 #else
sse2_packus_epi32(__m128i a,__m128i b)425 inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
426 {
427     const auto minus32768_32 = _mm_set1_epi32(-32768);
428     const auto minus32768_16 = _mm_set1_epi16(-32768);
429     a = _mm_add_epi32(a, minus32768_32);
430     b = _mm_add_epi32(b, minus32768_32);
431     a = _mm_packs_epi32(a, b);
432     a = _mm_sub_epi16(a, minus32768_16);
433     return a;
434 }
435 #endif
436 
437 #ifdef __SSSE3__
438 #define sse2_hadd_epi16     _mm_hadd_epi16
439 #else
sse2_hadd_epi16(__m128i a,__m128i b)440 inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
441 {
442     // Horizontal addition of adjacent pairs
443     const auto mask = _mm_set1_epi32(0xFFFF);
444     const auto horizLo = _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
445     const auto horizHi = _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
446 
447     // Recombine low and high parts
448     return _mm_packs_epi32(horizLo, horizHi);
449 }
450 #endif
451 
452 
453 #ifdef __AVX2__
454 #define DEST_ELTS       16
455 #define set1_epi16      _mm256_set1_epi16
456 #define set1_epi32      _mm256_set1_epi32
457 #define setzero         _mm256_setzero_si256
458 #define set1_ps         _mm256_set1_ps
459 #define loadu_int(x)    _mm256_loadu_si256(reinterpret_cast<__m256i const*>(x))
460 #define unpacklo_epi8   _mm256_unpacklo_epi8
461 #define unpackhi_epi8   _mm256_unpackhi_epi8
462 #define madd_epi16      _mm256_madd_epi16
463 #define add_epi32       _mm256_add_epi32
464 #define mul_ps          _mm256_mul_ps
465 #define cvtepi32_ps     _mm256_cvtepi32_ps
466 #define sqrt_ps         _mm256_sqrt_ps
467 #define cvttps_epi32    _mm256_cvttps_epi32
468 #define packs_epi32     _mm256_packs_epi32
469 #define packus_epi32    _mm256_packus_epi32
470 #define srli_epi32      _mm256_srli_epi32
471 #define mullo_epi16     _mm256_mullo_epi16
472 #define srli_epi16      _mm256_srli_epi16
473 #define cmpgt_epi16     _mm256_cmpgt_epi16
474 #define add_epi16       _mm256_add_epi16
475 #define sub_epi16       _mm256_sub_epi16
476 #define packus_epi16    _mm256_packus_epi16
477 /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
478 /* to get the lower 128-bit bits of what would be a true 256-bit vector register */
479 #define store_lo(x,y)   _mm_storeu_si128(reinterpret_cast<__m128i*>(x), \
480                             _mm256_extracti128_si256(_mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
481 #define hadd_epi16      _mm256_hadd_epi16
482 #define zeroupper()     _mm256_zeroupper()
483 #else
484 #define DEST_ELTS       8
485 #define set1_epi16      _mm_set1_epi16
486 #define set1_epi32      _mm_set1_epi32
487 #define setzero         _mm_setzero_si128
488 #define set1_ps         _mm_set1_ps
489 #define loadu_int(x)    _mm_loadu_si128(reinterpret_cast<__m128i const*>(x))
490 #define unpacklo_epi8   _mm_unpacklo_epi8
491 #define unpackhi_epi8   _mm_unpackhi_epi8
492 #define madd_epi16      _mm_madd_epi16
493 #define add_epi32       _mm_add_epi32
494 #define mul_ps          _mm_mul_ps
495 #define cvtepi32_ps     _mm_cvtepi32_ps
496 #define sqrt_ps         _mm_sqrt_ps
497 #define cvttps_epi32    _mm_cvttps_epi32
498 #define packs_epi32     _mm_packs_epi32
499 #define packus_epi32    sse2_packus_epi32
500 #define srli_epi32      _mm_srli_epi32
501 #define mullo_epi16     _mm_mullo_epi16
502 #define srli_epi16      _mm_srli_epi16
503 #define cmpgt_epi16     _mm_cmpgt_epi16
504 #define add_epi16       _mm_add_epi16
505 #define sub_epi16       _mm_sub_epi16
506 #define packus_epi16    _mm_packus_epi16
507 #define store_lo(x,y)   _mm_storel_epi64(reinterpret_cast<__m128i*>(x), (y))
508 #define hadd_epi16      sse2_hadd_epi16
509 #define zeroupper()     (void)0
510 #endif
511 
QuadraticMeanByteSSE2OrAVX2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)512 template<class T> static int QuadraticMeanByteSSE2OrAVX2(
513                                                    int nDstXWidth,
514                                                    int nChunkXSize,
515                                                    const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
516                                                    T* CPL_RESTRICT pDstScanline)
517 {
518     // Optimized implementation for RMS on Byte by
519     // processing by group of 8 output pixels, so as to use
520     // a single _mm_sqrt_ps() call for 4 output pixels
521     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
522 
523     int iDstPixel = 0;
524     const auto one16 = set1_epi16(1);
525     const auto one32 = set1_epi32(1);
526     const auto zero = setzero();
527     const auto minus32768 = set1_epi16(-32768);
528 
529     for( ; iDstPixel < nDstXWidth - (DEST_ELTS-1); iDstPixel += DEST_ELTS )
530     {
531         // Load 2 * DEST_ELTS bytes from each line
532         auto firstLine = loadu_int(pSrcScanlineShifted);
533         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
534         // Extend those Bytes as UInt16s
535         auto firstLineLo = unpacklo_epi8(firstLine, zero);
536         auto firstLineHi = unpackhi_epi8(firstLine, zero);
537         auto secondLineLo = unpacklo_epi8(secondLine, zero);
538         auto secondLineHi = unpackhi_epi8(secondLine, zero);
539 
540         // Multiplication of 16 bit values and horizontal
541         // addition of 32 bit results
542         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
543         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
544         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
545         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
546         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
547 
548         // Vertical addition
549         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
550         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
551 
552         const auto sumSquaresPlusOneDiv4Lo =
553             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
554         const auto sumSquaresPlusOneDiv4Hi =
555             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
556 
557         // Take square root and truncate/floor to int32
558         const auto rmsLo = cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
559         const auto rmsHi = cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
560 
561         // Merge back low and high registers with each RMS value
562         // as a 16 bit value.
563         auto rms = packs_epi32(rmsLo, rmsHi);
564 
565         // Round to upper value if it minimizes the
566         // error |rms^2 - sumSquares/4|
567         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
568         //    rms += 1;
569         // which is equivalent to:
570         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
571         //    rms += 1;
572         // And both left and right parts fit on 16 (unsigned) bits
573         const auto sumSquaresPlusOneDiv4 = packus_epi32(
574             sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
575         // cmpgt_epi16 operates on signed int16, but here
576         // we have unsigned values, so shift them by -32768 before
577         auto mask = cmpgt_epi16(
578             add_epi16(sumSquaresPlusOneDiv4, minus32768),
579             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
580         // The value of the mask will be -1 when the correction needs to be applied
581         rms = sub_epi16(rms, mask);
582 
583         // Pack each 16 bit RMS value to 8 bits
584         rms = packus_epi16(rms, rms /* could be anything */);
585         store_lo(&pDstScanline[iDstPixel], rms);
586         pSrcScanlineShifted += 2 * DEST_ELTS;
587     }
588     zeroupper();
589 
590     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
591     return iDstPixel;
592 }
593 
594 /************************************************************************/
595 /*                      AverageByteSSE2OrAVX2()                         */
596 /************************************************************************/
597 
AverageByteSSE2OrAVX2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)598 template<class T> static int AverageByteSSE2OrAVX2(
599                                              int nDstXWidth,
600                                              int nChunkXSize,
601                                              const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
602                                              T* CPL_RESTRICT pDstScanline)
603 {
604     // Optimized implementation for average on Byte by
605     // processing by group of 8 output pixels.
606 
607     const auto zero = setzero();
608     const auto two16 = set1_epi16(2);
609     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
610 
611     int iDstPixel = 0;
612     for( ; iDstPixel < nDstXWidth - (DEST_ELTS-1); iDstPixel += DEST_ELTS )
613     {
614         // Load 2 * DEST_ELTS bytes from each line
615         const auto firstLine = loadu_int(pSrcScanlineShifted);
616         const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
617         // Extend those Bytes as UInt16s
618         const auto firstLineLo = unpacklo_epi8(firstLine, zero);
619         const auto firstLineHi = unpackhi_epi8(firstLine, zero);
620         const auto secondLineLo = unpacklo_epi8(secondLine, zero);
621         const auto secondLineHi = unpackhi_epi8(secondLine, zero);
622 
623         // Vertical addition
624         const auto sumLo = add_epi16(firstLineLo, secondLineLo);
625         const auto sumHi = add_epi16(firstLineHi, secondLineHi);
626 
627         // Horizontal addition of adjacent pairs, and recombine low and high parts
628         const auto sum = hadd_epi16(sumLo, sumHi);
629 
630         // average = (sum + 2) / 4
631         auto average = srli_epi16(add_epi16(sum, two16), 2);
632 
633         // Pack each 16 bit average value to 8 bits
634         average = packus_epi16(average, average /* could be anything */);
635         store_lo(&pDstScanline[iDstPixel], average);
636         pSrcScanlineShifted += 2 * DEST_ELTS;
637     }
638     zeroupper();
639 
640     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
641     return iDstPixel;
642 }
643 
644 /************************************************************************/
645 /*                     QuadraticMeanUInt16SSE2()                        */
646 /************************************************************************/
647 
648 #ifdef __SSE3__
649 #define sse2_hadd_pd _mm_hadd_pd
650 #else
sse2_hadd_pd(__m128d a,__m128d b)651 inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
652 {
653     auto aLo_bLo = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
654     auto aHi_bHi = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
655     return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
656 }
657 #endif
658 
SQUARE(__m128d x)659 inline __m128d SQUARE(__m128d x)
660 {
661     return _mm_mul_pd(x, x);
662 }
663 
QuadraticMeanUInt16SSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)664 template<class T> static int QuadraticMeanUInt16SSE2(int nDstXWidth,
665                                                      int nChunkXSize,
666                                                      const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
667                                                      T* CPL_RESTRICT pDstScanline)
668 {
669     // Optimized implementation for RMS on UInt16 by
670     // processing by group of 4 output pixels.
671     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
672 
673     int iDstPixel = 0;
674     const auto zero = _mm_setzero_si128();
675     const auto zeroDot25 = _mm_set1_pd(0.25);
676     const auto zeroDot5 = _mm_set1_pd(0.5);
677 
678     for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
679     {
680         // Load 8 UInt16 from each line
681         const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted));
682         const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + nChunkXSize));
683 
684         // Detect if all of the source values fit in 14 bits.
685         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
686         // and we can do a much faster implementation.
687         const auto maskTmp = _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
688         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
689             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
690         if( nMaskFitsIn14Bits == 0 )
691         {
692             // Multiplication of 16 bit values and horizontal
693             // addition of 32 bit results
694             const auto firstLineHSumSquare = _mm_madd_epi16(firstLine, firstLine);
695             const auto secondLineHSumSquare = _mm_madd_epi16(secondLine, secondLine);
696             // Vertical addition
697             const auto sumSquares = _mm_add_epi32(firstLineHSumSquare,
698                                                   secondLineHSumSquare);
699             // In theory we should take sqrt(sumSquares * 0.25f)
700             // but given the rounding we do, this is equivalent to
701             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
702             // sumSquares <= 4 * 16383^2
703             const auto one32 = _mm_set1_epi32(1);
704             const auto sumSquaresPlusOneDiv4 =
705                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
706             // Take square root and truncate/floor to int32
707             auto rms = _mm_cvttps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
708 
709             // Round to upper value if it minimizes the
710             // error |rms^2 - sumSquares/4|
711             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
712             //    rms += 1;
713             // which is equivalent to:
714             // if( rms * rms + rms < (sumSquares+1) / 4 )
715             //    rms += 1;
716             auto mask = _mm_cmpgt_epi32(
717                 sumSquaresPlusOneDiv4,
718                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
719             rms = _mm_sub_epi32(rms, mask);
720             // Pack each 32 bit RMS value to 16 bits
721             rms = _mm_packs_epi32(rms, rms /* could be anything */);
722             _mm_storel_epi64(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), rms);
723             pSrcScanlineShifted += 8;
724             continue;
725         }
726 
727         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
728         // to 32 bit would result in 4 multiplications instead of 8, but mullo/mulhi
729         // have a worse throughput than mul_pd.
730 
731         // Extend those UInt16s as UInt32s
732         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
733         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
734         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
735         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
736 
737         // Multiplication of 32 bit values previously converted to 64 bit double
738         const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
739         const auto firstLineLoHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo,8)));
740         const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
741         const auto firstLineHiHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi,8)));
742 
743         const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
744         const auto secondLineLoHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo,8)));
745         const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
746         const auto secondLineHiHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi,8)));
747 
748         // Vertical addition of squares
749         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
750         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
751         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
752         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
753 
754         // Horizontal addition of squares
755         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
756         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
757 
758         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
759         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
760         // Take square root and truncate/floor to int32
761         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
762         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
763 
764         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
765         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
766         //     rms += 1;
767         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
768         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
769         const auto rightLo = _mm_sub_pd(sumDivWeightLo,
770                                 _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
771         const auto rightHi = _mm_sub_pd(sumDivWeightHi,
772                                 _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
773 
774         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
775         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
776         // The value of the mask will be -1 when the correction needs to be applied
777         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
778             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
779 
780         auto rms = _mm_castps_si128(_mm_movelh_ps(
781             _mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
782         // Apply the correction
783         rms = _mm_sub_epi32(rms, mask);
784 
785         // Pack each 32 bit RMS value to 16 bits
786         rms = sse2_packus_epi32(rms, rms /* could be anything */);
787         _mm_storel_epi64(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), rms);
788         pSrcScanlineShifted += 8;
789     }
790 
791     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
792     return iDstPixel;
793 }
794 
795 /************************************************************************/
796 /*                         AverageUInt16SSE2()                          */
797 /************************************************************************/
798 
AverageUInt16SSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)799 template<class T> static int AverageUInt16SSE2(
800                                              int nDstXWidth,
801                                              int nChunkXSize,
802                                              const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
803                                              T* CPL_RESTRICT pDstScanline)
804 {
805     // Optimized implementation for average on UInt16 by
806     // processing by group of 8 output pixels.
807 
808     const auto mask = _mm_set1_epi32(0xFFFF);
809     const auto two = _mm_set1_epi32(2);
810     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
811 
812     int iDstPixel = 0;
813     for( ; iDstPixel < nDstXWidth - 7; iDstPixel += 8 )
814     {
815         __m128i averageLow;
816         // Load 8 UInt16 from each line
817         {
818         const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted));
819         const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + nChunkXSize));
820 
821         // Horizontal addition and extension to 32 bit
822         const auto horizAddFirstLine = _mm_add_epi32(_mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
823         const auto horizAddSecondLine = _mm_add_epi32(_mm_and_si128(secondLine, mask), _mm_srli_epi32(secondLine, 16));
824 
825         // Vertical addition and average computation
826         // average = (sum + 2) >> 2
827         const auto sum = _mm_add_epi32(_mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
828         averageLow = _mm_srli_epi32(sum, 2);
829         }
830         // Load 8 UInt16 from each line
831         __m128i averageHigh;
832         {
833         const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + 8));
834         const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + 8 + nChunkXSize));
835 
836         // Horizontal addition and extension to 32 bit
837         const auto horizAddFirstLine = _mm_add_epi32(_mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
838         const auto horizAddSecondLine = _mm_add_epi32(_mm_and_si128(secondLine, mask), _mm_srli_epi32(secondLine, 16));
839 
840         // Vertical addition and average computation
841         // average = (sum + 2) >> 2
842         const auto sum = _mm_add_epi32(_mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
843         averageHigh = _mm_srli_epi32(sum, 2);
844         }
845 
846         // Pack each 32 bit average value to 16 bits
847         auto average = sse2_packus_epi32(averageLow, averageHigh);
848         _mm_storeu_si128(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), average);
849         pSrcScanlineShifted += 16;
850     }
851 
852     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
853     return iDstPixel;
854 }
855 
856 /************************************************************************/
857 /*                      QuadraticMeanFloatSSE2()                        */
858 /************************************************************************/
859 
SQUARE(__m128 x)860 inline __m128 SQUARE(__m128 x)
861 {
862     return _mm_mul_ps(x, x);
863 }
864 
QuadraticMeanFloatSSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)865 template<class T> static int QuadraticMeanFloatSSE2(int nDstXWidth,
866                                                     int nChunkXSize,
867                                                     const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
868                                                     T* CPL_RESTRICT pDstScanline)
869 {
870     // Optimized implementation for RMS on Float32 by
871     // processing by group of 4 output pixels.
872     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
873 
874     int iDstPixel = 0;
875     const auto minus_zero = _mm_set1_ps(-0.0f);
876     const auto zeroDot25 = _mm_set1_ps(0.25f);
877     const auto one = _mm_set1_ps(1.0f);
878     const auto infv = _mm_set1_ps(std::numeric_limits<float>::infinity());
879 
880     for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
881     {
882         // Load 8 Float32 from each line
883         auto firstLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted));
884         auto firstLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4));
885         auto secondLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + nChunkXSize));
886         auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4 + nChunkXSize));
887 
888         // Take the absolute value
889         firstLineLo = _mm_andnot_ps(minus_zero, firstLineLo);
890         firstLineHi = _mm_andnot_ps(minus_zero, firstLineHi);
891         secondLineLo = _mm_andnot_ps(minus_zero, secondLineLo);
892         secondLineHi = _mm_andnot_ps(minus_zero, secondLineHi);
893 
894         // Compute the maximum of each 4 value to RMS-average
895         auto maxLo = _mm_max_ps(firstLineLo, secondLineLo);
896         maxLo = _mm_max_ps(maxLo, _mm_shuffle_ps(maxLo, maxLo, _MM_SHUFFLE(2,3,0,1)));
897         auto maxHi = _mm_max_ps(firstLineHi, secondLineHi);
898         maxHi = _mm_max_ps(maxHi, _mm_shuffle_ps(maxHi, maxHi, _MM_SHUFFLE(2,3,0,1)));
899 
900         const auto maxV = _mm_shuffle_ps(maxLo, maxHi, _MM_SHUFFLE(2,0,2,0));
901 
902         // Normalize each value by the maximum of the 4 ones.
903         // This step is important to avoid that the square evaluates to infinity
904         // for sufficiently big input.
905         auto invMax = _mm_div_ps(one, maxV);
906         // Deal with 0 being the maximum to correct division by zero
907         // note: comparing to -0 leads to identical results as to comparing with 0
908         invMax = _mm_andnot_ps(_mm_cmpeq_ps(maxV, minus_zero), invMax);
909 
910         const auto invMaxLo = _mm_shuffle_ps(invMax, invMax, _MM_SHUFFLE(1,1,0,0));
911         const auto invMaxHi = _mm_shuffle_ps(invMax, invMax, _MM_SHUFFLE(3,3,2,2));
912 
913         firstLineLo = _mm_mul_ps(firstLineLo, invMaxLo);
914         firstLineHi = _mm_mul_ps(firstLineHi, invMaxHi);
915         secondLineLo = _mm_mul_ps(secondLineLo, invMaxLo);
916         secondLineHi = _mm_mul_ps(secondLineHi, invMaxHi);
917 
918         // Compute squares
919         firstLineLo = SQUARE(firstLineLo);
920         firstLineHi = SQUARE(firstLineHi);
921         secondLineLo = SQUARE(secondLineLo);
922         secondLineHi = SQUARE(secondLineHi);
923 
924         // Vertical addition
925         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
926         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
927 
928         // Horizontal addition
929         const auto A = _mm_shuffle_ps(sumLo, sumHi, _MM_SHUFFLE(2,0,2,0));
930         const auto B = _mm_shuffle_ps(sumLo, sumHi, _MM_SHUFFLE(3,1,3,1));
931         const auto sumSquares = _mm_add_ps(A, B);
932 
933         auto rms = _mm_mul_ps(maxV, _mm_sqrt_ps(_mm_mul_ps(sumSquares, zeroDot25)));
934 
935         // Deal with infinity being the maximum
936         const auto maskIsInf = _mm_cmpeq_ps(maxV, infv);
937         rms = _mm_or_ps(_mm_andnot_ps(maskIsInf, rms), _mm_and_ps(maskIsInf, infv));
938 
939         // coverity[incompatible_cast]
940         _mm_storeu_ps(reinterpret_cast<float*>(&pDstScanline[iDstPixel]), rms);
941         pSrcScanlineShifted += 8;
942     }
943 
944     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
945     return iDstPixel;
946 }
947 
948 /************************************************************************/
949 /*                        AverageFloatSSE2()                            */
950 /************************************************************************/
951 
AverageFloatSSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)952 template<class T> static int AverageFloatSSE2(int nDstXWidth,
953                                               int nChunkXSize,
954                                               const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
955                                               T* CPL_RESTRICT pDstScanline)
956 {
957     // Optimized implementation for average on Float32 by
958     // processing by group of 4 output pixels.
959     const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
960 
961     int iDstPixel = 0;
962     const auto zeroDot25 = _mm_set1_ps(0.25f);
963 
964     for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
965     {
966         // Load 8 Float32 from each line
967         const auto firstLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted));
968         const auto firstLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4));
969         const auto secondLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + nChunkXSize));
970         const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4 + nChunkXSize));
971 
972         // Vertical addition
973         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
974         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
975 
976         // Horizontal addition
977         const auto A = _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
978         const auto B = _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
979         const auto sum = _mm_add_ps(A, B);
980 
981         const auto average = _mm_mul_ps(sum, zeroDot25);
982 
983         // coverity[incompatible_cast]
984         _mm_storeu_ps(reinterpret_cast<float*>(&pDstScanline[iDstPixel]), average);
985         pSrcScanlineShifted += 8;
986     }
987 
988     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
989     return iDstPixel;
990 }
991 
992 #endif
993 
994 /************************************************************************/
995 /*                    GDALResampleChunk32R_Average()                    */
996 /************************************************************************/
997 
998 template <class T, class Tsum, GDALDataType eWrkDataType>
999 static CPLErr
GDALResampleChunk32R_AverageT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,const T * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,bool bPropagateNoData)1000 GDALResampleChunk32R_AverageT( double dfXRatioDstToSrc,
1001                                double dfYRatioDstToSrc,
1002                                double dfSrcXDelta,
1003                                double dfSrcYDelta,
1004                                const T* pChunk,
1005                                const GByte * pabyChunkNodataMask,
1006                                int nChunkXOff, int nChunkXSize,
1007                                int nChunkYOff, int nChunkYSize,
1008                                int nDstXOff, int nDstXOff2,
1009                                int nDstYOff, int nDstYOff2,
1010                                GDALRasterBand * poOverview,
1011                                void** ppDstBuffer,
1012                                const char * pszResampling,
1013                                int bHasNoData,  // TODO(schwehr): bool.
1014                                float fNoDataValue,
1015                                GDALColorTable* poColorTable,
1016                                bool bPropagateNoData )
1017 {
1018     // AVERAGE_BIT2GRAYSCALE
1019     const bool bBit2Grayscale =
1020         CPL_TO_BOOL( STARTS_WITH_CI( pszResampling, "AVERAGE_BIT2G" ) );
1021     const bool bQuadraticMean =
1022         CPL_TO_BOOL( EQUAL( pszResampling, "RMS" ) );
1023     if( bBit2Grayscale )
1024         poColorTable = nullptr;
1025 
1026     T tNoDataValue;
1027     if( !bHasNoData )
1028         tNoDataValue = 0;
1029     else
1030         tNoDataValue = static_cast<T>(fNoDataValue);
1031     const T tReplacementVal = static_cast<T>(GetReplacementValueIfNoData(
1032         poOverview->GetRasterDataType(), bHasNoData, fNoDataValue));
1033 
1034     int nChunkRightXOff = nChunkXOff + nChunkXSize;
1035     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1036     int nDstXWidth = nDstXOff2 - nDstXOff;
1037 
1038 /* -------------------------------------------------------------------- */
1039 /*      Allocate buffers.                                               */
1040 /* -------------------------------------------------------------------- */
1041     *ppDstBuffer = static_cast<T *>(
1042         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1043                             GDALGetDataTypeSizeBytes(eWrkDataType)));
1044     if( *ppDstBuffer == nullptr )
1045     {
1046         return CE_Failure;
1047     }
1048     T* const pDstBuffer = static_cast<T*>(*ppDstBuffer);
1049 
1050     struct PrecomputedXValue
1051     {
1052         int nLeftXOffShifted;
1053         int nRightXOffShifted;
1054         double dfLeftWeight;
1055         double dfRightWeight;
1056         double dfTotalWeightFullLine;
1057     };
1058     PrecomputedXValue* pasSrcX = static_cast<PrecomputedXValue *>(
1059         VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue) ) );
1060 
1061     if( pasSrcX == nullptr )
1062     {
1063         VSIFree(pasSrcX);
1064         return CE_Failure;
1065     }
1066 
1067     int nEntryCount = 0;
1068     GDALColorEntry* aEntries = nullptr;
1069     int nTransparentIdx = -1;
1070 
1071     if( poColorTable &&
1072         !ReadColorTableAsArray(poColorTable, nEntryCount, aEntries,
1073                                nTransparentIdx) )
1074     {
1075         VSIFree(pasSrcX);
1076         return CE_Failure;
1077     }
1078 
1079     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1080     // it as nodata value
1081     if( bHasNoData && fNoDataValue >= 0.0f && tNoDataValue < nEntryCount )
1082     {
1083         if( aEntries == nullptr )
1084         {
1085             CPLError(CE_Failure, CPLE_ObjectNull, "No aEntries.");
1086             VSIFree(pasSrcX);
1087             return CE_Failure;
1088         }
1089         aEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1090     }
1091     // Or if we have no explicit nodata, but a color table entry that is
1092     // transparent, consider it as the nodata value
1093     else if( !bHasNoData && nTransparentIdx >= 0 )
1094     {
1095         bHasNoData = TRUE;
1096         tNoDataValue = static_cast<T>(nTransparentIdx);
1097     }
1098 
1099 /* ==================================================================== */
1100 /*      Precompute inner loop constants.                                */
1101 /* ==================================================================== */
1102     bool bSrcXSpacingIsTwo = true;
1103     int nLastSrcXOff2 = -1;
1104     for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
1105     {
1106         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1107         // Apply some epsilon to avoid numerical precision issues
1108         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1109         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel+1)* dfXRatioDstToSrc;
1110         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1111 
1112         if( nSrcXOff < nChunkXOff )
1113             nSrcXOff = nChunkXOff;
1114         if( nSrcXOff2 == nSrcXOff )
1115             nSrcXOff2 ++;
1116         if( nSrcXOff2 > nChunkRightXOff )
1117             nSrcXOff2 = nChunkRightXOff;
1118 
1119         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1120         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted = nSrcXOff2 - nChunkXOff;
1121         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight = ( nSrcXOff2 == nSrcXOff + 1 ) ? 1.0 :  1 - (dfSrcXOff - nSrcXOff);
1122         pasSrcX[iDstPixel - nDstXOff].dfRightWeight = 1 - (nSrcXOff2 - dfSrcXOff2);
1123         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine = pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1124         if( nSrcXOff + 1 < nSrcXOff2 )
1125         {
1126             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine += nSrcXOff2 - nSrcXOff - 2;
1127             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine += pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1128         }
1129 
1130         if( nSrcXOff2 - nSrcXOff != 2 ||
1131             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff) )
1132         {
1133             bSrcXSpacingIsTwo = false;
1134         }
1135         nLastSrcXOff2 = nSrcXOff2;
1136     }
1137 
1138 /* ==================================================================== */
1139 /*      Loop over destination scanlines.                                */
1140 /* ==================================================================== */
1141     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
1142     {
1143         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1144         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1145         if( nSrcYOff < nChunkYOff )
1146             nSrcYOff = nChunkYOff;
1147 
1148         double dfSrcYOff2 = dfSrcYDelta + (iDstLine+1) * dfYRatioDstToSrc;
1149         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1150         if( nSrcYOff2 == nSrcYOff )
1151             ++nSrcYOff2;
1152         if( nSrcYOff2 > nChunkBottomYOff )
1153             nSrcYOff2 = nChunkBottomYOff;
1154 
1155         T* const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1156 
1157 /* -------------------------------------------------------------------- */
1158 /*      Loop over destination pixels                                    */
1159 /* -------------------------------------------------------------------- */
1160         if( poColorTable == nullptr )
1161         {
1162             if( bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1163                 pabyChunkNodataMask == nullptr )
1164             {
1165               if( eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16 )
1166               {
1167                 // Optimized case : no nodata, overview by a factor of 2 and
1168                 // regular x and y src spacing.
1169                 const T* pSrcScanlineShifted =
1170                     pChunk + pasSrcX[0].nLeftXOffShifted +
1171                     static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1172                 int iDstPixel = 0;
1173 #ifdef USE_SSE2
1174                 if( bQuadraticMean && eWrkDataType == GDT_Byte )
1175                 {
1176                     iDstPixel = QuadraticMeanByteSSE2OrAVX2(nDstXWidth,
1177                                                             nChunkXSize,
1178                                                             pSrcScanlineShifted,
1179                                                             pDstScanline);
1180                 }
1181                 else if( bQuadraticMean /* && eWrkDataType == GDT_UInt16 */ )
1182                 {
1183                     iDstPixel = QuadraticMeanUInt16SSE2(nDstXWidth,
1184                                                         nChunkXSize,
1185                                                         pSrcScanlineShifted,
1186                                                         pDstScanline);
1187                 }
1188                 else if( /* !bQuadraticMean && */ eWrkDataType == GDT_Byte )
1189                 {
1190                     iDstPixel = AverageByteSSE2OrAVX2(nDstXWidth,
1191                                                       nChunkXSize,
1192                                                       pSrcScanlineShifted,
1193                                                       pDstScanline);
1194                 }
1195                 else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 ) */
1196                 {
1197                     iDstPixel = AverageUInt16SSE2(nDstXWidth,
1198                                                   nChunkXSize,
1199                                                   pSrcScanlineShifted,
1200                                                   pDstScanline);
1201                 }
1202 #endif
1203                 for( ; iDstPixel < nDstXWidth; ++iDstPixel )
1204                 {
1205                     Tsum nTotal = 0;
1206                     T nVal;
1207                     if( bQuadraticMean )
1208                         nTotal =
1209                             SQUARE<Tsum>(pSrcScanlineShifted[0])
1210                             + SQUARE<Tsum>(pSrcScanlineShifted[1])
1211                             + SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize])
1212                             + SQUARE<Tsum>(pSrcScanlineShifted[1+nChunkXSize]);
1213                     else
1214                         nTotal =
1215                             pSrcScanlineShifted[0]
1216                             + pSrcScanlineShifted[1]
1217                             + pSrcScanlineShifted[nChunkXSize]
1218                             + pSrcScanlineShifted[1+nChunkXSize];
1219 
1220                     constexpr int nTotalWeight = 4;
1221                     if( bQuadraticMean )
1222                         nVal = ComputeIntegerRMS_4values<T>(nTotal);
1223                     else
1224                         nVal = static_cast<T>((nTotal + nTotalWeight/2) / nTotalWeight);
1225 
1226                     // No need to compare nVal against tNoDataValue as we are
1227                     // in a case where pabyChunkNodataMask == nullptr implies
1228                     // the absence of nodata value.
1229                     pDstScanline[iDstPixel] = nVal;
1230                     pSrcScanlineShifted += 2;
1231                 }
1232               }
1233               else
1234               {
1235                 CPLAssert( eWrkDataType == GDT_Float32 );
1236                 const T* pSrcScanlineShifted =
1237                     pChunk + pasSrcX[0].nLeftXOffShifted +
1238                     static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1239                 int iDstPixel = 0;
1240 #ifdef USE_SSE2
1241                 if( bQuadraticMean )
1242                 {
1243                     iDstPixel = QuadraticMeanFloatSSE2(nDstXWidth,
1244                                                        nChunkXSize,
1245                                                        pSrcScanlineShifted,
1246                                                        pDstScanline);
1247                 }
1248                 else
1249                 {
1250                     iDstPixel = AverageFloatSSE2(nDstXWidth,
1251                                                  nChunkXSize,
1252                                                  pSrcScanlineShifted,
1253                                                  pDstScanline);
1254                 }
1255 #endif
1256 
1257                 for( ; iDstPixel < nDstXWidth; ++iDstPixel )
1258                 {
1259                     T nVal;
1260                     if( bQuadraticMean )
1261                     {
1262                         // Cast to double to avoid overflows
1263                         // (using std::hypot() is much slower)
1264                         nVal = static_cast<T>(std::sqrt(0.25 *
1265                             ( SQUARE<double>(pSrcScanlineShifted[0])
1266                             + SQUARE<double>(pSrcScanlineShifted[1])
1267                             + SQUARE<double>(pSrcScanlineShifted[nChunkXSize])
1268                             + SQUARE<double>(pSrcScanlineShifted[1+nChunkXSize]))));
1269                     }
1270                     else
1271                     {
1272                         nVal = static_cast<T>(0.25f *
1273                             ( pSrcScanlineShifted[0]
1274                             + pSrcScanlineShifted[1]
1275                             + pSrcScanlineShifted[nChunkXSize]
1276                             + pSrcScanlineShifted[1+nChunkXSize]));
1277                     }
1278 
1279                     // No need to compare nVal against tNoDataValue as we are
1280                     // in a case where pabyChunkNodataMask == nullptr implies
1281                     // the absence of nodata value.
1282                     pDstScanline[iDstPixel] = nVal;
1283                     pSrcScanlineShifted += 2;
1284                 }
1285               }
1286             }
1287             else
1288             {
1289                 const double dfBottomWeight =
1290                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0 : 1.0 - (dfSrcYOff - nSrcYOff);
1291                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1292                 nSrcYOff -= nChunkYOff;
1293                 nSrcYOff2 -= nChunkYOff;
1294 
1295                 double dfTotalWeightFullColumn = dfBottomWeight;
1296                 if( nSrcYOff + 1 < nSrcYOff2 )
1297                 {
1298                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1299                     dfTotalWeightFullColumn += dfTopWeight;
1300                 }
1301 
1302                 for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
1303                 {
1304                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1305                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1306 
1307                     double dfTotal = 0;
1308                     double dfTotalWeight = 0;
1309                     if( pabyChunkNodataMask == nullptr )
1310                     {
1311                         auto pChunkShifted = pChunk +
1312                                 static_cast<GPtrDiff_t>(nSrcYOff) *nChunkXSize;
1313                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1314                         double dfWeightY = dfBottomWeight;
1315                         while( true )
1316                         {
1317                             double dfTotalLine;
1318                             if( bQuadraticMean )
1319                             {
1320                                 // Left pixel
1321                                 {
1322                                     const T val = pChunkShifted[nSrcXOff];
1323                                     dfTotalLine = SQUARE<double>(val) * pasSrcX[iDstPixel].dfLeftWeight;
1324                                 }
1325 
1326                                 if( nSrcXOff + 1 < nSrcXOff2 )
1327                                 {
1328                                     // Middle pixels
1329                                     for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1330                                     {
1331                                         const T val = pChunkShifted[iX];
1332                                         dfTotalLine += SQUARE<double>(val);
1333                                     }
1334 
1335                                     // Right pixel
1336                                     {
1337                                         const T val = pChunkShifted[nSrcXOff2 - 1];
1338                                         dfTotalLine += SQUARE<double>(val) * pasSrcX[iDstPixel].dfRightWeight;
1339                                     }
1340                                 }
1341                             }
1342                             else
1343                             {
1344                                 // Left pixel
1345                                 {
1346                                     const T val = pChunkShifted[nSrcXOff];
1347                                     dfTotalLine = val * pasSrcX[iDstPixel].dfLeftWeight;
1348                                 }
1349 
1350                                 if( nSrcXOff + 1 < nSrcXOff2 )
1351                                 {
1352                                     // Middle pixels
1353                                     for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1354                                     {
1355                                         const T val = pChunkShifted[iX];
1356                                         dfTotalLine += val;
1357                                     }
1358 
1359                                     // Right pixel
1360                                     {
1361                                         const T val = pChunkShifted[nSrcXOff2 - 1];
1362                                         dfTotalLine += val * pasSrcX[iDstPixel].dfRightWeight;
1363                                     }
1364                                 }
1365                             }
1366 
1367                             dfTotal += dfTotalLine * dfWeightY;
1368                             --nCounterY;
1369                             if( nCounterY < 0 )
1370                                 break;
1371                             pChunkShifted += nChunkXSize;
1372                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1373                         }
1374 
1375                         dfTotalWeight = pasSrcX[iDstPixel].dfTotalWeightFullLine * dfTotalWeightFullColumn;
1376                     }
1377                     else
1378                     {
1379                         GPtrDiff_t nCount = 0;
1380                         for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
1381                         {
1382                             const auto pChunkShifted = pChunk +
1383                                 static_cast<GPtrDiff_t>(iY) *nChunkXSize;
1384 
1385                             double dfTotalLine = 0;
1386                             double dfTotalWeightLine = 0;
1387                             // Left pixel
1388                             {
1389                                 const int iX = nSrcXOff;
1390                                 const T val = pChunkShifted[iX];
1391                                 if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1392                                 {
1393                                     nCount ++;
1394                                     const double dfWeightX = pasSrcX[iDstPixel].dfLeftWeight;
1395                                     dfTotalWeightLine = dfWeightX;
1396                                     if( bQuadraticMean )
1397                                         dfTotalLine = SQUARE<double>(val) * dfWeightX;
1398                                     else
1399                                         dfTotalLine = val * dfWeightX;
1400                                 }
1401                             }
1402 
1403                             if( nSrcXOff + 1 < nSrcXOff2 )
1404                             {
1405                                 // Middle pixels
1406                                 for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1407                                 {
1408                                     const T val = pChunkShifted[iX];
1409                                     if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1410                                     {
1411                                         nCount ++;
1412                                         dfTotalWeightLine += 1;
1413                                         if( bQuadraticMean )
1414                                             dfTotalLine += SQUARE<double>(val);
1415                                         else
1416                                             dfTotalLine += val;
1417                                     }
1418                                 }
1419 
1420                                 // Right pixel
1421                                 {
1422                                     const int iX = nSrcXOff2 - 1;
1423                                     const T val = pChunkShifted[iX];
1424                                     if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1425                                     {
1426                                         nCount ++;
1427                                         const double dfWeightX = pasSrcX[iDstPixel].dfRightWeight;
1428                                         dfTotalWeightLine += dfWeightX;
1429                                         if( bQuadraticMean )
1430                                             dfTotalLine += SQUARE<double>(val) * dfWeightX;
1431                                         else
1432                                             dfTotalLine += val * dfWeightX;
1433                                     }
1434                                 }
1435                             }
1436 
1437                             const double dfWeightY =
1438                                 (iY == nSrcYOff) ? dfBottomWeight :
1439                                 (iY + 1 == nSrcYOff2) ? dfTopWeight :
1440                                 1.0;
1441                             dfTotal += dfTotalLine * dfWeightY;
1442                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
1443                         }
1444 
1445                         if( nCount == 0 ||
1446                             (bPropagateNoData && nCount <
1447                                 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) * (nSrcXOff2 - nSrcXOff)))
1448                         {
1449                             pDstScanline[iDstPixel] = tNoDataValue;
1450                             continue;
1451                         }
1452                     }
1453                     if( eWrkDataType == GDT_Byte ||
1454                              eWrkDataType == GDT_UInt16)
1455                     {
1456                         T nVal;
1457                         if( bQuadraticMean )
1458                             nVal = ComputeIntegerRMS<T>(dfTotal, dfTotalWeight);
1459                         else
1460                             nVal = static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1461                         if( bHasNoData && nVal == tNoDataValue )
1462                             nVal = tReplacementVal;
1463                         pDstScanline[iDstPixel] = nVal;
1464                     }
1465                     else
1466                     {
1467                         T nVal;
1468                         if( bQuadraticMean )
1469                             nVal = static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1470                         else
1471                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
1472                         if( bHasNoData && nVal == tNoDataValue )
1473                             nVal = tReplacementVal;
1474                         pDstScanline[iDstPixel] = nVal;
1475                     }
1476                 }
1477             }
1478         }
1479         else
1480         {
1481             nSrcYOff -= nChunkYOff;
1482             nSrcYOff2 -= nChunkYOff;
1483 
1484             for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
1485             {
1486                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1487                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1488 
1489                 GPtrDiff_t nTotalR = 0;
1490                 GPtrDiff_t nTotalG = 0;
1491                 GPtrDiff_t nTotalB = 0;
1492                 GPtrDiff_t nCount = 0;
1493 
1494                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
1495                 {
1496                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
1497                     {
1498                         const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *nChunkXSize];
1499                         const unsigned nVal = static_cast<unsigned>(val);
1500                         if( nVal < static_cast<unsigned>(nEntryCount) && aEntries[nVal].c4 )
1501                         {
1502                             if( bQuadraticMean )
1503                             {
1504                                 nTotalR += SQUARE<int>(aEntries[nVal].c1);
1505                                 nTotalG += SQUARE<int>(aEntries[nVal].c2);
1506                                 nTotalB += SQUARE<int>(aEntries[nVal].c3);
1507                                 ++nCount;
1508                             }
1509                             else
1510                             {
1511                                 nTotalR += aEntries[nVal].c1;
1512                                 nTotalG += aEntries[nVal].c2;
1513                                 nTotalB += aEntries[nVal].c3;
1514                                 ++nCount;
1515                             }
1516                         }
1517                     }
1518                 }
1519 
1520                 if( nCount == 0 ||
1521                     (bPropagateNoData && nCount <
1522                         static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) * (nSrcXOff2 - nSrcXOff)) )
1523                 {
1524                     pDstScanline[iDstPixel] = tNoDataValue;
1525                 }
1526                 else
1527                 {
1528                     int nR, nG, nB;
1529                     if( bQuadraticMean )
1530                     {
1531                         nR = static_cast<int>(sqrt(nTotalR / nCount) + 0.5);
1532                         nG = static_cast<int>(sqrt(nTotalG / nCount) + 0.5);
1533                         nB = static_cast<int>(sqrt(nTotalB / nCount) + 0.5);
1534                     }
1535                     else
1536                     {
1537                         nR = static_cast<int>((nTotalR + nCount / 2) / nCount);
1538                         nG = static_cast<int>((nTotalG + nCount / 2) / nCount);
1539                         nB = static_cast<int>((nTotalB + nCount / 2) / nCount);
1540                     }
1541                     pDstScanline[iDstPixel] = static_cast<T>(GDALFindBestEntry(
1542                         nEntryCount, aEntries, nR, nG, nB));
1543                 }
1544             }
1545         }
1546     }
1547 
1548     CPLFree( aEntries );
1549     CPLFree( pasSrcX );
1550 
1551     return CE_None;
1552 }
1553 
1554 static CPLErr
GDALResampleChunk32R_Average(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType eWrkDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType,bool bPropagateNoData)1555 GDALResampleChunk32R_Average( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1556                               double dfSrcXDelta,
1557                               double dfSrcYDelta,
1558                               GDALDataType eWrkDataType,
1559                               const void * pChunk,
1560                               const GByte * pabyChunkNodataMask,
1561                               int nChunkXOff, int nChunkXSize,
1562                               int nChunkYOff, int nChunkYSize,
1563                               int nDstXOff, int nDstXOff2,
1564                               int nDstYOff, int nDstYOff2,
1565                               GDALRasterBand * poOverview,
1566                               void** ppDstBuffer,
1567                               GDALDataType* peDstBufferDataType,
1568                               const char * pszResampling,
1569                               int bHasNoData, float fNoDataValue,
1570                               GDALColorTable* poColorTable,
1571                               GDALDataType /* eSrcDataType */,
1572                               bool bPropagateNoData )
1573 {
1574     if( eWrkDataType == GDT_Byte )
1575     {
1576         *peDstBufferDataType = eWrkDataType;
1577         return GDALResampleChunk32R_AverageT<GByte, int, GDT_Byte>(
1578             dfXRatioDstToSrc, dfYRatioDstToSrc,
1579             dfSrcXDelta, dfSrcYDelta,
1580             static_cast<const GByte *>( pChunk ),
1581             pabyChunkNodataMask,
1582             nChunkXOff, nChunkXSize,
1583             nChunkYOff, nChunkYSize,
1584             nDstXOff, nDstXOff2,
1585             nDstYOff, nDstYOff2,
1586             poOverview,
1587             ppDstBuffer,
1588             pszResampling,
1589             bHasNoData, fNoDataValue,
1590             poColorTable,
1591             bPropagateNoData );
1592     }
1593     else if( eWrkDataType == GDT_UInt16  )
1594     {
1595         *peDstBufferDataType = eWrkDataType;
1596         if( EQUAL(pszResampling, "RMS") )
1597         {
1598             // Use double as accumulation type, because UInt32 could overflow
1599             return GDALResampleChunk32R_AverageT<GUInt16, double, GDT_UInt16>(
1600                 dfXRatioDstToSrc, dfYRatioDstToSrc,
1601                 dfSrcXDelta, dfSrcYDelta,
1602                 static_cast<const GUInt16 *>( pChunk ),
1603                 pabyChunkNodataMask,
1604                 nChunkXOff, nChunkXSize,
1605                 nChunkYOff, nChunkYSize,
1606                 nDstXOff, nDstXOff2,
1607                 nDstYOff, nDstYOff2,
1608                 poOverview,
1609                 ppDstBuffer,
1610                 pszResampling,
1611                 bHasNoData, fNoDataValue,
1612                 poColorTable,
1613                 bPropagateNoData );
1614         }
1615         else
1616         {
1617             return GDALResampleChunk32R_AverageT<GUInt16, GUInt32, GDT_UInt16>(
1618                 dfXRatioDstToSrc, dfYRatioDstToSrc,
1619                 dfSrcXDelta, dfSrcYDelta,
1620                 static_cast<const GUInt16 *>( pChunk ),
1621                 pabyChunkNodataMask,
1622                 nChunkXOff, nChunkXSize,
1623                 nChunkYOff, nChunkYSize,
1624                 nDstXOff, nDstXOff2,
1625                 nDstYOff, nDstYOff2,
1626                 poOverview,
1627                 ppDstBuffer,
1628                 pszResampling,
1629                 bHasNoData, fNoDataValue,
1630                 poColorTable,
1631                 bPropagateNoData );
1632         }
1633     }
1634     else if( eWrkDataType == GDT_Float32 )
1635     {
1636         *peDstBufferDataType = eWrkDataType;
1637         return GDALResampleChunk32R_AverageT<float, double, GDT_Float32>(
1638             dfXRatioDstToSrc, dfYRatioDstToSrc,
1639             dfSrcXDelta, dfSrcYDelta,
1640             static_cast<const float *>( pChunk ),
1641             pabyChunkNodataMask,
1642             nChunkXOff, nChunkXSize,
1643             nChunkYOff, nChunkYSize,
1644             nDstXOff, nDstXOff2,
1645             nDstYOff, nDstYOff2,
1646             poOverview,
1647             ppDstBuffer,
1648             pszResampling,
1649             bHasNoData, fNoDataValue,
1650             poColorTable,
1651             bPropagateNoData );
1652     }
1653 
1654     CPLAssert(false);
1655     return CE_Failure;
1656 }
1657 
1658 /************************************************************************/
1659 /*                    GDALResampleChunk32R_Gauss()                      */
1660 /************************************************************************/
1661 
1662 static CPLErr
GDALResampleChunk32R_Gauss(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double,double,GDALDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType,bool)1663 GDALResampleChunk32R_Gauss( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1664                             double /* dfSrcXDelta */,
1665                             double /* dfSrcYDelta */,
1666                             GDALDataType /* eWrkDataType */,
1667                             const void * pChunk,
1668                             const GByte * pabyChunkNodataMask,
1669                             int nChunkXOff, int nChunkXSize,
1670                             int nChunkYOff, int nChunkYSize,
1671                             int nDstXOff, int nDstXOff2,
1672                             int nDstYOff, int nDstYOff2,
1673                             GDALRasterBand * poOverview,
1674                             void** ppDstBuffer,
1675                             GDALDataType* peDstBufferDataType,
1676                             const char * /* pszResampling */,
1677                             int bHasNoData, float fNoDataValue,
1678                             GDALColorTable* poColorTable,
1679                             GDALDataType /* eSrcDataType */,
1680                             bool /* bPropagateNoData */ )
1681 
1682 {
1683     const float * const pafChunk = static_cast<const float *>( pChunk );
1684 
1685     *ppDstBuffer =
1686         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1687                             GDALGetDataTypeSizeBytes(GDT_Float32));
1688     if( *ppDstBuffer == nullptr )
1689     {
1690         return CE_Failure;
1691     }
1692     *peDstBufferDataType = GDT_Float32;
1693     float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
1694 
1695 /* -------------------------------------------------------------------- */
1696 /*      Create the filter kernel and allocate scanline buffer.          */
1697 /* -------------------------------------------------------------------- */
1698     int nGaussMatrixDim = 3;
1699     const int *panGaussMatrix;
1700     constexpr int anGaussMatrix3x3[] ={
1701         1, 2, 1,
1702         2, 4, 2,
1703         1, 2, 1
1704     };
1705     constexpr int anGaussMatrix5x5[] = {
1706         1, 4, 6, 4, 1,
1707         4, 16, 24, 16, 4,
1708         6, 24, 36, 24, 6,
1709         4, 16, 24, 16, 4,
1710         1, 4, 6, 4, 1};
1711     constexpr int anGaussMatrix7x7[] = {
1712         1, 6, 15, 20, 15, 6, 1,
1713         6, 36, 90, 120, 90, 36, 6,
1714         15, 90, 225, 300, 225, 90, 15,
1715         20, 120, 300, 400, 300, 120, 20,
1716         15, 90, 225, 300, 225, 90, 15,
1717         6, 36, 90, 120, 90, 36, 6,
1718         1, 6, 15, 20, 15, 6, 1};
1719 
1720     const int nOXSize = poOverview->GetXSize();
1721     const int nOYSize = poOverview->GetYSize();
1722     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1723 
1724     // matrix for gauss filter
1725     if(nResYFactor <= 2 )
1726     {
1727         panGaussMatrix = anGaussMatrix3x3;
1728         nGaussMatrixDim=3;
1729     }
1730     else if( nResYFactor <= 4 )
1731     {
1732         panGaussMatrix = anGaussMatrix5x5;
1733         nGaussMatrixDim=5;
1734     }
1735     else
1736     {
1737         panGaussMatrix = anGaussMatrix7x7;
1738         nGaussMatrixDim=7;
1739     }
1740 
1741 #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1742     int* panGaussMatrixDup = static_cast<int*>(
1743         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1744     memcpy(panGaussMatrixDup, panGaussMatrix,
1745            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1746     panGaussMatrix = panGaussMatrixDup;
1747 #endif
1748 
1749     if( !bHasNoData )
1750         fNoDataValue = 0.0f;
1751 
1752     int nEntryCount = 0;
1753     GDALColorEntry* aEntries = nullptr;
1754     int nTransparentIdx = -1;
1755     if( poColorTable &&
1756         !ReadColorTableAsArray(poColorTable, nEntryCount, aEntries,
1757                                nTransparentIdx) )
1758     {
1759         return CE_Failure;
1760     }
1761 
1762     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1763     // it as nodata value.
1764     if( bHasNoData && fNoDataValue >= 0.0f && fNoDataValue < nEntryCount )
1765     {
1766         if( aEntries == nullptr )
1767         {
1768             CPLError(CE_Failure, CPLE_ObjectNull, "No aEntries");
1769             return CE_Failure;
1770         }
1771         aEntries[static_cast<int>(fNoDataValue)].c4 = 0;
1772     }
1773     // Or if we have no explicit nodata, but a color table entry that is
1774     // transparent, consider it as the nodata value.
1775     else if( !bHasNoData && nTransparentIdx >= 0 )
1776     {
1777         fNoDataValue = static_cast<float>(nTransparentIdx);
1778     }
1779 
1780     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1781     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1782     const int nDstXWidth = nDstXOff2 - nDstXOff;
1783 
1784 /* ==================================================================== */
1785 /*      Loop over destination scanlines.                                */
1786 /* ==================================================================== */
1787     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
1788     {
1789         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1790         int nSrcYOff2 =
1791             static_cast<int>(0.5 + (iDstLine+1) * dfYRatioDstToSrc) + 1;
1792 
1793         if( nSrcYOff < nChunkYOff )
1794         {
1795             nSrcYOff = nChunkYOff;
1796             nSrcYOff2++;
1797         }
1798 
1799         const int iSizeY = nSrcYOff2 - nSrcYOff;
1800         nSrcYOff = nSrcYOff + iSizeY/2 - nGaussMatrixDim/2;
1801         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1802 
1803         if( nSrcYOff2 > nChunkBottomYOff ||
1804             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize-1) )
1805         {
1806             nSrcYOff2 = std::min(nChunkBottomYOff,
1807                                  nSrcYOff + nGaussMatrixDim);
1808         }
1809 
1810         int nYShiftGaussMatrix = 0;
1811         if(nSrcYOff < nChunkYOff)
1812         {
1813             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1814             nSrcYOff = nChunkYOff;
1815         }
1816 
1817         const float * const pafSrcScanline =
1818             pafChunk + ((nSrcYOff-nChunkYOff) * nChunkXSize);
1819         const GByte *pabySrcScanlineNodataMask = nullptr;
1820         if( pabyChunkNodataMask != nullptr )
1821             pabySrcScanlineNodataMask =
1822                 pabyChunkNodataMask + ((nSrcYOff-nChunkYOff) * nChunkXSize);
1823 
1824 /* -------------------------------------------------------------------- */
1825 /*      Loop over destination pixels                                    */
1826 /* -------------------------------------------------------------------- */
1827         float* const pafDstScanline = pafDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1828         for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
1829         {
1830             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1831             int nSrcXOff2 =
1832                 static_cast<int>(0.5 + (iDstPixel+1) * dfXRatioDstToSrc) + 1;
1833 
1834             if( nSrcXOff < nChunkXOff )
1835             {
1836                 nSrcXOff = nChunkXOff;
1837                 nSrcXOff2++;
1838             }
1839 
1840             const int iSizeX = nSrcXOff2 - nSrcXOff;
1841             nSrcXOff = nSrcXOff + iSizeX/2 - nGaussMatrixDim/2;
1842             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1843 
1844             if( nSrcXOff2 > nChunkRightXOff ||
1845                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize-1) )
1846             {
1847                 nSrcXOff2 = std::min(nChunkRightXOff,
1848                                      nSrcXOff + nGaussMatrixDim);
1849             }
1850 
1851             int nXShiftGaussMatrix = 0;
1852             if(nSrcXOff < nChunkXOff)
1853             {
1854                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1855                 nSrcXOff = nChunkXOff;
1856             }
1857 
1858             if( poColorTable == nullptr )
1859             {
1860                 double dfTotal = 0.0;
1861                 GInt64 nCount = 0;
1862                 const int *panLineWeight = panGaussMatrix +
1863                     nYShiftGaussMatrix * nGaussMatrixDim + nXShiftGaussMatrix;
1864 
1865                 for( int j=0, iY = nSrcYOff;
1866                      iY < nSrcYOff2;
1867                      ++iY, ++j, panLineWeight += nGaussMatrixDim )
1868                 {
1869                     for( int i=0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i )
1870                     {
1871                         const double val =
1872                             pafSrcScanline[iX-nChunkXOff+static_cast<GPtrDiff_t>(iY-nSrcYOff)
1873                                            * nChunkXSize];
1874                         if( pabySrcScanlineNodataMask == nullptr ||
1875                             pabySrcScanlineNodataMask[iX - nChunkXOff
1876                                                       +static_cast<GPtrDiff_t>(iY - nSrcYOff)
1877                                                       * nChunkXSize] )
1878                         {
1879                             const int nWeight = panLineWeight[i];
1880                             dfTotal += val * nWeight;
1881                             nCount += nWeight;
1882                         }
1883                     }
1884                 }
1885 
1886                 if( nCount == 0 )
1887                 {
1888                     pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
1889                 }
1890                 else
1891                 {
1892                     pafDstScanline[iDstPixel - nDstXOff] =
1893                         static_cast<float>(dfTotal / nCount);
1894                 }
1895             }
1896             else
1897             {
1898                 GInt64 nTotalR = 0;
1899                 GInt64 nTotalG = 0;
1900                 GInt64 nTotalB = 0;
1901                 GInt64 nTotalWeight = 0;
1902                 const int *panLineWeight =
1903                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1904                     nXShiftGaussMatrix;
1905 
1906                 for( int j=0, iY = nSrcYOff; iY < nSrcYOff2;
1907                         ++iY, ++j, panLineWeight += nGaussMatrixDim )
1908                 {
1909                     for( int i=0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i )
1910                     {
1911                         const double val =
1912                             pafSrcScanline[iX - nChunkXOff +
1913                                            static_cast<GPtrDiff_t>(iY-nSrcYOff) * nChunkXSize];
1914                         int nVal = static_cast<int>(val);
1915                         if( nVal >= 0 && nVal < nEntryCount &&
1916                             aEntries[nVal].c4 )
1917                         {
1918                             const int nWeight = panLineWeight[i];
1919                             nTotalR += aEntries[nVal].c1 * nWeight;
1920                             nTotalG += aEntries[nVal].c2 * nWeight;
1921                             nTotalB += aEntries[nVal].c3 * nWeight;
1922                             nTotalWeight += nWeight;
1923                         }
1924                     }
1925                 }
1926 
1927                 if( nTotalWeight == 0 )
1928                 {
1929                     pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
1930                 }
1931                 else
1932                 {
1933                     const int nR =
1934                         static_cast<int>((nTotalR + nTotalWeight / 2) / nTotalWeight);
1935                     const int nG =
1936                         static_cast<int>((nTotalG + nTotalWeight / 2) / nTotalWeight);
1937                     const int nB =
1938                         static_cast<int>((nTotalB + nTotalWeight / 2) / nTotalWeight);
1939                     pafDstScanline[iDstPixel - nDstXOff] =
1940                         static_cast<float>( GDALFindBestEntry(
1941                             nEntryCount, aEntries, nR, nG, nB ) );
1942                 }
1943             }
1944         }
1945     }
1946 
1947     CPLFree( aEntries );
1948 #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1949     CPLFree( panGaussMatrixDup );
1950 #endif
1951 
1952     return CE_None;
1953 }
1954 
1955 /************************************************************************/
1956 /*                    GDALResampleChunk32R_Mode()                       */
1957 /************************************************************************/
1958 
1959 static CPLErr
GDALResampleChunk32R_Mode(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand *,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType eSrcDataType,bool)1960 GDALResampleChunk32R_Mode( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1961                            double dfSrcXDelta,
1962                            double dfSrcYDelta,
1963                            GDALDataType /* eWrkDataType */,
1964                            const void * pChunk,
1965                            const GByte * pabyChunkNodataMask,
1966                            int nChunkXOff, int nChunkXSize,
1967                            int nChunkYOff, int nChunkYSize,
1968                            int nDstXOff, int nDstXOff2,
1969                            int nDstYOff, int nDstYOff2,
1970                            GDALRasterBand * /* poOverview */,
1971                            void** ppDstBuffer,
1972                            GDALDataType* peDstBufferDataType,
1973                            const char * /* pszResampling */,
1974                            int bHasNoData, float fNoDataValue,
1975                            GDALColorTable* poColorTable,
1976                            GDALDataType eSrcDataType,
1977                            bool /* bPropagateNoData */ )
1978 
1979 {
1980     const float * const pafChunk = static_cast<const float *>( pChunk );
1981 
1982     const int nDstXSize = nDstXOff2 - nDstXOff;
1983     *ppDstBuffer =
1984         VSI_MALLOC3_VERBOSE(nDstXSize, nDstYOff2 - nDstYOff,
1985                             GDALGetDataTypeSizeBytes(GDT_Float32));
1986     if( *ppDstBuffer == nullptr )
1987     {
1988         return CE_Failure;
1989     }
1990     *peDstBufferDataType = GDT_Float32;
1991     float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
1992 
1993 /* -------------------------------------------------------------------- */
1994 /*      Create the filter kernel and allocate scanline buffer.          */
1995 /* -------------------------------------------------------------------- */
1996 
1997     if( !bHasNoData )
1998         fNoDataValue = 0.0f;
1999     int nEntryCount = 0;
2000     GDALColorEntry* aEntries = nullptr;
2001     int nTransparentIdx = -1;
2002     if( poColorTable &&
2003         !ReadColorTableAsArray(poColorTable, nEntryCount,
2004                                aEntries, nTransparentIdx) )
2005     {
2006         return CE_Failure;
2007     }
2008 
2009     size_t nMaxNumPx = 0;
2010     float *pafVals = nullptr;
2011     int *panSums = nullptr;
2012 
2013     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2014     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2015     std::vector<int> anVals(256, 0);
2016 
2017 /* ==================================================================== */
2018 /*      Loop over destination scanlines.                                */
2019 /* ==================================================================== */
2020     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
2021     {
2022         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2023         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2024 #ifdef only_pixels_with_more_than_10_pct_participation
2025         // When oversampling, don't take into account pixels that have a tiny
2026         // participation in the resulting pixel
2027         if( dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2028             nSrcYOff < nChunkBottomYOff)
2029             nSrcYOff ++;
2030 #endif
2031         if( nSrcYOff < nChunkYOff )
2032             nSrcYOff = nChunkYOff;
2033 
2034         double dfSrcYOff2 = dfSrcYDelta + (iDstLine+1) * dfYRatioDstToSrc;
2035         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2036 #ifdef only_pixels_with_more_than_10_pct_participation
2037         // When oversampling, don't take into account pixels that have a tiny
2038         // participation in the resulting pixel
2039         if( dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2040             nSrcYOff2 > nChunkYOff)
2041             nSrcYOff2 --;
2042 #endif
2043         if( nSrcYOff2 == nSrcYOff )
2044             ++nSrcYOff2;
2045         if( nSrcYOff2 > nChunkBottomYOff )
2046             nSrcYOff2 = nChunkBottomYOff;
2047 
2048         const float * const pafSrcScanline =
2049             pafChunk + (static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize);
2050         const GByte *pabySrcScanlineNodataMask = nullptr;
2051         if( pabyChunkNodataMask != nullptr )
2052             pabySrcScanlineNodataMask =
2053                 pabyChunkNodataMask + static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize;
2054 
2055         float* const pafDstScanline = pafDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2056 /* -------------------------------------------------------------------- */
2057 /*      Loop over destination pixels                                    */
2058 /* -------------------------------------------------------------------- */
2059         for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
2060         {
2061             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2062             // Apply some epsilon to avoid numerical precision issues
2063             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2064 #ifdef only_pixels_with_more_than_10_pct_participation
2065             // When oversampling, don't take into account pixels that have a tiny
2066             // participation in the resulting pixel
2067             if( dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2068                 nSrcXOff < nChunkRightXOff)
2069                 nSrcXOff ++;
2070 #endif
2071             if( nSrcXOff < nChunkXOff )
2072                 nSrcXOff = nChunkXOff;
2073 
2074             double dfSrcXOff2 = dfSrcXDelta + (iDstPixel+1)* dfXRatioDstToSrc;
2075             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2076 #ifdef only_pixels_with_more_than_10_pct_participation
2077             // When oversampling, don't take into account pixels that have a tiny
2078             // participation in the resulting pixel
2079             if( dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2080                 nSrcXOff2 > nChunkXOff)
2081                 nSrcXOff2 --;
2082 #endif
2083             if( nSrcXOff2 == nSrcXOff )
2084                 nSrcXOff2 ++;
2085             if( nSrcXOff2 > nChunkRightXOff )
2086                 nSrcXOff2 = nChunkRightXOff;
2087 
2088             if( eSrcDataType != GDT_Byte || nEntryCount > 256 )
2089             {
2090                 // Not sure how much sense it makes to run a majority
2091                 // filter on floating point data, but here it is for the sake
2092                 // of compatibility. It won't look right on RGB images by the
2093                 // nature of the filter.
2094 
2095                 if( nSrcYOff2 - nSrcYOff <= 0 ||
2096                     nSrcXOff2 - nSrcXOff <= 0 ||
2097                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2098                     static_cast<size_t>(nSrcYOff2-nSrcYOff)*
2099                         static_cast<size_t>(nSrcXOff2-nSrcXOff) >
2100                             std::numeric_limits<size_t>::max() / sizeof(float) )
2101                 {
2102                     CPLError(CE_Failure, CPLE_NotSupported,
2103                              "Too big downsampling factor");
2104                     CPLFree( aEntries );
2105                     CPLFree( pafVals );
2106                     CPLFree( panSums );
2107                     return CE_Failure;
2108                 }
2109                 const size_t nNumPx = static_cast<size_t>(nSrcYOff2-nSrcYOff)*
2110                                       static_cast<size_t>(nSrcXOff2-nSrcXOff);
2111                 size_t iMaxInd = 0;
2112                 size_t iMaxVal = 0;
2113                 bool biMaxValdValid = false;
2114 
2115                 if( pafVals == nullptr || nNumPx > nMaxNumPx )
2116                 {
2117                     float* pafValsNew = static_cast<float *>(
2118                         VSI_REALLOC_VERBOSE(pafVals, nNumPx * sizeof(float)) );
2119                     int* panSumsNew = static_cast<int *>(
2120                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)) );
2121                     if( pafValsNew != nullptr )
2122                         pafVals = pafValsNew;
2123                     if( panSumsNew != nullptr )
2124                         panSums = panSumsNew;
2125                     if( pafValsNew == nullptr || panSumsNew == nullptr )
2126                     {
2127                         CPLFree( aEntries );
2128                         CPLFree( pafVals );
2129                         CPLFree( panSums );
2130                         return CE_Failure;
2131                     }
2132                     nMaxNumPx = nNumPx;
2133                 }
2134 
2135                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
2136                 {
2137                     const GPtrDiff_t iTotYOff = static_cast<GPtrDiff_t>(iY-nSrcYOff)*nChunkXSize-nChunkXOff;
2138                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
2139                     {
2140                         if( pabySrcScanlineNodataMask == nullptr ||
2141                             pabySrcScanlineNodataMask[iX+iTotYOff] )
2142                         {
2143                             const float fVal = pafSrcScanline[iX+iTotYOff];
2144                             size_t i = 0;  // Used after for.
2145 
2146                             // Check array for existing entry.
2147                             for( ; i < iMaxInd; ++i )
2148                                 if( pafVals[i] == fVal
2149                                     && ++panSums[i] > panSums[iMaxVal] )
2150                                 {
2151                                     iMaxVal = i;
2152                                     biMaxValdValid = true;
2153                                     break;
2154                                 }
2155 
2156                             // Add to arr if entry not already there.
2157                             if( i == iMaxInd )
2158                             {
2159                                 pafVals[iMaxInd] = fVal;
2160                                 panSums[iMaxInd] = 1;
2161 
2162                                 if( !biMaxValdValid )
2163                                 {
2164                                     iMaxVal = iMaxInd;
2165                                     biMaxValdValid = true;
2166                                 }
2167 
2168                                 ++iMaxInd;
2169                             }
2170                         }
2171                     }
2172                 }
2173 
2174                 if( !biMaxValdValid )
2175                     pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
2176                 else
2177                     pafDstScanline[iDstPixel - nDstXOff] = pafVals[iMaxVal];
2178             }
2179             else // if( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2180             {
2181                 // So we go here for a paletted or non-paletted byte band.
2182                 // The input values are then between 0 and 255.
2183                 int nMaxVal = 0;
2184                 int iMaxInd = -1;
2185 
2186                 // The cost of this zeroing might be high. Perhaps we should just
2187                 // use the above generic case, and go to this one if the number
2188                 // of source pixels is large enough
2189                 std::fill(anVals.begin(), anVals.end(), 0);
2190 
2191                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
2192                 {
2193                     const GPtrDiff_t iTotYOff =
2194                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize - nChunkXOff;
2195                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
2196                     {
2197                         const float val = pafSrcScanline[iX+iTotYOff];
2198                         if( bHasNoData == FALSE || val != fNoDataValue )
2199                         {
2200                             int nVal = static_cast<int>(val);
2201                             if( ++anVals[nVal] > nMaxVal)
2202                             {
2203                                 // Sum the density.
2204                                 // Is it the most common value so far?
2205                                 iMaxInd = nVal;
2206                                 nMaxVal = anVals[nVal];
2207                             }
2208                         }
2209                     }
2210                 }
2211 
2212                 if( iMaxInd == -1 )
2213                     pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
2214                 else
2215                     pafDstScanline[iDstPixel - nDstXOff] =
2216                         static_cast<float>(iMaxInd);
2217             }
2218         }
2219     }
2220 
2221     CPLFree( aEntries );
2222     CPLFree( pafVals );
2223     CPLFree( panSums );
2224 
2225     return CE_None;
2226 }
2227 
2228 /************************************************************************/
2229 /*                  GDALResampleConvolutionHorizontal()                 */
2230 /************************************************************************/
2231 
GDALResampleConvolutionHorizontal(const T * pChunk,const double * padfWeights,int nSrcPixelCount)2232 template<class T> static inline double GDALResampleConvolutionHorizontal(
2233     const T* pChunk, const double* padfWeights, int nSrcPixelCount )
2234 {
2235     double dfVal1 = 0.0;
2236     double dfVal2 = 0.0;
2237     int i = 0;  // Used after for.
2238     for( ; i + 3 < nSrcPixelCount; i += 4 )
2239     {
2240         dfVal1 += pChunk[i] * padfWeights[i];
2241         dfVal1 += pChunk[i+1] * padfWeights[i+1];
2242         dfVal2 += pChunk[i+2] * padfWeights[i+2];
2243         dfVal2 += pChunk[i+3] * padfWeights[i+3];
2244     }
2245     for( ; i < nSrcPixelCount; ++i )
2246     {
2247         dfVal1 += pChunk[i] * padfWeights[i];
2248     }
2249     return dfVal1 + dfVal2;
2250 }
2251 
GDALResampleConvolutionHorizontalWithMask(const T * pChunk,const GByte * pabyMask,const double * padfWeights,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2252 template<class T> static inline void GDALResampleConvolutionHorizontalWithMask(
2253     const T* pChunk, const GByte* pabyMask,
2254     const double* padfWeights, int nSrcPixelCount,
2255     double& dfVal, double &dfWeightSum)
2256 {
2257     dfVal = 0;
2258     dfWeightSum = 0;
2259     int i = 0;
2260     for( ; i + 3 < nSrcPixelCount; i += 4 )
2261     {
2262         const double dfWeight0 = padfWeights[i] * pabyMask[i];
2263         const double dfWeight1 = padfWeights[i+1] * pabyMask[i+1];
2264         const double dfWeight2 = padfWeights[i+2] * pabyMask[i+2];
2265         const double dfWeight3 = padfWeights[i+3] * pabyMask[i+3];
2266         dfVal += pChunk[i] * dfWeight0;
2267         dfVal += pChunk[i+1] * dfWeight1;
2268         dfVal += pChunk[i+2] * dfWeight2;
2269         dfVal += pChunk[i+3] * dfWeight3;
2270         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2271     }
2272     for( ; i < nSrcPixelCount; ++i )
2273     {
2274         const double dfWeight = padfWeights[i] * pabyMask[i];
2275         dfVal += pChunk[i] * dfWeight;
2276         dfWeightSum += dfWeight;
2277     }
2278 }
2279 
GDALResampleConvolutionHorizontal_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2280 template<class T> static inline void GDALResampleConvolutionHorizontal_3rows(
2281     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2282     const double* padfWeights, int nSrcPixelCount,
2283     double& dfRes1, double& dfRes2, double& dfRes3)
2284 {
2285     double dfVal1 = 0.0;
2286     double dfVal2 = 0.0;
2287     double dfVal3 = 0.0;
2288     double dfVal4 = 0.0;
2289     double dfVal5 = 0.0;
2290     double dfVal6 = 0.0;
2291     int i = 0;  // Used after for.
2292     for( ; i + 3 < nSrcPixelCount; i += 4 )
2293     {
2294         dfVal1 += pChunkRow1[i] * padfWeights[i];
2295         dfVal1 += pChunkRow1[i+1] * padfWeights[i+1];
2296         dfVal2 += pChunkRow1[i+2] * padfWeights[i+2];
2297         dfVal2 += pChunkRow1[i+3] * padfWeights[i+3];
2298         dfVal3 += pChunkRow2[i] * padfWeights[i];
2299         dfVal3 += pChunkRow2[i+1] * padfWeights[i+1];
2300         dfVal4 += pChunkRow2[i+2] * padfWeights[i+2];
2301         dfVal4 += pChunkRow2[i+3] * padfWeights[i+3];
2302         dfVal5 += pChunkRow3[i] * padfWeights[i];
2303         dfVal5 += pChunkRow3[i+1] * padfWeights[i+1];
2304         dfVal6 += pChunkRow3[i+2] * padfWeights[i+2];
2305         dfVal6 += pChunkRow3[i+3] * padfWeights[i+3];
2306     }
2307     for( ; i < nSrcPixelCount; ++i )
2308     {
2309         dfVal1 += pChunkRow1[i] * padfWeights[i];
2310         dfVal3 += pChunkRow2[i] * padfWeights[i];
2311         dfVal5 += pChunkRow3[i] * padfWeights[i];
2312     }
2313     dfRes1 = dfVal1 + dfVal2;
2314     dfRes2 = dfVal3 + dfVal4;
2315     dfRes3 = dfVal5 + dfVal6;
2316 }
2317 
2318 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2319 GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2320     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2321     const double* padfWeights, int nSrcPixelCount,
2322     double& dfRes1, double& dfRes2, double& dfRes3 )
2323 {
2324     GDALResampleConvolutionHorizontal_3rows(
2325         pChunkRow1, pChunkRow2, pChunkRow3,
2326         padfWeights, nSrcPixelCount,
2327         dfRes1, dfRes2, dfRes3 );
2328 }
2329 
2330 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,double & dfRes1,double & dfRes2,double & dfRes3)2331 GDALResampleConvolutionHorizontalPixelCount4_3rows(
2332     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2333     const double* padfWeights,
2334     double& dfRes1, double& dfRes2, double& dfRes3 )
2335 {
2336     GDALResampleConvolutionHorizontal_3rows(
2337         pChunkRow1, pChunkRow2, pChunkRow3,
2338         padfWeights, 4,
2339         dfRes1, dfRes2, dfRes3 );
2340 }
2341 
2342 /************************************************************************/
2343 /*                  GDALResampleConvolutionVertical()                   */
2344 /************************************************************************/
2345 
GDALResampleConvolutionVertical(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount)2346 template<class T> static inline double GDALResampleConvolutionVertical(
2347     const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount )
2348 {
2349     double dfVal1 = 0.0;
2350     double dfVal2 = 0.0;
2351     int i = 0;
2352     int j = 0;
2353     for( ; i + 3 < nSrcLineCount; i+=4, j+=4*nStride)
2354     {
2355         dfVal1 += pChunk[j] * padfWeights[i];
2356         dfVal1 += pChunk[j + nStride] * padfWeights[i+1];
2357         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i+2];
2358         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i+3];
2359     }
2360     for( ; i < nSrcLineCount; ++i, j += nStride)
2361     {
2362         dfVal1 += pChunk[j] * padfWeights[i];
2363     }
2364     return dfVal1 + dfVal2;
2365 }
2366 
GDALResampleConvolutionVertical_2cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,double & dfRes1,double & dfRes2)2367 template<class T> static inline void GDALResampleConvolutionVertical_2cols(
2368     const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2369     double& dfRes1, double& dfRes2 )
2370 {
2371     double dfVal1 = 0.0;
2372     double dfVal2 = 0.0;
2373     double dfVal3 = 0.0;
2374     double dfVal4 = 0.0;
2375     int i = 0;
2376     int j = 0;
2377     for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2378     {
2379         dfVal1 += pChunk[j] * padfWeights[i];
2380         dfVal3 += pChunk[j+1] * padfWeights[i];
2381         dfVal1 += pChunk[j + nStride] * padfWeights[i+1];
2382         dfVal3 += pChunk[j+1 + nStride] * padfWeights[i+1];
2383         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i+2];
2384         dfVal4 += pChunk[j+1 + 2 * nStride] * padfWeights[i+2];
2385         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i+3];
2386         dfVal4 += pChunk[j+1 + 3 * nStride] * padfWeights[i+3];
2387     }
2388     for( ; i < nSrcLineCount; ++i, j += nStride )
2389     {
2390         dfVal1 += pChunk[j] * padfWeights[i];
2391         dfVal3 += pChunk[j+1] * padfWeights[i];
2392     }
2393     dfRes1 = dfVal1 + dfVal2;
2394     dfRes2 = dfVal3 + dfVal4;
2395 }
2396 
2397 #ifdef USE_SSE2
2398 
2399 #ifdef __AVX__
2400 /************************************************************************/
2401 /*             GDALResampleConvolutionVertical_16cols<T>                */
2402 /************************************************************************/
2403 
GDALResampleConvolutionVertical_16cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,float * afDest)2404 template<class T> static inline void GDALResampleConvolutionVertical_16cols(
2405     const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2406     float* afDest )
2407 {
2408     int i = 0;
2409     int j = 0;
2410     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2411     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2412     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2413     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2414     for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2415     {
2416         XMMReg4Double w0 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+0);
2417         XMMReg4Double w1 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+1);
2418         XMMReg4Double w2 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+2);
2419         XMMReg4Double w3 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+3);
2420         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+0*nStride) * w0;
2421         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+0*nStride) * w0;
2422         v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+0*nStride) * w0;
2423         v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+0*nStride) * w0;
2424         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+1*nStride) * w1;
2425         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+1*nStride) * w1;
2426         v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+1*nStride) * w1;
2427         v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+1*nStride) * w1;
2428         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+2*nStride) * w2;
2429         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+2*nStride) * w2;
2430         v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+2*nStride) * w2;
2431         v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+2*nStride) * w2;
2432         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+3*nStride) * w3;
2433         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+3*nStride) * w3;
2434         v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+3*nStride) * w3;
2435         v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+3*nStride) * w3;
2436     }
2437     for( ; i < nSrcLineCount; ++i, j += nStride )
2438     {
2439         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights+i);
2440         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0) * w;
2441         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4) * w;
2442         v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8) * w;
2443         v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12) * w;
2444     }
2445     v_acc0.Store4Val(afDest);
2446     v_acc1.Store4Val(afDest+4);
2447     v_acc2.Store4Val(afDest+8);
2448     v_acc3.Store4Val(afDest+12);
2449 }
2450 
2451 #else
2452 
2453 
2454 /************************************************************************/
2455 /*              GDALResampleConvolutionVertical_8cols<T>                */
2456 /************************************************************************/
2457 
GDALResampleConvolutionVertical_8cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,float * afDest)2458 template<class T> static inline void GDALResampleConvolutionVertical_8cols(
2459     const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2460     float* afDest )
2461 {
2462     int i = 0;
2463     int j = 0;
2464     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2465     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2466     for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2467     {
2468         XMMReg4Double w0 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+0);
2469         XMMReg4Double w1 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+1);
2470         XMMReg4Double w2 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+2);
2471         XMMReg4Double w3 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+3);
2472         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+0*nStride) * w0;
2473         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+0*nStride) * w0;
2474         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+1*nStride) * w1;
2475         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+1*nStride) * w1;
2476         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+2*nStride) * w2;
2477         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+2*nStride) * w2;
2478         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+3*nStride) * w3;
2479         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+3*nStride) * w3;
2480     }
2481     for( ; i < nSrcLineCount; ++i, j += nStride )
2482     {
2483         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights+i);
2484         v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0) * w;
2485         v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4) * w;
2486     }
2487     v_acc0.Store4Val(afDest);
2488     v_acc1.Store4Val(afDest+4);
2489 }
2490 
2491 #endif // __AVX__
2492 
2493 /************************************************************************/
2494 /*              GDALResampleConvolutionHorizontalSSE2<T>                */
2495 /************************************************************************/
2496 
GDALResampleConvolutionHorizontalSSE2(const T * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2497 template<class T> static inline double GDALResampleConvolutionHorizontalSSE2(
2498     const T* pChunk, const double* padfWeightsAligned, int nSrcPixelCount )
2499 {
2500     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2501     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2502     int i = 0;  // Used after for.
2503     for( ; i + 7 < nSrcPixelCount; i += 8 )
2504     {
2505         // Retrieve the pixel & accumulate
2506         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk+i);
2507         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk+i+4);
2508         const XMMReg4Double v_weight1 =
2509             XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2510         const XMMReg4Double v_weight2 =
2511             XMMReg4Double::Load4ValAligned(padfWeightsAligned+i+4);
2512 
2513         v_acc1 += v_pixels1 * v_weight1;
2514         v_acc2 += v_pixels2 * v_weight2;
2515     }
2516 
2517     v_acc1 += v_acc2;
2518 
2519     double dfVal = v_acc1.GetHorizSum();
2520     for( ; i < nSrcPixelCount; ++i )
2521     {
2522         dfVal += pChunk[i] * padfWeightsAligned[i];
2523     }
2524     return dfVal;
2525 }
2526 
2527 /************************************************************************/
2528 /*              GDALResampleConvolutionHorizontal<GByte>                */
2529 /************************************************************************/
2530 
GDALResampleConvolutionHorizontal(const GByte * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2531 template<> inline double GDALResampleConvolutionHorizontal<GByte>(
2532     const GByte* pChunk, const double* padfWeightsAligned, int nSrcPixelCount )
2533 {
2534     return GDALResampleConvolutionHorizontalSSE2( pChunk, padfWeightsAligned,
2535                                                   nSrcPixelCount );
2536 }
2537 
GDALResampleConvolutionHorizontal(const GUInt16 * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2538 template<> inline double GDALResampleConvolutionHorizontal<GUInt16>(
2539     const GUInt16* pChunk, const double* padfWeightsAligned,
2540     int nSrcPixelCount )
2541 {
2542     return GDALResampleConvolutionHorizontalSSE2( pChunk, padfWeightsAligned,
2543                                                   nSrcPixelCount) ;
2544 }
2545 
2546 /************************************************************************/
2547 /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2548 /************************************************************************/
2549 
2550 template<class T> static inline void
GDALResampleConvolutionHorizontalWithMaskSSE2(const T * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2551 GDALResampleConvolutionHorizontalWithMaskSSE2(
2552     const T* pChunk, const GByte* pabyMask,
2553     const double* padfWeightsAligned, int nSrcPixelCount,
2554     double& dfVal, double &dfWeightSum )
2555 {
2556     int i = 0;  // Used after for.
2557     XMMReg4Double v_acc = XMMReg4Double::Zero();
2558     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2559     for( ; i + 3 < nSrcPixelCount; i += 4 )
2560     {
2561         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk+i);
2562         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask+i);
2563         XMMReg4Double v_weight =
2564             XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2565         v_weight *= v_mask;
2566         v_acc += v_pixels * v_weight;
2567         v_acc_weight += v_weight;
2568     }
2569 
2570     dfVal = v_acc.GetHorizSum();
2571     dfWeightSum = v_acc_weight.GetHorizSum();
2572     for( ; i < nSrcPixelCount; ++i )
2573     {
2574         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2575         dfVal += pChunk[i] * dfWeight;
2576         dfWeightSum += dfWeight;
2577     }
2578 }
2579 
2580 /************************************************************************/
2581 /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2582 /************************************************************************/
2583 
GDALResampleConvolutionHorizontalWithMask(const GByte * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2584 template<> inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2585     const GByte* pChunk, const GByte* pabyMask,
2586     const double* padfWeightsAligned, int nSrcPixelCount,
2587     double& dfVal, double &dfWeightSum)
2588 {
2589     GDALResampleConvolutionHorizontalWithMaskSSE2(pChunk, pabyMask,
2590                                                   padfWeightsAligned,
2591                                                   nSrcPixelCount,
2592                                                   dfVal, dfWeightSum);
2593 }
2594 
GDALResampleConvolutionHorizontalWithMask(const GUInt16 * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2595 template<> inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2596     const GUInt16* pChunk, const GByte* pabyMask,
2597     const double* padfWeightsAligned, int nSrcPixelCount,
2598     double& dfVal, double &dfWeightSum )
2599 {
2600     GDALResampleConvolutionHorizontalWithMaskSSE2( pChunk, pabyMask,
2601                                                    padfWeightsAligned,
2602                                                    nSrcPixelCount,
2603                                                    dfVal, dfWeightSum );
2604 }
2605 
2606 /************************************************************************/
2607 /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2608 /************************************************************************/
2609 
2610 template<class T> static inline void
GDALResampleConvolutionHorizontal_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2611 GDALResampleConvolutionHorizontal_3rows_SSE2(
2612     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2613     const double* padfWeightsAligned, int nSrcPixelCount,
2614     double& dfRes1, double& dfRes2, double& dfRes3 )
2615 {
2616     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2617                   v_acc2 = XMMReg4Double::Zero(),
2618                   v_acc3 = XMMReg4Double::Zero();
2619     int i = 0;
2620     for( ; i + 7 < nSrcPixelCount; i += 8 )
2621     {
2622         // Retrieve the pixel & accumulate.
2623         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1+i);
2624         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1+i+4);
2625         const XMMReg4Double v_weight1 =
2626             XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2627         const XMMReg4Double v_weight2 =
2628             XMMReg4Double::Load4ValAligned(padfWeightsAligned+i+4);
2629 
2630         v_acc1 += v_pixels1 * v_weight1;
2631         v_acc1 += v_pixels2 * v_weight2;
2632 
2633         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2+i);
2634         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2+i+4);
2635         v_acc2 += v_pixels1 * v_weight1;
2636         v_acc2 += v_pixels2 * v_weight2;
2637 
2638         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3+i);
2639         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3+i+4);
2640         v_acc3 += v_pixels1 * v_weight1;
2641         v_acc3 += v_pixels2 * v_weight2;
2642     }
2643 
2644     dfRes1 = v_acc1.GetHorizSum();
2645     dfRes2 = v_acc2.GetHorizSum();
2646     dfRes3 = v_acc3.GetHorizSum();
2647     for( ; i < nSrcPixelCount; ++i )
2648     {
2649         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2650         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2651         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2652     }
2653 }
2654 
2655 /************************************************************************/
2656 /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2657 /************************************************************************/
2658 
GDALResampleConvolutionHorizontal_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2659 template<> inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2660     const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2661     const double* padfWeightsAligned, int nSrcPixelCount,
2662     double& dfRes1, double& dfRes2, double& dfRes3 )
2663 {
2664     GDALResampleConvolutionHorizontal_3rows_SSE2(
2665         pChunkRow1, pChunkRow2, pChunkRow3,
2666         padfWeightsAligned, nSrcPixelCount,
2667         dfRes1, dfRes2, dfRes3 );
2668 }
2669 
GDALResampleConvolutionHorizontal_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2670 template<> inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2671     const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2672     const GUInt16* pChunkRow3,
2673     const double* padfWeightsAligned, int nSrcPixelCount,
2674     double& dfRes1, double& dfRes2, double& dfRes3 )
2675 {
2676     GDALResampleConvolutionHorizontal_3rows_SSE2(
2677         pChunkRow1, pChunkRow2, pChunkRow3,
2678         padfWeightsAligned, nSrcPixelCount,
2679         dfRes1, dfRes2, dfRes3);
2680 }
2681 
2682 /************************************************************************/
2683 /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
2684 /************************************************************************/
2685 
2686 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2687 GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2688     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2689     const double* padfWeightsAligned, int nSrcPixelCount,
2690     double& dfRes1, double& dfRes2, double& dfRes3)
2691 {
2692     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2693     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2694     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2695     int i = 0;  // Use after for.
2696     for( ; i + 3 < nSrcPixelCount; i += 4)
2697     {
2698         // Retrieve the pixel & accumulate.
2699         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1+i);
2700         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2+i);
2701         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3+i);
2702         const XMMReg4Double v_weight =
2703             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2704 
2705         v_acc1 += v_pixels1 * v_weight;
2706         v_acc2 += v_pixels2 * v_weight;
2707         v_acc3 += v_pixels3 * v_weight;
2708     }
2709 
2710     dfRes1 = v_acc1.GetHorizSum();
2711     dfRes2 = v_acc2.GetHorizSum();
2712     dfRes3 = v_acc3.GetHorizSum();
2713 
2714     for( ; i < nSrcPixelCount; ++i )
2715     {
2716         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2717         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2718         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2719     }
2720 }
2721 
2722 /************************************************************************/
2723 /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
2724 /************************************************************************/
2725 
2726 template<> inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2727 GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
2728     const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2729     const double* padfWeightsAligned, int nSrcPixelCount,
2730     double& dfRes1, double& dfRes2, double& dfRes3 )
2731 {
2732     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2733         pChunkRow1, pChunkRow2, pChunkRow3,
2734         padfWeightsAligned, nSrcPixelCount,
2735         dfRes1, dfRes2, dfRes3 );
2736 }
2737 
2738 template<> inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2739 GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
2740     const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2741     const GUInt16* pChunkRow3,
2742     const double* padfWeightsAligned, int nSrcPixelCount,
2743     double& dfRes1, double& dfRes2, double& dfRes3 )
2744 {
2745     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2746         pChunkRow1, pChunkRow2, pChunkRow3,
2747         padfWeightsAligned, nSrcPixelCount,
2748         dfRes1, dfRes2, dfRes3 );
2749 }
2750 
2751 /************************************************************************/
2752 /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
2753 /************************************************************************/
2754 
2755 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2756 GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2757     const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2758     const double* padfWeightsAligned,
2759     double& dfRes1, double& dfRes2, double& dfRes3)
2760 {
2761     const XMMReg4Double v_weight =
2762         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
2763 
2764     // Retrieve the pixel & accumulate.
2765     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
2766     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
2767     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
2768 
2769     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
2770     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
2771     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
2772 
2773     dfRes1 = v_acc1.GetHorizSum();
2774     dfRes2 = v_acc2.GetHorizSum();
2775     dfRes3 = v_acc3.GetHorizSum();
2776 }
2777 
2778 /************************************************************************/
2779 /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
2780 /************************************************************************/
2781 
2782 template<> inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2783 GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
2784     const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2785     const double* padfWeightsAligned,
2786     double& dfRes1, double& dfRes2, double& dfRes3 )
2787 {
2788     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2789         pChunkRow1, pChunkRow2, pChunkRow3,
2790         padfWeightsAligned,
2791         dfRes1, dfRes2, dfRes3 );
2792 }
2793 
2794 template<> inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2795 GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
2796     const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2797     const GUInt16* pChunkRow3,
2798     const double* padfWeightsAligned,
2799     double& dfRes1, double& dfRes2, double& dfRes3 )
2800 {
2801     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2802         pChunkRow1, pChunkRow2, pChunkRow3,
2803         padfWeightsAligned,
2804         dfRes1, dfRes2, dfRes3 );
2805 }
2806 
2807 #endif  // USE_SSE2
2808 
2809 /************************************************************************/
2810 /*                   GDALResampleChunk32R_Convolution()                 */
2811 /************************************************************************/
2812 
2813 template<class T> static CPLErr
GDALResampleChunk32R_ConvolutionT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,const T * pChunk,int nBands,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poDstBand,void * pDstBuffer,int bHasNoData,float fNoDataValue,FilterFuncType pfnFilterFunc,FilterFunc4ValuesType pfnFilterFunc4Values,int nKernelRadius,bool bKernelWithNegativeWeights,float fMaxVal)2814 GDALResampleChunk32R_ConvolutionT( double dfXRatioDstToSrc,
2815                                    double dfYRatioDstToSrc,
2816                                    double dfSrcXDelta,
2817                                    double dfSrcYDelta,
2818                                    const T * pChunk, int nBands,
2819                                    const GByte * pabyChunkNodataMask,
2820                                    int nChunkXOff, int nChunkXSize,
2821                                    int nChunkYOff, int nChunkYSize,
2822                                    int nDstXOff, int nDstXOff2,
2823                                    int nDstYOff, int nDstYOff2,
2824                                    GDALRasterBand * poDstBand,
2825                                    void* pDstBuffer,
2826                                    int bHasNoData,
2827                                    float fNoDataValue,
2828                                    FilterFuncType pfnFilterFunc,
2829                                    FilterFunc4ValuesType pfnFilterFunc4Values,
2830                                    int nKernelRadius,
2831                                    bool bKernelWithNegativeWeights,
2832                                    float fMaxVal )
2833 
2834 {
2835     if( !bHasNoData )
2836         fNoDataValue = 0.0f;
2837     const auto dstDataType = poDstBand->GetRasterDataType();
2838     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
2839     const float fReplacementVal = GetReplacementValueIfNoData(
2840         dstDataType, bHasNoData, fNoDataValue);
2841     // cppcheck-suppress unreadVariable
2842     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
2843     const auto nNodataValueInt64 = static_cast<GInt64>(fNoDataValue);
2844 
2845     // TODO: we should have some generic function to do this.
2846     float fDstMin = -std::numeric_limits<float>::max();
2847     float fDstMax = std::numeric_limits<float>::max();
2848     if( dstDataType == GDT_Byte )
2849     {
2850         fDstMin = std::numeric_limits<GByte>::min();
2851         fDstMax = std::numeric_limits<GByte>::max();
2852     }
2853     else if( dstDataType == GDT_UInt16 )
2854     {
2855         fDstMin = std::numeric_limits<GUInt16>::min();
2856         fDstMax = std::numeric_limits<GUInt16>::max();
2857     }
2858     else if( dstDataType == GDT_Int16 )
2859     {
2860         fDstMin = std::numeric_limits<GInt16>::min();
2861         fDstMax = std::numeric_limits<GInt16>::max();
2862     }
2863     else if( dstDataType == GDT_UInt32 )
2864     {
2865         fDstMin = static_cast<float>(std::numeric_limits<GUInt32>::min());
2866         fDstMax = static_cast<float>(std::numeric_limits<GUInt32>::max());
2867     }
2868     else if( dstDataType == GDT_Int32 )
2869     {
2870         // cppcheck-suppress unreadVariable
2871         fDstMin = static_cast<float>(std::numeric_limits<GInt32>::min());
2872         // cppcheck-suppress unreadVariable
2873         fDstMax = static_cast<float>(std::numeric_limits<GInt32>::max());
2874     }
2875 
2876     auto replaceValIfNodata =
2877         [bHasNoData, isIntegerDT, fDstMin, fDstMax, nNodataValueInt64,
2878          fNoDataValue, fReplacementVal](float fVal)
2879     {
2880         if( !bHasNoData )
2881             return fVal;
2882 
2883         // Clamp value before comparing to nodata: this is only needed for
2884         // kernels with negative weights (Lanczos)
2885         float fClamped = fVal;
2886         if( fClamped < fDstMin )
2887             fClamped = fDstMin;
2888         else if( fClamped > fDstMax )
2889             fClamped = fDstMax;
2890         if( isIntegerDT )
2891         {
2892             if( nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)) )
2893             {
2894                 // Do not use the nodata value
2895                 return fReplacementVal;
2896             }
2897         }
2898         else if( fNoDataValue == fClamped )
2899         {
2900             // Do not use the nodata value
2901             return fReplacementVal;
2902         }
2903         return fClamped;
2904     };
2905 
2906 /* -------------------------------------------------------------------- */
2907 /*      Allocate work buffers.                                          */
2908 /* -------------------------------------------------------------------- */
2909     const int nDstXSize = nDstXOff2 - nDstXOff;
2910     float* pafWrkScanline = nullptr;
2911     if( dstDataType != GDT_Float32 )
2912     {
2913         pafWrkScanline = static_cast<float*>(
2914             VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(float)));
2915         if( pafWrkScanline == nullptr )
2916             return CE_Failure;
2917     }
2918 
2919     const double dfXScale = 1.0 / dfXRatioDstToSrc;
2920     const double dfXScaleWeight = ( dfXScale >= 1.0 ) ? 1.0 : dfXScale;
2921     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
2922     const double dfYScale = 1.0 / dfYRatioDstToSrc;
2923     const double dfYScaleWeight = ( dfYScale >= 1.0 ) ? 1.0 : dfYScale;
2924     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
2925 
2926     // Temporary array to store result of horizontal filter.
2927     double* padfHorizontalFiltered = static_cast<double*>(
2928         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands) );
2929 
2930     // To store convolution coefficients.
2931     double* padfWeights = static_cast<double *>(
2932         VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
2933             static_cast<int>(
2934                 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
2935                 0.5) * sizeof(double) ) );
2936 
2937     GByte* pabyChunkNodataMaskHorizontalFiltered = nullptr;
2938     if( pabyChunkNodataMask )
2939         pabyChunkNodataMaskHorizontalFiltered = static_cast<GByte*>(
2940             VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize) );
2941     if( padfHorizontalFiltered == nullptr ||
2942         padfWeights == nullptr ||
2943         (pabyChunkNodataMask != nullptr &&
2944          pabyChunkNodataMaskHorizontalFiltered == nullptr) )
2945     {
2946         VSIFree(pafWrkScanline);
2947         VSIFree(padfHorizontalFiltered);
2948         VSIFreeAligned(padfWeights);
2949         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
2950         return CE_Failure;
2951     }
2952 
2953 /* ==================================================================== */
2954 /*      First pass: horizontal filter                                   */
2955 /* ==================================================================== */
2956     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2957 #ifdef USE_SSE2
2958     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
2959 #endif
2960     for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
2961     {
2962         const double dfSrcPixel =
2963             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
2964         int nSrcPixelStart =
2965             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
2966         if( nSrcPixelStart < nChunkXOff )
2967             nSrcPixelStart = nChunkXOff;
2968         int nSrcPixelStop =
2969             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
2970         if( nSrcPixelStop > nChunkRightXOff )
2971             nSrcPixelStop = nChunkRightXOff;
2972 #if 0
2973         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
2974         {
2975             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
2976         }
2977         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
2978         {
2979             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
2980         }
2981 #endif
2982         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
2983         double dfWeightSum = 0.0;
2984 
2985         // Compute convolution coefficients.
2986         int nSrcPixel = nSrcPixelStart;
2987         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
2988         for( ; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel+=4)
2989         {
2990             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
2991             dfX += dfXScaleWeight;
2992             padfWeights[nSrcPixel+1 - nSrcPixelStart] = dfX;
2993             dfX += dfXScaleWeight;
2994             padfWeights[nSrcPixel+2 - nSrcPixelStart] = dfX;
2995             dfX += dfXScaleWeight;
2996             padfWeights[nSrcPixel+3 - nSrcPixelStart] = dfX;
2997             dfX += dfXScaleWeight;
2998             dfWeightSum +=
2999                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3000         }
3001         for( ; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3002         {
3003             const double dfWeight = pfnFilterFunc(dfX);
3004             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3005             dfWeightSum += dfWeight;
3006         }
3007 
3008         const int nHeight = nChunkYSize * nBands;
3009         if( pabyChunkNodataMask == nullptr )
3010         {
3011             if( dfWeightSum != 0 )
3012             {
3013                 const double dfInvWeightSum = 1.0 / dfWeightSum;
3014                 for( int i = 0; i < nSrcPixelCount; ++i )
3015                     padfWeights[i] *= dfInvWeightSum;
3016             }
3017             int iSrcLineOff = 0;
3018 #ifdef USE_SSE2
3019             if( nSrcPixelCount == 4 )
3020             {
3021                 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3022                 {
3023                     const GPtrDiff_t j =
3024                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3025                         (nSrcPixelStart - nChunkXOff);
3026                     double dfVal1 = 0.0;
3027                     double dfVal2 = 0.0;
3028                     double dfVal3 = 0.0;
3029                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
3030                         pChunk + j, pChunk + j + nChunkXSize,
3031                         pChunk + j + 2 * nChunkXSize,
3032                         padfWeights, dfVal1, dfVal2, dfVal3);
3033                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3034                                            iDstPixel - nDstXOff] = dfVal1;
3035                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3036                                            iDstPixel - nDstXOff] = dfVal2;
3037                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3038                                            iDstPixel - nDstXOff] = dfVal3;
3039                 }
3040             }
3041             else if( bSrcPixelCountLess8 )
3042             {
3043                 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3044                 {
3045                     const GPtrDiff_t j =
3046                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3047                         (nSrcPixelStart - nChunkXOff);
3048                     double dfVal1 = 0.0;
3049                     double dfVal2 = 0.0;
3050                     double dfVal3 = 0.0;
3051                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3052                         pChunk + j, pChunk + j + nChunkXSize,
3053                         pChunk + j + 2 * nChunkXSize,
3054                         padfWeights, nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3055                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3056                                            iDstPixel - nDstXOff] = dfVal1;
3057                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3058                                            iDstPixel - nDstXOff] = dfVal2;
3059                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3060                                            iDstPixel - nDstXOff] = dfVal3;
3061                 }
3062             }
3063             else
3064 #endif
3065             {
3066                 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3067                 {
3068                     const GPtrDiff_t j =
3069                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3070                         (nSrcPixelStart - nChunkXOff);
3071                     double dfVal1 = 0.0;
3072                     double dfVal2 = 0.0;
3073                     double dfVal3 = 0.0;
3074                     GDALResampleConvolutionHorizontal_3rows(
3075                         pChunk + j,
3076                         pChunk + j + nChunkXSize,
3077                         pChunk + j + 2 * nChunkXSize,
3078                         padfWeights, nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3079                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3080                                            iDstPixel - nDstXOff] = dfVal1;
3081                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3082                                            iDstPixel - nDstXOff] = dfVal2;
3083                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3084                                            iDstPixel - nDstXOff] = dfVal3;
3085                 }
3086             }
3087             for( ; iSrcLineOff < nHeight; ++iSrcLineOff )
3088             {
3089                 const GPtrDiff_t j =
3090                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize + (nSrcPixelStart - nChunkXOff);
3091                 const double dfVal =
3092                     GDALResampleConvolutionHorizontal(pChunk + j,
3093                                                 padfWeights, nSrcPixelCount);
3094                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3095                                        iDstPixel - nDstXOff] = dfVal;
3096             }
3097         }
3098         else
3099         {
3100             for( int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff )
3101             {
3102                 const GPtrDiff_t j =
3103                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize + (nSrcPixelStart - nChunkXOff);
3104 
3105                 if( bKernelWithNegativeWeights )
3106                 {
3107                     int nConsecutiveValid = 0;
3108                     int nMaxConsecutiveValid = 0;
3109                     for( int k = 0; k < nSrcPixelCount; k++ )
3110                     {
3111                         if( pabyChunkNodataMask[j + k] )
3112                             nConsecutiveValid ++;
3113                         else if( nConsecutiveValid )
3114                         {
3115                             nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3116                                                             nConsecutiveValid);
3117                             nConsecutiveValid = 0;
3118                         }
3119                     }
3120                     nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3121                                                     nConsecutiveValid);
3122                     if( nMaxConsecutiveValid < nSrcPixelCount / 2 )
3123                     {
3124                         const size_t nTempOffset =
3125                             static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel - nDstXOff;
3126                         padfHorizontalFiltered[nTempOffset] = 0.0;
3127                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3128                         continue;
3129                     }
3130                 }
3131 
3132                 double dfVal = 0.0;
3133                 GDALResampleConvolutionHorizontalWithMask(
3134                     pChunk + j, pabyChunkNodataMask + j,
3135                     padfWeights, nSrcPixelCount,
3136                     dfVal, dfWeightSum );
3137                 const size_t nTempOffset =
3138                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel - nDstXOff;
3139                 if( dfWeightSum > 0.0 )
3140                 {
3141                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3142                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3143                 }
3144                 else
3145                 {
3146                     padfHorizontalFiltered[nTempOffset] = 0.0;
3147                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3148                 }
3149             }
3150         }
3151     }
3152 
3153 /* ==================================================================== */
3154 /*      Second pass: vertical filter                                    */
3155 /* ==================================================================== */
3156     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3157 
3158     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
3159     {
3160         float* const pafDstScanline = pafWrkScanline ? pafWrkScanline:
3161             static_cast<float*>(pDstBuffer) + (iDstLine - nDstYOff)  * nDstXSize;
3162 
3163         const double dfSrcLine =
3164             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3165         int nSrcLineStart =
3166             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3167         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3168         if( nSrcLineStart < nChunkYOff )
3169             nSrcLineStart = nChunkYOff;
3170         if( nSrcLineStop > nChunkBottomYOff )
3171             nSrcLineStop = nChunkBottomYOff;
3172 #if 0
3173         if( nSrcLineStart < nChunkYOff &&
3174             nChunkYOff > 0 )
3175         {
3176             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3177         }
3178         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3179         {
3180             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3181         }
3182 #endif
3183         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3184         double dfWeightSum = 0.0;
3185 
3186         // Compute convolution coefficients.
3187         int nSrcLine = nSrcLineStart;  // Used after for.
3188         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3189         for( ;
3190              nSrcLine + 3 < nSrcLineStop;
3191              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3192         {
3193             padfWeights[nSrcLine - nSrcLineStart] = dfY;
3194             padfWeights[nSrcLine+1 - nSrcLineStart] = dfY + dfYScaleWeight;
3195             padfWeights[nSrcLine+2 - nSrcLineStart] = dfY + 2 * dfYScaleWeight;
3196             padfWeights[nSrcLine+3 - nSrcLineStart] = dfY + 3 * dfYScaleWeight;
3197             dfWeightSum +=
3198                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3199         }
3200         for( ; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight )
3201         {
3202             const double dfWeight = pfnFilterFunc(dfY);
3203             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3204             dfWeightSum += dfWeight;
3205         }
3206 
3207         if( pabyChunkNodataMask == nullptr )
3208         {
3209             if( dfWeightSum != 0 )
3210             {
3211                 const double dfInvWeightSum = 1.0 / dfWeightSum;
3212                 for( int i = 0; i < nSrcLineCount; ++i )
3213                     padfWeights[i] *= dfInvWeightSum;
3214             }
3215         }
3216 
3217         if( pabyChunkNodataMask == nullptr )
3218         {
3219             int iFilteredPixelOff = 0;  // Used after for.
3220             // j used after for.
3221             size_t j = (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3222 #ifdef USE_SSE2
3223 
3224 #ifdef __AVX__
3225             for( ;
3226                  iFilteredPixelOff+15 < nDstXSize;
3227                  iFilteredPixelOff += 16, j += 16 )
3228             {
3229                 GDALResampleConvolutionVertical_16cols(
3230                     padfHorizontalFiltered + j, nDstXSize, padfWeights,
3231                     nSrcLineCount, pafDstScanline + iFilteredPixelOff );
3232                 if( bHasNoData )
3233                 {
3234                     for( int k = 0; k < 16; k++ )
3235                     {
3236                         pafDstScanline[iFilteredPixelOff + k] =
3237                             replaceValIfNodata(pafDstScanline[iFilteredPixelOff + k]);
3238                     }
3239                 }
3240             }
3241 #else
3242             for( ;
3243                  iFilteredPixelOff+7 < nDstXSize;
3244                  iFilteredPixelOff += 8, j += 8 )
3245             {
3246                 GDALResampleConvolutionVertical_8cols(
3247                     padfHorizontalFiltered + j, nDstXSize, padfWeights,
3248                     nSrcLineCount, pafDstScanline + iFilteredPixelOff );
3249                 if( bHasNoData )
3250                 {
3251                     for( int k = 0; k < 8; k++ )
3252                     {
3253                         pafDstScanline[iFilteredPixelOff + k] =
3254                             replaceValIfNodata(pafDstScanline[iFilteredPixelOff + k]);
3255                     }
3256                 }
3257             }
3258 #endif
3259 
3260             for( ; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++ )
3261             {
3262                 const float fVal = static_cast<float>(
3263                     GDALResampleConvolutionVertical(
3264                         padfHorizontalFiltered + j,
3265                         nDstXSize, padfWeights, nSrcLineCount ));
3266                 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(fVal);
3267             }
3268 #else
3269             for( ;
3270                  iFilteredPixelOff+1 < nDstXSize;
3271                  iFilteredPixelOff += 2, j += 2 )
3272             {
3273                 double dfVal1 = 0.0;
3274                 double dfVal2 = 0.0;
3275                 GDALResampleConvolutionVertical_2cols(
3276                     padfHorizontalFiltered + j, nDstXSize, padfWeights,
3277                     nSrcLineCount, dfVal1, dfVal2 );
3278                 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3279                     static_cast<float>(dfVal1));
3280                 pafDstScanline[iFilteredPixelOff+1] = replaceValIfNodata(
3281                     static_cast<float>(dfVal2));
3282             }
3283             if( iFilteredPixelOff < nDstXSize )
3284             {
3285                 const double dfVal =
3286                     GDALResampleConvolutionVertical(
3287                         padfHorizontalFiltered + j,
3288                         nDstXSize, padfWeights, nSrcLineCount );
3289                 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3290                     static_cast<float>(dfVal));
3291             }
3292 #endif
3293         }
3294         else
3295         {
3296             for( int iFilteredPixelOff = 0;
3297                  iFilteredPixelOff < nDstXSize;
3298                  ++iFilteredPixelOff )
3299             {
3300                 double dfVal = 0.0;
3301                 dfWeightSum = 0.0;
3302                 size_t j = (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize)
3303                              + iFilteredPixelOff;
3304                 if( bKernelWithNegativeWeights )
3305                 {
3306                     int nConsecutiveValid = 0;
3307                     int nMaxConsecutiveValid = 0;
3308                     for(int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3309                     {
3310                         const double dfWeight =
3311                             padfWeights[i]
3312                             * pabyChunkNodataMaskHorizontalFiltered[j];
3313                         if( pabyChunkNodataMaskHorizontalFiltered[j] )
3314                         {
3315                             nConsecutiveValid ++;
3316                         }
3317                         else if( nConsecutiveValid )
3318                         {
3319                             nMaxConsecutiveValid = std::max(
3320                                 nMaxConsecutiveValid, nConsecutiveValid);
3321                             nConsecutiveValid = 0;
3322                         }
3323                         dfVal += padfHorizontalFiltered[j] * dfWeight;
3324                         dfWeightSum += dfWeight;
3325                     }
3326                     nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3327                                                     nConsecutiveValid);
3328                     if( nMaxConsecutiveValid < nSrcLineCount / 2 )
3329                     {
3330                         pafDstScanline[iFilteredPixelOff] = fNoDataValue;
3331                         continue;
3332                     }
3333                 }
3334                 else
3335                 {
3336                     for(int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3337                     {
3338                         const double dfWeight =
3339                             padfWeights[i]
3340                             * pabyChunkNodataMaskHorizontalFiltered[j];
3341                         dfVal += padfHorizontalFiltered[j] * dfWeight;
3342                         dfWeightSum += dfWeight;
3343                     }
3344                 }
3345                 if( dfWeightSum > 0.0 )
3346                 {
3347                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3348                         static_cast<float>(dfVal / dfWeightSum));
3349                 }
3350                 else
3351                 {
3352                     pafDstScanline[iFilteredPixelOff] = fNoDataValue;
3353                 }
3354             }
3355         }
3356 
3357         if( fMaxVal != 0.0f )
3358         {
3359             for( int i = 0; i < nDstXSize; ++i )
3360             {
3361                 if( pafDstScanline[i] > fMaxVal )
3362                     pafDstScanline[i] = fMaxVal;
3363             }
3364         }
3365 
3366         if( pafWrkScanline )
3367         {
3368             GDALCopyWords(pafWrkScanline, GDT_Float32, 4,
3369                           static_cast<GByte*>(pDstBuffer) +
3370                             static_cast<size_t>(iDstLine - nDstYOff) *
3371                                 nDstXSize * nDstDataTypeSize,
3372                           dstDataType, nDstDataTypeSize,
3373                           nDstXSize);
3374         }
3375     }
3376 
3377     VSIFree(pafWrkScanline);
3378     VSIFreeAligned( padfWeights );
3379     VSIFree( padfHorizontalFiltered );
3380     VSIFree( pabyChunkNodataMaskHorizontalFiltered );
3381 
3382     return CE_None;
3383 }
3384 
GDALResampleChunk32R_Convolution(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType eWrkDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable *,GDALDataType,bool)3385 static CPLErr GDALResampleChunk32R_Convolution(
3386     double dfXRatioDstToSrc, double dfYRatioDstToSrc,
3387     double dfSrcXDelta,
3388     double dfSrcYDelta,
3389     GDALDataType eWrkDataType,
3390     const void * pChunk,
3391     const GByte * pabyChunkNodataMask,
3392     int nChunkXOff, int nChunkXSize,
3393     int nChunkYOff, int nChunkYSize,
3394     int nDstXOff, int nDstXOff2,
3395     int nDstYOff, int nDstYOff2,
3396     GDALRasterBand * poOverview,
3397     void** ppDstBuffer,
3398     GDALDataType* peDstBufferDataType,
3399     const char * pszResampling,
3400     int bHasNoData, float fNoDataValue,
3401     GDALColorTable* /* poColorTable_unused */,
3402     GDALDataType /* eSrcDataType */,
3403     bool /* bPropagateNoData */ )
3404 {
3405     GDALResampleAlg eResample;
3406     bool bKernelWithNegativeWeights = false;
3407     if( EQUAL(pszResampling, "BILINEAR") )
3408         eResample = GRA_Bilinear;
3409     else if( EQUAL(pszResampling, "CUBIC") )
3410         eResample = GRA_Cubic;
3411     else if( EQUAL(pszResampling, "CUBICSPLINE") )
3412         eResample = GRA_CubicSpline;
3413     else if( EQUAL(pszResampling, "LANCZOS") )
3414     {
3415         eResample = GRA_Lanczos;
3416         bKernelWithNegativeWeights = true;
3417     }
3418     else
3419     {
3420         CPLAssert(false);
3421         return CE_Failure;
3422     }
3423     const int nKernelRadius = GWKGetFilterRadius(eResample);
3424     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3425     const FilterFunc4ValuesType pfnFilterFunc4Values =
3426         GWKGetFilterFunc4Values(eResample);
3427 
3428     float fMaxVal = 0.f;
3429     // Cubic, etc... can have overshoots, so make sure we clamp values to the
3430     // maximum value if NBITS is set.
3431     const char* pszNBITS =
3432         poOverview->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
3433     GDALDataType eBandDT = poOverview->GetRasterDataType();
3434     if( eResample != GRA_Bilinear && pszNBITS != nullptr &&
3435         (eBandDT == GDT_Byte || eBandDT == GDT_UInt16 ||
3436          eBandDT == GDT_UInt32) )
3437     {
3438         int nBits = atoi(pszNBITS);
3439         if( nBits == GDALGetDataTypeSize(eBandDT) )
3440             nBits = 0;
3441         if( nBits > 0 && nBits < 32 )
3442             fMaxVal = static_cast<float>((1U << nBits) -1);
3443     }
3444 
3445     *ppDstBuffer =
3446         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff,
3447                             nDstYOff2 - nDstYOff,
3448                             GDALGetDataTypeSizeBytes(eBandDT));
3449     if( *ppDstBuffer == nullptr )
3450     {
3451         return CE_Failure;
3452     }
3453     *peDstBufferDataType = eBandDT;
3454 
3455     if( eWrkDataType == GDT_Byte )
3456         return GDALResampleChunk32R_ConvolutionT<GByte>(
3457             dfXRatioDstToSrc, dfYRatioDstToSrc,
3458             dfSrcXDelta, dfSrcYDelta,
3459             static_cast<const GByte *>( pChunk ),
3460             1,
3461             pabyChunkNodataMask,
3462             nChunkXOff, nChunkXSize,
3463             nChunkYOff, nChunkYSize,
3464             nDstXOff, nDstXOff2,
3465             nDstYOff, nDstYOff2,
3466             poOverview,
3467             *ppDstBuffer,
3468             bHasNoData, fNoDataValue,
3469             pfnFilterFunc,
3470             pfnFilterFunc4Values,
3471             nKernelRadius,
3472             bKernelWithNegativeWeights,
3473             fMaxVal );
3474     else if( eWrkDataType == GDT_UInt16 )
3475         return GDALResampleChunk32R_ConvolutionT<GUInt16>(
3476             dfXRatioDstToSrc, dfYRatioDstToSrc,
3477             dfSrcXDelta, dfSrcYDelta,
3478             static_cast<const GUInt16 *>( pChunk ),
3479             1,
3480             pabyChunkNodataMask,
3481             nChunkXOff, nChunkXSize,
3482             nChunkYOff, nChunkYSize,
3483             nDstXOff, nDstXOff2,
3484             nDstYOff, nDstYOff2,
3485             poOverview,
3486             *ppDstBuffer,
3487             bHasNoData, fNoDataValue,
3488             pfnFilterFunc,
3489             pfnFilterFunc4Values,
3490             nKernelRadius,
3491             bKernelWithNegativeWeights,
3492             fMaxVal );
3493     else if( eWrkDataType == GDT_Float32 )
3494         return GDALResampleChunk32R_ConvolutionT<float>(
3495             dfXRatioDstToSrc, dfYRatioDstToSrc,
3496             dfSrcXDelta, dfSrcYDelta,
3497             static_cast<const float *>( pChunk ),
3498             1,
3499             pabyChunkNodataMask,
3500             nChunkXOff, nChunkXSize,
3501             nChunkYOff, nChunkYSize,
3502             nDstXOff, nDstXOff2,
3503             nDstYOff, nDstYOff2,
3504             poOverview,
3505             *ppDstBuffer,
3506             bHasNoData, fNoDataValue,
3507             pfnFilterFunc,
3508             pfnFilterFunc4Values,
3509             nKernelRadius,
3510             bKernelWithNegativeWeights,
3511             fMaxVal );
3512 
3513     CPLAssert(false);
3514     return CE_Failure;
3515 }
3516 
3517 /************************************************************************/
3518 /*                       GDALResampleChunkC32R()                        */
3519 /************************************************************************/
3520 
3521 static CPLErr
GDALResampleChunkC32R(int nSrcWidth,int nSrcHeight,const float * pafChunk,int nChunkYOff,int nChunkYSize,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling)3522 GDALResampleChunkC32R( int nSrcWidth, int nSrcHeight,
3523                        const float * pafChunk, int nChunkYOff, int nChunkYSize,
3524                        int nDstYOff, int nDstYOff2,
3525                        GDALRasterBand * poOverview,
3526                        void** ppDstBuffer,
3527                        GDALDataType* peDstBufferDataType,
3528                        const char * pszResampling )
3529 
3530 {
3531     enum Method
3532     {
3533         NEAR,
3534         AVERAGE,
3535         AVERAGE_MAGPHASE,
3536         RMS,
3537     };
3538 
3539     Method eMethod = NEAR;
3540     if( STARTS_WITH_CI(pszResampling, "NEAR") )
3541     {
3542         eMethod = NEAR;
3543     }
3544     else if( EQUAL(pszResampling, "AVERAGE_MAGPHASE") )
3545     {
3546         eMethod = AVERAGE_MAGPHASE;
3547     }
3548     else if( EQUAL(pszResampling, "RMS") )
3549     {
3550         eMethod = RMS;
3551     }
3552     else if( STARTS_WITH_CI(pszResampling, "AVER") )
3553     {
3554         eMethod = AVERAGE;
3555     }
3556     else
3557     {
3558         CPLError(CE_Failure, CPLE_NotSupported,
3559                  "Unsupported resampling method %s for GDALResampleChunkC32R",
3560                  pszResampling);
3561         return CE_Failure;
3562     }
3563 
3564     const int nOXSize = poOverview->GetXSize();
3565     *ppDstBuffer =
3566         VSI_MALLOC3_VERBOSE(nOXSize,
3567                             nDstYOff2 - nDstYOff,
3568                             GDALGetDataTypeSizeBytes(GDT_CFloat32));
3569     if( *ppDstBuffer == nullptr )
3570     {
3571         return CE_Failure;
3572     }
3573     float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
3574     *peDstBufferDataType = GDT_CFloat32;
3575 
3576     const int nOYSize = poOverview->GetYSize();
3577     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3578     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3579 
3580 /* ==================================================================== */
3581 /*      Loop over destination scanlines.                                */
3582 /* ==================================================================== */
3583     for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
3584     {
3585         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3586         if( nSrcYOff < nChunkYOff )
3587             nSrcYOff = nChunkYOff;
3588 
3589         int nSrcYOff2 = static_cast<int>(0.5 + (iDstLine+1) * dfYRatioDstToSrc);
3590         if( nSrcYOff2 == nSrcYOff )
3591             nSrcYOff2 ++;
3592 
3593         if( nSrcYOff2 > nSrcHeight || iDstLine == nOYSize-1 )
3594         {
3595             if( nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff )
3596                 nSrcYOff = nSrcHeight - 1;
3597             nSrcYOff2 = nSrcHeight;
3598         }
3599         if( nSrcYOff2 > nChunkYOff + nChunkYSize )
3600             nSrcYOff2 = nChunkYOff + nChunkYSize;
3601 
3602         const float * const pafSrcScanline =
3603             pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3604         float* const pafDstScanline =
3605             pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3606 
3607 /* -------------------------------------------------------------------- */
3608 /*      Loop over destination pixels                                    */
3609 /* -------------------------------------------------------------------- */
3610         for( int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel )
3611         {
3612             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3613             int nSrcXOff2 = static_cast<int>(
3614                 0.5 + (iDstPixel+1) * dfXRatioDstToSrc);
3615             if( nSrcXOff2 == nSrcXOff )
3616                 nSrcXOff2 ++;
3617             if( nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize-1 )
3618             {
3619                 if( nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0 )
3620                     nSrcXOff = nSrcWidth - 1;
3621                 nSrcXOff2 = nSrcWidth;
3622             }
3623 
3624             if( eMethod == NEAR )
3625             {
3626                 pafDstScanline[iDstPixel*2] = pafSrcScanline[nSrcXOff*2];
3627                 pafDstScanline[iDstPixel*2+1] = pafSrcScanline[nSrcXOff*2+1];
3628             }
3629             else if( eMethod == AVERAGE_MAGPHASE )
3630             {
3631                 double dfTotalR = 0.0;
3632                 double dfTotalI = 0.0;
3633                 double dfTotalM = 0.0;
3634                 int nCount = 0;
3635 
3636                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3637                 {
3638                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3639                     {
3640                         const double dfR =
3641                             pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3642                         const double dfI =
3643                             pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3644                         dfTotalR += dfR;
3645                         dfTotalI += dfI;
3646                         dfTotalM += std::hypot(dfR, dfI);
3647                         ++nCount;
3648                     }
3649                 }
3650 
3651                 CPLAssert( nCount > 0 );
3652                 if( nCount == 0 )
3653                 {
3654                     pafDstScanline[iDstPixel*2] = 0.0;
3655                     pafDstScanline[iDstPixel*2+1] = 0.0;
3656                 }
3657                 else
3658                 {
3659                     pafDstScanline[iDstPixel*2  ] =
3660                         static_cast<float>(dfTotalR/nCount);
3661                     pafDstScanline[iDstPixel*2+1] =
3662                         static_cast<float>(dfTotalI/nCount);
3663                     const double dfM = std::hypot(pafDstScanline[iDstPixel*2],
3664                                                   pafDstScanline[iDstPixel*2+1]);
3665                     const double dfDesiredM = dfTotalM / nCount;
3666                     double dfRatio = 1.0;
3667                     if( dfM != 0.0 )
3668                         dfRatio = dfDesiredM / dfM;
3669 
3670                     pafDstScanline[iDstPixel*2] *=
3671                         static_cast<float>(dfRatio);
3672                     pafDstScanline[iDstPixel*2+1] *=
3673                         static_cast<float>(dfRatio);
3674                 }
3675             }
3676             else if( eMethod == RMS )
3677             {
3678                 double dfTotalR = 0.0;
3679                 double dfTotalI = 0.0;
3680                 int nCount = 0;
3681 
3682                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3683                 {
3684                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3685                     {
3686                         const double dfR = pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3687                         const double dfI = pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3688 
3689                         dfTotalR += SQUARE(dfR);
3690                         dfTotalI += SQUARE(dfI);
3691 
3692                         ++nCount;
3693                     }
3694                 }
3695 
3696                 CPLAssert( nCount > 0 );
3697                 if( nCount == 0 )
3698                 {
3699                     pafDstScanline[iDstPixel*2] = 0.0;
3700                     pafDstScanline[iDstPixel*2+1] = 0.0;
3701                 }
3702                 else
3703                 {
3704                     /* compute RMS */
3705                     pafDstScanline[iDstPixel*2  ] = static_cast<float>(sqrt(dfTotalR/nCount));
3706                     pafDstScanline[iDstPixel*2+1] = static_cast<float>(sqrt(dfTotalI/nCount));
3707                 }
3708             }
3709             else if( eMethod == AVERAGE )
3710             {
3711                 double dfTotalR = 0.0;
3712                 double dfTotalI = 0.0;
3713                 int nCount = 0;
3714 
3715                 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3716                 {
3717                     for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3718                     {
3719                         // TODO(schwehr): Maybe use std::complex?
3720                         dfTotalR +=
3721                             pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3722                         dfTotalI +=
3723                             pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3724                         ++nCount;
3725                     }
3726                 }
3727 
3728                 CPLAssert( nCount > 0 );
3729                 if( nCount == 0 )
3730                 {
3731                     pafDstScanline[iDstPixel*2] = 0.0;
3732                     pafDstScanline[iDstPixel*2+1] = 0.0;
3733                 }
3734                 else
3735                 {
3736                     pafDstScanline[iDstPixel*2  ] =
3737                         static_cast<float>(dfTotalR/nCount);
3738                     pafDstScanline[iDstPixel*2+1] =
3739                         static_cast<float>(dfTotalI/nCount);
3740                 }
3741             }
3742         }
3743     }
3744 
3745     return CE_None;
3746 }
3747 
3748 /************************************************************************/
3749 /*                  GDALRegenerateCascadingOverviews()                  */
3750 /*                                                                      */
3751 /*      Generate a list of overviews in order from largest to           */
3752 /*      smallest, computing each from the next larger.                  */
3753 /************************************************************************/
3754 
3755 static CPLErr
GDALRegenerateCascadingOverviews(GDALRasterBand * poSrcBand,int nOverviews,GDALRasterBand ** papoOvrBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)3756 GDALRegenerateCascadingOverviews(
3757     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
3758     const char * pszResampling,
3759     GDALProgressFunc pfnProgress, void * pProgressData )
3760 
3761 {
3762 /* -------------------------------------------------------------------- */
3763 /*      First, we must put the overviews in order from largest to       */
3764 /*      smallest.                                                       */
3765 /* -------------------------------------------------------------------- */
3766     for( int i = 0; i < nOverviews-1; ++i )
3767     {
3768         for( int j = 0; j < nOverviews - i - 1; ++j )
3769         {
3770             if( papoOvrBands[j]->GetXSize()
3771                 * static_cast<float>( papoOvrBands[j]->GetYSize() ) <
3772                 papoOvrBands[j+1]->GetXSize()
3773                 * static_cast<float>( papoOvrBands[j+1]->GetYSize() ) )
3774             {
3775                 GDALRasterBand *poTempBand = papoOvrBands[j];
3776                 papoOvrBands[j] = papoOvrBands[j+1];
3777                 papoOvrBands[j+1] = poTempBand;
3778             }
3779         }
3780     }
3781 
3782 /* -------------------------------------------------------------------- */
3783 /*      Count total pixels so we can prepare appropriate scaled         */
3784 /*      progress functions.                                             */
3785 /* -------------------------------------------------------------------- */
3786     double dfTotalPixels = 0.0;
3787 
3788     for( int i = 0; i < nOverviews; ++i )
3789     {
3790         dfTotalPixels += papoOvrBands[i]->GetXSize()
3791             * static_cast<double>( papoOvrBands[i]->GetYSize() );
3792     }
3793 
3794 /* -------------------------------------------------------------------- */
3795 /*      Generate all the bands.                                         */
3796 /* -------------------------------------------------------------------- */
3797     double dfPixelsProcessed = 0.0;
3798 
3799     for( int i = 0; i < nOverviews; ++i )
3800     {
3801         GDALRasterBand *poBaseBand = poSrcBand;
3802         if( i != 0 )
3803             poBaseBand = papoOvrBands[i-1];
3804 
3805         double dfPixels =
3806             papoOvrBands[i]->GetXSize()
3807             * static_cast<double>( papoOvrBands[i]->GetYSize() );
3808 
3809         void *pScaledProgressData =
3810             GDALCreateScaledProgress(
3811             dfPixelsProcessed / dfTotalPixels,
3812             (dfPixelsProcessed + dfPixels) / dfTotalPixels,
3813             pfnProgress, pProgressData );
3814 
3815         const CPLErr eErr =
3816             GDALRegenerateOverviews(
3817                 poBaseBand,
3818                 1,
3819                 reinterpret_cast<GDALRasterBandH *>( papoOvrBands ) + i,
3820                 pszResampling,
3821                 GDALScaledProgress,
3822                 pScaledProgressData );
3823         GDALDestroyScaledProgress( pScaledProgressData );
3824 
3825         if( eErr != CE_None )
3826             return eErr;
3827 
3828         dfPixelsProcessed += dfPixels;
3829 
3830         // Only do the bit2grayscale promotion on the base band.
3831         if( STARTS_WITH_CI( pszResampling,
3832                             "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */) )
3833             pszResampling = "AVERAGE";
3834     }
3835 
3836     return CE_None;
3837 }
3838 
3839 /************************************************************************/
3840 /*                    GDALGetResampleFunction()                         */
3841 /************************************************************************/
3842 
GDALGetResampleFunction(const char * pszResampling,int * pnRadius)3843 GDALResampleFunction GDALGetResampleFunction( const char* pszResampling,
3844                                               int* pnRadius )
3845 {
3846     if( pnRadius ) *pnRadius = 0;
3847     if( STARTS_WITH_CI(pszResampling, "NEAR") )
3848         return GDALResampleChunk32R_Near;
3849     else if( STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") )
3850         return GDALResampleChunk32R_Average;
3851     else if( STARTS_WITH_CI(pszResampling, "GAUSS") )
3852     {
3853         if( pnRadius ) *pnRadius = 1;
3854         return GDALResampleChunk32R_Gauss;
3855     }
3856     else if( STARTS_WITH_CI(pszResampling, "MODE") )
3857         return GDALResampleChunk32R_Mode;
3858     else if( EQUAL(pszResampling,"CUBIC") )
3859     {
3860         if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Cubic);
3861         return GDALResampleChunk32R_Convolution;
3862     }
3863     else if( EQUAL(pszResampling,"CUBICSPLINE") )
3864     {
3865         if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
3866         return GDALResampleChunk32R_Convolution;
3867     }
3868     else if( EQUAL(pszResampling,"LANCZOS") )
3869     {
3870         if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
3871         return GDALResampleChunk32R_Convolution;
3872     }
3873     else if( EQUAL(pszResampling,"BILINEAR") )
3874     {
3875         if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
3876         return GDALResampleChunk32R_Convolution;
3877     }
3878     else
3879     {
3880        CPLError(
3881            CE_Failure, CPLE_AppDefined,
3882            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
3883            pszResampling );
3884         return nullptr;
3885     }
3886 }
3887 
3888 /************************************************************************/
3889 /*                      GDALGetOvrWorkDataType()                        */
3890 /************************************************************************/
3891 
GDALGetOvrWorkDataType(const char * pszResampling,GDALDataType eSrcDataType)3892 GDALDataType GDALGetOvrWorkDataType( const char* pszResampling,
3893                                      GDALDataType eSrcDataType )
3894 {
3895     if( (STARTS_WITH_CI(pszResampling, "NEAR") ||
3896          STARTS_WITH_CI(pszResampling, "AVER") ||
3897          EQUAL(pszResampling, "RMS") ||
3898          EQUAL(pszResampling, "CUBIC") ||
3899          EQUAL(pszResampling, "CUBICSPLINE") ||
3900          EQUAL(pszResampling, "LANCZOS") ||
3901          EQUAL(pszResampling, "BILINEAR")) &&
3902         eSrcDataType == GDT_Byte )
3903     {
3904         return GDT_Byte;
3905     }
3906     else if( (STARTS_WITH_CI(pszResampling, "NEAR") ||
3907          STARTS_WITH_CI(pszResampling, "AVER") ||
3908          EQUAL(pszResampling, "RMS") ||
3909          EQUAL(pszResampling, "CUBIC") ||
3910          EQUAL(pszResampling, "CUBICSPLINE") ||
3911          EQUAL(pszResampling, "LANCZOS") ||
3912          EQUAL(pszResampling, "BILINEAR")) &&
3913         eSrcDataType == GDT_UInt16 )
3914     {
3915         return GDT_UInt16;
3916     }
3917 
3918     return GDT_Float32;
3919 }
3920 
3921 
3922 namespace {
3923 // Structure to hold a pointer to free with CPLFree()
3924 struct PointerHolder
3925 {
3926     void* ptr = nullptr;
3927 
PointerHolder__anon9571b05b0211::PointerHolder3928     explicit PointerHolder(void* ptrIn): ptr(ptrIn) {}
~PointerHolder__anon9571b05b0211::PointerHolder3929     ~PointerHolder() { CPLFree(ptr); }
3930     PointerHolder(const PointerHolder&) = delete;
3931     PointerHolder& operator=(const PointerHolder&) = delete;
3932 };
3933 }
3934 
3935 /************************************************************************/
3936 /*                      GDALRegenerateOverviews()                       */
3937 /************************************************************************/
3938 
3939 /**
3940  * \brief Generate downsampled overviews.
3941  *
3942  * This function will generate one or more overview images from a base image
3943  * using the requested downsampling algorithm.  Its primary use is for
3944  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
3945  * used to generate downsampled images in one file from another outside the
3946  * overview architecture.
3947  *
3948  * The output bands need to exist in advance.
3949  *
3950  * The full set of resampling algorithms is documented in
3951  * GDALDataset::BuildOverviews().
3952  *
3953  * This function will honour properly NODATA_VALUES tuples (special dataset
3954  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
3955  * considered as the nodata value and not each value of the triplet
3956  * independently per band.
3957  *
3958  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
3959  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
3960  * overview computation.
3961  *
3962  * @param hSrcBand the source (base level) band.
3963  * @param nOverviewCount the number of downsampled bands being generated.
3964  * @param pahOvrBands the list of downsampled bands to be generated.
3965  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
3966  * @param pfnProgress progress report function.
3967  * @param pProgressData progress function callback data.
3968  * @return CE_None on success or CE_Failure on failure.
3969  */
3970 CPLErr
GDALRegenerateOverviews(GDALRasterBandH hSrcBand,int nOverviewCount,GDALRasterBandH * pahOvrBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)3971 GDALRegenerateOverviews( GDALRasterBandH hSrcBand,
3972                          int nOverviewCount, GDALRasterBandH *pahOvrBands,
3973                          const char * pszResampling,
3974                          GDALProgressFunc pfnProgress, void * pProgressData )
3975 
3976 {
3977     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle( hSrcBand );
3978     GDALRasterBand **papoOvrBands =
3979         reinterpret_cast<GDALRasterBand **>( pahOvrBands );
3980 
3981     if( pfnProgress == nullptr )
3982         pfnProgress = GDALDummyProgress;
3983 
3984     if( EQUAL(pszResampling,"NONE") )
3985         return CE_None;
3986 
3987     int nKernelRadius = 0;
3988     GDALResampleFunction pfnResampleFn
3989         = GDALGetResampleFunction(pszResampling, &nKernelRadius);
3990 
3991     if( pfnResampleFn == nullptr )
3992         return CE_Failure;
3993 
3994 /* -------------------------------------------------------------------- */
3995 /*      Check color tables...                                           */
3996 /* -------------------------------------------------------------------- */
3997     GDALColorTable* poColorTable = nullptr;
3998 
3999     if( (STARTS_WITH_CI(pszResampling, "AVER")
4000          || EQUAL(pszResampling, "RMS")
4001          || STARTS_WITH_CI(pszResampling, "MODE")
4002          || STARTS_WITH_CI(pszResampling, "GAUSS")) &&
4003         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex )
4004     {
4005         poColorTable = poSrcBand->GetColorTable();
4006         if( poColorTable != nullptr )
4007         {
4008             if( poColorTable->GetPaletteInterpretation() != GPI_RGB )
4009             {
4010                 CPLError(
4011                     CE_Warning, CPLE_AppDefined,
4012                     "Computing overviews on palette index raster bands "
4013                     "with a palette whose color interpretation is not RGB "
4014                     "will probably lead to unexpected results." );
4015                 poColorTable = nullptr;
4016             }
4017         }
4018         else
4019         {
4020             CPLError( CE_Warning, CPLE_AppDefined,
4021                       "Computing overviews on palette index raster bands "
4022                       "without a palette will probably lead to unexpected "
4023                       "results." );
4024         }
4025     }
4026     // Not ready yet
4027     else if( (EQUAL(pszResampling, "CUBIC") ||
4028               EQUAL(pszResampling, "CUBICSPLINE") ||
4029               EQUAL(pszResampling, "LANCZOS") ||
4030               EQUAL(pszResampling, "BILINEAR") )
4031         && poSrcBand->GetColorInterpretation() == GCI_PaletteIndex )
4032     {
4033         CPLError( CE_Warning, CPLE_AppDefined,
4034                   "Computing %s overviews on palette index raster bands "
4035                   "will probably lead to unexpected results.", pszResampling );
4036     }
4037 
4038     // If we have a nodata mask and we are doing something more complicated
4039     // than nearest neighbouring, we have to fetch to nodata mask.
4040 
4041     GDALRasterBand* poMaskBand = nullptr;
4042     bool bUseNoDataMask = false;
4043     bool bCanUseCascaded = true;
4044 
4045     if( !STARTS_WITH_CI(pszResampling, "NEAR") )
4046     {
4047         int nMaskFlags;
4048         // Special case if we are the alpha band. We want it to be considered
4049         // as the mask band to avoid alpha=0 to be taken into account in average
4050         // computation.
4051         if( poSrcBand->GetColorInterpretation() == GCI_AlphaBand )
4052         {
4053             poMaskBand = poSrcBand;
4054             nMaskFlags = GMF_ALPHA | GMF_PER_DATASET;
4055         }
4056         // Same as above for mask band. I'd wish we had a better way of conveying this !
4057         else if( CPLTestBool(CPLGetConfigOption(
4058                     "GDAL_REGENERATED_BAND_IS_MASK", "NO")) )
4059         {
4060             poMaskBand = poSrcBand;
4061             nMaskFlags = GMF_PER_DATASET;
4062         }
4063         else
4064         {
4065             poMaskBand = poSrcBand->GetMaskBand();
4066             nMaskFlags = poSrcBand->GetMaskFlags();
4067             bCanUseCascaded = (nMaskFlags == GMF_NODATA ||
4068                                nMaskFlags == GMF_ALL_VALID);
4069         }
4070 
4071         bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4072     }
4073 
4074 /* -------------------------------------------------------------------- */
4075 /*      If we are operating on multiple overviews, and using            */
4076 /*      averaging, lets do them in cascading order to reduce the        */
4077 /*      amount of computation.                                          */
4078 /* -------------------------------------------------------------------- */
4079 
4080     // In case the mask made be computed from another band of the dataset,
4081     // we can't use cascaded generation, as the computation of the overviews
4082     // of the band used for the mask band may not have yet occurred (#3033).
4083     if( (STARTS_WITH_CI(pszResampling, "AVER") ||
4084          STARTS_WITH_CI(pszResampling, "GAUSS") ||
4085          EQUAL(pszResampling, "RMS") ||
4086          EQUAL(pszResampling, "CUBIC") ||
4087          EQUAL(pszResampling, "CUBICSPLINE") ||
4088          EQUAL(pszResampling, "LANCZOS") ||
4089          EQUAL(pszResampling, "BILINEAR")) && nOverviewCount > 1
4090          && bCanUseCascaded )
4091         return GDALRegenerateCascadingOverviews( poSrcBand,
4092                                                  nOverviewCount, papoOvrBands,
4093                                                  pszResampling,
4094                                                  pfnProgress,
4095                                                  pProgressData );
4096 
4097 /* -------------------------------------------------------------------- */
4098 /*      Setup one horizontal swath to read from the raw buffer.         */
4099 /* -------------------------------------------------------------------- */
4100     int nFRXBlockSize = 0;
4101     int nFRYBlockSize = 0;
4102     poSrcBand->GetBlockSize( &nFRXBlockSize, &nFRYBlockSize );
4103 
4104     int nFullResYChunk = 0;
4105     if( nFRYBlockSize < 16 || nFRYBlockSize > 256 )
4106         nFullResYChunk = 64;
4107     else
4108         nFullResYChunk = nFRYBlockSize;
4109 
4110     // Only configurable for debug / testing
4111     const char* pszChunkYSize = CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4112     if( pszChunkYSize )
4113     {
4114         // coverity[tainted_data]
4115         nFullResYChunk = atoi(pszChunkYSize);
4116     }
4117 
4118     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4119     const GDALDataType eWrkDataType = GDALDataTypeIsComplex( eSrcDataType )?
4120         GDT_CFloat32 :
4121         GDALGetOvrWorkDataType( pszResampling, eSrcDataType );
4122 
4123     const int nWidth = poSrcBand->GetXSize();
4124     const int nHeight = poSrcBand->GetYSize();
4125 
4126     int nMaxOvrFactor = 1;
4127     for( int iOverview = 0; iOverview < nOverviewCount; ++iOverview )
4128     {
4129         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4130         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4131         nMaxOvrFactor = std::max(
4132             nMaxOvrFactor,
4133             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5) );
4134         nMaxOvrFactor = std::max(
4135             nMaxOvrFactor,
4136             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5) );
4137     }
4138     // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff + nFullResYChunk) / nMaxOvrFactor)
4139     nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4140     const int nMaxChunkYSizeQueried =
4141         nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4142 
4143     int bHasNoData = FALSE;
4144     const float fNoDataValue =
4145         static_cast<float>( poSrcBand->GetNoDataValue(&bHasNoData) );
4146     const bool bPropagateNoData =
4147         CPLTestBool( CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO") );
4148 
4149     // Structure describing a resampling job
4150     struct OvrJob
4151     {
4152         // Buffers to free when job is finished
4153         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4154         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4155         std::unique_ptr<PointerHolder> oDstBufferHolder{};
4156 
4157         // Input parameters of pfnResampleFn
4158         GDALResampleFunction pfnResampleFn = nullptr;
4159         double dfXRatioDstToSrc{};
4160         double dfYRatioDstToSrc{};
4161         GDALDataType eWrkDataType = GDT_Unknown;
4162         const void * pChunk = nullptr;
4163         const GByte * pabyChunkNodataMask = nullptr;
4164         int nWidth = 0;
4165         int nHeight = 0;
4166         int nChunkYOff= 0;
4167         int nChunkYSize = 0;
4168         int nDstWidth = 0;
4169         int nDstYOff = 0;
4170         int nDstYOff2 = 0;
4171         GDALRasterBand* poDstBand = nullptr;
4172         const char * pszResampling = nullptr;
4173         int bHasNoData = 0;
4174         float fNoDataValue = 0.0f;
4175         GDALColorTable* poColorTable = nullptr;
4176         GDALDataType eSrcDataType = GDT_Unknown;
4177         bool bPropagateNoData = false;
4178 
4179         // Output values of resampling function
4180         CPLErr eErr = CE_Failure;
4181         void* pDstBuffer = nullptr;
4182         GDALDataType eDstBufferDataType = GDT_Unknown;
4183 
4184         // Synchronization
4185         bool                    bFinished = false;
4186         std::mutex              mutex{};
4187         std::condition_variable cv{};
4188     };
4189 
4190     // Thread function to resample
4191     const auto JobResampleFunc = [](void* pData)
4192     {
4193         OvrJob* poJob = static_cast<OvrJob*>(pData);
4194 
4195         if( poJob->eWrkDataType != GDT_CFloat32 )
4196         {
4197             poJob->eErr = poJob->pfnResampleFn(
4198                 poJob->dfXRatioDstToSrc,
4199                 poJob->dfYRatioDstToSrc,
4200                 0.0, 0.0,
4201                 poJob->eWrkDataType,
4202                 poJob->pChunk,
4203                 poJob->pabyChunkNodataMask,
4204                 0,
4205                 poJob->nWidth,
4206                 poJob->nChunkYOff,
4207                 poJob->nChunkYSize,
4208                 0, poJob->nDstWidth,
4209                 poJob->nDstYOff,
4210                 poJob->nDstYOff2,
4211                 poJob->poDstBand,
4212                 &(poJob->pDstBuffer),
4213                 &(poJob->eDstBufferDataType),
4214                 poJob->pszResampling,
4215                 poJob->bHasNoData,
4216                 poJob->fNoDataValue,
4217                 poJob->poColorTable,
4218                 poJob->eSrcDataType,
4219                 poJob->bPropagateNoData);
4220         }
4221         else
4222         {
4223             poJob->eErr = GDALResampleChunkC32R(
4224                 poJob->nWidth,
4225                 poJob->nHeight,
4226                 static_cast<const float*>(poJob->pChunk),
4227                 poJob->nChunkYOff,
4228                 poJob->nChunkYSize,
4229                 poJob->nDstYOff,
4230                 poJob->nDstYOff2,
4231                 poJob->poDstBand,
4232                 &(poJob->pDstBuffer),
4233                 &(poJob->eDstBufferDataType),
4234                 poJob->pszResampling);
4235         }
4236 
4237         poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
4238 
4239         {
4240             std::lock_guard<std::mutex> guard(poJob->mutex);
4241             poJob->bFinished = true;
4242             poJob->cv.notify_one();
4243         }
4244     };
4245 
4246     // Function to write resample data to target band
4247     const auto WriteJobData = [](const OvrJob* poJob)
4248     {
4249         return poJob->poDstBand->RasterIO( GF_Write,
4250                                             0,
4251                                             poJob->nDstYOff,
4252                                             poJob->nDstWidth,
4253                                             poJob->nDstYOff2 - poJob->nDstYOff,
4254                                             poJob->pDstBuffer,
4255                                             poJob->nDstWidth,
4256                                             poJob->nDstYOff2 - poJob->nDstYOff,
4257                                             poJob->eDstBufferDataType,
4258                                             0, 0, nullptr );
4259     };
4260 
4261     // Wait for completion of oldest job and serialize it
4262     const auto WaitAndFinalizeOldestJob = [WriteJobData](
4263                         std::list<std::unique_ptr<OvrJob>>& jobList)
4264     {
4265         auto poOldestJob = jobList.front().get();
4266         {
4267             std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4268             while( !poOldestJob->bFinished )
4269             {
4270                 poOldestJob->cv.wait(oGuard);
4271             }
4272         }
4273         CPLErr l_eErr = poOldestJob->eErr;
4274         if( l_eErr == CE_None )
4275         {
4276             l_eErr = WriteJobData(poOldestJob);
4277         }
4278 
4279         jobList.pop_front();
4280         return l_eErr;
4281     };
4282 
4283     // Queue of jobs
4284     std::list<std::unique_ptr<OvrJob>> jobList;
4285 
4286     GByte *pabyChunkNodataMask = nullptr;
4287     void *pChunk = nullptr;
4288 
4289     const char* pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4290     const int nThreads = std::max(1, std::min(128,
4291             EQUAL(pszThreads, "ALL_CPUS") ? CPLGetNumCPUs() : atoi(pszThreads)));
4292     auto poThreadPool = nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4293     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue() :
4294                             std::unique_ptr<CPLJobQueue>(nullptr);
4295 
4296 /* -------------------------------------------------------------------- */
4297 /*      Loop over image operating on chunks.                            */
4298 /* -------------------------------------------------------------------- */
4299     int nChunkYOff = 0;
4300     CPLErr eErr = CE_None;
4301 
4302     for( nChunkYOff = 0;
4303          nChunkYOff < nHeight && eErr == CE_None;
4304          nChunkYOff += nFullResYChunk )
4305     {
4306         if( !pfnProgress( nChunkYOff / static_cast<double>( nHeight ),
4307                           nullptr, pProgressData ) )
4308         {
4309             CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
4310             eErr = CE_Failure;
4311         }
4312 
4313         if( nFullResYChunk + nChunkYOff > nHeight )
4314             nFullResYChunk = nHeight - nChunkYOff;
4315 
4316         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4317         int nChunkYSizeQueried =
4318             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4319         if( nChunkYOffQueried < 0 )
4320         {
4321             nChunkYSizeQueried += nChunkYOffQueried;
4322             nChunkYOffQueried = 0;
4323         }
4324         if( nChunkYOffQueried + nChunkYSizeQueried > nHeight )
4325             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4326 
4327         // Avoid accumulating too many tasks and exhaust RAM
4328         // Try to complete already finished jobs
4329         while( eErr == CE_None && !jobList.empty() )
4330         {
4331             auto poOldestJob = jobList.front().get();
4332             {
4333                 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4334                 if( !poOldestJob->bFinished )
4335                 {
4336                     break;
4337                 }
4338             }
4339             eErr = poOldestJob->eErr;
4340             if( eErr == CE_None )
4341             {
4342                 eErr = WriteJobData(poOldestJob);
4343             }
4344 
4345             jobList.pop_front();
4346         }
4347 
4348         // And in case we have saturated the number of threads,
4349         // wait for completion of tasks to go below the threshold.
4350         while( eErr == CE_None &&
4351                 jobList.size() >= static_cast<size_t>(nThreads) )
4352         {
4353             eErr = WaitAndFinalizeOldestJob(jobList);
4354         }
4355 
4356         // (Re)allocate buffers if needed
4357         if( pChunk == nullptr )
4358         {
4359             pChunk = VSI_MALLOC3_VERBOSE(
4360                 GDALGetDataTypeSizeBytes(eWrkDataType), nMaxChunkYSizeQueried, nWidth );
4361         }
4362         if( bUseNoDataMask && pabyChunkNodataMask == nullptr )
4363         {
4364             pabyChunkNodataMask = static_cast<GByte*>(
4365                 VSI_MALLOC2_VERBOSE( nMaxChunkYSizeQueried, nWidth ));
4366         }
4367 
4368         if( pChunk == nullptr || (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4369         {
4370             CPLFree(pChunk);
4371             CPLFree(pabyChunkNodataMask);
4372             return CE_Failure;
4373         }
4374 
4375         // Read chunk.
4376         if( eErr == CE_None )
4377             eErr = poSrcBand->RasterIO(
4378                 GF_Read, 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4379                 pChunk, nWidth, nChunkYSizeQueried, eWrkDataType,
4380                 0, 0, nullptr );
4381         if( eErr == CE_None && bUseNoDataMask )
4382             eErr = poMaskBand->RasterIO(
4383                 GF_Read, 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4384                 pabyChunkNodataMask, nWidth, nChunkYSizeQueried, GDT_Byte,
4385                 0, 0, nullptr );
4386 
4387         // Special case to promote 1bit data to 8bit 0/255 values.
4388         if( EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE") )
4389         {
4390             if( eWrkDataType == GDT_Float32 )
4391             {
4392                 float* pafChunk = static_cast<float*>(pChunk);
4393                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4394                 {
4395                     if( pafChunk[i] == 1.0 )
4396                         pafChunk[i] = 255.0;
4397                 }
4398             }
4399             else if( eWrkDataType == GDT_Byte )
4400             {
4401                 GByte* pabyChunk = static_cast<GByte*>(pChunk);
4402                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4403                 {
4404                     if( pabyChunk[i] == 1 )
4405                         pabyChunk[i] = 255;
4406                 }
4407             }
4408             else if( eWrkDataType == GDT_UInt16 )
4409             {
4410                 GUInt16* pasChunk = static_cast<GUInt16*>(pChunk);
4411                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4412                 {
4413                     if( pasChunk[i] == 1 )
4414                         pasChunk[i] = 255;
4415                 }
4416             }
4417             else {
4418                 CPLAssert(false);
4419             }
4420         }
4421         else if( EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE") )
4422         {
4423             if( eWrkDataType == GDT_Float32 )
4424             {
4425                 float* pafChunk = static_cast<float*>(pChunk);
4426                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4427                 {
4428                     if( pafChunk[i] == 1.0 )
4429                         pafChunk[i] = 0.0;
4430                     else if( pafChunk[i] == 0.0 )
4431                         pafChunk[i] = 255.0;
4432                 }
4433             }
4434             else if( eWrkDataType == GDT_Byte )
4435             {
4436                 GByte* pabyChunk = static_cast<GByte*>(pChunk);
4437                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4438                 {
4439                     if( pabyChunk[i] == 1 )
4440                         pabyChunk[i] = 0;
4441                     else if( pabyChunk[i] == 0 )
4442                         pabyChunk[i] = 255;
4443                 }
4444             }
4445             else if( eWrkDataType == GDT_UInt16 )
4446             {
4447                 GUInt16* pasChunk = static_cast<GUInt16*>(pChunk);
4448                 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4449                 {
4450                     if( pasChunk[i] == 1 )
4451                         pasChunk[i] = 0;
4452                     else if( pasChunk[i] == 0 )
4453                         pasChunk[i] = 255;
4454                 }
4455             }
4456             else {
4457                 CPLAssert(false);
4458             }
4459         }
4460 
4461         std::shared_ptr<PointerHolder> oSrcBufferHolder(
4462             new PointerHolder(poJobQueue ? pChunk : nullptr));
4463         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder(
4464             new PointerHolder(poJobQueue ? pabyChunkNodataMask : nullptr));
4465 
4466         for( int iOverview = 0;
4467              iOverview < nOverviewCount && eErr == CE_None;
4468              ++iOverview )
4469         {
4470             GDALRasterBand* poDstBand = papoOvrBands[iOverview];
4471             const int nDstWidth = poDstBand->GetXSize();
4472             const int nDstHeight = poDstBand->GetYSize();
4473 
4474             const double dfXRatioDstToSrc =
4475                 static_cast<double>(nWidth) / nDstWidth;
4476             const double dfYRatioDstToSrc =
4477                 static_cast<double>(nHeight) / nDstHeight;
4478 
4479 /* -------------------------------------------------------------------- */
4480 /*      Figure out the line to start writing to, and the first line     */
4481 /*      to not write to.  In theory this approach should ensure that    */
4482 /*      every output line will be written if all input chunks are       */
4483 /*      processed.                                                      */
4484 /* -------------------------------------------------------------------- */
4485             int nDstYOff = static_cast<int>(0.5 + nChunkYOff/dfYRatioDstToSrc);
4486             if( nDstYOff == nDstHeight )
4487                 continue;
4488             int nDstYOff2 = static_cast<int>(
4489                 0.5 + (nChunkYOff+nFullResYChunk)/dfYRatioDstToSrc);
4490 
4491             if( nChunkYOff + nFullResYChunk == nHeight )
4492                 nDstYOff2 = nDstHeight;
4493 #if DEBUG_VERBOSE
4494             CPLDebug(
4495                 "GDAL",
4496                 "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
4497                 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4498                 0, nDstYOff, nDstWidth, nDstYOff2 - nDstYOff );
4499 #endif
4500 
4501             auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
4502             poJob->pfnResampleFn = pfnResampleFn;
4503             poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
4504             poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
4505             poJob->eWrkDataType = eWrkDataType;
4506             poJob->pChunk = pChunk;
4507             poJob->pabyChunkNodataMask = pabyChunkNodataMask;
4508             poJob->nWidth = nWidth;
4509             poJob->nHeight = nHeight;
4510             poJob->nChunkYOff = nChunkYOffQueried;
4511             poJob->nChunkYSize = nChunkYSizeQueried;
4512             poJob->nDstWidth = nDstWidth;
4513             poJob->nDstYOff = nDstYOff;
4514             poJob->nDstYOff2 = nDstYOff2;
4515             poJob->poDstBand = poDstBand;
4516             poJob->pszResampling = pszResampling;
4517             poJob->bHasNoData = bHasNoData;
4518             poJob->fNoDataValue = fNoDataValue;
4519             poJob->poColorTable = poColorTable;
4520             poJob->eSrcDataType = eSrcDataType;
4521             poJob->bPropagateNoData = bPropagateNoData;
4522 
4523             if( poJobQueue )
4524             {
4525                 poJob->oSrcMaskBufferHolder = oSrcMaskBufferHolder;
4526                 poJob->oSrcBufferHolder = oSrcBufferHolder;
4527                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4528                 jobList.emplace_back(std::move(poJob));
4529             }
4530             else
4531             {
4532                 JobResampleFunc(poJob.get());
4533                 eErr = poJob->eErr;
4534                 if( eErr == CE_None )
4535                 {
4536                     eErr = WriteJobData(poJob.get());
4537                 }
4538             }
4539         }
4540 
4541         if( poJobQueue )
4542         {
4543             pChunk = nullptr;
4544             pabyChunkNodataMask = nullptr;
4545         }
4546     }
4547 
4548     VSIFree( pChunk );
4549     VSIFree( pabyChunkNodataMask );
4550 
4551     // Wait for all pending jobs to complete
4552     while( !jobList.empty() )
4553     {
4554         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
4555         if( l_eErr != CE_None && eErr == CE_None )
4556             eErr = l_eErr;
4557     }
4558 
4559 /* -------------------------------------------------------------------- */
4560 /*      Renormalized overview mean / stddev if needed.                  */
4561 /* -------------------------------------------------------------------- */
4562     if( eErr == CE_None && EQUAL(pszResampling,"AVERAGE_MP") )
4563     {
4564         GDALOverviewMagnitudeCorrection(
4565             poSrcBand,
4566             nOverviewCount,
4567             reinterpret_cast<GDALRasterBandH *>( papoOvrBands ),
4568             GDALDummyProgress, nullptr );
4569     }
4570 
4571 /* -------------------------------------------------------------------- */
4572 /*      It can be important to flush out data to overviews.             */
4573 /* -------------------------------------------------------------------- */
4574     for( int iOverview = 0;
4575          eErr == CE_None && iOverview < nOverviewCount;
4576          ++iOverview )
4577     {
4578         eErr = papoOvrBands[iOverview]->FlushCache();
4579     }
4580 
4581     if( eErr == CE_None )
4582         pfnProgress( 1.0, nullptr, pProgressData );
4583 
4584     return eErr;
4585 }
4586 
4587 /************************************************************************/
4588 /*            GDALRegenerateOverviewsMultiBand()                        */
4589 /************************************************************************/
4590 
4591 /**
4592  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
4593  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
4594  *
4595  * This function will generate one or more overview images from a base
4596  * image using the requested downsampling algorithm.  Its primary use
4597  * is for generating overviews via GDALDataset::BuildOverviews(), but it
4598  * can also be used to generate downsampled images in one file from another
4599  * outside the overview architecture.
4600  *
4601  * The output bands need to exist in advance and share the same characteristics
4602  * (type, dimensions)
4603  *
4604  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
4605  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
4606  *
4607  * It does not support color tables or complex data types.
4608  *
4609  * The pseudo-algorithm used by the function is :
4610  *    for each overview
4611  *       iterate on lines of the source by a step of deltay
4612  *           iterate on columns of the source  by a step of deltax
4613  *               read the source data of size deltax * deltay for all the bands
4614  *               generate the corresponding overview block for all the bands
4615  *
4616  * This function will honour properly NODATA_VALUES tuples (special dataset
4617  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4618  * considered as the nodata value and not each value of the triplet
4619  * independently per band.
4620  *
4621  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4622  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4623  * overview computation.
4624  *
4625  * @param nBands the number of bands, size of papoSrcBands and size of
4626  *               first dimension of papapoOverviewBands
4627  * @param papoSrcBands the list of source bands to downsample
4628  * @param nOverviews the number of downsampled overview levels being generated.
4629  * @param papapoOverviewBands bidimension array of bands. First dimension is
4630  *                            indexed by nBands. Second dimension is indexed by
4631  *                            nOverviews.
4632  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
4633  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
4634  * @param pfnProgress progress report function.
4635  * @param pProgressData progress function callback data.
4636  * @return CE_None on success or CE_Failure on failure.
4637  */
4638 
4639 CPLErr
GDALRegenerateOverviewsMultiBand(int nBands,GDALRasterBand ** papoSrcBands,int nOverviews,GDALRasterBand *** papapoOverviewBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)4640 GDALRegenerateOverviewsMultiBand( int nBands, GDALRasterBand** papoSrcBands,
4641                                   int nOverviews,
4642                                   GDALRasterBand*** papapoOverviewBands,
4643                                   const char * pszResampling,
4644                                   GDALProgressFunc pfnProgress,
4645                                   void * pProgressData )
4646 {
4647     if( pfnProgress == nullptr )
4648         pfnProgress = GDALDummyProgress;
4649 
4650     if( EQUAL(pszResampling,"NONE") )
4651         return CE_None;
4652 
4653     // Sanity checks.
4654     if( !STARTS_WITH_CI(pszResampling, "NEAR") &&
4655         !EQUAL(pszResampling, "RMS") &&
4656         !EQUAL(pszResampling, "AVERAGE") &&
4657         !EQUAL(pszResampling, "GAUSS") &&
4658         !EQUAL(pszResampling, "CUBIC") &&
4659         !EQUAL(pszResampling, "CUBICSPLINE") &&
4660         !EQUAL(pszResampling, "LANCZOS") &&
4661         !EQUAL(pszResampling, "BILINEAR") )
4662     {
4663         CPLError(
4664             CE_Failure, CPLE_NotSupported,
4665             "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
4666             "not supported", pszResampling );
4667         return CE_Failure;
4668     }
4669 
4670     int nKernelRadius = 0;
4671     GDALResampleFunction pfnResampleFn =
4672         GDALGetResampleFunction(pszResampling, &nKernelRadius);
4673     if( pfnResampleFn == nullptr )
4674         return CE_Failure;
4675 
4676     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
4677     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
4678     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
4679     for( int iBand = 1; iBand < nBands; ++iBand )
4680     {
4681         if( papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
4682             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight )
4683         {
4684             CPLError(
4685                 CE_Failure, CPLE_NotSupported,
4686                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
4687                 "have the same dimensions" );
4688             return CE_Failure;
4689         }
4690         if( papoSrcBands[iBand]->GetRasterDataType() != eDataType )
4691         {
4692             CPLError(
4693                 CE_Failure, CPLE_NotSupported,
4694                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
4695                 "have the same data type" );
4696             return CE_Failure;
4697         }
4698     }
4699 
4700     for( int iOverview = 0; iOverview < nOverviews; ++iOverview )
4701     {
4702         const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4703         const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
4704         for( int iBand = 1; iBand < nBands; ++iBand )
4705         {
4706             if( papapoOverviewBands[iBand][iOverview]->GetXSize() !=
4707                 nDstWidth ||
4708                 papapoOverviewBands[iBand][iOverview]->GetYSize()
4709                 != nDstHeight )
4710             {
4711                 CPLError(
4712                     CE_Failure, CPLE_NotSupported,
4713                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
4714                     "of the same level must have the same dimensions" );
4715                 return CE_Failure;
4716             }
4717             if( papapoOverviewBands[iBand][iOverview]->GetRasterDataType() !=
4718                 eDataType )
4719             {
4720                 CPLError(
4721                     CE_Failure, CPLE_NotSupported,
4722                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
4723                     "must have the same data type as the source bands" );
4724                 return CE_Failure;
4725             }
4726         }
4727     }
4728 
4729     // First pass to compute the total number of pixels to read.
4730     double dfTotalPixelCount = 0;
4731     for( int iOverview = 0; iOverview < nOverviews; ++iOverview )
4732     {
4733         int nSrcWidth = nToplevelSrcWidth;
4734         int nSrcHeight = nToplevelSrcHeight;
4735         const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4736         // Try to use previous level of overview as the source to compute
4737         // the next level.
4738         if( iOverview > 0 &&
4739             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstWidth )
4740         {
4741             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
4742             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
4743         }
4744 
4745         dfTotalPixelCount += static_cast<double>(nSrcWidth) * nSrcHeight;
4746     }
4747 
4748     const GDALDataType eWrkDataType =
4749         GDALGetOvrWorkDataType(pszResampling, eDataType);
4750 
4751     // I'd wish we had a better way of conveying this !
4752     const bool bIsMask = CPLTestBool(CPLGetConfigOption(
4753                     "GDAL_REGENERATED_BAND_IS_MASK", "NO"));
4754 
4755     // If we have a nodata mask and we are doing something more complicated
4756     // than nearest neighbouring, we have to fetch to nodata mask.
4757     const bool bUseNoDataMask =
4758         !STARTS_WITH_CI(pszResampling, "NEAR") &&
4759         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
4760 
4761     int* const pabHasNoData = static_cast<int *>(
4762         VSI_MALLOC_VERBOSE(nBands * sizeof(int)) );
4763     float* const pafNoDataValue = static_cast<float *>(
4764         VSI_MALLOC_VERBOSE(nBands * sizeof(float)) );
4765     if( pabHasNoData == nullptr || pafNoDataValue == nullptr )
4766     {
4767         CPLFree(pabHasNoData);
4768         CPLFree(pafNoDataValue);
4769         return CE_Failure;
4770     }
4771 
4772     for( int iBand = 0; iBand < nBands; ++iBand )
4773     {
4774         pabHasNoData[iBand] = FALSE;
4775         pafNoDataValue[iBand] = static_cast<float>(
4776             papoSrcBands[iBand]->GetNoDataValue(&pabHasNoData[iBand]) );
4777     }
4778     const bool bPropagateNoData =
4779         CPLTestBool( CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO") );
4780 
4781     const char* pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4782     const int nThreads = std::max(1, std::min(128,
4783             EQUAL(pszThreads, "ALL_CPUS") ? CPLGetNumCPUs() : atoi(pszThreads)));
4784     auto poThreadPool = nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4785     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue() :
4786                             std::unique_ptr<CPLJobQueue>(nullptr);
4787 
4788     // Only configurable for debug / testing
4789     const int nChunkMaxSize =
4790         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4791 
4792     // Second pass to do the real job.
4793     double dfCurPixelCount = 0;
4794     CPLErr eErr = CE_None;
4795     for( int iOverview = 0;
4796          iOverview < nOverviews && eErr == CE_None;
4797          ++iOverview )
4798     {
4799         int iSrcOverview = -1;  // -1 means the source bands.
4800 
4801         int nDstChunkXSize = 0;
4802         int nDstChunkYSize = 0;
4803         papapoOverviewBands[0][iOverview]->GetBlockSize( &nDstChunkXSize,
4804                                                          &nDstChunkYSize );
4805 
4806         const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4807         const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
4808 
4809         // Try to use previous level of overview as the source to compute
4810         // the next level.
4811         int nSrcWidth = nToplevelSrcWidth;
4812         int nSrcHeight = nToplevelSrcHeight;
4813         if( iOverview > 0 &&
4814             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstWidth )
4815         {
4816             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
4817             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
4818             iSrcOverview = iOverview - 1;
4819         }
4820 
4821         const double dfXRatioDstToSrc =
4822             static_cast<double>(nSrcWidth) / nDstWidth;
4823         const double dfYRatioDstToSrc =
4824             static_cast<double>(nSrcHeight) / nDstHeight;
4825 
4826         int nOvrFactor = std::max( static_cast<int>(0.5 + dfXRatioDstToSrc),
4827                                    static_cast<int>(0.5 + dfYRatioDstToSrc) );
4828         if( nOvrFactor == 0 ) nOvrFactor = 1;
4829 
4830         // Try to extend the chunk size so that the memory needed to acquire
4831         // source pixels goes up to 10 MB.
4832         // This can help for drivers that support multi-threaded reading
4833         const int nFullResYChunk =
4834             2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
4835         const int nFullResYChunkQueried =
4836             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
4837         while( nDstChunkXSize < nDstWidth )
4838         {
4839             const int nFullResXChunk =
4840                 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
4841 
4842             const int nFullResXChunkQueried =
4843                 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
4844 
4845             if( static_cast<GIntBig>(nFullResXChunkQueried) *
4846                   nFullResYChunkQueried * nBands *
4847                     GDALGetDataTypeSizeBytes(eWrkDataType) > nChunkMaxSize )
4848             {
4849                 break;
4850             }
4851 
4852             nDstChunkXSize *= 2;
4853         }
4854         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
4855 
4856         const int nFullResXChunk =
4857             2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
4858         const int nFullResXChunkQueried =
4859             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
4860 
4861         // Structure describing a resampling job
4862         struct OvrJob
4863         {
4864             // Buffers to free when job is finished
4865             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
4866             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
4867             std::unique_ptr<PointerHolder> oDstBufferHolder{};
4868 
4869             // Input parameters of pfnResampleFn
4870             GDALResampleFunction pfnResampleFn = nullptr;
4871             double dfXRatioDstToSrc{};
4872             double dfYRatioDstToSrc{};
4873             GDALDataType eWrkDataType = GDT_Unknown;
4874             const void * pChunk = nullptr;
4875             const GByte * pabyChunkNodataMask = nullptr;
4876             int nChunkXOff = 0;
4877             int nChunkXSize = 0;
4878             int nChunkYOff = 0;
4879             int nChunkYSize = 0;
4880             int nDstXOff = 0;
4881             int nDstXOff2 = 0;
4882             int nDstYOff = 0;
4883             int nDstYOff2 = 0;
4884             GDALRasterBand* poOverview = nullptr;
4885             const char * pszResampling = nullptr;
4886             int bHasNoData = 0;
4887             float fNoDataValue = 0.0f;
4888             GDALDataType eSrcDataType = GDT_Unknown;
4889             bool bPropagateNoData = false;
4890 
4891             // Output values of resampling function
4892             CPLErr eErr = CE_Failure;
4893             void* pDstBuffer = nullptr;
4894             GDALDataType eDstBufferDataType = GDT_Unknown;
4895 
4896             // Synchronization
4897             bool                    bFinished = false;
4898             std::mutex              mutex{};
4899             std::condition_variable cv{};
4900         };
4901 
4902         // Thread function to resample
4903         const auto JobResampleFunc = [](void* pData)
4904         {
4905             OvrJob* poJob = static_cast<OvrJob*>(pData);
4906 
4907             poJob->eErr = poJob->pfnResampleFn(
4908                 poJob->dfXRatioDstToSrc,
4909                 poJob->dfYRatioDstToSrc,
4910                 0.0, 0.0,
4911                 poJob->eWrkDataType,
4912                 poJob->pChunk,
4913                 poJob->pabyChunkNodataMask,
4914                 poJob->nChunkXOff,
4915                 poJob->nChunkXSize,
4916                 poJob->nChunkYOff,
4917                 poJob->nChunkYSize,
4918                 poJob->nDstXOff,
4919                 poJob->nDstXOff2,
4920                 poJob->nDstYOff,
4921                 poJob->nDstYOff2,
4922                 poJob->poOverview,
4923                 &(poJob->pDstBuffer),
4924                 &(poJob->eDstBufferDataType),
4925                 poJob->pszResampling,
4926                 poJob->bHasNoData,
4927                 poJob->fNoDataValue,
4928                 nullptr,
4929                 poJob->eSrcDataType,
4930                 poJob->bPropagateNoData);
4931 
4932             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
4933 
4934             {
4935                 std::lock_guard<std::mutex> guard(poJob->mutex);
4936                 poJob->bFinished = true;
4937                 poJob->cv.notify_one();
4938             }
4939         };
4940 
4941         // Function to write resample data to target band
4942         const auto WriteJobData = [](const OvrJob* poJob)
4943         {
4944             return poJob->poOverview->RasterIO(
4945                                 GF_Write,
4946                                 poJob->nDstXOff,
4947                                 poJob->nDstYOff,
4948                                 poJob->nDstXOff2 - poJob->nDstXOff,
4949                                 poJob->nDstYOff2 - poJob->nDstYOff,
4950                                 poJob->pDstBuffer,
4951                                 poJob->nDstXOff2 - poJob->nDstXOff,
4952                                 poJob->nDstYOff2 - poJob->nDstYOff,
4953                                 poJob->eDstBufferDataType,
4954                                 0, 0, nullptr );
4955         };
4956 
4957         // Wait for completion of oldest job and serialize it
4958         const auto WaitAndFinalizeOldestJob = [WriteJobData](
4959                             std::list<std::unique_ptr<OvrJob>>& jobList)
4960         {
4961             auto poOldestJob = jobList.front().get();
4962             {
4963                 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4964                 while( !poOldestJob->bFinished )
4965                 {
4966                     poOldestJob->cv.wait(oGuard);
4967                 }
4968             }
4969             CPLErr l_eErr = poOldestJob->eErr;
4970             if( l_eErr == CE_None )
4971             {
4972                 l_eErr = WriteJobData(poOldestJob);
4973             }
4974 
4975             jobList.pop_front();
4976             return l_eErr;
4977         };
4978 
4979         // Queue of jobs
4980         std::list<std::unique_ptr<OvrJob>> jobList;
4981 
4982         std::vector<void*> apaChunk(nBands);
4983         std::vector<GByte*> apabyChunkNoDataMask(nBands);
4984 
4985         int nDstYOff = 0;
4986         // Iterate on destination overview, block by block.
4987         for( nDstYOff = 0;
4988              nDstYOff < nDstHeight && eErr == CE_None;
4989              nDstYOff += nDstChunkYSize )
4990         {
4991             int nDstYCount;
4992             if( nDstYOff + nDstChunkYSize <= nDstHeight )
4993                 nDstYCount = nDstChunkYSize;
4994             else
4995                 nDstYCount = nDstHeight - nDstYOff;
4996 
4997             int nChunkYOff =
4998                 static_cast<int>(nDstYOff * dfYRatioDstToSrc);
4999             int nChunkYOff2 =
5000                 static_cast<int>(
5001                     ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc) );
5002             if( nChunkYOff2 > nSrcHeight || nDstYOff + nDstYCount == nDstHeight)
5003                 nChunkYOff2 = nSrcHeight;
5004             int nYCount = nChunkYOff2 - nChunkYOff;
5005             CPLAssert(nYCount <= nFullResYChunk);
5006 
5007             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5008             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5009             if( nChunkYOffQueried < 0 )
5010             {
5011                 nChunkYSizeQueried += nChunkYOffQueried;
5012                 nChunkYOffQueried = 0;
5013             }
5014             if( nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight )
5015                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5016             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5017 
5018             if( !pfnProgress( dfCurPixelCount / dfTotalPixelCount,
5019                               nullptr, pProgressData ) )
5020             {
5021                 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5022                 eErr = CE_Failure;
5023             }
5024 
5025             int nDstXOff = 0;
5026             // Iterate on destination overview, block by block.
5027             for( nDstXOff = 0;
5028                  nDstXOff < nDstWidth && eErr == CE_None;
5029                  nDstXOff += nDstChunkXSize )
5030             {
5031                 int nDstXCount = 0;
5032                 if( nDstXOff + nDstChunkXSize <= nDstWidth )
5033                     nDstXCount = nDstChunkXSize;
5034                 else
5035                     nDstXCount = nDstWidth - nDstXOff;
5036 
5037                 int nChunkXOff =
5038                     static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5039                 int nChunkXOff2 =
5040                     static_cast<int>(
5041                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc) );
5042                 if( nChunkXOff2 > nSrcWidth ||
5043                     nDstXOff + nDstXCount == nDstWidth )
5044                     nChunkXOff2 = nSrcWidth;
5045                 const int nXCount = nChunkXOff2 - nChunkXOff;
5046                 CPLAssert(nXCount <= nFullResXChunk);
5047 
5048                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5049                 int nChunkXSizeQueried =
5050                     nXCount + 2 * nKernelRadius * nOvrFactor;
5051                 if( nChunkXOffQueried < 0 )
5052                 {
5053                     nChunkXSizeQueried += nChunkXOffQueried;
5054                     nChunkXOffQueried = 0;
5055                 }
5056                 if( nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth )
5057                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5058                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5059 #if DEBUG_VERBOSE
5060                 CPLDebug(
5061                     "GDAL",
5062                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5063                     nChunkXOffQueried, nChunkYOffQueried, nChunkXSizeQueried, nChunkYSizeQueried,
5064                     nDstXOff, nDstYOff, nDstXCount, nDstYCount );
5065 #endif
5066 
5067                 // Avoid accumulating too many tasks and exhaust RAM
5068 
5069                 // Try to complete already finished jobs
5070                 while( eErr == CE_None && !jobList.empty() )
5071                 {
5072                     auto poOldestJob = jobList.front().get();
5073                     {
5074                         std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5075                         if( !poOldestJob->bFinished )
5076                         {
5077                             break;
5078                         }
5079                     }
5080                     eErr = poOldestJob->eErr;
5081                     if( eErr == CE_None )
5082                     {
5083                         eErr = WriteJobData(poOldestJob);
5084                     }
5085 
5086                     jobList.pop_front();
5087                 }
5088 
5089                 // And in case we have saturated the number of threads,
5090                 // wait for completion of tasks to go below the threshold.
5091                 while( eErr == CE_None &&
5092                        jobList.size() >= static_cast<size_t>(nThreads) )
5093                 {
5094                     eErr = WaitAndFinalizeOldestJob(jobList);
5095                 }
5096 
5097                 // (Re)allocate buffers if needed
5098                 for( int iBand = 0; iBand < nBands; ++iBand )
5099                 {
5100                     if( apaChunk[iBand] == nullptr )
5101                     {
5102                         apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5103                             nFullResXChunkQueried,
5104                             nFullResYChunkQueried,
5105                             GDALGetDataTypeSizeBytes(eWrkDataType) );
5106                         if( apaChunk[iBand] == nullptr )
5107                         {
5108                             eErr = CE_Failure;
5109                         }
5110                     }
5111                     if( bUseNoDataMask && apabyChunkNoDataMask[iBand] == nullptr  )
5112                     {
5113                         apabyChunkNoDataMask[iBand] = static_cast<GByte *>(
5114                             VSI_MALLOC2_VERBOSE( nFullResXChunkQueried,
5115                                                 nFullResYChunkQueried ) );
5116                         if( apabyChunkNoDataMask[iBand] == nullptr )
5117                         {
5118                             eErr = CE_Failure;
5119                         }
5120                     }
5121                 }
5122 
5123                 // Read the source buffers for all the bands.
5124                 for( int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand )
5125                 {
5126                     GDALRasterBand* poSrcBand = nullptr;
5127                     if( iSrcOverview == -1 )
5128                         poSrcBand = papoSrcBands[iBand];
5129                     else
5130                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5131                     eErr = poSrcBand->RasterIO(
5132                         GF_Read,
5133                         nChunkXOffQueried, nChunkYOffQueried,
5134                         nChunkXSizeQueried, nChunkYSizeQueried,
5135                         apaChunk[iBand],
5136                         nChunkXSizeQueried, nChunkYSizeQueried,
5137                         eWrkDataType, 0, 0, nullptr );
5138 
5139                     if( bUseNoDataMask && eErr == CE_None )
5140                     {
5141                         const bool bUseSrcAsMaskBand = bIsMask || papoSrcBands[iBand]->GetColorInterpretation() == GCI_AlphaBand;
5142                         auto poMaskBand = bUseSrcAsMaskBand ? poSrcBand : poSrcBand->GetMaskBand();
5143                         eErr = poMaskBand->RasterIO(
5144                             GF_Read,
5145                             nChunkXOffQueried, nChunkYOffQueried,
5146                             nChunkXSizeQueried, nChunkYSizeQueried,
5147                             apabyChunkNoDataMask[iBand],
5148                             nChunkXSizeQueried, nChunkYSizeQueried,
5149                             GDT_Byte, 0, 0, nullptr );
5150                     }
5151                 }
5152 
5153                 // Compute the resulting overview block.
5154                 for( int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand )
5155                 {
5156                     auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
5157                     poJob->pfnResampleFn = pfnResampleFn;
5158                     poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
5159                     poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
5160                     poJob->eWrkDataType = eWrkDataType;
5161                     poJob->pChunk = apaChunk[iBand];
5162                     poJob->pabyChunkNodataMask = apabyChunkNoDataMask[iBand];
5163                     poJob->nChunkXOff = nChunkXOffQueried;
5164                     poJob->nChunkXSize = nChunkXSizeQueried;
5165                     poJob->nChunkYOff = nChunkYOffQueried;
5166                     poJob->nChunkYSize = nChunkYSizeQueried;
5167                     poJob->nDstXOff = nDstXOff;
5168                     poJob->nDstXOff2 = nDstXOff + nDstXCount;
5169                     poJob->nDstYOff = nDstYOff;
5170                     poJob->nDstYOff2 = nDstYOff + nDstYCount;
5171                     poJob->poOverview = papapoOverviewBands[iBand][iOverview];
5172                     poJob->pszResampling = pszResampling;
5173                     poJob->bHasNoData = pabHasNoData[iBand];
5174                     poJob->fNoDataValue = pafNoDataValue[iBand];
5175                     poJob->eSrcDataType = eDataType;
5176                     poJob->bPropagateNoData = bPropagateNoData;
5177 
5178                     if( poJobQueue )
5179                     {
5180                         poJob->oSrcMaskBufferHolder.reset(
5181                             new PointerHolder(apabyChunkNoDataMask[iBand]));
5182                         apabyChunkNoDataMask[iBand] = nullptr;
5183 
5184                         poJob->oSrcBufferHolder.reset(
5185                             new PointerHolder(apaChunk[iBand]));
5186                         apaChunk[iBand] = nullptr;
5187 
5188                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5189                         jobList.emplace_back(std::move(poJob));
5190                     }
5191                     else
5192                     {
5193                         JobResampleFunc(poJob.get());
5194                         eErr = poJob->eErr;
5195                         if( eErr == CE_None )
5196                         {
5197                             eErr = WriteJobData(poJob.get());
5198                         }
5199                     }
5200                 }
5201             }
5202 
5203             dfCurPixelCount += static_cast<double>(nYCount) * nSrcWidth;
5204         }
5205 
5206         // Wait for all pending jobs to complete
5207         while( !jobList.empty() )
5208         {
5209             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5210             if( l_eErr != CE_None && eErr == CE_None )
5211                 eErr = l_eErr;
5212         }
5213 
5214         // Flush the data to overviews.
5215         for( int iBand = 0; iBand < nBands; ++iBand )
5216         {
5217             CPLFree(apaChunk[iBand]);
5218             papapoOverviewBands[iBand][iOverview]->FlushCache();
5219 
5220             CPLFree(apabyChunkNoDataMask[iBand]);
5221         }
5222     }
5223 
5224     CPLFree(pabHasNoData);
5225     CPLFree(pafNoDataValue);
5226 
5227     if( eErr == CE_None )
5228         pfnProgress( 1.0, nullptr, pProgressData );
5229 
5230     return eErr;
5231 }
5232 
5233 /************************************************************************/
5234 /*                        GDALComputeBandStats()                        */
5235 /************************************************************************/
5236 
5237 /** Undocumented
5238  * @param hSrcBand undocumented.
5239  * @param nSampleStep undocumented.
5240  * @param pdfMean undocumented.
5241  * @param pdfStdDev undocumented.
5242  * @param pfnProgress undocumented.
5243  * @param pProgressData undocumented.
5244  * @return undocumented
5245  */
5246 CPLErr CPL_STDCALL
GDALComputeBandStats(GDALRasterBandH hSrcBand,int nSampleStep,double * pdfMean,double * pdfStdDev,GDALProgressFunc pfnProgress,void * pProgressData)5247 GDALComputeBandStats( GDALRasterBandH hSrcBand,
5248                       int nSampleStep,
5249                       double *pdfMean, double *pdfStdDev,
5250                       GDALProgressFunc pfnProgress,
5251                       void *pProgressData )
5252 
5253 {
5254     VALIDATE_POINTER1( hSrcBand, "GDALComputeBandStats", CE_Failure );
5255 
5256     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle( hSrcBand );
5257 
5258     if( pfnProgress == nullptr )
5259         pfnProgress = GDALDummyProgress;
5260 
5261     const int nWidth = poSrcBand->GetXSize();
5262     const int nHeight = poSrcBand->GetYSize();
5263 
5264     if( nSampleStep >= nHeight || nSampleStep < 1 )
5265         nSampleStep = 1;
5266 
5267     GDALDataType eWrkType = GDT_Unknown;
5268     float *pafData = nullptr;
5269     GDALDataType eType = poSrcBand->GetRasterDataType();
5270     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5271     if( bComplex )
5272     {
5273         pafData = static_cast<float *>(
5274             VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)) );
5275         eWrkType = GDT_CFloat32;
5276     }
5277     else
5278     {
5279         pafData = static_cast<float *>(
5280             VSI_MALLOC_VERBOSE(nWidth * sizeof(float)) );
5281         eWrkType = GDT_Float32;
5282     }
5283 
5284     if( nWidth == 0 || pafData == nullptr )
5285     {
5286         VSIFree(pafData);
5287         return CE_Failure;
5288     }
5289 
5290 /* -------------------------------------------------------------------- */
5291 /*      Loop over all sample lines.                                     */
5292 /* -------------------------------------------------------------------- */
5293     double dfSum = 0.0;
5294     double dfSum2 = 0.0;
5295     int iLine = 0;
5296     GIntBig nSamples = 0;
5297 
5298     do
5299     {
5300         if( !pfnProgress( iLine / static_cast<double>( nHeight ),
5301                           nullptr, pProgressData ) )
5302         {
5303             CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5304             CPLFree( pafData );
5305             return CE_Failure;
5306         }
5307 
5308         const CPLErr eErr =
5309             poSrcBand->RasterIO( GF_Read, 0, iLine, nWidth, 1,
5310                                  pafData, nWidth, 1, eWrkType,
5311                                  0, 0, nullptr );
5312         if( eErr != CE_None )
5313         {
5314             CPLFree( pafData );
5315             return eErr;
5316         }
5317 
5318         for( int iPixel = 0; iPixel < nWidth; ++iPixel )
5319         {
5320             float fValue = 0.0f;
5321 
5322             if( bComplex )
5323             {
5324                 // Compute the magnitude of the complex value.
5325                 fValue = std::hypot(pafData[iPixel*2],
5326                                     pafData[iPixel*2+1]);
5327             }
5328             else
5329             {
5330                 fValue = pafData[iPixel];
5331             }
5332 
5333             dfSum  += fValue;
5334             dfSum2 += fValue * fValue;
5335         }
5336 
5337         nSamples += nWidth;
5338         iLine += nSampleStep;
5339     } while( iLine < nHeight );
5340 
5341     if( !pfnProgress( 1.0, nullptr, pProgressData ) )
5342     {
5343         CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5344         CPLFree( pafData );
5345         return CE_Failure;
5346     }
5347 
5348 /* -------------------------------------------------------------------- */
5349 /*      Produce the result values.                                      */
5350 /* -------------------------------------------------------------------- */
5351     if( pdfMean != nullptr )
5352         *pdfMean = dfSum / nSamples;
5353 
5354     if( pdfStdDev != nullptr )
5355     {
5356         const double dfMean = dfSum / nSamples;
5357 
5358         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
5359     }
5360 
5361     CPLFree( pafData );
5362 
5363     return CE_None;
5364 }
5365 
5366 /************************************************************************/
5367 /*                  GDALOverviewMagnitudeCorrection()                   */
5368 /*                                                                      */
5369 /*      Correct the mean and standard deviation of the overviews of     */
5370 /*      the given band to match the base layer approximately.           */
5371 /************************************************************************/
5372 
5373 /** Undocumented
5374  * @param hBaseBand undocumented.
5375  * @param nOverviewCount undocumented.
5376  * @param pahOverviews undocumented.
5377  * @param pfnProgress undocumented.
5378  * @param pProgressData undocumented.
5379  * @return undocumented
5380  */
5381 CPLErr
GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,int nOverviewCount,GDALRasterBandH * pahOverviews,GDALProgressFunc pfnProgress,void * pProgressData)5382 GDALOverviewMagnitudeCorrection( GDALRasterBandH hBaseBand,
5383                                  int nOverviewCount,
5384                                  GDALRasterBandH *pahOverviews,
5385                                  GDALProgressFunc pfnProgress,
5386                                  void *pProgressData )
5387 
5388 {
5389     VALIDATE_POINTER1( hBaseBand, "GDALOverviewMagnitudeCorrection",
5390                        CE_Failure );
5391 
5392 /* -------------------------------------------------------------------- */
5393 /*      Compute mean/stddev for source raster.                          */
5394 /* -------------------------------------------------------------------- */
5395     double dfOrigMean = 0.0;
5396     double dfOrigStdDev = 0.0;
5397     {
5398         const CPLErr eErr
5399             = GDALComputeBandStats( hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
5400                                     pfnProgress, pProgressData );
5401 
5402         if( eErr != CE_None )
5403             return eErr;
5404     }
5405 
5406 /* -------------------------------------------------------------------- */
5407 /*      Loop on overview bands.                                         */
5408 /* -------------------------------------------------------------------- */
5409     for( int iOverview = 0; iOverview < nOverviewCount; ++iOverview )
5410     {
5411         GDALRasterBand *poOverview = GDALRasterBand::FromHandle(
5412             pahOverviews[iOverview]);
5413         double  dfOverviewMean, dfOverviewStdDev;
5414 
5415         const CPLErr eErr =
5416             GDALComputeBandStats( pahOverviews[iOverview], 1,
5417                                   &dfOverviewMean, &dfOverviewStdDev,
5418                                   pfnProgress, pProgressData );
5419 
5420         if( eErr != CE_None )
5421             return eErr;
5422 
5423         double dfGain = 1.0;
5424         if( dfOrigStdDev >= 0.0001 )
5425             dfGain = dfOrigStdDev / dfOverviewStdDev;
5426 
5427 /* -------------------------------------------------------------------- */
5428 /*      Apply gain and offset.                                          */
5429 /* -------------------------------------------------------------------- */
5430         const int nWidth = poOverview->GetXSize();
5431         const int nHeight = poOverview->GetYSize();
5432 
5433         GDALDataType eWrkType = GDT_Unknown;
5434         float *pafData = nullptr;
5435         const GDALDataType eType = poOverview->GetRasterDataType();
5436         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5437         if( bComplex )
5438         {
5439             pafData = static_cast<float *>(
5440                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)) );
5441             eWrkType = GDT_CFloat32;
5442         }
5443         else
5444         {
5445             pafData = static_cast<float *>(
5446                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)) );
5447             eWrkType = GDT_Float32;
5448         }
5449 
5450         if( pafData == nullptr )
5451         {
5452             return CE_Failure;
5453         }
5454 
5455         for( int iLine = 0; iLine < nHeight; ++iLine )
5456         {
5457             if( !pfnProgress( iLine / static_cast<double>( nHeight ),
5458                               nullptr, pProgressData ) )
5459             {
5460                 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5461                 CPLFree( pafData );
5462                 return CE_Failure;
5463             }
5464 
5465             if( poOverview->RasterIO( GF_Read, 0, iLine, nWidth, 1,
5466                                       pafData, nWidth, 1, eWrkType,
5467                                       0, 0, nullptr ) != CE_None )
5468             {
5469                 CPLFree( pafData );
5470                 return CE_Failure;
5471             }
5472 
5473             for( int iPixel = 0; iPixel < nWidth; ++iPixel )
5474             {
5475                 if( bComplex )
5476                 {
5477                     pafData[iPixel*2] *= static_cast<float>( dfGain );
5478                     pafData[iPixel*2+1] *= static_cast<float>( dfGain );
5479                 }
5480                 else
5481                 {
5482                     pafData[iPixel] = static_cast<float>(
5483                         (pafData[iPixel] - dfOverviewMean)
5484                         * dfGain + dfOrigMean );
5485                 }
5486             }
5487 
5488             if( poOverview->RasterIO( GF_Write, 0, iLine, nWidth, 1,
5489                                       pafData, nWidth, 1, eWrkType,
5490                                       0, 0, nullptr ) != CE_None )
5491             {
5492                 CPLFree( pafData );
5493                 return CE_Failure;
5494             }
5495         }
5496 
5497         if( !pfnProgress( 1.0, nullptr, pProgressData ) )
5498         {
5499             CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5500             CPLFree( pafData );
5501             return CE_Failure;
5502         }
5503 
5504         CPLFree( pafData );
5505     }
5506 
5507     return CE_None;
5508 }
5509