1 /******************************************************************************
2 *
3 * Project: GDAL Core
4 * Purpose: Helper code to implement overview support in different drivers.
5 * Author: Frank Warmerdam, warmerdam@pobox.com
6 *
7 ******************************************************************************
8 * Copyright (c) 2000, Frank Warmerdam
9 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
10 *
11 * Permission is hereby granted, free of charge, to any person obtaining a
12 * copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 ****************************************************************************/
29
30 #include "cpl_port.h"
31 #include "gdal_priv.h"
32
33 #include <cmath>
34 #include <cstddef>
35 #include <cstdlib>
36
37 #include <algorithm>
38 #include <complex>
39 #include <condition_variable>
40 #include <limits>
41 #include <list>
42 #include <memory>
43 #include <mutex>
44 #include <vector>
45
46 #include "cpl_conv.h"
47 #include "cpl_error.h"
48 #include "cpl_progress.h"
49 #include "cpl_vsi.h"
50 #include "gdal.h"
51 #include "gdal_thread_pool.h"
52 #include "gdalwarper.h"
53
54 // Restrict to 64bit processors because they are guaranteed to have SSE2.
55 // Could possibly be used too on 32bit, but we would need to check at runtime.
56 #if defined(__x86_64) || defined(_M_X64)
57 #define USE_SSE2
58
59 #include "gdalsse_priv.h"
60 #endif
61
62 CPL_CVSID("$Id: overview.cpp e2f3698499b06dd03205c26333cd316db9e6e65a 2021-08-26 21:37:16 +0200 Even Rouault $")
63
64 /************************************************************************/
65 /* GDALResampleChunk32R_Near() */
66 /************************************************************************/
67
68 template <class T>
69 static CPLErr
GDALResampleChunk32R_NearT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,GDALDataType eWrkDataType,const T * pChunk,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,T ** ppDstBuffer)70 GDALResampleChunk32R_NearT( double dfXRatioDstToSrc,
71 double dfYRatioDstToSrc,
72 GDALDataType eWrkDataType,
73 const T * pChunk,
74 int nChunkXOff, int nChunkXSize,
75 int nChunkYOff,
76 int nDstXOff, int nDstXOff2,
77 int nDstYOff, int nDstYOff2,
78 T** ppDstBuffer )
79
80 {
81 const int nDstXWidth = nDstXOff2 - nDstXOff;
82
83 /* -------------------------------------------------------------------- */
84 /* Allocate buffers. */
85 /* -------------------------------------------------------------------- */
86 *ppDstBuffer = static_cast<T *>(
87 VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
88 GDALGetDataTypeSizeBytes(eWrkDataType)));
89 if( *ppDstBuffer == nullptr )
90 {
91 return CE_Failure;
92 }
93 T* const pDstBuffer = *ppDstBuffer;
94
95 int* panSrcXOff = static_cast<int *>(
96 VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)) );
97
98 if( panSrcXOff == nullptr )
99 {
100 VSIFree(panSrcXOff);
101 return CE_Failure;
102 }
103
104 /* ==================================================================== */
105 /* Precompute inner loop constants. */
106 /* ==================================================================== */
107 for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
108 {
109 int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
110 if( nSrcXOff < nChunkXOff )
111 nSrcXOff = nChunkXOff;
112
113 panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
114 }
115
116 /* ==================================================================== */
117 /* Loop over destination scanlines. */
118 /* ==================================================================== */
119 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
120 {
121 int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
122 if( nSrcYOff < nChunkYOff )
123 nSrcYOff = nChunkYOff;
124
125 const T * const pSrcScanline =
126 pChunk + (static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize) - nChunkXOff;
127
128 /* -------------------------------------------------------------------- */
129 /* Loop over destination pixels */
130 /* -------------------------------------------------------------------- */
131 T* pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
132 for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
133 {
134 pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
135 }
136 }
137
138 CPLFree( panSrcXOff );
139
140 return CE_None;
141 }
142
143 static CPLErr
GDALResampleChunk32R_Near(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double,double,GDALDataType eWrkDataType,const void * pChunk,const GByte *,int nChunkXOff,int nChunkXSize,int nChunkYOff,int,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand *,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int,float,GDALColorTable *,GDALDataType,bool)144 GDALResampleChunk32R_Near( double dfXRatioDstToSrc,
145 double dfYRatioDstToSrc,
146 double /* dfSrcXDelta */,
147 double /* dfSrcYDelta */,
148 GDALDataType eWrkDataType,
149 const void * pChunk,
150 const GByte * /* pabyChunkNodataMask_unused */,
151 int nChunkXOff, int nChunkXSize,
152 int nChunkYOff, int /* nChunkYSize */,
153 int nDstXOff, int nDstXOff2,
154 int nDstYOff, int nDstYOff2,
155 GDALRasterBand * /*poOverview*/,
156 void** ppDstBuffer,
157 GDALDataType* peDstBufferDataType,
158 const char * /* pszResampling_unused */,
159 int /* bHasNoData_unused */,
160 float /* fNoDataValue_unused */,
161 GDALColorTable* /* poColorTable_unused */,
162 GDALDataType /* eSrcDataType */,
163 bool /* bPropagateNoData */ )
164 {
165 *peDstBufferDataType = eWrkDataType;
166 if( eWrkDataType == GDT_Byte )
167 return GDALResampleChunk32R_NearT(
168 dfXRatioDstToSrc,
169 dfYRatioDstToSrc,
170 eWrkDataType,
171 static_cast<const GByte *>( pChunk ),
172 nChunkXOff, nChunkXSize,
173 nChunkYOff,
174 nDstXOff, nDstXOff2,
175 nDstYOff, nDstYOff2,
176 reinterpret_cast<GByte **>( ppDstBuffer ) );
177 else if( eWrkDataType == GDT_UInt16 )
178 return GDALResampleChunk32R_NearT(
179 dfXRatioDstToSrc,
180 dfYRatioDstToSrc,
181 eWrkDataType,
182 static_cast<const GInt16 *>( pChunk ),
183 nChunkXOff, nChunkXSize,
184 nChunkYOff,
185 nDstXOff, nDstXOff2,
186 nDstYOff, nDstYOff2,
187 reinterpret_cast<GInt16 **>( ppDstBuffer ) );
188 else if( eWrkDataType == GDT_Float32 )
189 return GDALResampleChunk32R_NearT(
190 dfXRatioDstToSrc,
191 dfYRatioDstToSrc,
192 eWrkDataType,
193 static_cast<const float *>( pChunk ),
194 nChunkXOff, nChunkXSize,
195 nChunkYOff,
196 nDstXOff, nDstXOff2,
197 nDstYOff, nDstYOff2,
198 reinterpret_cast<float **>( ppDstBuffer ));
199
200 CPLAssert(false);
201 return CE_Failure;
202 }
203
204 /************************************************************************/
205 /* GDALFindBestEntry() */
206 /************************************************************************/
207
208 // Find in the color table the entry whose (c1,c2,c3) value is the closest
209 // (using quadratic distance) to the passed (nR,nG,nB) triplet, ignoring
210 // transparent entries.
GDALFindBestEntry(int nEntryCount,const GDALColorEntry * aEntries,int nR,int nG,int nB)211 static int GDALFindBestEntry( int nEntryCount, const GDALColorEntry* aEntries,
212 int nR, int nG, int nB )
213 {
214 int nMinDist = std::numeric_limits<int>::max();
215 int iBestEntry = 0;
216 for( int i = 0; i < nEntryCount; ++i )
217 {
218 // Ignore transparent entries
219 if( aEntries[i].c4 == 0 )
220 continue;
221 int nDist = (nR - aEntries[i].c1) * (nR - aEntries[i].c1) +
222 (nG - aEntries[i].c2) * (nG - aEntries[i].c2) +
223 (nB - aEntries[i].c3) * (nB - aEntries[i].c3);
224 if( nDist < nMinDist )
225 {
226 nMinDist = nDist;
227 iBestEntry = i;
228 }
229 }
230 return iBestEntry;
231 }
232
233 /************************************************************************/
234 /* ReadColorTableAsArray() */
235 /************************************************************************/
236
ReadColorTableAsArray(const GDALColorTable * poColorTable,int & nEntryCount,GDALColorEntry * & aEntries,int & nTransparentIdx)237 static bool ReadColorTableAsArray( const GDALColorTable* poColorTable,
238 int& nEntryCount,
239 GDALColorEntry*& aEntries,
240 int& nTransparentIdx )
241 {
242 nEntryCount = poColorTable->GetColorEntryCount();
243 aEntries = static_cast<GDALColorEntry *>(
244 VSI_MALLOC2_VERBOSE(sizeof(GDALColorEntry), nEntryCount) );
245 nTransparentIdx = -1;
246 if( aEntries == nullptr )
247 return false;
248 for( int i = 0; i < nEntryCount; ++i )
249 {
250 poColorTable->GetColorEntryAsRGB(i, &aEntries[i]);
251 if( nTransparentIdx < 0 && aEntries[i].c4 == 0 )
252 nTransparentIdx = i;
253 }
254 return true;
255 }
256
257 /************************************************************************/
258 /* GetReplacementValueIfNoData() */
259 /************************************************************************/
260
GetReplacementValueIfNoData(GDALDataType dt,int bHasNoData,float fNoDataValue)261 static float GetReplacementValueIfNoData(GDALDataType dt, int bHasNoData,
262 float fNoDataValue)
263 {
264 float fReplacementVal = 0.0f;
265 if( bHasNoData )
266 {
267 if( dt == GDT_Byte )
268 {
269 if( fNoDataValue == std::numeric_limits<unsigned char>::max() )
270 fReplacementVal = static_cast<float>(
271 std::numeric_limits<unsigned char>::max() - 1);
272 else
273 fReplacementVal = fNoDataValue + 1;
274 }
275 else if( dt == GDT_UInt16 )
276 {
277 if( fNoDataValue == std::numeric_limits<GUInt16>::max() )
278 fReplacementVal = static_cast<float>(
279 std::numeric_limits<GUInt16>::max() - 1);
280 else
281 fReplacementVal = fNoDataValue + 1;
282 }
283 else if( dt == GDT_Int16 )
284 {
285 if( fNoDataValue == std::numeric_limits<GInt16>::max() )
286 fReplacementVal = static_cast<float>(
287 std::numeric_limits<GInt16>::max() - 1);
288 else
289 fReplacementVal = fNoDataValue + 1;
290 }
291 else if( dt == GDT_UInt32 )
292 {
293 // Be careful to limited precision of float
294 fReplacementVal = fNoDataValue + 1;
295 double dfVal = fNoDataValue;
296 if( fReplacementVal >= static_cast<double>(std::numeric_limits<GUInt32>::max() - 128) )
297 {
298 while( fReplacementVal == fNoDataValue )
299 {
300 dfVal -= 1.0;
301 fReplacementVal = static_cast<float>(dfVal);
302 }
303 }
304 else
305 {
306 while( fReplacementVal == fNoDataValue )
307 {
308 dfVal += 1.0;
309 fReplacementVal = static_cast<float>(dfVal);
310 }
311 }
312 }
313 else if( dt == GDT_Int32 )
314 {
315 // Be careful to limited precision of float
316 fReplacementVal = fNoDataValue + 1;
317 double dfVal = fNoDataValue;
318 if( fReplacementVal >= static_cast<double>(std::numeric_limits<GInt32>::max() - 64) )
319 {
320 while( fReplacementVal == fNoDataValue )
321 {
322 dfVal -= 1.0;
323 fReplacementVal = static_cast<float>(dfVal);
324 }
325 }
326 else
327 {
328 while( fReplacementVal == fNoDataValue )
329 {
330 dfVal += 1.0;
331 fReplacementVal = static_cast<float>(dfVal);
332 }
333 }
334 }
335 else if( dt == GDT_Float32 || dt == GDT_Float64 )
336 {
337 if( fNoDataValue == 0 )
338 {
339 fReplacementVal = std::numeric_limits<float>::min();
340 }
341 else
342 {
343 fReplacementVal = static_cast<float>(
344 fNoDataValue + 1e-7 * fNoDataValue);
345 }
346 }
347 }
348 return fReplacementVal;
349 }
350
351 /************************************************************************/
352 /* SQUARE() */
353 /************************************************************************/
354
SQUARE(T val)355 template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
356 {
357 return static_cast<Tsquare>(val) * val;
358 }
359
360 /************************************************************************/
361 /* ComputeIntegerRMS() */
362 /************************************************************************/
363 // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
364 // integer that minimizes abs(rms**2 - sumSquares / weight)
ComputeIntegerRMS(double sumSquares,double weight)365 template <class T> inline T ComputeIntegerRMS(double sumSquares, double weight)
366 {
367 const double sumDivWeight = sumSquares / weight;
368 T rms = static_cast<T>(sqrt(sumDivWeight));
369
370 // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
371 // Naive version:
372 // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
373 if( 2 * rms * (rms + 1) + 1 < 2 * sumDivWeight )
374 rms += 1;
375 return rms;
376 }
377
ComputeIntegerRMS_4values(Tsum)378 template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
379 {
380 CPLAssert(false);
381 return 0;
382 }
383
ComputeIntegerRMS_4values(int sumSquares)384 template<> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
385 {
386 // It has been verified that given the correction on rms below, using
387 // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
388 // is equivalent, so use the former as it is used twice.
389 const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
390 const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
391 GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
392
393 // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
394 // Naive version:
395 // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
396 // Optimized version for integer case and weight == 4
397 if( static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4 )
398 rms += 1;
399 return rms;
400 }
401
ComputeIntegerRMS_4values(double sumSquares)402 template<> inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
403 {
404 const double sumDivWeight = sumSquares * 0.25;
405 GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
406
407 // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
408 // Naive version:
409 // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
410 // Optimized version for integer case and weight == 4
411 if( static_cast<GUInt32>(rms) * (rms + 1) < static_cast<GUInt32>(sumDivWeight + 0.25) )
412 rms += 1;
413 return rms;
414 }
415
416 #ifdef USE_SSE2
417
418 /************************************************************************/
419 /* QuadraticMeanByteSSE2OrAVX2() */
420 /************************************************************************/
421
422 #ifdef __SSE4_1__
423 #define sse2_packus_epi32 _mm_packus_epi32
424 #else
sse2_packus_epi32(__m128i a,__m128i b)425 inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
426 {
427 const auto minus32768_32 = _mm_set1_epi32(-32768);
428 const auto minus32768_16 = _mm_set1_epi16(-32768);
429 a = _mm_add_epi32(a, minus32768_32);
430 b = _mm_add_epi32(b, minus32768_32);
431 a = _mm_packs_epi32(a, b);
432 a = _mm_sub_epi16(a, minus32768_16);
433 return a;
434 }
435 #endif
436
437 #ifdef __SSSE3__
438 #define sse2_hadd_epi16 _mm_hadd_epi16
439 #else
sse2_hadd_epi16(__m128i a,__m128i b)440 inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
441 {
442 // Horizontal addition of adjacent pairs
443 const auto mask = _mm_set1_epi32(0xFFFF);
444 const auto horizLo = _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
445 const auto horizHi = _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
446
447 // Recombine low and high parts
448 return _mm_packs_epi32(horizLo, horizHi);
449 }
450 #endif
451
452
453 #ifdef __AVX2__
454 #define DEST_ELTS 16
455 #define set1_epi16 _mm256_set1_epi16
456 #define set1_epi32 _mm256_set1_epi32
457 #define setzero _mm256_setzero_si256
458 #define set1_ps _mm256_set1_ps
459 #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const*>(x))
460 #define unpacklo_epi8 _mm256_unpacklo_epi8
461 #define unpackhi_epi8 _mm256_unpackhi_epi8
462 #define madd_epi16 _mm256_madd_epi16
463 #define add_epi32 _mm256_add_epi32
464 #define mul_ps _mm256_mul_ps
465 #define cvtepi32_ps _mm256_cvtepi32_ps
466 #define sqrt_ps _mm256_sqrt_ps
467 #define cvttps_epi32 _mm256_cvttps_epi32
468 #define packs_epi32 _mm256_packs_epi32
469 #define packus_epi32 _mm256_packus_epi32
470 #define srli_epi32 _mm256_srli_epi32
471 #define mullo_epi16 _mm256_mullo_epi16
472 #define srli_epi16 _mm256_srli_epi16
473 #define cmpgt_epi16 _mm256_cmpgt_epi16
474 #define add_epi16 _mm256_add_epi16
475 #define sub_epi16 _mm256_sub_epi16
476 #define packus_epi16 _mm256_packus_epi16
477 /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
478 /* to get the lower 128-bit bits of what would be a true 256-bit vector register */
479 #define store_lo(x,y) _mm_storeu_si128(reinterpret_cast<__m128i*>(x), \
480 _mm256_extracti128_si256(_mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
481 #define hadd_epi16 _mm256_hadd_epi16
482 #define zeroupper() _mm256_zeroupper()
483 #else
484 #define DEST_ELTS 8
485 #define set1_epi16 _mm_set1_epi16
486 #define set1_epi32 _mm_set1_epi32
487 #define setzero _mm_setzero_si128
488 #define set1_ps _mm_set1_ps
489 #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const*>(x))
490 #define unpacklo_epi8 _mm_unpacklo_epi8
491 #define unpackhi_epi8 _mm_unpackhi_epi8
492 #define madd_epi16 _mm_madd_epi16
493 #define add_epi32 _mm_add_epi32
494 #define mul_ps _mm_mul_ps
495 #define cvtepi32_ps _mm_cvtepi32_ps
496 #define sqrt_ps _mm_sqrt_ps
497 #define cvttps_epi32 _mm_cvttps_epi32
498 #define packs_epi32 _mm_packs_epi32
499 #define packus_epi32 sse2_packus_epi32
500 #define srli_epi32 _mm_srli_epi32
501 #define mullo_epi16 _mm_mullo_epi16
502 #define srli_epi16 _mm_srli_epi16
503 #define cmpgt_epi16 _mm_cmpgt_epi16
504 #define add_epi16 _mm_add_epi16
505 #define sub_epi16 _mm_sub_epi16
506 #define packus_epi16 _mm_packus_epi16
507 #define store_lo(x,y) _mm_storel_epi64(reinterpret_cast<__m128i*>(x), (y))
508 #define hadd_epi16 sse2_hadd_epi16
509 #define zeroupper() (void)0
510 #endif
511
QuadraticMeanByteSSE2OrAVX2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)512 template<class T> static int QuadraticMeanByteSSE2OrAVX2(
513 int nDstXWidth,
514 int nChunkXSize,
515 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
516 T* CPL_RESTRICT pDstScanline)
517 {
518 // Optimized implementation for RMS on Byte by
519 // processing by group of 8 output pixels, so as to use
520 // a single _mm_sqrt_ps() call for 4 output pixels
521 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
522
523 int iDstPixel = 0;
524 const auto one16 = set1_epi16(1);
525 const auto one32 = set1_epi32(1);
526 const auto zero = setzero();
527 const auto minus32768 = set1_epi16(-32768);
528
529 for( ; iDstPixel < nDstXWidth - (DEST_ELTS-1); iDstPixel += DEST_ELTS )
530 {
531 // Load 2 * DEST_ELTS bytes from each line
532 auto firstLine = loadu_int(pSrcScanlineShifted);
533 auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
534 // Extend those Bytes as UInt16s
535 auto firstLineLo = unpacklo_epi8(firstLine, zero);
536 auto firstLineHi = unpackhi_epi8(firstLine, zero);
537 auto secondLineLo = unpacklo_epi8(secondLine, zero);
538 auto secondLineHi = unpackhi_epi8(secondLine, zero);
539
540 // Multiplication of 16 bit values and horizontal
541 // addition of 32 bit results
542 // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
543 firstLineLo = madd_epi16(firstLineLo, firstLineLo);
544 firstLineHi = madd_epi16(firstLineHi, firstLineHi);
545 secondLineLo = madd_epi16(secondLineLo, secondLineLo);
546 secondLineHi = madd_epi16(secondLineHi, secondLineHi);
547
548 // Vertical addition
549 const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
550 const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
551
552 const auto sumSquaresPlusOneDiv4Lo =
553 srli_epi32(add_epi32(sumSquaresLo, one32), 2);
554 const auto sumSquaresPlusOneDiv4Hi =
555 srli_epi32(add_epi32(sumSquaresHi, one32), 2);
556
557 // Take square root and truncate/floor to int32
558 const auto rmsLo = cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
559 const auto rmsHi = cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
560
561 // Merge back low and high registers with each RMS value
562 // as a 16 bit value.
563 auto rms = packs_epi32(rmsLo, rmsHi);
564
565 // Round to upper value if it minimizes the
566 // error |rms^2 - sumSquares/4|
567 // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
568 // rms += 1;
569 // which is equivalent to:
570 // if( rms * (rms + 1) < (sumSquares+1) / 4 )
571 // rms += 1;
572 // And both left and right parts fit on 16 (unsigned) bits
573 const auto sumSquaresPlusOneDiv4 = packus_epi32(
574 sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
575 // cmpgt_epi16 operates on signed int16, but here
576 // we have unsigned values, so shift them by -32768 before
577 auto mask = cmpgt_epi16(
578 add_epi16(sumSquaresPlusOneDiv4, minus32768),
579 add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
580 // The value of the mask will be -1 when the correction needs to be applied
581 rms = sub_epi16(rms, mask);
582
583 // Pack each 16 bit RMS value to 8 bits
584 rms = packus_epi16(rms, rms /* could be anything */);
585 store_lo(&pDstScanline[iDstPixel], rms);
586 pSrcScanlineShifted += 2 * DEST_ELTS;
587 }
588 zeroupper();
589
590 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
591 return iDstPixel;
592 }
593
594 /************************************************************************/
595 /* AverageByteSSE2OrAVX2() */
596 /************************************************************************/
597
AverageByteSSE2OrAVX2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)598 template<class T> static int AverageByteSSE2OrAVX2(
599 int nDstXWidth,
600 int nChunkXSize,
601 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
602 T* CPL_RESTRICT pDstScanline)
603 {
604 // Optimized implementation for average on Byte by
605 // processing by group of 8 output pixels.
606
607 const auto zero = setzero();
608 const auto two16 = set1_epi16(2);
609 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
610
611 int iDstPixel = 0;
612 for( ; iDstPixel < nDstXWidth - (DEST_ELTS-1); iDstPixel += DEST_ELTS )
613 {
614 // Load 2 * DEST_ELTS bytes from each line
615 const auto firstLine = loadu_int(pSrcScanlineShifted);
616 const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
617 // Extend those Bytes as UInt16s
618 const auto firstLineLo = unpacklo_epi8(firstLine, zero);
619 const auto firstLineHi = unpackhi_epi8(firstLine, zero);
620 const auto secondLineLo = unpacklo_epi8(secondLine, zero);
621 const auto secondLineHi = unpackhi_epi8(secondLine, zero);
622
623 // Vertical addition
624 const auto sumLo = add_epi16(firstLineLo, secondLineLo);
625 const auto sumHi = add_epi16(firstLineHi, secondLineHi);
626
627 // Horizontal addition of adjacent pairs, and recombine low and high parts
628 const auto sum = hadd_epi16(sumLo, sumHi);
629
630 // average = (sum + 2) / 4
631 auto average = srli_epi16(add_epi16(sum, two16), 2);
632
633 // Pack each 16 bit average value to 8 bits
634 average = packus_epi16(average, average /* could be anything */);
635 store_lo(&pDstScanline[iDstPixel], average);
636 pSrcScanlineShifted += 2 * DEST_ELTS;
637 }
638 zeroupper();
639
640 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
641 return iDstPixel;
642 }
643
644 /************************************************************************/
645 /* QuadraticMeanUInt16SSE2() */
646 /************************************************************************/
647
648 #ifdef __SSE3__
649 #define sse2_hadd_pd _mm_hadd_pd
650 #else
sse2_hadd_pd(__m128d a,__m128d b)651 inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
652 {
653 auto aLo_bLo = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
654 auto aHi_bHi = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
655 return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
656 }
657 #endif
658
SQUARE(__m128d x)659 inline __m128d SQUARE(__m128d x)
660 {
661 return _mm_mul_pd(x, x);
662 }
663
QuadraticMeanUInt16SSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)664 template<class T> static int QuadraticMeanUInt16SSE2(int nDstXWidth,
665 int nChunkXSize,
666 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
667 T* CPL_RESTRICT pDstScanline)
668 {
669 // Optimized implementation for RMS on UInt16 by
670 // processing by group of 4 output pixels.
671 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
672
673 int iDstPixel = 0;
674 const auto zero = _mm_setzero_si128();
675 const auto zeroDot25 = _mm_set1_pd(0.25);
676 const auto zeroDot5 = _mm_set1_pd(0.5);
677
678 for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
679 {
680 // Load 8 UInt16 from each line
681 const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted));
682 const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + nChunkXSize));
683
684 // Detect if all of the source values fit in 14 bits.
685 // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
686 // and we can do a much faster implementation.
687 const auto maskTmp = _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
688 const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
689 _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
690 if( nMaskFitsIn14Bits == 0 )
691 {
692 // Multiplication of 16 bit values and horizontal
693 // addition of 32 bit results
694 const auto firstLineHSumSquare = _mm_madd_epi16(firstLine, firstLine);
695 const auto secondLineHSumSquare = _mm_madd_epi16(secondLine, secondLine);
696 // Vertical addition
697 const auto sumSquares = _mm_add_epi32(firstLineHSumSquare,
698 secondLineHSumSquare);
699 // In theory we should take sqrt(sumSquares * 0.25f)
700 // but given the rounding we do, this is equivalent to
701 // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
702 // sumSquares <= 4 * 16383^2
703 const auto one32 = _mm_set1_epi32(1);
704 const auto sumSquaresPlusOneDiv4 =
705 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
706 // Take square root and truncate/floor to int32
707 auto rms = _mm_cvttps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
708
709 // Round to upper value if it minimizes the
710 // error |rms^2 - sumSquares/4|
711 // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
712 // rms += 1;
713 // which is equivalent to:
714 // if( rms * rms + rms < (sumSquares+1) / 4 )
715 // rms += 1;
716 auto mask = _mm_cmpgt_epi32(
717 sumSquaresPlusOneDiv4,
718 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
719 rms = _mm_sub_epi32(rms, mask);
720 // Pack each 32 bit RMS value to 16 bits
721 rms = _mm_packs_epi32(rms, rms /* could be anything */);
722 _mm_storel_epi64(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), rms);
723 pSrcScanlineShifted += 8;
724 continue;
725 }
726
727 // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
728 // to 32 bit would result in 4 multiplications instead of 8, but mullo/mulhi
729 // have a worse throughput than mul_pd.
730
731 // Extend those UInt16s as UInt32s
732 const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
733 const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
734 const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
735 const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
736
737 // Multiplication of 32 bit values previously converted to 64 bit double
738 const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
739 const auto firstLineLoHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo,8)));
740 const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
741 const auto firstLineHiHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi,8)));
742
743 const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
744 const auto secondLineLoHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo,8)));
745 const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
746 const auto secondLineHiHi = SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi,8)));
747
748 // Vertical addition of squares
749 const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
750 const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
751 const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
752 const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
753
754 // Horizontal addition of squares
755 const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
756 const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
757
758 const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
759 const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
760 // Take square root and truncate/floor to int32
761 const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
762 const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
763
764 // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
765 // if( 0.5 < sumDivWeight - (rms * rms + rms) )
766 // rms += 1;
767 const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
768 const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
769 const auto rightLo = _mm_sub_pd(sumDivWeightLo,
770 _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
771 const auto rightHi = _mm_sub_pd(sumDivWeightHi,
772 _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
773
774 const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
775 const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
776 // The value of the mask will be -1 when the correction needs to be applied
777 const auto mask = _mm_castps_si128(_mm_shuffle_ps(
778 maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
779
780 auto rms = _mm_castps_si128(_mm_movelh_ps(
781 _mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
782 // Apply the correction
783 rms = _mm_sub_epi32(rms, mask);
784
785 // Pack each 32 bit RMS value to 16 bits
786 rms = sse2_packus_epi32(rms, rms /* could be anything */);
787 _mm_storel_epi64(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), rms);
788 pSrcScanlineShifted += 8;
789 }
790
791 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
792 return iDstPixel;
793 }
794
795 /************************************************************************/
796 /* AverageUInt16SSE2() */
797 /************************************************************************/
798
AverageUInt16SSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)799 template<class T> static int AverageUInt16SSE2(
800 int nDstXWidth,
801 int nChunkXSize,
802 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
803 T* CPL_RESTRICT pDstScanline)
804 {
805 // Optimized implementation for average on UInt16 by
806 // processing by group of 8 output pixels.
807
808 const auto mask = _mm_set1_epi32(0xFFFF);
809 const auto two = _mm_set1_epi32(2);
810 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
811
812 int iDstPixel = 0;
813 for( ; iDstPixel < nDstXWidth - 7; iDstPixel += 8 )
814 {
815 __m128i averageLow;
816 // Load 8 UInt16 from each line
817 {
818 const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted));
819 const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + nChunkXSize));
820
821 // Horizontal addition and extension to 32 bit
822 const auto horizAddFirstLine = _mm_add_epi32(_mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
823 const auto horizAddSecondLine = _mm_add_epi32(_mm_and_si128(secondLine, mask), _mm_srli_epi32(secondLine, 16));
824
825 // Vertical addition and average computation
826 // average = (sum + 2) >> 2
827 const auto sum = _mm_add_epi32(_mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
828 averageLow = _mm_srli_epi32(sum, 2);
829 }
830 // Load 8 UInt16 from each line
831 __m128i averageHigh;
832 {
833 const auto firstLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + 8));
834 const auto secondLine = _mm_loadu_si128(reinterpret_cast<__m128i const*>(pSrcScanlineShifted + 8 + nChunkXSize));
835
836 // Horizontal addition and extension to 32 bit
837 const auto horizAddFirstLine = _mm_add_epi32(_mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
838 const auto horizAddSecondLine = _mm_add_epi32(_mm_and_si128(secondLine, mask), _mm_srli_epi32(secondLine, 16));
839
840 // Vertical addition and average computation
841 // average = (sum + 2) >> 2
842 const auto sum = _mm_add_epi32(_mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
843 averageHigh = _mm_srli_epi32(sum, 2);
844 }
845
846 // Pack each 32 bit average value to 16 bits
847 auto average = sse2_packus_epi32(averageLow, averageHigh);
848 _mm_storeu_si128(reinterpret_cast<__m128i*>(&pDstScanline[iDstPixel]), average);
849 pSrcScanlineShifted += 16;
850 }
851
852 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
853 return iDstPixel;
854 }
855
856 /************************************************************************/
857 /* QuadraticMeanFloatSSE2() */
858 /************************************************************************/
859
SQUARE(__m128 x)860 inline __m128 SQUARE(__m128 x)
861 {
862 return _mm_mul_ps(x, x);
863 }
864
QuadraticMeanFloatSSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)865 template<class T> static int QuadraticMeanFloatSSE2(int nDstXWidth,
866 int nChunkXSize,
867 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
868 T* CPL_RESTRICT pDstScanline)
869 {
870 // Optimized implementation for RMS on Float32 by
871 // processing by group of 4 output pixels.
872 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
873
874 int iDstPixel = 0;
875 const auto minus_zero = _mm_set1_ps(-0.0f);
876 const auto zeroDot25 = _mm_set1_ps(0.25f);
877 const auto one = _mm_set1_ps(1.0f);
878 const auto infv = _mm_set1_ps(std::numeric_limits<float>::infinity());
879
880 for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
881 {
882 // Load 8 Float32 from each line
883 auto firstLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted));
884 auto firstLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4));
885 auto secondLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + nChunkXSize));
886 auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4 + nChunkXSize));
887
888 // Take the absolute value
889 firstLineLo = _mm_andnot_ps(minus_zero, firstLineLo);
890 firstLineHi = _mm_andnot_ps(minus_zero, firstLineHi);
891 secondLineLo = _mm_andnot_ps(minus_zero, secondLineLo);
892 secondLineHi = _mm_andnot_ps(minus_zero, secondLineHi);
893
894 // Compute the maximum of each 4 value to RMS-average
895 auto maxLo = _mm_max_ps(firstLineLo, secondLineLo);
896 maxLo = _mm_max_ps(maxLo, _mm_shuffle_ps(maxLo, maxLo, _MM_SHUFFLE(2,3,0,1)));
897 auto maxHi = _mm_max_ps(firstLineHi, secondLineHi);
898 maxHi = _mm_max_ps(maxHi, _mm_shuffle_ps(maxHi, maxHi, _MM_SHUFFLE(2,3,0,1)));
899
900 const auto maxV = _mm_shuffle_ps(maxLo, maxHi, _MM_SHUFFLE(2,0,2,0));
901
902 // Normalize each value by the maximum of the 4 ones.
903 // This step is important to avoid that the square evaluates to infinity
904 // for sufficiently big input.
905 auto invMax = _mm_div_ps(one, maxV);
906 // Deal with 0 being the maximum to correct division by zero
907 // note: comparing to -0 leads to identical results as to comparing with 0
908 invMax = _mm_andnot_ps(_mm_cmpeq_ps(maxV, minus_zero), invMax);
909
910 const auto invMaxLo = _mm_shuffle_ps(invMax, invMax, _MM_SHUFFLE(1,1,0,0));
911 const auto invMaxHi = _mm_shuffle_ps(invMax, invMax, _MM_SHUFFLE(3,3,2,2));
912
913 firstLineLo = _mm_mul_ps(firstLineLo, invMaxLo);
914 firstLineHi = _mm_mul_ps(firstLineHi, invMaxHi);
915 secondLineLo = _mm_mul_ps(secondLineLo, invMaxLo);
916 secondLineHi = _mm_mul_ps(secondLineHi, invMaxHi);
917
918 // Compute squares
919 firstLineLo = SQUARE(firstLineLo);
920 firstLineHi = SQUARE(firstLineHi);
921 secondLineLo = SQUARE(secondLineLo);
922 secondLineHi = SQUARE(secondLineHi);
923
924 // Vertical addition
925 const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
926 const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
927
928 // Horizontal addition
929 const auto A = _mm_shuffle_ps(sumLo, sumHi, _MM_SHUFFLE(2,0,2,0));
930 const auto B = _mm_shuffle_ps(sumLo, sumHi, _MM_SHUFFLE(3,1,3,1));
931 const auto sumSquares = _mm_add_ps(A, B);
932
933 auto rms = _mm_mul_ps(maxV, _mm_sqrt_ps(_mm_mul_ps(sumSquares, zeroDot25)));
934
935 // Deal with infinity being the maximum
936 const auto maskIsInf = _mm_cmpeq_ps(maxV, infv);
937 rms = _mm_or_ps(_mm_andnot_ps(maskIsInf, rms), _mm_and_ps(maskIsInf, infv));
938
939 // coverity[incompatible_cast]
940 _mm_storeu_ps(reinterpret_cast<float*>(&pDstScanline[iDstPixel]), rms);
941 pSrcScanlineShifted += 8;
942 }
943
944 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
945 return iDstPixel;
946 }
947
948 /************************************************************************/
949 /* AverageFloatSSE2() */
950 /************************************************************************/
951
AverageFloatSSE2(int nDstXWidth,int nChunkXSize,const T * & CPL_RESTRICT pSrcScanlineShiftedInOut,T * CPL_RESTRICT pDstScanline)952 template<class T> static int AverageFloatSSE2(int nDstXWidth,
953 int nChunkXSize,
954 const T*& CPL_RESTRICT pSrcScanlineShiftedInOut,
955 T* CPL_RESTRICT pDstScanline)
956 {
957 // Optimized implementation for average on Float32 by
958 // processing by group of 4 output pixels.
959 const T* CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
960
961 int iDstPixel = 0;
962 const auto zeroDot25 = _mm_set1_ps(0.25f);
963
964 for( ; iDstPixel < nDstXWidth - 3; iDstPixel += 4 )
965 {
966 // Load 8 Float32 from each line
967 const auto firstLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted));
968 const auto firstLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4));
969 const auto secondLineLo = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + nChunkXSize));
970 const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const*>(pSrcScanlineShifted + 4 + nChunkXSize));
971
972 // Vertical addition
973 const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
974 const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
975
976 // Horizontal addition
977 const auto A = _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
978 const auto B = _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
979 const auto sum = _mm_add_ps(A, B);
980
981 const auto average = _mm_mul_ps(sum, zeroDot25);
982
983 // coverity[incompatible_cast]
984 _mm_storeu_ps(reinterpret_cast<float*>(&pDstScanline[iDstPixel]), average);
985 pSrcScanlineShifted += 8;
986 }
987
988 pSrcScanlineShiftedInOut = pSrcScanlineShifted;
989 return iDstPixel;
990 }
991
992 #endif
993
994 /************************************************************************/
995 /* GDALResampleChunk32R_Average() */
996 /************************************************************************/
997
998 template <class T, class Tsum, GDALDataType eWrkDataType>
999 static CPLErr
GDALResampleChunk32R_AverageT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,const T * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,bool bPropagateNoData)1000 GDALResampleChunk32R_AverageT( double dfXRatioDstToSrc,
1001 double dfYRatioDstToSrc,
1002 double dfSrcXDelta,
1003 double dfSrcYDelta,
1004 const T* pChunk,
1005 const GByte * pabyChunkNodataMask,
1006 int nChunkXOff, int nChunkXSize,
1007 int nChunkYOff, int nChunkYSize,
1008 int nDstXOff, int nDstXOff2,
1009 int nDstYOff, int nDstYOff2,
1010 GDALRasterBand * poOverview,
1011 void** ppDstBuffer,
1012 const char * pszResampling,
1013 int bHasNoData, // TODO(schwehr): bool.
1014 float fNoDataValue,
1015 GDALColorTable* poColorTable,
1016 bool bPropagateNoData )
1017 {
1018 // AVERAGE_BIT2GRAYSCALE
1019 const bool bBit2Grayscale =
1020 CPL_TO_BOOL( STARTS_WITH_CI( pszResampling, "AVERAGE_BIT2G" ) );
1021 const bool bQuadraticMean =
1022 CPL_TO_BOOL( EQUAL( pszResampling, "RMS" ) );
1023 if( bBit2Grayscale )
1024 poColorTable = nullptr;
1025
1026 T tNoDataValue;
1027 if( !bHasNoData )
1028 tNoDataValue = 0;
1029 else
1030 tNoDataValue = static_cast<T>(fNoDataValue);
1031 const T tReplacementVal = static_cast<T>(GetReplacementValueIfNoData(
1032 poOverview->GetRasterDataType(), bHasNoData, fNoDataValue));
1033
1034 int nChunkRightXOff = nChunkXOff + nChunkXSize;
1035 int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1036 int nDstXWidth = nDstXOff2 - nDstXOff;
1037
1038 /* -------------------------------------------------------------------- */
1039 /* Allocate buffers. */
1040 /* -------------------------------------------------------------------- */
1041 *ppDstBuffer = static_cast<T *>(
1042 VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1043 GDALGetDataTypeSizeBytes(eWrkDataType)));
1044 if( *ppDstBuffer == nullptr )
1045 {
1046 return CE_Failure;
1047 }
1048 T* const pDstBuffer = static_cast<T*>(*ppDstBuffer);
1049
1050 struct PrecomputedXValue
1051 {
1052 int nLeftXOffShifted;
1053 int nRightXOffShifted;
1054 double dfLeftWeight;
1055 double dfRightWeight;
1056 double dfTotalWeightFullLine;
1057 };
1058 PrecomputedXValue* pasSrcX = static_cast<PrecomputedXValue *>(
1059 VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue) ) );
1060
1061 if( pasSrcX == nullptr )
1062 {
1063 VSIFree(pasSrcX);
1064 return CE_Failure;
1065 }
1066
1067 int nEntryCount = 0;
1068 GDALColorEntry* aEntries = nullptr;
1069 int nTransparentIdx = -1;
1070
1071 if( poColorTable &&
1072 !ReadColorTableAsArray(poColorTable, nEntryCount, aEntries,
1073 nTransparentIdx) )
1074 {
1075 VSIFree(pasSrcX);
1076 return CE_Failure;
1077 }
1078
1079 // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1080 // it as nodata value
1081 if( bHasNoData && fNoDataValue >= 0.0f && tNoDataValue < nEntryCount )
1082 {
1083 if( aEntries == nullptr )
1084 {
1085 CPLError(CE_Failure, CPLE_ObjectNull, "No aEntries.");
1086 VSIFree(pasSrcX);
1087 return CE_Failure;
1088 }
1089 aEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1090 }
1091 // Or if we have no explicit nodata, but a color table entry that is
1092 // transparent, consider it as the nodata value
1093 else if( !bHasNoData && nTransparentIdx >= 0 )
1094 {
1095 bHasNoData = TRUE;
1096 tNoDataValue = static_cast<T>(nTransparentIdx);
1097 }
1098
1099 /* ==================================================================== */
1100 /* Precompute inner loop constants. */
1101 /* ==================================================================== */
1102 bool bSrcXSpacingIsTwo = true;
1103 int nLastSrcXOff2 = -1;
1104 for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
1105 {
1106 double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1107 // Apply some epsilon to avoid numerical precision issues
1108 int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1109 double dfSrcXOff2 = dfSrcXDelta + (iDstPixel+1)* dfXRatioDstToSrc;
1110 int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1111
1112 if( nSrcXOff < nChunkXOff )
1113 nSrcXOff = nChunkXOff;
1114 if( nSrcXOff2 == nSrcXOff )
1115 nSrcXOff2 ++;
1116 if( nSrcXOff2 > nChunkRightXOff )
1117 nSrcXOff2 = nChunkRightXOff;
1118
1119 pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1120 pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted = nSrcXOff2 - nChunkXOff;
1121 pasSrcX[iDstPixel - nDstXOff].dfLeftWeight = ( nSrcXOff2 == nSrcXOff + 1 ) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1122 pasSrcX[iDstPixel - nDstXOff].dfRightWeight = 1 - (nSrcXOff2 - dfSrcXOff2);
1123 pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine = pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1124 if( nSrcXOff + 1 < nSrcXOff2 )
1125 {
1126 pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine += nSrcXOff2 - nSrcXOff - 2;
1127 pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine += pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1128 }
1129
1130 if( nSrcXOff2 - nSrcXOff != 2 ||
1131 (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff) )
1132 {
1133 bSrcXSpacingIsTwo = false;
1134 }
1135 nLastSrcXOff2 = nSrcXOff2;
1136 }
1137
1138 /* ==================================================================== */
1139 /* Loop over destination scanlines. */
1140 /* ==================================================================== */
1141 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
1142 {
1143 double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1144 int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1145 if( nSrcYOff < nChunkYOff )
1146 nSrcYOff = nChunkYOff;
1147
1148 double dfSrcYOff2 = dfSrcYDelta + (iDstLine+1) * dfYRatioDstToSrc;
1149 int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1150 if( nSrcYOff2 == nSrcYOff )
1151 ++nSrcYOff2;
1152 if( nSrcYOff2 > nChunkBottomYOff )
1153 nSrcYOff2 = nChunkBottomYOff;
1154
1155 T* const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1156
1157 /* -------------------------------------------------------------------- */
1158 /* Loop over destination pixels */
1159 /* -------------------------------------------------------------------- */
1160 if( poColorTable == nullptr )
1161 {
1162 if( bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1163 pabyChunkNodataMask == nullptr )
1164 {
1165 if( eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16 )
1166 {
1167 // Optimized case : no nodata, overview by a factor of 2 and
1168 // regular x and y src spacing.
1169 const T* pSrcScanlineShifted =
1170 pChunk + pasSrcX[0].nLeftXOffShifted +
1171 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1172 int iDstPixel = 0;
1173 #ifdef USE_SSE2
1174 if( bQuadraticMean && eWrkDataType == GDT_Byte )
1175 {
1176 iDstPixel = QuadraticMeanByteSSE2OrAVX2(nDstXWidth,
1177 nChunkXSize,
1178 pSrcScanlineShifted,
1179 pDstScanline);
1180 }
1181 else if( bQuadraticMean /* && eWrkDataType == GDT_UInt16 */ )
1182 {
1183 iDstPixel = QuadraticMeanUInt16SSE2(nDstXWidth,
1184 nChunkXSize,
1185 pSrcScanlineShifted,
1186 pDstScanline);
1187 }
1188 else if( /* !bQuadraticMean && */ eWrkDataType == GDT_Byte )
1189 {
1190 iDstPixel = AverageByteSSE2OrAVX2(nDstXWidth,
1191 nChunkXSize,
1192 pSrcScanlineShifted,
1193 pDstScanline);
1194 }
1195 else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 ) */
1196 {
1197 iDstPixel = AverageUInt16SSE2(nDstXWidth,
1198 nChunkXSize,
1199 pSrcScanlineShifted,
1200 pDstScanline);
1201 }
1202 #endif
1203 for( ; iDstPixel < nDstXWidth; ++iDstPixel )
1204 {
1205 Tsum nTotal = 0;
1206 T nVal;
1207 if( bQuadraticMean )
1208 nTotal =
1209 SQUARE<Tsum>(pSrcScanlineShifted[0])
1210 + SQUARE<Tsum>(pSrcScanlineShifted[1])
1211 + SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize])
1212 + SQUARE<Tsum>(pSrcScanlineShifted[1+nChunkXSize]);
1213 else
1214 nTotal =
1215 pSrcScanlineShifted[0]
1216 + pSrcScanlineShifted[1]
1217 + pSrcScanlineShifted[nChunkXSize]
1218 + pSrcScanlineShifted[1+nChunkXSize];
1219
1220 constexpr int nTotalWeight = 4;
1221 if( bQuadraticMean )
1222 nVal = ComputeIntegerRMS_4values<T>(nTotal);
1223 else
1224 nVal = static_cast<T>((nTotal + nTotalWeight/2) / nTotalWeight);
1225
1226 // No need to compare nVal against tNoDataValue as we are
1227 // in a case where pabyChunkNodataMask == nullptr implies
1228 // the absence of nodata value.
1229 pDstScanline[iDstPixel] = nVal;
1230 pSrcScanlineShifted += 2;
1231 }
1232 }
1233 else
1234 {
1235 CPLAssert( eWrkDataType == GDT_Float32 );
1236 const T* pSrcScanlineShifted =
1237 pChunk + pasSrcX[0].nLeftXOffShifted +
1238 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1239 int iDstPixel = 0;
1240 #ifdef USE_SSE2
1241 if( bQuadraticMean )
1242 {
1243 iDstPixel = QuadraticMeanFloatSSE2(nDstXWidth,
1244 nChunkXSize,
1245 pSrcScanlineShifted,
1246 pDstScanline);
1247 }
1248 else
1249 {
1250 iDstPixel = AverageFloatSSE2(nDstXWidth,
1251 nChunkXSize,
1252 pSrcScanlineShifted,
1253 pDstScanline);
1254 }
1255 #endif
1256
1257 for( ; iDstPixel < nDstXWidth; ++iDstPixel )
1258 {
1259 T nVal;
1260 if( bQuadraticMean )
1261 {
1262 // Cast to double to avoid overflows
1263 // (using std::hypot() is much slower)
1264 nVal = static_cast<T>(std::sqrt(0.25 *
1265 ( SQUARE<double>(pSrcScanlineShifted[0])
1266 + SQUARE<double>(pSrcScanlineShifted[1])
1267 + SQUARE<double>(pSrcScanlineShifted[nChunkXSize])
1268 + SQUARE<double>(pSrcScanlineShifted[1+nChunkXSize]))));
1269 }
1270 else
1271 {
1272 nVal = static_cast<T>(0.25f *
1273 ( pSrcScanlineShifted[0]
1274 + pSrcScanlineShifted[1]
1275 + pSrcScanlineShifted[nChunkXSize]
1276 + pSrcScanlineShifted[1+nChunkXSize]));
1277 }
1278
1279 // No need to compare nVal against tNoDataValue as we are
1280 // in a case where pabyChunkNodataMask == nullptr implies
1281 // the absence of nodata value.
1282 pDstScanline[iDstPixel] = nVal;
1283 pSrcScanlineShifted += 2;
1284 }
1285 }
1286 }
1287 else
1288 {
1289 const double dfBottomWeight =
1290 (nSrcYOff + 1 == nSrcYOff2) ? 1.0 : 1.0 - (dfSrcYOff - nSrcYOff);
1291 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1292 nSrcYOff -= nChunkYOff;
1293 nSrcYOff2 -= nChunkYOff;
1294
1295 double dfTotalWeightFullColumn = dfBottomWeight;
1296 if( nSrcYOff + 1 < nSrcYOff2 )
1297 {
1298 dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1299 dfTotalWeightFullColumn += dfTopWeight;
1300 }
1301
1302 for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
1303 {
1304 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1305 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1306
1307 double dfTotal = 0;
1308 double dfTotalWeight = 0;
1309 if( pabyChunkNodataMask == nullptr )
1310 {
1311 auto pChunkShifted = pChunk +
1312 static_cast<GPtrDiff_t>(nSrcYOff) *nChunkXSize;
1313 int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1314 double dfWeightY = dfBottomWeight;
1315 while( true )
1316 {
1317 double dfTotalLine;
1318 if( bQuadraticMean )
1319 {
1320 // Left pixel
1321 {
1322 const T val = pChunkShifted[nSrcXOff];
1323 dfTotalLine = SQUARE<double>(val) * pasSrcX[iDstPixel].dfLeftWeight;
1324 }
1325
1326 if( nSrcXOff + 1 < nSrcXOff2 )
1327 {
1328 // Middle pixels
1329 for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1330 {
1331 const T val = pChunkShifted[iX];
1332 dfTotalLine += SQUARE<double>(val);
1333 }
1334
1335 // Right pixel
1336 {
1337 const T val = pChunkShifted[nSrcXOff2 - 1];
1338 dfTotalLine += SQUARE<double>(val) * pasSrcX[iDstPixel].dfRightWeight;
1339 }
1340 }
1341 }
1342 else
1343 {
1344 // Left pixel
1345 {
1346 const T val = pChunkShifted[nSrcXOff];
1347 dfTotalLine = val * pasSrcX[iDstPixel].dfLeftWeight;
1348 }
1349
1350 if( nSrcXOff + 1 < nSrcXOff2 )
1351 {
1352 // Middle pixels
1353 for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1354 {
1355 const T val = pChunkShifted[iX];
1356 dfTotalLine += val;
1357 }
1358
1359 // Right pixel
1360 {
1361 const T val = pChunkShifted[nSrcXOff2 - 1];
1362 dfTotalLine += val * pasSrcX[iDstPixel].dfRightWeight;
1363 }
1364 }
1365 }
1366
1367 dfTotal += dfTotalLine * dfWeightY;
1368 --nCounterY;
1369 if( nCounterY < 0 )
1370 break;
1371 pChunkShifted += nChunkXSize;
1372 dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1373 }
1374
1375 dfTotalWeight = pasSrcX[iDstPixel].dfTotalWeightFullLine * dfTotalWeightFullColumn;
1376 }
1377 else
1378 {
1379 GPtrDiff_t nCount = 0;
1380 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
1381 {
1382 const auto pChunkShifted = pChunk +
1383 static_cast<GPtrDiff_t>(iY) *nChunkXSize;
1384
1385 double dfTotalLine = 0;
1386 double dfTotalWeightLine = 0;
1387 // Left pixel
1388 {
1389 const int iX = nSrcXOff;
1390 const T val = pChunkShifted[iX];
1391 if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1392 {
1393 nCount ++;
1394 const double dfWeightX = pasSrcX[iDstPixel].dfLeftWeight;
1395 dfTotalWeightLine = dfWeightX;
1396 if( bQuadraticMean )
1397 dfTotalLine = SQUARE<double>(val) * dfWeightX;
1398 else
1399 dfTotalLine = val * dfWeightX;
1400 }
1401 }
1402
1403 if( nSrcXOff + 1 < nSrcXOff2 )
1404 {
1405 // Middle pixels
1406 for( int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2; ++iX )
1407 {
1408 const T val = pChunkShifted[iX];
1409 if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1410 {
1411 nCount ++;
1412 dfTotalWeightLine += 1;
1413 if( bQuadraticMean )
1414 dfTotalLine += SQUARE<double>(val);
1415 else
1416 dfTotalLine += val;
1417 }
1418 }
1419
1420 // Right pixel
1421 {
1422 const int iX = nSrcXOff2 - 1;
1423 const T val = pChunkShifted[iX];
1424 if( pabyChunkNodataMask[iX + iY *nChunkXSize] )
1425 {
1426 nCount ++;
1427 const double dfWeightX = pasSrcX[iDstPixel].dfRightWeight;
1428 dfTotalWeightLine += dfWeightX;
1429 if( bQuadraticMean )
1430 dfTotalLine += SQUARE<double>(val) * dfWeightX;
1431 else
1432 dfTotalLine += val * dfWeightX;
1433 }
1434 }
1435 }
1436
1437 const double dfWeightY =
1438 (iY == nSrcYOff) ? dfBottomWeight :
1439 (iY + 1 == nSrcYOff2) ? dfTopWeight :
1440 1.0;
1441 dfTotal += dfTotalLine * dfWeightY;
1442 dfTotalWeight += dfTotalWeightLine * dfWeightY;
1443 }
1444
1445 if( nCount == 0 ||
1446 (bPropagateNoData && nCount <
1447 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) * (nSrcXOff2 - nSrcXOff)))
1448 {
1449 pDstScanline[iDstPixel] = tNoDataValue;
1450 continue;
1451 }
1452 }
1453 if( eWrkDataType == GDT_Byte ||
1454 eWrkDataType == GDT_UInt16)
1455 {
1456 T nVal;
1457 if( bQuadraticMean )
1458 nVal = ComputeIntegerRMS<T>(dfTotal, dfTotalWeight);
1459 else
1460 nVal = static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1461 if( bHasNoData && nVal == tNoDataValue )
1462 nVal = tReplacementVal;
1463 pDstScanline[iDstPixel] = nVal;
1464 }
1465 else
1466 {
1467 T nVal;
1468 if( bQuadraticMean )
1469 nVal = static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1470 else
1471 nVal = static_cast<T>(dfTotal / dfTotalWeight);
1472 if( bHasNoData && nVal == tNoDataValue )
1473 nVal = tReplacementVal;
1474 pDstScanline[iDstPixel] = nVal;
1475 }
1476 }
1477 }
1478 }
1479 else
1480 {
1481 nSrcYOff -= nChunkYOff;
1482 nSrcYOff2 -= nChunkYOff;
1483
1484 for( int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel )
1485 {
1486 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1487 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1488
1489 GPtrDiff_t nTotalR = 0;
1490 GPtrDiff_t nTotalG = 0;
1491 GPtrDiff_t nTotalB = 0;
1492 GPtrDiff_t nCount = 0;
1493
1494 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
1495 {
1496 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
1497 {
1498 const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *nChunkXSize];
1499 const unsigned nVal = static_cast<unsigned>(val);
1500 if( nVal < static_cast<unsigned>(nEntryCount) && aEntries[nVal].c4 )
1501 {
1502 if( bQuadraticMean )
1503 {
1504 nTotalR += SQUARE<int>(aEntries[nVal].c1);
1505 nTotalG += SQUARE<int>(aEntries[nVal].c2);
1506 nTotalB += SQUARE<int>(aEntries[nVal].c3);
1507 ++nCount;
1508 }
1509 else
1510 {
1511 nTotalR += aEntries[nVal].c1;
1512 nTotalG += aEntries[nVal].c2;
1513 nTotalB += aEntries[nVal].c3;
1514 ++nCount;
1515 }
1516 }
1517 }
1518 }
1519
1520 if( nCount == 0 ||
1521 (bPropagateNoData && nCount <
1522 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) * (nSrcXOff2 - nSrcXOff)) )
1523 {
1524 pDstScanline[iDstPixel] = tNoDataValue;
1525 }
1526 else
1527 {
1528 int nR, nG, nB;
1529 if( bQuadraticMean )
1530 {
1531 nR = static_cast<int>(sqrt(nTotalR / nCount) + 0.5);
1532 nG = static_cast<int>(sqrt(nTotalG / nCount) + 0.5);
1533 nB = static_cast<int>(sqrt(nTotalB / nCount) + 0.5);
1534 }
1535 else
1536 {
1537 nR = static_cast<int>((nTotalR + nCount / 2) / nCount);
1538 nG = static_cast<int>((nTotalG + nCount / 2) / nCount);
1539 nB = static_cast<int>((nTotalB + nCount / 2) / nCount);
1540 }
1541 pDstScanline[iDstPixel] = static_cast<T>(GDALFindBestEntry(
1542 nEntryCount, aEntries, nR, nG, nB));
1543 }
1544 }
1545 }
1546 }
1547
1548 CPLFree( aEntries );
1549 CPLFree( pasSrcX );
1550
1551 return CE_None;
1552 }
1553
1554 static CPLErr
GDALResampleChunk32R_Average(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType eWrkDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType,bool bPropagateNoData)1555 GDALResampleChunk32R_Average( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1556 double dfSrcXDelta,
1557 double dfSrcYDelta,
1558 GDALDataType eWrkDataType,
1559 const void * pChunk,
1560 const GByte * pabyChunkNodataMask,
1561 int nChunkXOff, int nChunkXSize,
1562 int nChunkYOff, int nChunkYSize,
1563 int nDstXOff, int nDstXOff2,
1564 int nDstYOff, int nDstYOff2,
1565 GDALRasterBand * poOverview,
1566 void** ppDstBuffer,
1567 GDALDataType* peDstBufferDataType,
1568 const char * pszResampling,
1569 int bHasNoData, float fNoDataValue,
1570 GDALColorTable* poColorTable,
1571 GDALDataType /* eSrcDataType */,
1572 bool bPropagateNoData )
1573 {
1574 if( eWrkDataType == GDT_Byte )
1575 {
1576 *peDstBufferDataType = eWrkDataType;
1577 return GDALResampleChunk32R_AverageT<GByte, int, GDT_Byte>(
1578 dfXRatioDstToSrc, dfYRatioDstToSrc,
1579 dfSrcXDelta, dfSrcYDelta,
1580 static_cast<const GByte *>( pChunk ),
1581 pabyChunkNodataMask,
1582 nChunkXOff, nChunkXSize,
1583 nChunkYOff, nChunkYSize,
1584 nDstXOff, nDstXOff2,
1585 nDstYOff, nDstYOff2,
1586 poOverview,
1587 ppDstBuffer,
1588 pszResampling,
1589 bHasNoData, fNoDataValue,
1590 poColorTable,
1591 bPropagateNoData );
1592 }
1593 else if( eWrkDataType == GDT_UInt16 )
1594 {
1595 *peDstBufferDataType = eWrkDataType;
1596 if( EQUAL(pszResampling, "RMS") )
1597 {
1598 // Use double as accumulation type, because UInt32 could overflow
1599 return GDALResampleChunk32R_AverageT<GUInt16, double, GDT_UInt16>(
1600 dfXRatioDstToSrc, dfYRatioDstToSrc,
1601 dfSrcXDelta, dfSrcYDelta,
1602 static_cast<const GUInt16 *>( pChunk ),
1603 pabyChunkNodataMask,
1604 nChunkXOff, nChunkXSize,
1605 nChunkYOff, nChunkYSize,
1606 nDstXOff, nDstXOff2,
1607 nDstYOff, nDstYOff2,
1608 poOverview,
1609 ppDstBuffer,
1610 pszResampling,
1611 bHasNoData, fNoDataValue,
1612 poColorTable,
1613 bPropagateNoData );
1614 }
1615 else
1616 {
1617 return GDALResampleChunk32R_AverageT<GUInt16, GUInt32, GDT_UInt16>(
1618 dfXRatioDstToSrc, dfYRatioDstToSrc,
1619 dfSrcXDelta, dfSrcYDelta,
1620 static_cast<const GUInt16 *>( pChunk ),
1621 pabyChunkNodataMask,
1622 nChunkXOff, nChunkXSize,
1623 nChunkYOff, nChunkYSize,
1624 nDstXOff, nDstXOff2,
1625 nDstYOff, nDstYOff2,
1626 poOverview,
1627 ppDstBuffer,
1628 pszResampling,
1629 bHasNoData, fNoDataValue,
1630 poColorTable,
1631 bPropagateNoData );
1632 }
1633 }
1634 else if( eWrkDataType == GDT_Float32 )
1635 {
1636 *peDstBufferDataType = eWrkDataType;
1637 return GDALResampleChunk32R_AverageT<float, double, GDT_Float32>(
1638 dfXRatioDstToSrc, dfYRatioDstToSrc,
1639 dfSrcXDelta, dfSrcYDelta,
1640 static_cast<const float *>( pChunk ),
1641 pabyChunkNodataMask,
1642 nChunkXOff, nChunkXSize,
1643 nChunkYOff, nChunkYSize,
1644 nDstXOff, nDstXOff2,
1645 nDstYOff, nDstYOff2,
1646 poOverview,
1647 ppDstBuffer,
1648 pszResampling,
1649 bHasNoData, fNoDataValue,
1650 poColorTable,
1651 bPropagateNoData );
1652 }
1653
1654 CPLAssert(false);
1655 return CE_Failure;
1656 }
1657
1658 /************************************************************************/
1659 /* GDALResampleChunk32R_Gauss() */
1660 /************************************************************************/
1661
1662 static CPLErr
GDALResampleChunk32R_Gauss(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double,double,GDALDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType,bool)1663 GDALResampleChunk32R_Gauss( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1664 double /* dfSrcXDelta */,
1665 double /* dfSrcYDelta */,
1666 GDALDataType /* eWrkDataType */,
1667 const void * pChunk,
1668 const GByte * pabyChunkNodataMask,
1669 int nChunkXOff, int nChunkXSize,
1670 int nChunkYOff, int nChunkYSize,
1671 int nDstXOff, int nDstXOff2,
1672 int nDstYOff, int nDstYOff2,
1673 GDALRasterBand * poOverview,
1674 void** ppDstBuffer,
1675 GDALDataType* peDstBufferDataType,
1676 const char * /* pszResampling */,
1677 int bHasNoData, float fNoDataValue,
1678 GDALColorTable* poColorTable,
1679 GDALDataType /* eSrcDataType */,
1680 bool /* bPropagateNoData */ )
1681
1682 {
1683 const float * const pafChunk = static_cast<const float *>( pChunk );
1684
1685 *ppDstBuffer =
1686 VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1687 GDALGetDataTypeSizeBytes(GDT_Float32));
1688 if( *ppDstBuffer == nullptr )
1689 {
1690 return CE_Failure;
1691 }
1692 *peDstBufferDataType = GDT_Float32;
1693 float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
1694
1695 /* -------------------------------------------------------------------- */
1696 /* Create the filter kernel and allocate scanline buffer. */
1697 /* -------------------------------------------------------------------- */
1698 int nGaussMatrixDim = 3;
1699 const int *panGaussMatrix;
1700 constexpr int anGaussMatrix3x3[] ={
1701 1, 2, 1,
1702 2, 4, 2,
1703 1, 2, 1
1704 };
1705 constexpr int anGaussMatrix5x5[] = {
1706 1, 4, 6, 4, 1,
1707 4, 16, 24, 16, 4,
1708 6, 24, 36, 24, 6,
1709 4, 16, 24, 16, 4,
1710 1, 4, 6, 4, 1};
1711 constexpr int anGaussMatrix7x7[] = {
1712 1, 6, 15, 20, 15, 6, 1,
1713 6, 36, 90, 120, 90, 36, 6,
1714 15, 90, 225, 300, 225, 90, 15,
1715 20, 120, 300, 400, 300, 120, 20,
1716 15, 90, 225, 300, 225, 90, 15,
1717 6, 36, 90, 120, 90, 36, 6,
1718 1, 6, 15, 20, 15, 6, 1};
1719
1720 const int nOXSize = poOverview->GetXSize();
1721 const int nOYSize = poOverview->GetYSize();
1722 const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1723
1724 // matrix for gauss filter
1725 if(nResYFactor <= 2 )
1726 {
1727 panGaussMatrix = anGaussMatrix3x3;
1728 nGaussMatrixDim=3;
1729 }
1730 else if( nResYFactor <= 4 )
1731 {
1732 panGaussMatrix = anGaussMatrix5x5;
1733 nGaussMatrixDim=5;
1734 }
1735 else
1736 {
1737 panGaussMatrix = anGaussMatrix7x7;
1738 nGaussMatrixDim=7;
1739 }
1740
1741 #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1742 int* panGaussMatrixDup = static_cast<int*>(
1743 CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1744 memcpy(panGaussMatrixDup, panGaussMatrix,
1745 sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1746 panGaussMatrix = panGaussMatrixDup;
1747 #endif
1748
1749 if( !bHasNoData )
1750 fNoDataValue = 0.0f;
1751
1752 int nEntryCount = 0;
1753 GDALColorEntry* aEntries = nullptr;
1754 int nTransparentIdx = -1;
1755 if( poColorTable &&
1756 !ReadColorTableAsArray(poColorTable, nEntryCount, aEntries,
1757 nTransparentIdx) )
1758 {
1759 return CE_Failure;
1760 }
1761
1762 // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1763 // it as nodata value.
1764 if( bHasNoData && fNoDataValue >= 0.0f && fNoDataValue < nEntryCount )
1765 {
1766 if( aEntries == nullptr )
1767 {
1768 CPLError(CE_Failure, CPLE_ObjectNull, "No aEntries");
1769 return CE_Failure;
1770 }
1771 aEntries[static_cast<int>(fNoDataValue)].c4 = 0;
1772 }
1773 // Or if we have no explicit nodata, but a color table entry that is
1774 // transparent, consider it as the nodata value.
1775 else if( !bHasNoData && nTransparentIdx >= 0 )
1776 {
1777 fNoDataValue = static_cast<float>(nTransparentIdx);
1778 }
1779
1780 const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1781 const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1782 const int nDstXWidth = nDstXOff2 - nDstXOff;
1783
1784 /* ==================================================================== */
1785 /* Loop over destination scanlines. */
1786 /* ==================================================================== */
1787 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
1788 {
1789 int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1790 int nSrcYOff2 =
1791 static_cast<int>(0.5 + (iDstLine+1) * dfYRatioDstToSrc) + 1;
1792
1793 if( nSrcYOff < nChunkYOff )
1794 {
1795 nSrcYOff = nChunkYOff;
1796 nSrcYOff2++;
1797 }
1798
1799 const int iSizeY = nSrcYOff2 - nSrcYOff;
1800 nSrcYOff = nSrcYOff + iSizeY/2 - nGaussMatrixDim/2;
1801 nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1802
1803 if( nSrcYOff2 > nChunkBottomYOff ||
1804 (dfYRatioDstToSrc > 1 && iDstLine == nOYSize-1) )
1805 {
1806 nSrcYOff2 = std::min(nChunkBottomYOff,
1807 nSrcYOff + nGaussMatrixDim);
1808 }
1809
1810 int nYShiftGaussMatrix = 0;
1811 if(nSrcYOff < nChunkYOff)
1812 {
1813 nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1814 nSrcYOff = nChunkYOff;
1815 }
1816
1817 const float * const pafSrcScanline =
1818 pafChunk + ((nSrcYOff-nChunkYOff) * nChunkXSize);
1819 const GByte *pabySrcScanlineNodataMask = nullptr;
1820 if( pabyChunkNodataMask != nullptr )
1821 pabySrcScanlineNodataMask =
1822 pabyChunkNodataMask + ((nSrcYOff-nChunkYOff) * nChunkXSize);
1823
1824 /* -------------------------------------------------------------------- */
1825 /* Loop over destination pixels */
1826 /* -------------------------------------------------------------------- */
1827 float* const pafDstScanline = pafDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1828 for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
1829 {
1830 int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1831 int nSrcXOff2 =
1832 static_cast<int>(0.5 + (iDstPixel+1) * dfXRatioDstToSrc) + 1;
1833
1834 if( nSrcXOff < nChunkXOff )
1835 {
1836 nSrcXOff = nChunkXOff;
1837 nSrcXOff2++;
1838 }
1839
1840 const int iSizeX = nSrcXOff2 - nSrcXOff;
1841 nSrcXOff = nSrcXOff + iSizeX/2 - nGaussMatrixDim/2;
1842 nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1843
1844 if( nSrcXOff2 > nChunkRightXOff ||
1845 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize-1) )
1846 {
1847 nSrcXOff2 = std::min(nChunkRightXOff,
1848 nSrcXOff + nGaussMatrixDim);
1849 }
1850
1851 int nXShiftGaussMatrix = 0;
1852 if(nSrcXOff < nChunkXOff)
1853 {
1854 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1855 nSrcXOff = nChunkXOff;
1856 }
1857
1858 if( poColorTable == nullptr )
1859 {
1860 double dfTotal = 0.0;
1861 GInt64 nCount = 0;
1862 const int *panLineWeight = panGaussMatrix +
1863 nYShiftGaussMatrix * nGaussMatrixDim + nXShiftGaussMatrix;
1864
1865 for( int j=0, iY = nSrcYOff;
1866 iY < nSrcYOff2;
1867 ++iY, ++j, panLineWeight += nGaussMatrixDim )
1868 {
1869 for( int i=0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i )
1870 {
1871 const double val =
1872 pafSrcScanline[iX-nChunkXOff+static_cast<GPtrDiff_t>(iY-nSrcYOff)
1873 * nChunkXSize];
1874 if( pabySrcScanlineNodataMask == nullptr ||
1875 pabySrcScanlineNodataMask[iX - nChunkXOff
1876 +static_cast<GPtrDiff_t>(iY - nSrcYOff)
1877 * nChunkXSize] )
1878 {
1879 const int nWeight = panLineWeight[i];
1880 dfTotal += val * nWeight;
1881 nCount += nWeight;
1882 }
1883 }
1884 }
1885
1886 if( nCount == 0 )
1887 {
1888 pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
1889 }
1890 else
1891 {
1892 pafDstScanline[iDstPixel - nDstXOff] =
1893 static_cast<float>(dfTotal / nCount);
1894 }
1895 }
1896 else
1897 {
1898 GInt64 nTotalR = 0;
1899 GInt64 nTotalG = 0;
1900 GInt64 nTotalB = 0;
1901 GInt64 nTotalWeight = 0;
1902 const int *panLineWeight =
1903 panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1904 nXShiftGaussMatrix;
1905
1906 for( int j=0, iY = nSrcYOff; iY < nSrcYOff2;
1907 ++iY, ++j, panLineWeight += nGaussMatrixDim )
1908 {
1909 for( int i=0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i )
1910 {
1911 const double val =
1912 pafSrcScanline[iX - nChunkXOff +
1913 static_cast<GPtrDiff_t>(iY-nSrcYOff) * nChunkXSize];
1914 int nVal = static_cast<int>(val);
1915 if( nVal >= 0 && nVal < nEntryCount &&
1916 aEntries[nVal].c4 )
1917 {
1918 const int nWeight = panLineWeight[i];
1919 nTotalR += aEntries[nVal].c1 * nWeight;
1920 nTotalG += aEntries[nVal].c2 * nWeight;
1921 nTotalB += aEntries[nVal].c3 * nWeight;
1922 nTotalWeight += nWeight;
1923 }
1924 }
1925 }
1926
1927 if( nTotalWeight == 0 )
1928 {
1929 pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
1930 }
1931 else
1932 {
1933 const int nR =
1934 static_cast<int>((nTotalR + nTotalWeight / 2) / nTotalWeight);
1935 const int nG =
1936 static_cast<int>((nTotalG + nTotalWeight / 2) / nTotalWeight);
1937 const int nB =
1938 static_cast<int>((nTotalB + nTotalWeight / 2) / nTotalWeight);
1939 pafDstScanline[iDstPixel - nDstXOff] =
1940 static_cast<float>( GDALFindBestEntry(
1941 nEntryCount, aEntries, nR, nG, nB ) );
1942 }
1943 }
1944 }
1945 }
1946
1947 CPLFree( aEntries );
1948 #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1949 CPLFree( panGaussMatrixDup );
1950 #endif
1951
1952 return CE_None;
1953 }
1954
1955 /************************************************************************/
1956 /* GDALResampleChunk32R_Mode() */
1957 /************************************************************************/
1958
1959 static CPLErr
GDALResampleChunk32R_Mode(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand *,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char *,int bHasNoData,float fNoDataValue,GDALColorTable * poColorTable,GDALDataType eSrcDataType,bool)1960 GDALResampleChunk32R_Mode( double dfXRatioDstToSrc, double dfYRatioDstToSrc,
1961 double dfSrcXDelta,
1962 double dfSrcYDelta,
1963 GDALDataType /* eWrkDataType */,
1964 const void * pChunk,
1965 const GByte * pabyChunkNodataMask,
1966 int nChunkXOff, int nChunkXSize,
1967 int nChunkYOff, int nChunkYSize,
1968 int nDstXOff, int nDstXOff2,
1969 int nDstYOff, int nDstYOff2,
1970 GDALRasterBand * /* poOverview */,
1971 void** ppDstBuffer,
1972 GDALDataType* peDstBufferDataType,
1973 const char * /* pszResampling */,
1974 int bHasNoData, float fNoDataValue,
1975 GDALColorTable* poColorTable,
1976 GDALDataType eSrcDataType,
1977 bool /* bPropagateNoData */ )
1978
1979 {
1980 const float * const pafChunk = static_cast<const float *>( pChunk );
1981
1982 const int nDstXSize = nDstXOff2 - nDstXOff;
1983 *ppDstBuffer =
1984 VSI_MALLOC3_VERBOSE(nDstXSize, nDstYOff2 - nDstYOff,
1985 GDALGetDataTypeSizeBytes(GDT_Float32));
1986 if( *ppDstBuffer == nullptr )
1987 {
1988 return CE_Failure;
1989 }
1990 *peDstBufferDataType = GDT_Float32;
1991 float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
1992
1993 /* -------------------------------------------------------------------- */
1994 /* Create the filter kernel and allocate scanline buffer. */
1995 /* -------------------------------------------------------------------- */
1996
1997 if( !bHasNoData )
1998 fNoDataValue = 0.0f;
1999 int nEntryCount = 0;
2000 GDALColorEntry* aEntries = nullptr;
2001 int nTransparentIdx = -1;
2002 if( poColorTable &&
2003 !ReadColorTableAsArray(poColorTable, nEntryCount,
2004 aEntries, nTransparentIdx) )
2005 {
2006 return CE_Failure;
2007 }
2008
2009 size_t nMaxNumPx = 0;
2010 float *pafVals = nullptr;
2011 int *panSums = nullptr;
2012
2013 const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2014 const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2015 std::vector<int> anVals(256, 0);
2016
2017 /* ==================================================================== */
2018 /* Loop over destination scanlines. */
2019 /* ==================================================================== */
2020 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
2021 {
2022 double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2023 int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2024 #ifdef only_pixels_with_more_than_10_pct_participation
2025 // When oversampling, don't take into account pixels that have a tiny
2026 // participation in the resulting pixel
2027 if( dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2028 nSrcYOff < nChunkBottomYOff)
2029 nSrcYOff ++;
2030 #endif
2031 if( nSrcYOff < nChunkYOff )
2032 nSrcYOff = nChunkYOff;
2033
2034 double dfSrcYOff2 = dfSrcYDelta + (iDstLine+1) * dfYRatioDstToSrc;
2035 int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2036 #ifdef only_pixels_with_more_than_10_pct_participation
2037 // When oversampling, don't take into account pixels that have a tiny
2038 // participation in the resulting pixel
2039 if( dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2040 nSrcYOff2 > nChunkYOff)
2041 nSrcYOff2 --;
2042 #endif
2043 if( nSrcYOff2 == nSrcYOff )
2044 ++nSrcYOff2;
2045 if( nSrcYOff2 > nChunkBottomYOff )
2046 nSrcYOff2 = nChunkBottomYOff;
2047
2048 const float * const pafSrcScanline =
2049 pafChunk + (static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize);
2050 const GByte *pabySrcScanlineNodataMask = nullptr;
2051 if( pabyChunkNodataMask != nullptr )
2052 pabySrcScanlineNodataMask =
2053 pabyChunkNodataMask + static_cast<GPtrDiff_t>(nSrcYOff-nChunkYOff) * nChunkXSize;
2054
2055 float* const pafDstScanline = pafDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2056 /* -------------------------------------------------------------------- */
2057 /* Loop over destination pixels */
2058 /* -------------------------------------------------------------------- */
2059 for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
2060 {
2061 double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2062 // Apply some epsilon to avoid numerical precision issues
2063 int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2064 #ifdef only_pixels_with_more_than_10_pct_participation
2065 // When oversampling, don't take into account pixels that have a tiny
2066 // participation in the resulting pixel
2067 if( dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2068 nSrcXOff < nChunkRightXOff)
2069 nSrcXOff ++;
2070 #endif
2071 if( nSrcXOff < nChunkXOff )
2072 nSrcXOff = nChunkXOff;
2073
2074 double dfSrcXOff2 = dfSrcXDelta + (iDstPixel+1)* dfXRatioDstToSrc;
2075 int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2076 #ifdef only_pixels_with_more_than_10_pct_participation
2077 // When oversampling, don't take into account pixels that have a tiny
2078 // participation in the resulting pixel
2079 if( dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2080 nSrcXOff2 > nChunkXOff)
2081 nSrcXOff2 --;
2082 #endif
2083 if( nSrcXOff2 == nSrcXOff )
2084 nSrcXOff2 ++;
2085 if( nSrcXOff2 > nChunkRightXOff )
2086 nSrcXOff2 = nChunkRightXOff;
2087
2088 if( eSrcDataType != GDT_Byte || nEntryCount > 256 )
2089 {
2090 // Not sure how much sense it makes to run a majority
2091 // filter on floating point data, but here it is for the sake
2092 // of compatibility. It won't look right on RGB images by the
2093 // nature of the filter.
2094
2095 if( nSrcYOff2 - nSrcYOff <= 0 ||
2096 nSrcXOff2 - nSrcXOff <= 0 ||
2097 nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2098 static_cast<size_t>(nSrcYOff2-nSrcYOff)*
2099 static_cast<size_t>(nSrcXOff2-nSrcXOff) >
2100 std::numeric_limits<size_t>::max() / sizeof(float) )
2101 {
2102 CPLError(CE_Failure, CPLE_NotSupported,
2103 "Too big downsampling factor");
2104 CPLFree( aEntries );
2105 CPLFree( pafVals );
2106 CPLFree( panSums );
2107 return CE_Failure;
2108 }
2109 const size_t nNumPx = static_cast<size_t>(nSrcYOff2-nSrcYOff)*
2110 static_cast<size_t>(nSrcXOff2-nSrcXOff);
2111 size_t iMaxInd = 0;
2112 size_t iMaxVal = 0;
2113 bool biMaxValdValid = false;
2114
2115 if( pafVals == nullptr || nNumPx > nMaxNumPx )
2116 {
2117 float* pafValsNew = static_cast<float *>(
2118 VSI_REALLOC_VERBOSE(pafVals, nNumPx * sizeof(float)) );
2119 int* panSumsNew = static_cast<int *>(
2120 VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)) );
2121 if( pafValsNew != nullptr )
2122 pafVals = pafValsNew;
2123 if( panSumsNew != nullptr )
2124 panSums = panSumsNew;
2125 if( pafValsNew == nullptr || panSumsNew == nullptr )
2126 {
2127 CPLFree( aEntries );
2128 CPLFree( pafVals );
2129 CPLFree( panSums );
2130 return CE_Failure;
2131 }
2132 nMaxNumPx = nNumPx;
2133 }
2134
2135 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
2136 {
2137 const GPtrDiff_t iTotYOff = static_cast<GPtrDiff_t>(iY-nSrcYOff)*nChunkXSize-nChunkXOff;
2138 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
2139 {
2140 if( pabySrcScanlineNodataMask == nullptr ||
2141 pabySrcScanlineNodataMask[iX+iTotYOff] )
2142 {
2143 const float fVal = pafSrcScanline[iX+iTotYOff];
2144 size_t i = 0; // Used after for.
2145
2146 // Check array for existing entry.
2147 for( ; i < iMaxInd; ++i )
2148 if( pafVals[i] == fVal
2149 && ++panSums[i] > panSums[iMaxVal] )
2150 {
2151 iMaxVal = i;
2152 biMaxValdValid = true;
2153 break;
2154 }
2155
2156 // Add to arr if entry not already there.
2157 if( i == iMaxInd )
2158 {
2159 pafVals[iMaxInd] = fVal;
2160 panSums[iMaxInd] = 1;
2161
2162 if( !biMaxValdValid )
2163 {
2164 iMaxVal = iMaxInd;
2165 biMaxValdValid = true;
2166 }
2167
2168 ++iMaxInd;
2169 }
2170 }
2171 }
2172 }
2173
2174 if( !biMaxValdValid )
2175 pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
2176 else
2177 pafDstScanline[iDstPixel - nDstXOff] = pafVals[iMaxVal];
2178 }
2179 else // if( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2180 {
2181 // So we go here for a paletted or non-paletted byte band.
2182 // The input values are then between 0 and 255.
2183 int nMaxVal = 0;
2184 int iMaxInd = -1;
2185
2186 // The cost of this zeroing might be high. Perhaps we should just
2187 // use the above generic case, and go to this one if the number
2188 // of source pixels is large enough
2189 std::fill(anVals.begin(), anVals.end(), 0);
2190
2191 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
2192 {
2193 const GPtrDiff_t iTotYOff =
2194 static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize - nChunkXOff;
2195 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
2196 {
2197 const float val = pafSrcScanline[iX+iTotYOff];
2198 if( bHasNoData == FALSE || val != fNoDataValue )
2199 {
2200 int nVal = static_cast<int>(val);
2201 if( ++anVals[nVal] > nMaxVal)
2202 {
2203 // Sum the density.
2204 // Is it the most common value so far?
2205 iMaxInd = nVal;
2206 nMaxVal = anVals[nVal];
2207 }
2208 }
2209 }
2210 }
2211
2212 if( iMaxInd == -1 )
2213 pafDstScanline[iDstPixel - nDstXOff] = fNoDataValue;
2214 else
2215 pafDstScanline[iDstPixel - nDstXOff] =
2216 static_cast<float>(iMaxInd);
2217 }
2218 }
2219 }
2220
2221 CPLFree( aEntries );
2222 CPLFree( pafVals );
2223 CPLFree( panSums );
2224
2225 return CE_None;
2226 }
2227
2228 /************************************************************************/
2229 /* GDALResampleConvolutionHorizontal() */
2230 /************************************************************************/
2231
GDALResampleConvolutionHorizontal(const T * pChunk,const double * padfWeights,int nSrcPixelCount)2232 template<class T> static inline double GDALResampleConvolutionHorizontal(
2233 const T* pChunk, const double* padfWeights, int nSrcPixelCount )
2234 {
2235 double dfVal1 = 0.0;
2236 double dfVal2 = 0.0;
2237 int i = 0; // Used after for.
2238 for( ; i + 3 < nSrcPixelCount; i += 4 )
2239 {
2240 dfVal1 += pChunk[i] * padfWeights[i];
2241 dfVal1 += pChunk[i+1] * padfWeights[i+1];
2242 dfVal2 += pChunk[i+2] * padfWeights[i+2];
2243 dfVal2 += pChunk[i+3] * padfWeights[i+3];
2244 }
2245 for( ; i < nSrcPixelCount; ++i )
2246 {
2247 dfVal1 += pChunk[i] * padfWeights[i];
2248 }
2249 return dfVal1 + dfVal2;
2250 }
2251
GDALResampleConvolutionHorizontalWithMask(const T * pChunk,const GByte * pabyMask,const double * padfWeights,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2252 template<class T> static inline void GDALResampleConvolutionHorizontalWithMask(
2253 const T* pChunk, const GByte* pabyMask,
2254 const double* padfWeights, int nSrcPixelCount,
2255 double& dfVal, double &dfWeightSum)
2256 {
2257 dfVal = 0;
2258 dfWeightSum = 0;
2259 int i = 0;
2260 for( ; i + 3 < nSrcPixelCount; i += 4 )
2261 {
2262 const double dfWeight0 = padfWeights[i] * pabyMask[i];
2263 const double dfWeight1 = padfWeights[i+1] * pabyMask[i+1];
2264 const double dfWeight2 = padfWeights[i+2] * pabyMask[i+2];
2265 const double dfWeight3 = padfWeights[i+3] * pabyMask[i+3];
2266 dfVal += pChunk[i] * dfWeight0;
2267 dfVal += pChunk[i+1] * dfWeight1;
2268 dfVal += pChunk[i+2] * dfWeight2;
2269 dfVal += pChunk[i+3] * dfWeight3;
2270 dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2271 }
2272 for( ; i < nSrcPixelCount; ++i )
2273 {
2274 const double dfWeight = padfWeights[i] * pabyMask[i];
2275 dfVal += pChunk[i] * dfWeight;
2276 dfWeightSum += dfWeight;
2277 }
2278 }
2279
GDALResampleConvolutionHorizontal_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2280 template<class T> static inline void GDALResampleConvolutionHorizontal_3rows(
2281 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2282 const double* padfWeights, int nSrcPixelCount,
2283 double& dfRes1, double& dfRes2, double& dfRes3)
2284 {
2285 double dfVal1 = 0.0;
2286 double dfVal2 = 0.0;
2287 double dfVal3 = 0.0;
2288 double dfVal4 = 0.0;
2289 double dfVal5 = 0.0;
2290 double dfVal6 = 0.0;
2291 int i = 0; // Used after for.
2292 for( ; i + 3 < nSrcPixelCount; i += 4 )
2293 {
2294 dfVal1 += pChunkRow1[i] * padfWeights[i];
2295 dfVal1 += pChunkRow1[i+1] * padfWeights[i+1];
2296 dfVal2 += pChunkRow1[i+2] * padfWeights[i+2];
2297 dfVal2 += pChunkRow1[i+3] * padfWeights[i+3];
2298 dfVal3 += pChunkRow2[i] * padfWeights[i];
2299 dfVal3 += pChunkRow2[i+1] * padfWeights[i+1];
2300 dfVal4 += pChunkRow2[i+2] * padfWeights[i+2];
2301 dfVal4 += pChunkRow2[i+3] * padfWeights[i+3];
2302 dfVal5 += pChunkRow3[i] * padfWeights[i];
2303 dfVal5 += pChunkRow3[i+1] * padfWeights[i+1];
2304 dfVal6 += pChunkRow3[i+2] * padfWeights[i+2];
2305 dfVal6 += pChunkRow3[i+3] * padfWeights[i+3];
2306 }
2307 for( ; i < nSrcPixelCount; ++i )
2308 {
2309 dfVal1 += pChunkRow1[i] * padfWeights[i];
2310 dfVal3 += pChunkRow2[i] * padfWeights[i];
2311 dfVal5 += pChunkRow3[i] * padfWeights[i];
2312 }
2313 dfRes1 = dfVal1 + dfVal2;
2314 dfRes2 = dfVal3 + dfVal4;
2315 dfRes3 = dfVal5 + dfVal6;
2316 }
2317
2318 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2319 GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2320 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2321 const double* padfWeights, int nSrcPixelCount,
2322 double& dfRes1, double& dfRes2, double& dfRes3 )
2323 {
2324 GDALResampleConvolutionHorizontal_3rows(
2325 pChunkRow1, pChunkRow2, pChunkRow3,
2326 padfWeights, nSrcPixelCount,
2327 dfRes1, dfRes2, dfRes3 );
2328 }
2329
2330 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeights,double & dfRes1,double & dfRes2,double & dfRes3)2331 GDALResampleConvolutionHorizontalPixelCount4_3rows(
2332 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2333 const double* padfWeights,
2334 double& dfRes1, double& dfRes2, double& dfRes3 )
2335 {
2336 GDALResampleConvolutionHorizontal_3rows(
2337 pChunkRow1, pChunkRow2, pChunkRow3,
2338 padfWeights, 4,
2339 dfRes1, dfRes2, dfRes3 );
2340 }
2341
2342 /************************************************************************/
2343 /* GDALResampleConvolutionVertical() */
2344 /************************************************************************/
2345
GDALResampleConvolutionVertical(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount)2346 template<class T> static inline double GDALResampleConvolutionVertical(
2347 const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount )
2348 {
2349 double dfVal1 = 0.0;
2350 double dfVal2 = 0.0;
2351 int i = 0;
2352 int j = 0;
2353 for( ; i + 3 < nSrcLineCount; i+=4, j+=4*nStride)
2354 {
2355 dfVal1 += pChunk[j] * padfWeights[i];
2356 dfVal1 += pChunk[j + nStride] * padfWeights[i+1];
2357 dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i+2];
2358 dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i+3];
2359 }
2360 for( ; i < nSrcLineCount; ++i, j += nStride)
2361 {
2362 dfVal1 += pChunk[j] * padfWeights[i];
2363 }
2364 return dfVal1 + dfVal2;
2365 }
2366
GDALResampleConvolutionVertical_2cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,double & dfRes1,double & dfRes2)2367 template<class T> static inline void GDALResampleConvolutionVertical_2cols(
2368 const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2369 double& dfRes1, double& dfRes2 )
2370 {
2371 double dfVal1 = 0.0;
2372 double dfVal2 = 0.0;
2373 double dfVal3 = 0.0;
2374 double dfVal4 = 0.0;
2375 int i = 0;
2376 int j = 0;
2377 for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2378 {
2379 dfVal1 += pChunk[j] * padfWeights[i];
2380 dfVal3 += pChunk[j+1] * padfWeights[i];
2381 dfVal1 += pChunk[j + nStride] * padfWeights[i+1];
2382 dfVal3 += pChunk[j+1 + nStride] * padfWeights[i+1];
2383 dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i+2];
2384 dfVal4 += pChunk[j+1 + 2 * nStride] * padfWeights[i+2];
2385 dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i+3];
2386 dfVal4 += pChunk[j+1 + 3 * nStride] * padfWeights[i+3];
2387 }
2388 for( ; i < nSrcLineCount; ++i, j += nStride )
2389 {
2390 dfVal1 += pChunk[j] * padfWeights[i];
2391 dfVal3 += pChunk[j+1] * padfWeights[i];
2392 }
2393 dfRes1 = dfVal1 + dfVal2;
2394 dfRes2 = dfVal3 + dfVal4;
2395 }
2396
2397 #ifdef USE_SSE2
2398
2399 #ifdef __AVX__
2400 /************************************************************************/
2401 /* GDALResampleConvolutionVertical_16cols<T> */
2402 /************************************************************************/
2403
GDALResampleConvolutionVertical_16cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,float * afDest)2404 template<class T> static inline void GDALResampleConvolutionVertical_16cols(
2405 const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2406 float* afDest )
2407 {
2408 int i = 0;
2409 int j = 0;
2410 XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2411 XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2412 XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2413 XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2414 for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2415 {
2416 XMMReg4Double w0 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+0);
2417 XMMReg4Double w1 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+1);
2418 XMMReg4Double w2 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+2);
2419 XMMReg4Double w3 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+3);
2420 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+0*nStride) * w0;
2421 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+0*nStride) * w0;
2422 v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+0*nStride) * w0;
2423 v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+0*nStride) * w0;
2424 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+1*nStride) * w1;
2425 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+1*nStride) * w1;
2426 v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+1*nStride) * w1;
2427 v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+1*nStride) * w1;
2428 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+2*nStride) * w2;
2429 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+2*nStride) * w2;
2430 v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+2*nStride) * w2;
2431 v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+2*nStride) * w2;
2432 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0+3*nStride) * w3;
2433 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4+3*nStride) * w3;
2434 v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8+3*nStride) * w3;
2435 v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12+3*nStride) * w3;
2436 }
2437 for( ; i < nSrcLineCount; ++i, j += nStride )
2438 {
2439 XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights+i);
2440 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+ 0) * w;
2441 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+ 4) * w;
2442 v_acc2 += XMMReg4Double::Load4Val(pChunk+j+ 8) * w;
2443 v_acc3 += XMMReg4Double::Load4Val(pChunk+j+12) * w;
2444 }
2445 v_acc0.Store4Val(afDest);
2446 v_acc1.Store4Val(afDest+4);
2447 v_acc2.Store4Val(afDest+8);
2448 v_acc3.Store4Val(afDest+12);
2449 }
2450
2451 #else
2452
2453
2454 /************************************************************************/
2455 /* GDALResampleConvolutionVertical_8cols<T> */
2456 /************************************************************************/
2457
GDALResampleConvolutionVertical_8cols(const T * pChunk,int nStride,const double * padfWeights,int nSrcLineCount,float * afDest)2458 template<class T> static inline void GDALResampleConvolutionVertical_8cols(
2459 const T* pChunk, int nStride, const double* padfWeights, int nSrcLineCount,
2460 float* afDest )
2461 {
2462 int i = 0;
2463 int j = 0;
2464 XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2465 XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2466 for(;i+3<nSrcLineCount;i+=4, j+=4*nStride)
2467 {
2468 XMMReg4Double w0 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+0);
2469 XMMReg4Double w1 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+1);
2470 XMMReg4Double w2 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+2);
2471 XMMReg4Double w3 = XMMReg4Double::Load1ValHighAndLow(padfWeights+i+3);
2472 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+0*nStride) * w0;
2473 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+0*nStride) * w0;
2474 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+1*nStride) * w1;
2475 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+1*nStride) * w1;
2476 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+2*nStride) * w2;
2477 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+2*nStride) * w2;
2478 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0+3*nStride) * w3;
2479 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4+3*nStride) * w3;
2480 }
2481 for( ; i < nSrcLineCount; ++i, j += nStride )
2482 {
2483 XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights+i);
2484 v_acc0 += XMMReg4Double::Load4Val(pChunk+j+0) * w;
2485 v_acc1 += XMMReg4Double::Load4Val(pChunk+j+4) * w;
2486 }
2487 v_acc0.Store4Val(afDest);
2488 v_acc1.Store4Val(afDest+4);
2489 }
2490
2491 #endif // __AVX__
2492
2493 /************************************************************************/
2494 /* GDALResampleConvolutionHorizontalSSE2<T> */
2495 /************************************************************************/
2496
GDALResampleConvolutionHorizontalSSE2(const T * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2497 template<class T> static inline double GDALResampleConvolutionHorizontalSSE2(
2498 const T* pChunk, const double* padfWeightsAligned, int nSrcPixelCount )
2499 {
2500 XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2501 XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2502 int i = 0; // Used after for.
2503 for( ; i + 7 < nSrcPixelCount; i += 8 )
2504 {
2505 // Retrieve the pixel & accumulate
2506 const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk+i);
2507 const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk+i+4);
2508 const XMMReg4Double v_weight1 =
2509 XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2510 const XMMReg4Double v_weight2 =
2511 XMMReg4Double::Load4ValAligned(padfWeightsAligned+i+4);
2512
2513 v_acc1 += v_pixels1 * v_weight1;
2514 v_acc2 += v_pixels2 * v_weight2;
2515 }
2516
2517 v_acc1 += v_acc2;
2518
2519 double dfVal = v_acc1.GetHorizSum();
2520 for( ; i < nSrcPixelCount; ++i )
2521 {
2522 dfVal += pChunk[i] * padfWeightsAligned[i];
2523 }
2524 return dfVal;
2525 }
2526
2527 /************************************************************************/
2528 /* GDALResampleConvolutionHorizontal<GByte> */
2529 /************************************************************************/
2530
GDALResampleConvolutionHorizontal(const GByte * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2531 template<> inline double GDALResampleConvolutionHorizontal<GByte>(
2532 const GByte* pChunk, const double* padfWeightsAligned, int nSrcPixelCount )
2533 {
2534 return GDALResampleConvolutionHorizontalSSE2( pChunk, padfWeightsAligned,
2535 nSrcPixelCount );
2536 }
2537
GDALResampleConvolutionHorizontal(const GUInt16 * pChunk,const double * padfWeightsAligned,int nSrcPixelCount)2538 template<> inline double GDALResampleConvolutionHorizontal<GUInt16>(
2539 const GUInt16* pChunk, const double* padfWeightsAligned,
2540 int nSrcPixelCount )
2541 {
2542 return GDALResampleConvolutionHorizontalSSE2( pChunk, padfWeightsAligned,
2543 nSrcPixelCount) ;
2544 }
2545
2546 /************************************************************************/
2547 /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2548 /************************************************************************/
2549
2550 template<class T> static inline void
GDALResampleConvolutionHorizontalWithMaskSSE2(const T * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2551 GDALResampleConvolutionHorizontalWithMaskSSE2(
2552 const T* pChunk, const GByte* pabyMask,
2553 const double* padfWeightsAligned, int nSrcPixelCount,
2554 double& dfVal, double &dfWeightSum )
2555 {
2556 int i = 0; // Used after for.
2557 XMMReg4Double v_acc = XMMReg4Double::Zero();
2558 XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2559 for( ; i + 3 < nSrcPixelCount; i += 4 )
2560 {
2561 const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk+i);
2562 const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask+i);
2563 XMMReg4Double v_weight =
2564 XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2565 v_weight *= v_mask;
2566 v_acc += v_pixels * v_weight;
2567 v_acc_weight += v_weight;
2568 }
2569
2570 dfVal = v_acc.GetHorizSum();
2571 dfWeightSum = v_acc_weight.GetHorizSum();
2572 for( ; i < nSrcPixelCount; ++i )
2573 {
2574 const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2575 dfVal += pChunk[i] * dfWeight;
2576 dfWeightSum += dfWeight;
2577 }
2578 }
2579
2580 /************************************************************************/
2581 /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2582 /************************************************************************/
2583
GDALResampleConvolutionHorizontalWithMask(const GByte * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2584 template<> inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2585 const GByte* pChunk, const GByte* pabyMask,
2586 const double* padfWeightsAligned, int nSrcPixelCount,
2587 double& dfVal, double &dfWeightSum)
2588 {
2589 GDALResampleConvolutionHorizontalWithMaskSSE2(pChunk, pabyMask,
2590 padfWeightsAligned,
2591 nSrcPixelCount,
2592 dfVal, dfWeightSum);
2593 }
2594
GDALResampleConvolutionHorizontalWithMask(const GUInt16 * pChunk,const GByte * pabyMask,const double * padfWeightsAligned,int nSrcPixelCount,double & dfVal,double & dfWeightSum)2595 template<> inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2596 const GUInt16* pChunk, const GByte* pabyMask,
2597 const double* padfWeightsAligned, int nSrcPixelCount,
2598 double& dfVal, double &dfWeightSum )
2599 {
2600 GDALResampleConvolutionHorizontalWithMaskSSE2( pChunk, pabyMask,
2601 padfWeightsAligned,
2602 nSrcPixelCount,
2603 dfVal, dfWeightSum );
2604 }
2605
2606 /************************************************************************/
2607 /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2608 /************************************************************************/
2609
2610 template<class T> static inline void
GDALResampleConvolutionHorizontal_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2611 GDALResampleConvolutionHorizontal_3rows_SSE2(
2612 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2613 const double* padfWeightsAligned, int nSrcPixelCount,
2614 double& dfRes1, double& dfRes2, double& dfRes3 )
2615 {
2616 XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2617 v_acc2 = XMMReg4Double::Zero(),
2618 v_acc3 = XMMReg4Double::Zero();
2619 int i = 0;
2620 for( ; i + 7 < nSrcPixelCount; i += 8 )
2621 {
2622 // Retrieve the pixel & accumulate.
2623 XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1+i);
2624 XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1+i+4);
2625 const XMMReg4Double v_weight1 =
2626 XMMReg4Double::Load4ValAligned(padfWeightsAligned+i);
2627 const XMMReg4Double v_weight2 =
2628 XMMReg4Double::Load4ValAligned(padfWeightsAligned+i+4);
2629
2630 v_acc1 += v_pixels1 * v_weight1;
2631 v_acc1 += v_pixels2 * v_weight2;
2632
2633 v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2+i);
2634 v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2+i+4);
2635 v_acc2 += v_pixels1 * v_weight1;
2636 v_acc2 += v_pixels2 * v_weight2;
2637
2638 v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3+i);
2639 v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3+i+4);
2640 v_acc3 += v_pixels1 * v_weight1;
2641 v_acc3 += v_pixels2 * v_weight2;
2642 }
2643
2644 dfRes1 = v_acc1.GetHorizSum();
2645 dfRes2 = v_acc2.GetHorizSum();
2646 dfRes3 = v_acc3.GetHorizSum();
2647 for( ; i < nSrcPixelCount; ++i )
2648 {
2649 dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2650 dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2651 dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2652 }
2653 }
2654
2655 /************************************************************************/
2656 /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2657 /************************************************************************/
2658
GDALResampleConvolutionHorizontal_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2659 template<> inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2660 const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2661 const double* padfWeightsAligned, int nSrcPixelCount,
2662 double& dfRes1, double& dfRes2, double& dfRes3 )
2663 {
2664 GDALResampleConvolutionHorizontal_3rows_SSE2(
2665 pChunkRow1, pChunkRow2, pChunkRow3,
2666 padfWeightsAligned, nSrcPixelCount,
2667 dfRes1, dfRes2, dfRes3 );
2668 }
2669
GDALResampleConvolutionHorizontal_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2670 template<> inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2671 const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2672 const GUInt16* pChunkRow3,
2673 const double* padfWeightsAligned, int nSrcPixelCount,
2674 double& dfRes1, double& dfRes2, double& dfRes3 )
2675 {
2676 GDALResampleConvolutionHorizontal_3rows_SSE2(
2677 pChunkRow1, pChunkRow2, pChunkRow3,
2678 padfWeightsAligned, nSrcPixelCount,
2679 dfRes1, dfRes2, dfRes3);
2680 }
2681
2682 /************************************************************************/
2683 /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2684 /************************************************************************/
2685
2686 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2687 GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2688 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2689 const double* padfWeightsAligned, int nSrcPixelCount,
2690 double& dfRes1, double& dfRes2, double& dfRes3)
2691 {
2692 XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2693 XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2694 XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2695 int i = 0; // Use after for.
2696 for( ; i + 3 < nSrcPixelCount; i += 4)
2697 {
2698 // Retrieve the pixel & accumulate.
2699 const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1+i);
2700 const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2+i);
2701 const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3+i);
2702 const XMMReg4Double v_weight =
2703 XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2704
2705 v_acc1 += v_pixels1 * v_weight;
2706 v_acc2 += v_pixels2 * v_weight;
2707 v_acc3 += v_pixels3 * v_weight;
2708 }
2709
2710 dfRes1 = v_acc1.GetHorizSum();
2711 dfRes2 = v_acc2.GetHorizSum();
2712 dfRes3 = v_acc3.GetHorizSum();
2713
2714 for( ; i < nSrcPixelCount; ++i )
2715 {
2716 dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2717 dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2718 dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2719 }
2720 }
2721
2722 /************************************************************************/
2723 /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
2724 /************************************************************************/
2725
2726 template<> inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2727 GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
2728 const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2729 const double* padfWeightsAligned, int nSrcPixelCount,
2730 double& dfRes1, double& dfRes2, double& dfRes3 )
2731 {
2732 GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2733 pChunkRow1, pChunkRow2, pChunkRow3,
2734 padfWeightsAligned, nSrcPixelCount,
2735 dfRes1, dfRes2, dfRes3 );
2736 }
2737
2738 template<> inline void
GDALResampleConvolutionHorizontalPixelCountLess8_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,int nSrcPixelCount,double & dfRes1,double & dfRes2,double & dfRes3)2739 GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
2740 const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2741 const GUInt16* pChunkRow3,
2742 const double* padfWeightsAligned, int nSrcPixelCount,
2743 double& dfRes1, double& dfRes2, double& dfRes3 )
2744 {
2745 GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2746 pChunkRow1, pChunkRow2, pChunkRow3,
2747 padfWeightsAligned, nSrcPixelCount,
2748 dfRes1, dfRes2, dfRes3 );
2749 }
2750
2751 /************************************************************************/
2752 /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
2753 /************************************************************************/
2754
2755 template<class T> static inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(const T * pChunkRow1,const T * pChunkRow2,const T * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2756 GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2757 const T* pChunkRow1, const T* pChunkRow2, const T* pChunkRow3,
2758 const double* padfWeightsAligned,
2759 double& dfRes1, double& dfRes2, double& dfRes3)
2760 {
2761 const XMMReg4Double v_weight =
2762 XMMReg4Double::Load4ValAligned(padfWeightsAligned);
2763
2764 // Retrieve the pixel & accumulate.
2765 const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
2766 const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
2767 const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
2768
2769 XMMReg4Double v_acc1 = v_pixels1 * v_weight;
2770 XMMReg4Double v_acc2 = v_pixels2 * v_weight;
2771 XMMReg4Double v_acc3 = v_pixels3 * v_weight;
2772
2773 dfRes1 = v_acc1.GetHorizSum();
2774 dfRes2 = v_acc2.GetHorizSum();
2775 dfRes3 = v_acc3.GetHorizSum();
2776 }
2777
2778 /************************************************************************/
2779 /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
2780 /************************************************************************/
2781
2782 template<> inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const GByte * pChunkRow1,const GByte * pChunkRow2,const GByte * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2783 GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
2784 const GByte* pChunkRow1, const GByte* pChunkRow2, const GByte* pChunkRow3,
2785 const double* padfWeightsAligned,
2786 double& dfRes1, double& dfRes2, double& dfRes3 )
2787 {
2788 GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2789 pChunkRow1, pChunkRow2, pChunkRow3,
2790 padfWeightsAligned,
2791 dfRes1, dfRes2, dfRes3 );
2792 }
2793
2794 template<> inline void
GDALResampleConvolutionHorizontalPixelCount4_3rows(const GUInt16 * pChunkRow1,const GUInt16 * pChunkRow2,const GUInt16 * pChunkRow3,const double * padfWeightsAligned,double & dfRes1,double & dfRes2,double & dfRes3)2795 GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
2796 const GUInt16* pChunkRow1, const GUInt16* pChunkRow2,
2797 const GUInt16* pChunkRow3,
2798 const double* padfWeightsAligned,
2799 double& dfRes1, double& dfRes2, double& dfRes3 )
2800 {
2801 GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2802 pChunkRow1, pChunkRow2, pChunkRow3,
2803 padfWeightsAligned,
2804 dfRes1, dfRes2, dfRes3 );
2805 }
2806
2807 #endif // USE_SSE2
2808
2809 /************************************************************************/
2810 /* GDALResampleChunk32R_Convolution() */
2811 /************************************************************************/
2812
2813 template<class T> static CPLErr
GDALResampleChunk32R_ConvolutionT(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,const T * pChunk,int nBands,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poDstBand,void * pDstBuffer,int bHasNoData,float fNoDataValue,FilterFuncType pfnFilterFunc,FilterFunc4ValuesType pfnFilterFunc4Values,int nKernelRadius,bool bKernelWithNegativeWeights,float fMaxVal)2814 GDALResampleChunk32R_ConvolutionT( double dfXRatioDstToSrc,
2815 double dfYRatioDstToSrc,
2816 double dfSrcXDelta,
2817 double dfSrcYDelta,
2818 const T * pChunk, int nBands,
2819 const GByte * pabyChunkNodataMask,
2820 int nChunkXOff, int nChunkXSize,
2821 int nChunkYOff, int nChunkYSize,
2822 int nDstXOff, int nDstXOff2,
2823 int nDstYOff, int nDstYOff2,
2824 GDALRasterBand * poDstBand,
2825 void* pDstBuffer,
2826 int bHasNoData,
2827 float fNoDataValue,
2828 FilterFuncType pfnFilterFunc,
2829 FilterFunc4ValuesType pfnFilterFunc4Values,
2830 int nKernelRadius,
2831 bool bKernelWithNegativeWeights,
2832 float fMaxVal )
2833
2834 {
2835 if( !bHasNoData )
2836 fNoDataValue = 0.0f;
2837 const auto dstDataType = poDstBand->GetRasterDataType();
2838 const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
2839 const float fReplacementVal = GetReplacementValueIfNoData(
2840 dstDataType, bHasNoData, fNoDataValue);
2841 // cppcheck-suppress unreadVariable
2842 const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
2843 const auto nNodataValueInt64 = static_cast<GInt64>(fNoDataValue);
2844
2845 // TODO: we should have some generic function to do this.
2846 float fDstMin = -std::numeric_limits<float>::max();
2847 float fDstMax = std::numeric_limits<float>::max();
2848 if( dstDataType == GDT_Byte )
2849 {
2850 fDstMin = std::numeric_limits<GByte>::min();
2851 fDstMax = std::numeric_limits<GByte>::max();
2852 }
2853 else if( dstDataType == GDT_UInt16 )
2854 {
2855 fDstMin = std::numeric_limits<GUInt16>::min();
2856 fDstMax = std::numeric_limits<GUInt16>::max();
2857 }
2858 else if( dstDataType == GDT_Int16 )
2859 {
2860 fDstMin = std::numeric_limits<GInt16>::min();
2861 fDstMax = std::numeric_limits<GInt16>::max();
2862 }
2863 else if( dstDataType == GDT_UInt32 )
2864 {
2865 fDstMin = static_cast<float>(std::numeric_limits<GUInt32>::min());
2866 fDstMax = static_cast<float>(std::numeric_limits<GUInt32>::max());
2867 }
2868 else if( dstDataType == GDT_Int32 )
2869 {
2870 // cppcheck-suppress unreadVariable
2871 fDstMin = static_cast<float>(std::numeric_limits<GInt32>::min());
2872 // cppcheck-suppress unreadVariable
2873 fDstMax = static_cast<float>(std::numeric_limits<GInt32>::max());
2874 }
2875
2876 auto replaceValIfNodata =
2877 [bHasNoData, isIntegerDT, fDstMin, fDstMax, nNodataValueInt64,
2878 fNoDataValue, fReplacementVal](float fVal)
2879 {
2880 if( !bHasNoData )
2881 return fVal;
2882
2883 // Clamp value before comparing to nodata: this is only needed for
2884 // kernels with negative weights (Lanczos)
2885 float fClamped = fVal;
2886 if( fClamped < fDstMin )
2887 fClamped = fDstMin;
2888 else if( fClamped > fDstMax )
2889 fClamped = fDstMax;
2890 if( isIntegerDT )
2891 {
2892 if( nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)) )
2893 {
2894 // Do not use the nodata value
2895 return fReplacementVal;
2896 }
2897 }
2898 else if( fNoDataValue == fClamped )
2899 {
2900 // Do not use the nodata value
2901 return fReplacementVal;
2902 }
2903 return fClamped;
2904 };
2905
2906 /* -------------------------------------------------------------------- */
2907 /* Allocate work buffers. */
2908 /* -------------------------------------------------------------------- */
2909 const int nDstXSize = nDstXOff2 - nDstXOff;
2910 float* pafWrkScanline = nullptr;
2911 if( dstDataType != GDT_Float32 )
2912 {
2913 pafWrkScanline = static_cast<float*>(
2914 VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(float)));
2915 if( pafWrkScanline == nullptr )
2916 return CE_Failure;
2917 }
2918
2919 const double dfXScale = 1.0 / dfXRatioDstToSrc;
2920 const double dfXScaleWeight = ( dfXScale >= 1.0 ) ? 1.0 : dfXScale;
2921 const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
2922 const double dfYScale = 1.0 / dfYRatioDstToSrc;
2923 const double dfYScaleWeight = ( dfYScale >= 1.0 ) ? 1.0 : dfYScale;
2924 const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
2925
2926 // Temporary array to store result of horizontal filter.
2927 double* padfHorizontalFiltered = static_cast<double*>(
2928 VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands) );
2929
2930 // To store convolution coefficients.
2931 double* padfWeights = static_cast<double *>(
2932 VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
2933 static_cast<int>(
2934 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
2935 0.5) * sizeof(double) ) );
2936
2937 GByte* pabyChunkNodataMaskHorizontalFiltered = nullptr;
2938 if( pabyChunkNodataMask )
2939 pabyChunkNodataMaskHorizontalFiltered = static_cast<GByte*>(
2940 VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize) );
2941 if( padfHorizontalFiltered == nullptr ||
2942 padfWeights == nullptr ||
2943 (pabyChunkNodataMask != nullptr &&
2944 pabyChunkNodataMaskHorizontalFiltered == nullptr) )
2945 {
2946 VSIFree(pafWrkScanline);
2947 VSIFree(padfHorizontalFiltered);
2948 VSIFreeAligned(padfWeights);
2949 VSIFree(pabyChunkNodataMaskHorizontalFiltered);
2950 return CE_Failure;
2951 }
2952
2953 /* ==================================================================== */
2954 /* First pass: horizontal filter */
2955 /* ==================================================================== */
2956 const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2957 #ifdef USE_SSE2
2958 bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
2959 #endif
2960 for( int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel )
2961 {
2962 const double dfSrcPixel =
2963 (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
2964 int nSrcPixelStart =
2965 static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
2966 if( nSrcPixelStart < nChunkXOff )
2967 nSrcPixelStart = nChunkXOff;
2968 int nSrcPixelStop =
2969 static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
2970 if( nSrcPixelStop > nChunkRightXOff )
2971 nSrcPixelStop = nChunkRightXOff;
2972 #if 0
2973 if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
2974 {
2975 printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
2976 }
2977 if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
2978 {
2979 printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
2980 }
2981 #endif
2982 const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
2983 double dfWeightSum = 0.0;
2984
2985 // Compute convolution coefficients.
2986 int nSrcPixel = nSrcPixelStart;
2987 double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
2988 for( ; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel+=4)
2989 {
2990 padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
2991 dfX += dfXScaleWeight;
2992 padfWeights[nSrcPixel+1 - nSrcPixelStart] = dfX;
2993 dfX += dfXScaleWeight;
2994 padfWeights[nSrcPixel+2 - nSrcPixelStart] = dfX;
2995 dfX += dfXScaleWeight;
2996 padfWeights[nSrcPixel+3 - nSrcPixelStart] = dfX;
2997 dfX += dfXScaleWeight;
2998 dfWeightSum +=
2999 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3000 }
3001 for( ; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3002 {
3003 const double dfWeight = pfnFilterFunc(dfX);
3004 padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3005 dfWeightSum += dfWeight;
3006 }
3007
3008 const int nHeight = nChunkYSize * nBands;
3009 if( pabyChunkNodataMask == nullptr )
3010 {
3011 if( dfWeightSum != 0 )
3012 {
3013 const double dfInvWeightSum = 1.0 / dfWeightSum;
3014 for( int i = 0; i < nSrcPixelCount; ++i )
3015 padfWeights[i] *= dfInvWeightSum;
3016 }
3017 int iSrcLineOff = 0;
3018 #ifdef USE_SSE2
3019 if( nSrcPixelCount == 4 )
3020 {
3021 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3022 {
3023 const GPtrDiff_t j =
3024 static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3025 (nSrcPixelStart - nChunkXOff);
3026 double dfVal1 = 0.0;
3027 double dfVal2 = 0.0;
3028 double dfVal3 = 0.0;
3029 GDALResampleConvolutionHorizontalPixelCount4_3rows(
3030 pChunk + j, pChunk + j + nChunkXSize,
3031 pChunk + j + 2 * nChunkXSize,
3032 padfWeights, dfVal1, dfVal2, dfVal3);
3033 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3034 iDstPixel - nDstXOff] = dfVal1;
3035 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3036 iDstPixel - nDstXOff] = dfVal2;
3037 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3038 iDstPixel - nDstXOff] = dfVal3;
3039 }
3040 }
3041 else if( bSrcPixelCountLess8 )
3042 {
3043 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3044 {
3045 const GPtrDiff_t j =
3046 static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3047 (nSrcPixelStart - nChunkXOff);
3048 double dfVal1 = 0.0;
3049 double dfVal2 = 0.0;
3050 double dfVal3 = 0.0;
3051 GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3052 pChunk + j, pChunk + j + nChunkXSize,
3053 pChunk + j + 2 * nChunkXSize,
3054 padfWeights, nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3055 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3056 iDstPixel - nDstXOff] = dfVal1;
3057 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3058 iDstPixel - nDstXOff] = dfVal2;
3059 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3060 iDstPixel - nDstXOff] = dfVal3;
3061 }
3062 }
3063 else
3064 #endif
3065 {
3066 for( ; iSrcLineOff+2 < nHeight; iSrcLineOff +=3 )
3067 {
3068 const GPtrDiff_t j =
3069 static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3070 (nSrcPixelStart - nChunkXOff);
3071 double dfVal1 = 0.0;
3072 double dfVal2 = 0.0;
3073 double dfVal3 = 0.0;
3074 GDALResampleConvolutionHorizontal_3rows(
3075 pChunk + j,
3076 pChunk + j + nChunkXSize,
3077 pChunk + j + 2 * nChunkXSize,
3078 padfWeights, nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3079 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3080 iDstPixel - nDstXOff] = dfVal1;
3081 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 1) * nDstXSize +
3082 iDstPixel - nDstXOff] = dfVal2;
3083 padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) + 2) * nDstXSize +
3084 iDstPixel - nDstXOff] = dfVal3;
3085 }
3086 }
3087 for( ; iSrcLineOff < nHeight; ++iSrcLineOff )
3088 {
3089 const GPtrDiff_t j =
3090 static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize + (nSrcPixelStart - nChunkXOff);
3091 const double dfVal =
3092 GDALResampleConvolutionHorizontal(pChunk + j,
3093 padfWeights, nSrcPixelCount);
3094 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) * nDstXSize +
3095 iDstPixel - nDstXOff] = dfVal;
3096 }
3097 }
3098 else
3099 {
3100 for( int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff )
3101 {
3102 const GPtrDiff_t j =
3103 static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize + (nSrcPixelStart - nChunkXOff);
3104
3105 if( bKernelWithNegativeWeights )
3106 {
3107 int nConsecutiveValid = 0;
3108 int nMaxConsecutiveValid = 0;
3109 for( int k = 0; k < nSrcPixelCount; k++ )
3110 {
3111 if( pabyChunkNodataMask[j + k] )
3112 nConsecutiveValid ++;
3113 else if( nConsecutiveValid )
3114 {
3115 nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3116 nConsecutiveValid);
3117 nConsecutiveValid = 0;
3118 }
3119 }
3120 nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3121 nConsecutiveValid);
3122 if( nMaxConsecutiveValid < nSrcPixelCount / 2 )
3123 {
3124 const size_t nTempOffset =
3125 static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel - nDstXOff;
3126 padfHorizontalFiltered[nTempOffset] = 0.0;
3127 pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3128 continue;
3129 }
3130 }
3131
3132 double dfVal = 0.0;
3133 GDALResampleConvolutionHorizontalWithMask(
3134 pChunk + j, pabyChunkNodataMask + j,
3135 padfWeights, nSrcPixelCount,
3136 dfVal, dfWeightSum );
3137 const size_t nTempOffset =
3138 static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel - nDstXOff;
3139 if( dfWeightSum > 0.0 )
3140 {
3141 padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3142 pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3143 }
3144 else
3145 {
3146 padfHorizontalFiltered[nTempOffset] = 0.0;
3147 pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3148 }
3149 }
3150 }
3151 }
3152
3153 /* ==================================================================== */
3154 /* Second pass: vertical filter */
3155 /* ==================================================================== */
3156 const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3157
3158 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
3159 {
3160 float* const pafDstScanline = pafWrkScanline ? pafWrkScanline:
3161 static_cast<float*>(pDstBuffer) + (iDstLine - nDstYOff) * nDstXSize;
3162
3163 const double dfSrcLine =
3164 (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3165 int nSrcLineStart =
3166 static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3167 int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3168 if( nSrcLineStart < nChunkYOff )
3169 nSrcLineStart = nChunkYOff;
3170 if( nSrcLineStop > nChunkBottomYOff )
3171 nSrcLineStop = nChunkBottomYOff;
3172 #if 0
3173 if( nSrcLineStart < nChunkYOff &&
3174 nChunkYOff > 0 )
3175 {
3176 printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3177 }
3178 if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3179 {
3180 printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3181 }
3182 #endif
3183 const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3184 double dfWeightSum = 0.0;
3185
3186 // Compute convolution coefficients.
3187 int nSrcLine = nSrcLineStart; // Used after for.
3188 double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3189 for( ;
3190 nSrcLine + 3 < nSrcLineStop;
3191 nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3192 {
3193 padfWeights[nSrcLine - nSrcLineStart] = dfY;
3194 padfWeights[nSrcLine+1 - nSrcLineStart] = dfY + dfYScaleWeight;
3195 padfWeights[nSrcLine+2 - nSrcLineStart] = dfY + 2 * dfYScaleWeight;
3196 padfWeights[nSrcLine+3 - nSrcLineStart] = dfY + 3 * dfYScaleWeight;
3197 dfWeightSum +=
3198 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3199 }
3200 for( ; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight )
3201 {
3202 const double dfWeight = pfnFilterFunc(dfY);
3203 padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3204 dfWeightSum += dfWeight;
3205 }
3206
3207 if( pabyChunkNodataMask == nullptr )
3208 {
3209 if( dfWeightSum != 0 )
3210 {
3211 const double dfInvWeightSum = 1.0 / dfWeightSum;
3212 for( int i = 0; i < nSrcLineCount; ++i )
3213 padfWeights[i] *= dfInvWeightSum;
3214 }
3215 }
3216
3217 if( pabyChunkNodataMask == nullptr )
3218 {
3219 int iFilteredPixelOff = 0; // Used after for.
3220 // j used after for.
3221 size_t j = (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3222 #ifdef USE_SSE2
3223
3224 #ifdef __AVX__
3225 for( ;
3226 iFilteredPixelOff+15 < nDstXSize;
3227 iFilteredPixelOff += 16, j += 16 )
3228 {
3229 GDALResampleConvolutionVertical_16cols(
3230 padfHorizontalFiltered + j, nDstXSize, padfWeights,
3231 nSrcLineCount, pafDstScanline + iFilteredPixelOff );
3232 if( bHasNoData )
3233 {
3234 for( int k = 0; k < 16; k++ )
3235 {
3236 pafDstScanline[iFilteredPixelOff + k] =
3237 replaceValIfNodata(pafDstScanline[iFilteredPixelOff + k]);
3238 }
3239 }
3240 }
3241 #else
3242 for( ;
3243 iFilteredPixelOff+7 < nDstXSize;
3244 iFilteredPixelOff += 8, j += 8 )
3245 {
3246 GDALResampleConvolutionVertical_8cols(
3247 padfHorizontalFiltered + j, nDstXSize, padfWeights,
3248 nSrcLineCount, pafDstScanline + iFilteredPixelOff );
3249 if( bHasNoData )
3250 {
3251 for( int k = 0; k < 8; k++ )
3252 {
3253 pafDstScanline[iFilteredPixelOff + k] =
3254 replaceValIfNodata(pafDstScanline[iFilteredPixelOff + k]);
3255 }
3256 }
3257 }
3258 #endif
3259
3260 for( ; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++ )
3261 {
3262 const float fVal = static_cast<float>(
3263 GDALResampleConvolutionVertical(
3264 padfHorizontalFiltered + j,
3265 nDstXSize, padfWeights, nSrcLineCount ));
3266 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(fVal);
3267 }
3268 #else
3269 for( ;
3270 iFilteredPixelOff+1 < nDstXSize;
3271 iFilteredPixelOff += 2, j += 2 )
3272 {
3273 double dfVal1 = 0.0;
3274 double dfVal2 = 0.0;
3275 GDALResampleConvolutionVertical_2cols(
3276 padfHorizontalFiltered + j, nDstXSize, padfWeights,
3277 nSrcLineCount, dfVal1, dfVal2 );
3278 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3279 static_cast<float>(dfVal1));
3280 pafDstScanline[iFilteredPixelOff+1] = replaceValIfNodata(
3281 static_cast<float>(dfVal2));
3282 }
3283 if( iFilteredPixelOff < nDstXSize )
3284 {
3285 const double dfVal =
3286 GDALResampleConvolutionVertical(
3287 padfHorizontalFiltered + j,
3288 nDstXSize, padfWeights, nSrcLineCount );
3289 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3290 static_cast<float>(dfVal));
3291 }
3292 #endif
3293 }
3294 else
3295 {
3296 for( int iFilteredPixelOff = 0;
3297 iFilteredPixelOff < nDstXSize;
3298 ++iFilteredPixelOff )
3299 {
3300 double dfVal = 0.0;
3301 dfWeightSum = 0.0;
3302 size_t j = (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize)
3303 + iFilteredPixelOff;
3304 if( bKernelWithNegativeWeights )
3305 {
3306 int nConsecutiveValid = 0;
3307 int nMaxConsecutiveValid = 0;
3308 for(int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3309 {
3310 const double dfWeight =
3311 padfWeights[i]
3312 * pabyChunkNodataMaskHorizontalFiltered[j];
3313 if( pabyChunkNodataMaskHorizontalFiltered[j] )
3314 {
3315 nConsecutiveValid ++;
3316 }
3317 else if( nConsecutiveValid )
3318 {
3319 nMaxConsecutiveValid = std::max(
3320 nMaxConsecutiveValid, nConsecutiveValid);
3321 nConsecutiveValid = 0;
3322 }
3323 dfVal += padfHorizontalFiltered[j] * dfWeight;
3324 dfWeightSum += dfWeight;
3325 }
3326 nMaxConsecutiveValid = std::max(nMaxConsecutiveValid,
3327 nConsecutiveValid);
3328 if( nMaxConsecutiveValid < nSrcLineCount / 2 )
3329 {
3330 pafDstScanline[iFilteredPixelOff] = fNoDataValue;
3331 continue;
3332 }
3333 }
3334 else
3335 {
3336 for(int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3337 {
3338 const double dfWeight =
3339 padfWeights[i]
3340 * pabyChunkNodataMaskHorizontalFiltered[j];
3341 dfVal += padfHorizontalFiltered[j] * dfWeight;
3342 dfWeightSum += dfWeight;
3343 }
3344 }
3345 if( dfWeightSum > 0.0 )
3346 {
3347 pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3348 static_cast<float>(dfVal / dfWeightSum));
3349 }
3350 else
3351 {
3352 pafDstScanline[iFilteredPixelOff] = fNoDataValue;
3353 }
3354 }
3355 }
3356
3357 if( fMaxVal != 0.0f )
3358 {
3359 for( int i = 0; i < nDstXSize; ++i )
3360 {
3361 if( pafDstScanline[i] > fMaxVal )
3362 pafDstScanline[i] = fMaxVal;
3363 }
3364 }
3365
3366 if( pafWrkScanline )
3367 {
3368 GDALCopyWords(pafWrkScanline, GDT_Float32, 4,
3369 static_cast<GByte*>(pDstBuffer) +
3370 static_cast<size_t>(iDstLine - nDstYOff) *
3371 nDstXSize * nDstDataTypeSize,
3372 dstDataType, nDstDataTypeSize,
3373 nDstXSize);
3374 }
3375 }
3376
3377 VSIFree(pafWrkScanline);
3378 VSIFreeAligned( padfWeights );
3379 VSIFree( padfHorizontalFiltered );
3380 VSIFree( pabyChunkNodataMaskHorizontalFiltered );
3381
3382 return CE_None;
3383 }
3384
GDALResampleChunk32R_Convolution(double dfXRatioDstToSrc,double dfYRatioDstToSrc,double dfSrcXDelta,double dfSrcYDelta,GDALDataType eWrkDataType,const void * pChunk,const GByte * pabyChunkNodataMask,int nChunkXOff,int nChunkXSize,int nChunkYOff,int nChunkYSize,int nDstXOff,int nDstXOff2,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling,int bHasNoData,float fNoDataValue,GDALColorTable *,GDALDataType,bool)3385 static CPLErr GDALResampleChunk32R_Convolution(
3386 double dfXRatioDstToSrc, double dfYRatioDstToSrc,
3387 double dfSrcXDelta,
3388 double dfSrcYDelta,
3389 GDALDataType eWrkDataType,
3390 const void * pChunk,
3391 const GByte * pabyChunkNodataMask,
3392 int nChunkXOff, int nChunkXSize,
3393 int nChunkYOff, int nChunkYSize,
3394 int nDstXOff, int nDstXOff2,
3395 int nDstYOff, int nDstYOff2,
3396 GDALRasterBand * poOverview,
3397 void** ppDstBuffer,
3398 GDALDataType* peDstBufferDataType,
3399 const char * pszResampling,
3400 int bHasNoData, float fNoDataValue,
3401 GDALColorTable* /* poColorTable_unused */,
3402 GDALDataType /* eSrcDataType */,
3403 bool /* bPropagateNoData */ )
3404 {
3405 GDALResampleAlg eResample;
3406 bool bKernelWithNegativeWeights = false;
3407 if( EQUAL(pszResampling, "BILINEAR") )
3408 eResample = GRA_Bilinear;
3409 else if( EQUAL(pszResampling, "CUBIC") )
3410 eResample = GRA_Cubic;
3411 else if( EQUAL(pszResampling, "CUBICSPLINE") )
3412 eResample = GRA_CubicSpline;
3413 else if( EQUAL(pszResampling, "LANCZOS") )
3414 {
3415 eResample = GRA_Lanczos;
3416 bKernelWithNegativeWeights = true;
3417 }
3418 else
3419 {
3420 CPLAssert(false);
3421 return CE_Failure;
3422 }
3423 const int nKernelRadius = GWKGetFilterRadius(eResample);
3424 FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3425 const FilterFunc4ValuesType pfnFilterFunc4Values =
3426 GWKGetFilterFunc4Values(eResample);
3427
3428 float fMaxVal = 0.f;
3429 // Cubic, etc... can have overshoots, so make sure we clamp values to the
3430 // maximum value if NBITS is set.
3431 const char* pszNBITS =
3432 poOverview->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
3433 GDALDataType eBandDT = poOverview->GetRasterDataType();
3434 if( eResample != GRA_Bilinear && pszNBITS != nullptr &&
3435 (eBandDT == GDT_Byte || eBandDT == GDT_UInt16 ||
3436 eBandDT == GDT_UInt32) )
3437 {
3438 int nBits = atoi(pszNBITS);
3439 if( nBits == GDALGetDataTypeSize(eBandDT) )
3440 nBits = 0;
3441 if( nBits > 0 && nBits < 32 )
3442 fMaxVal = static_cast<float>((1U << nBits) -1);
3443 }
3444
3445 *ppDstBuffer =
3446 VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff,
3447 nDstYOff2 - nDstYOff,
3448 GDALGetDataTypeSizeBytes(eBandDT));
3449 if( *ppDstBuffer == nullptr )
3450 {
3451 return CE_Failure;
3452 }
3453 *peDstBufferDataType = eBandDT;
3454
3455 if( eWrkDataType == GDT_Byte )
3456 return GDALResampleChunk32R_ConvolutionT<GByte>(
3457 dfXRatioDstToSrc, dfYRatioDstToSrc,
3458 dfSrcXDelta, dfSrcYDelta,
3459 static_cast<const GByte *>( pChunk ),
3460 1,
3461 pabyChunkNodataMask,
3462 nChunkXOff, nChunkXSize,
3463 nChunkYOff, nChunkYSize,
3464 nDstXOff, nDstXOff2,
3465 nDstYOff, nDstYOff2,
3466 poOverview,
3467 *ppDstBuffer,
3468 bHasNoData, fNoDataValue,
3469 pfnFilterFunc,
3470 pfnFilterFunc4Values,
3471 nKernelRadius,
3472 bKernelWithNegativeWeights,
3473 fMaxVal );
3474 else if( eWrkDataType == GDT_UInt16 )
3475 return GDALResampleChunk32R_ConvolutionT<GUInt16>(
3476 dfXRatioDstToSrc, dfYRatioDstToSrc,
3477 dfSrcXDelta, dfSrcYDelta,
3478 static_cast<const GUInt16 *>( pChunk ),
3479 1,
3480 pabyChunkNodataMask,
3481 nChunkXOff, nChunkXSize,
3482 nChunkYOff, nChunkYSize,
3483 nDstXOff, nDstXOff2,
3484 nDstYOff, nDstYOff2,
3485 poOverview,
3486 *ppDstBuffer,
3487 bHasNoData, fNoDataValue,
3488 pfnFilterFunc,
3489 pfnFilterFunc4Values,
3490 nKernelRadius,
3491 bKernelWithNegativeWeights,
3492 fMaxVal );
3493 else if( eWrkDataType == GDT_Float32 )
3494 return GDALResampleChunk32R_ConvolutionT<float>(
3495 dfXRatioDstToSrc, dfYRatioDstToSrc,
3496 dfSrcXDelta, dfSrcYDelta,
3497 static_cast<const float *>( pChunk ),
3498 1,
3499 pabyChunkNodataMask,
3500 nChunkXOff, nChunkXSize,
3501 nChunkYOff, nChunkYSize,
3502 nDstXOff, nDstXOff2,
3503 nDstYOff, nDstYOff2,
3504 poOverview,
3505 *ppDstBuffer,
3506 bHasNoData, fNoDataValue,
3507 pfnFilterFunc,
3508 pfnFilterFunc4Values,
3509 nKernelRadius,
3510 bKernelWithNegativeWeights,
3511 fMaxVal );
3512
3513 CPLAssert(false);
3514 return CE_Failure;
3515 }
3516
3517 /************************************************************************/
3518 /* GDALResampleChunkC32R() */
3519 /************************************************************************/
3520
3521 static CPLErr
GDALResampleChunkC32R(int nSrcWidth,int nSrcHeight,const float * pafChunk,int nChunkYOff,int nChunkYSize,int nDstYOff,int nDstYOff2,GDALRasterBand * poOverview,void ** ppDstBuffer,GDALDataType * peDstBufferDataType,const char * pszResampling)3522 GDALResampleChunkC32R( int nSrcWidth, int nSrcHeight,
3523 const float * pafChunk, int nChunkYOff, int nChunkYSize,
3524 int nDstYOff, int nDstYOff2,
3525 GDALRasterBand * poOverview,
3526 void** ppDstBuffer,
3527 GDALDataType* peDstBufferDataType,
3528 const char * pszResampling )
3529
3530 {
3531 enum Method
3532 {
3533 NEAR,
3534 AVERAGE,
3535 AVERAGE_MAGPHASE,
3536 RMS,
3537 };
3538
3539 Method eMethod = NEAR;
3540 if( STARTS_WITH_CI(pszResampling, "NEAR") )
3541 {
3542 eMethod = NEAR;
3543 }
3544 else if( EQUAL(pszResampling, "AVERAGE_MAGPHASE") )
3545 {
3546 eMethod = AVERAGE_MAGPHASE;
3547 }
3548 else if( EQUAL(pszResampling, "RMS") )
3549 {
3550 eMethod = RMS;
3551 }
3552 else if( STARTS_WITH_CI(pszResampling, "AVER") )
3553 {
3554 eMethod = AVERAGE;
3555 }
3556 else
3557 {
3558 CPLError(CE_Failure, CPLE_NotSupported,
3559 "Unsupported resampling method %s for GDALResampleChunkC32R",
3560 pszResampling);
3561 return CE_Failure;
3562 }
3563
3564 const int nOXSize = poOverview->GetXSize();
3565 *ppDstBuffer =
3566 VSI_MALLOC3_VERBOSE(nOXSize,
3567 nDstYOff2 - nDstYOff,
3568 GDALGetDataTypeSizeBytes(GDT_CFloat32));
3569 if( *ppDstBuffer == nullptr )
3570 {
3571 return CE_Failure;
3572 }
3573 float* const pafDstBuffer = static_cast<float*>(*ppDstBuffer);
3574 *peDstBufferDataType = GDT_CFloat32;
3575
3576 const int nOYSize = poOverview->GetYSize();
3577 const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3578 const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3579
3580 /* ==================================================================== */
3581 /* Loop over destination scanlines. */
3582 /* ==================================================================== */
3583 for( int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine )
3584 {
3585 int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3586 if( nSrcYOff < nChunkYOff )
3587 nSrcYOff = nChunkYOff;
3588
3589 int nSrcYOff2 = static_cast<int>(0.5 + (iDstLine+1) * dfYRatioDstToSrc);
3590 if( nSrcYOff2 == nSrcYOff )
3591 nSrcYOff2 ++;
3592
3593 if( nSrcYOff2 > nSrcHeight || iDstLine == nOYSize-1 )
3594 {
3595 if( nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff )
3596 nSrcYOff = nSrcHeight - 1;
3597 nSrcYOff2 = nSrcHeight;
3598 }
3599 if( nSrcYOff2 > nChunkYOff + nChunkYSize )
3600 nSrcYOff2 = nChunkYOff + nChunkYSize;
3601
3602 const float * const pafSrcScanline =
3603 pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3604 float* const pafDstScanline =
3605 pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3606
3607 /* -------------------------------------------------------------------- */
3608 /* Loop over destination pixels */
3609 /* -------------------------------------------------------------------- */
3610 for( int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel )
3611 {
3612 int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3613 int nSrcXOff2 = static_cast<int>(
3614 0.5 + (iDstPixel+1) * dfXRatioDstToSrc);
3615 if( nSrcXOff2 == nSrcXOff )
3616 nSrcXOff2 ++;
3617 if( nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize-1 )
3618 {
3619 if( nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0 )
3620 nSrcXOff = nSrcWidth - 1;
3621 nSrcXOff2 = nSrcWidth;
3622 }
3623
3624 if( eMethod == NEAR )
3625 {
3626 pafDstScanline[iDstPixel*2] = pafSrcScanline[nSrcXOff*2];
3627 pafDstScanline[iDstPixel*2+1] = pafSrcScanline[nSrcXOff*2+1];
3628 }
3629 else if( eMethod == AVERAGE_MAGPHASE )
3630 {
3631 double dfTotalR = 0.0;
3632 double dfTotalI = 0.0;
3633 double dfTotalM = 0.0;
3634 int nCount = 0;
3635
3636 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3637 {
3638 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3639 {
3640 const double dfR =
3641 pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3642 const double dfI =
3643 pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3644 dfTotalR += dfR;
3645 dfTotalI += dfI;
3646 dfTotalM += std::hypot(dfR, dfI);
3647 ++nCount;
3648 }
3649 }
3650
3651 CPLAssert( nCount > 0 );
3652 if( nCount == 0 )
3653 {
3654 pafDstScanline[iDstPixel*2] = 0.0;
3655 pafDstScanline[iDstPixel*2+1] = 0.0;
3656 }
3657 else
3658 {
3659 pafDstScanline[iDstPixel*2 ] =
3660 static_cast<float>(dfTotalR/nCount);
3661 pafDstScanline[iDstPixel*2+1] =
3662 static_cast<float>(dfTotalI/nCount);
3663 const double dfM = std::hypot(pafDstScanline[iDstPixel*2],
3664 pafDstScanline[iDstPixel*2+1]);
3665 const double dfDesiredM = dfTotalM / nCount;
3666 double dfRatio = 1.0;
3667 if( dfM != 0.0 )
3668 dfRatio = dfDesiredM / dfM;
3669
3670 pafDstScanline[iDstPixel*2] *=
3671 static_cast<float>(dfRatio);
3672 pafDstScanline[iDstPixel*2+1] *=
3673 static_cast<float>(dfRatio);
3674 }
3675 }
3676 else if( eMethod == RMS )
3677 {
3678 double dfTotalR = 0.0;
3679 double dfTotalI = 0.0;
3680 int nCount = 0;
3681
3682 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3683 {
3684 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3685 {
3686 const double dfR = pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3687 const double dfI = pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3688
3689 dfTotalR += SQUARE(dfR);
3690 dfTotalI += SQUARE(dfI);
3691
3692 ++nCount;
3693 }
3694 }
3695
3696 CPLAssert( nCount > 0 );
3697 if( nCount == 0 )
3698 {
3699 pafDstScanline[iDstPixel*2] = 0.0;
3700 pafDstScanline[iDstPixel*2+1] = 0.0;
3701 }
3702 else
3703 {
3704 /* compute RMS */
3705 pafDstScanline[iDstPixel*2 ] = static_cast<float>(sqrt(dfTotalR/nCount));
3706 pafDstScanline[iDstPixel*2+1] = static_cast<float>(sqrt(dfTotalI/nCount));
3707 }
3708 }
3709 else if( eMethod == AVERAGE )
3710 {
3711 double dfTotalR = 0.0;
3712 double dfTotalI = 0.0;
3713 int nCount = 0;
3714
3715 for( int iY = nSrcYOff; iY < nSrcYOff2; ++iY )
3716 {
3717 for( int iX = nSrcXOff; iX < nSrcXOff2; ++iX )
3718 {
3719 // TODO(schwehr): Maybe use std::complex?
3720 dfTotalR +=
3721 pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2];
3722 dfTotalI +=
3723 pafSrcScanline[iX*2+static_cast<GPtrDiff_t>(iY-nSrcYOff)*nSrcWidth*2+1];
3724 ++nCount;
3725 }
3726 }
3727
3728 CPLAssert( nCount > 0 );
3729 if( nCount == 0 )
3730 {
3731 pafDstScanline[iDstPixel*2] = 0.0;
3732 pafDstScanline[iDstPixel*2+1] = 0.0;
3733 }
3734 else
3735 {
3736 pafDstScanline[iDstPixel*2 ] =
3737 static_cast<float>(dfTotalR/nCount);
3738 pafDstScanline[iDstPixel*2+1] =
3739 static_cast<float>(dfTotalI/nCount);
3740 }
3741 }
3742 }
3743 }
3744
3745 return CE_None;
3746 }
3747
3748 /************************************************************************/
3749 /* GDALRegenerateCascadingOverviews() */
3750 /* */
3751 /* Generate a list of overviews in order from largest to */
3752 /* smallest, computing each from the next larger. */
3753 /************************************************************************/
3754
3755 static CPLErr
GDALRegenerateCascadingOverviews(GDALRasterBand * poSrcBand,int nOverviews,GDALRasterBand ** papoOvrBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)3756 GDALRegenerateCascadingOverviews(
3757 GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
3758 const char * pszResampling,
3759 GDALProgressFunc pfnProgress, void * pProgressData )
3760
3761 {
3762 /* -------------------------------------------------------------------- */
3763 /* First, we must put the overviews in order from largest to */
3764 /* smallest. */
3765 /* -------------------------------------------------------------------- */
3766 for( int i = 0; i < nOverviews-1; ++i )
3767 {
3768 for( int j = 0; j < nOverviews - i - 1; ++j )
3769 {
3770 if( papoOvrBands[j]->GetXSize()
3771 * static_cast<float>( papoOvrBands[j]->GetYSize() ) <
3772 papoOvrBands[j+1]->GetXSize()
3773 * static_cast<float>( papoOvrBands[j+1]->GetYSize() ) )
3774 {
3775 GDALRasterBand *poTempBand = papoOvrBands[j];
3776 papoOvrBands[j] = papoOvrBands[j+1];
3777 papoOvrBands[j+1] = poTempBand;
3778 }
3779 }
3780 }
3781
3782 /* -------------------------------------------------------------------- */
3783 /* Count total pixels so we can prepare appropriate scaled */
3784 /* progress functions. */
3785 /* -------------------------------------------------------------------- */
3786 double dfTotalPixels = 0.0;
3787
3788 for( int i = 0; i < nOverviews; ++i )
3789 {
3790 dfTotalPixels += papoOvrBands[i]->GetXSize()
3791 * static_cast<double>( papoOvrBands[i]->GetYSize() );
3792 }
3793
3794 /* -------------------------------------------------------------------- */
3795 /* Generate all the bands. */
3796 /* -------------------------------------------------------------------- */
3797 double dfPixelsProcessed = 0.0;
3798
3799 for( int i = 0; i < nOverviews; ++i )
3800 {
3801 GDALRasterBand *poBaseBand = poSrcBand;
3802 if( i != 0 )
3803 poBaseBand = papoOvrBands[i-1];
3804
3805 double dfPixels =
3806 papoOvrBands[i]->GetXSize()
3807 * static_cast<double>( papoOvrBands[i]->GetYSize() );
3808
3809 void *pScaledProgressData =
3810 GDALCreateScaledProgress(
3811 dfPixelsProcessed / dfTotalPixels,
3812 (dfPixelsProcessed + dfPixels) / dfTotalPixels,
3813 pfnProgress, pProgressData );
3814
3815 const CPLErr eErr =
3816 GDALRegenerateOverviews(
3817 poBaseBand,
3818 1,
3819 reinterpret_cast<GDALRasterBandH *>( papoOvrBands ) + i,
3820 pszResampling,
3821 GDALScaledProgress,
3822 pScaledProgressData );
3823 GDALDestroyScaledProgress( pScaledProgressData );
3824
3825 if( eErr != CE_None )
3826 return eErr;
3827
3828 dfPixelsProcessed += dfPixels;
3829
3830 // Only do the bit2grayscale promotion on the base band.
3831 if( STARTS_WITH_CI( pszResampling,
3832 "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */) )
3833 pszResampling = "AVERAGE";
3834 }
3835
3836 return CE_None;
3837 }
3838
3839 /************************************************************************/
3840 /* GDALGetResampleFunction() */
3841 /************************************************************************/
3842
GDALGetResampleFunction(const char * pszResampling,int * pnRadius)3843 GDALResampleFunction GDALGetResampleFunction( const char* pszResampling,
3844 int* pnRadius )
3845 {
3846 if( pnRadius ) *pnRadius = 0;
3847 if( STARTS_WITH_CI(pszResampling, "NEAR") )
3848 return GDALResampleChunk32R_Near;
3849 else if( STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") )
3850 return GDALResampleChunk32R_Average;
3851 else if( STARTS_WITH_CI(pszResampling, "GAUSS") )
3852 {
3853 if( pnRadius ) *pnRadius = 1;
3854 return GDALResampleChunk32R_Gauss;
3855 }
3856 else if( STARTS_WITH_CI(pszResampling, "MODE") )
3857 return GDALResampleChunk32R_Mode;
3858 else if( EQUAL(pszResampling,"CUBIC") )
3859 {
3860 if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Cubic);
3861 return GDALResampleChunk32R_Convolution;
3862 }
3863 else if( EQUAL(pszResampling,"CUBICSPLINE") )
3864 {
3865 if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
3866 return GDALResampleChunk32R_Convolution;
3867 }
3868 else if( EQUAL(pszResampling,"LANCZOS") )
3869 {
3870 if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
3871 return GDALResampleChunk32R_Convolution;
3872 }
3873 else if( EQUAL(pszResampling,"BILINEAR") )
3874 {
3875 if( pnRadius ) *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
3876 return GDALResampleChunk32R_Convolution;
3877 }
3878 else
3879 {
3880 CPLError(
3881 CE_Failure, CPLE_AppDefined,
3882 "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
3883 pszResampling );
3884 return nullptr;
3885 }
3886 }
3887
3888 /************************************************************************/
3889 /* GDALGetOvrWorkDataType() */
3890 /************************************************************************/
3891
GDALGetOvrWorkDataType(const char * pszResampling,GDALDataType eSrcDataType)3892 GDALDataType GDALGetOvrWorkDataType( const char* pszResampling,
3893 GDALDataType eSrcDataType )
3894 {
3895 if( (STARTS_WITH_CI(pszResampling, "NEAR") ||
3896 STARTS_WITH_CI(pszResampling, "AVER") ||
3897 EQUAL(pszResampling, "RMS") ||
3898 EQUAL(pszResampling, "CUBIC") ||
3899 EQUAL(pszResampling, "CUBICSPLINE") ||
3900 EQUAL(pszResampling, "LANCZOS") ||
3901 EQUAL(pszResampling, "BILINEAR")) &&
3902 eSrcDataType == GDT_Byte )
3903 {
3904 return GDT_Byte;
3905 }
3906 else if( (STARTS_WITH_CI(pszResampling, "NEAR") ||
3907 STARTS_WITH_CI(pszResampling, "AVER") ||
3908 EQUAL(pszResampling, "RMS") ||
3909 EQUAL(pszResampling, "CUBIC") ||
3910 EQUAL(pszResampling, "CUBICSPLINE") ||
3911 EQUAL(pszResampling, "LANCZOS") ||
3912 EQUAL(pszResampling, "BILINEAR")) &&
3913 eSrcDataType == GDT_UInt16 )
3914 {
3915 return GDT_UInt16;
3916 }
3917
3918 return GDT_Float32;
3919 }
3920
3921
3922 namespace {
3923 // Structure to hold a pointer to free with CPLFree()
3924 struct PointerHolder
3925 {
3926 void* ptr = nullptr;
3927
PointerHolder__anon9571b05b0211::PointerHolder3928 explicit PointerHolder(void* ptrIn): ptr(ptrIn) {}
~PointerHolder__anon9571b05b0211::PointerHolder3929 ~PointerHolder() { CPLFree(ptr); }
3930 PointerHolder(const PointerHolder&) = delete;
3931 PointerHolder& operator=(const PointerHolder&) = delete;
3932 };
3933 }
3934
3935 /************************************************************************/
3936 /* GDALRegenerateOverviews() */
3937 /************************************************************************/
3938
3939 /**
3940 * \brief Generate downsampled overviews.
3941 *
3942 * This function will generate one or more overview images from a base image
3943 * using the requested downsampling algorithm. Its primary use is for
3944 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
3945 * used to generate downsampled images in one file from another outside the
3946 * overview architecture.
3947 *
3948 * The output bands need to exist in advance.
3949 *
3950 * The full set of resampling algorithms is documented in
3951 * GDALDataset::BuildOverviews().
3952 *
3953 * This function will honour properly NODATA_VALUES tuples (special dataset
3954 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
3955 * considered as the nodata value and not each value of the triplet
3956 * independently per band.
3957 *
3958 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
3959 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
3960 * overview computation.
3961 *
3962 * @param hSrcBand the source (base level) band.
3963 * @param nOverviewCount the number of downsampled bands being generated.
3964 * @param pahOvrBands the list of downsampled bands to be generated.
3965 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
3966 * @param pfnProgress progress report function.
3967 * @param pProgressData progress function callback data.
3968 * @return CE_None on success or CE_Failure on failure.
3969 */
3970 CPLErr
GDALRegenerateOverviews(GDALRasterBandH hSrcBand,int nOverviewCount,GDALRasterBandH * pahOvrBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)3971 GDALRegenerateOverviews( GDALRasterBandH hSrcBand,
3972 int nOverviewCount, GDALRasterBandH *pahOvrBands,
3973 const char * pszResampling,
3974 GDALProgressFunc pfnProgress, void * pProgressData )
3975
3976 {
3977 GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle( hSrcBand );
3978 GDALRasterBand **papoOvrBands =
3979 reinterpret_cast<GDALRasterBand **>( pahOvrBands );
3980
3981 if( pfnProgress == nullptr )
3982 pfnProgress = GDALDummyProgress;
3983
3984 if( EQUAL(pszResampling,"NONE") )
3985 return CE_None;
3986
3987 int nKernelRadius = 0;
3988 GDALResampleFunction pfnResampleFn
3989 = GDALGetResampleFunction(pszResampling, &nKernelRadius);
3990
3991 if( pfnResampleFn == nullptr )
3992 return CE_Failure;
3993
3994 /* -------------------------------------------------------------------- */
3995 /* Check color tables... */
3996 /* -------------------------------------------------------------------- */
3997 GDALColorTable* poColorTable = nullptr;
3998
3999 if( (STARTS_WITH_CI(pszResampling, "AVER")
4000 || EQUAL(pszResampling, "RMS")
4001 || STARTS_WITH_CI(pszResampling, "MODE")
4002 || STARTS_WITH_CI(pszResampling, "GAUSS")) &&
4003 poSrcBand->GetColorInterpretation() == GCI_PaletteIndex )
4004 {
4005 poColorTable = poSrcBand->GetColorTable();
4006 if( poColorTable != nullptr )
4007 {
4008 if( poColorTable->GetPaletteInterpretation() != GPI_RGB )
4009 {
4010 CPLError(
4011 CE_Warning, CPLE_AppDefined,
4012 "Computing overviews on palette index raster bands "
4013 "with a palette whose color interpretation is not RGB "
4014 "will probably lead to unexpected results." );
4015 poColorTable = nullptr;
4016 }
4017 }
4018 else
4019 {
4020 CPLError( CE_Warning, CPLE_AppDefined,
4021 "Computing overviews on palette index raster bands "
4022 "without a palette will probably lead to unexpected "
4023 "results." );
4024 }
4025 }
4026 // Not ready yet
4027 else if( (EQUAL(pszResampling, "CUBIC") ||
4028 EQUAL(pszResampling, "CUBICSPLINE") ||
4029 EQUAL(pszResampling, "LANCZOS") ||
4030 EQUAL(pszResampling, "BILINEAR") )
4031 && poSrcBand->GetColorInterpretation() == GCI_PaletteIndex )
4032 {
4033 CPLError( CE_Warning, CPLE_AppDefined,
4034 "Computing %s overviews on palette index raster bands "
4035 "will probably lead to unexpected results.", pszResampling );
4036 }
4037
4038 // If we have a nodata mask and we are doing something more complicated
4039 // than nearest neighbouring, we have to fetch to nodata mask.
4040
4041 GDALRasterBand* poMaskBand = nullptr;
4042 bool bUseNoDataMask = false;
4043 bool bCanUseCascaded = true;
4044
4045 if( !STARTS_WITH_CI(pszResampling, "NEAR") )
4046 {
4047 int nMaskFlags;
4048 // Special case if we are the alpha band. We want it to be considered
4049 // as the mask band to avoid alpha=0 to be taken into account in average
4050 // computation.
4051 if( poSrcBand->GetColorInterpretation() == GCI_AlphaBand )
4052 {
4053 poMaskBand = poSrcBand;
4054 nMaskFlags = GMF_ALPHA | GMF_PER_DATASET;
4055 }
4056 // Same as above for mask band. I'd wish we had a better way of conveying this !
4057 else if( CPLTestBool(CPLGetConfigOption(
4058 "GDAL_REGENERATED_BAND_IS_MASK", "NO")) )
4059 {
4060 poMaskBand = poSrcBand;
4061 nMaskFlags = GMF_PER_DATASET;
4062 }
4063 else
4064 {
4065 poMaskBand = poSrcBand->GetMaskBand();
4066 nMaskFlags = poSrcBand->GetMaskFlags();
4067 bCanUseCascaded = (nMaskFlags == GMF_NODATA ||
4068 nMaskFlags == GMF_ALL_VALID);
4069 }
4070
4071 bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4072 }
4073
4074 /* -------------------------------------------------------------------- */
4075 /* If we are operating on multiple overviews, and using */
4076 /* averaging, lets do them in cascading order to reduce the */
4077 /* amount of computation. */
4078 /* -------------------------------------------------------------------- */
4079
4080 // In case the mask made be computed from another band of the dataset,
4081 // we can't use cascaded generation, as the computation of the overviews
4082 // of the band used for the mask band may not have yet occurred (#3033).
4083 if( (STARTS_WITH_CI(pszResampling, "AVER") ||
4084 STARTS_WITH_CI(pszResampling, "GAUSS") ||
4085 EQUAL(pszResampling, "RMS") ||
4086 EQUAL(pszResampling, "CUBIC") ||
4087 EQUAL(pszResampling, "CUBICSPLINE") ||
4088 EQUAL(pszResampling, "LANCZOS") ||
4089 EQUAL(pszResampling, "BILINEAR")) && nOverviewCount > 1
4090 && bCanUseCascaded )
4091 return GDALRegenerateCascadingOverviews( poSrcBand,
4092 nOverviewCount, papoOvrBands,
4093 pszResampling,
4094 pfnProgress,
4095 pProgressData );
4096
4097 /* -------------------------------------------------------------------- */
4098 /* Setup one horizontal swath to read from the raw buffer. */
4099 /* -------------------------------------------------------------------- */
4100 int nFRXBlockSize = 0;
4101 int nFRYBlockSize = 0;
4102 poSrcBand->GetBlockSize( &nFRXBlockSize, &nFRYBlockSize );
4103
4104 int nFullResYChunk = 0;
4105 if( nFRYBlockSize < 16 || nFRYBlockSize > 256 )
4106 nFullResYChunk = 64;
4107 else
4108 nFullResYChunk = nFRYBlockSize;
4109
4110 // Only configurable for debug / testing
4111 const char* pszChunkYSize = CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4112 if( pszChunkYSize )
4113 {
4114 // coverity[tainted_data]
4115 nFullResYChunk = atoi(pszChunkYSize);
4116 }
4117
4118 const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4119 const GDALDataType eWrkDataType = GDALDataTypeIsComplex( eSrcDataType )?
4120 GDT_CFloat32 :
4121 GDALGetOvrWorkDataType( pszResampling, eSrcDataType );
4122
4123 const int nWidth = poSrcBand->GetXSize();
4124 const int nHeight = poSrcBand->GetYSize();
4125
4126 int nMaxOvrFactor = 1;
4127 for( int iOverview = 0; iOverview < nOverviewCount; ++iOverview )
4128 {
4129 const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4130 const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4131 nMaxOvrFactor = std::max(
4132 nMaxOvrFactor,
4133 static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5) );
4134 nMaxOvrFactor = std::max(
4135 nMaxOvrFactor,
4136 static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5) );
4137 }
4138 // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff + nFullResYChunk) / nMaxOvrFactor)
4139 nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4140 const int nMaxChunkYSizeQueried =
4141 nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4142
4143 int bHasNoData = FALSE;
4144 const float fNoDataValue =
4145 static_cast<float>( poSrcBand->GetNoDataValue(&bHasNoData) );
4146 const bool bPropagateNoData =
4147 CPLTestBool( CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO") );
4148
4149 // Structure describing a resampling job
4150 struct OvrJob
4151 {
4152 // Buffers to free when job is finished
4153 std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4154 std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4155 std::unique_ptr<PointerHolder> oDstBufferHolder{};
4156
4157 // Input parameters of pfnResampleFn
4158 GDALResampleFunction pfnResampleFn = nullptr;
4159 double dfXRatioDstToSrc{};
4160 double dfYRatioDstToSrc{};
4161 GDALDataType eWrkDataType = GDT_Unknown;
4162 const void * pChunk = nullptr;
4163 const GByte * pabyChunkNodataMask = nullptr;
4164 int nWidth = 0;
4165 int nHeight = 0;
4166 int nChunkYOff= 0;
4167 int nChunkYSize = 0;
4168 int nDstWidth = 0;
4169 int nDstYOff = 0;
4170 int nDstYOff2 = 0;
4171 GDALRasterBand* poDstBand = nullptr;
4172 const char * pszResampling = nullptr;
4173 int bHasNoData = 0;
4174 float fNoDataValue = 0.0f;
4175 GDALColorTable* poColorTable = nullptr;
4176 GDALDataType eSrcDataType = GDT_Unknown;
4177 bool bPropagateNoData = false;
4178
4179 // Output values of resampling function
4180 CPLErr eErr = CE_Failure;
4181 void* pDstBuffer = nullptr;
4182 GDALDataType eDstBufferDataType = GDT_Unknown;
4183
4184 // Synchronization
4185 bool bFinished = false;
4186 std::mutex mutex{};
4187 std::condition_variable cv{};
4188 };
4189
4190 // Thread function to resample
4191 const auto JobResampleFunc = [](void* pData)
4192 {
4193 OvrJob* poJob = static_cast<OvrJob*>(pData);
4194
4195 if( poJob->eWrkDataType != GDT_CFloat32 )
4196 {
4197 poJob->eErr = poJob->pfnResampleFn(
4198 poJob->dfXRatioDstToSrc,
4199 poJob->dfYRatioDstToSrc,
4200 0.0, 0.0,
4201 poJob->eWrkDataType,
4202 poJob->pChunk,
4203 poJob->pabyChunkNodataMask,
4204 0,
4205 poJob->nWidth,
4206 poJob->nChunkYOff,
4207 poJob->nChunkYSize,
4208 0, poJob->nDstWidth,
4209 poJob->nDstYOff,
4210 poJob->nDstYOff2,
4211 poJob->poDstBand,
4212 &(poJob->pDstBuffer),
4213 &(poJob->eDstBufferDataType),
4214 poJob->pszResampling,
4215 poJob->bHasNoData,
4216 poJob->fNoDataValue,
4217 poJob->poColorTable,
4218 poJob->eSrcDataType,
4219 poJob->bPropagateNoData);
4220 }
4221 else
4222 {
4223 poJob->eErr = GDALResampleChunkC32R(
4224 poJob->nWidth,
4225 poJob->nHeight,
4226 static_cast<const float*>(poJob->pChunk),
4227 poJob->nChunkYOff,
4228 poJob->nChunkYSize,
4229 poJob->nDstYOff,
4230 poJob->nDstYOff2,
4231 poJob->poDstBand,
4232 &(poJob->pDstBuffer),
4233 &(poJob->eDstBufferDataType),
4234 poJob->pszResampling);
4235 }
4236
4237 poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
4238
4239 {
4240 std::lock_guard<std::mutex> guard(poJob->mutex);
4241 poJob->bFinished = true;
4242 poJob->cv.notify_one();
4243 }
4244 };
4245
4246 // Function to write resample data to target band
4247 const auto WriteJobData = [](const OvrJob* poJob)
4248 {
4249 return poJob->poDstBand->RasterIO( GF_Write,
4250 0,
4251 poJob->nDstYOff,
4252 poJob->nDstWidth,
4253 poJob->nDstYOff2 - poJob->nDstYOff,
4254 poJob->pDstBuffer,
4255 poJob->nDstWidth,
4256 poJob->nDstYOff2 - poJob->nDstYOff,
4257 poJob->eDstBufferDataType,
4258 0, 0, nullptr );
4259 };
4260
4261 // Wait for completion of oldest job and serialize it
4262 const auto WaitAndFinalizeOldestJob = [WriteJobData](
4263 std::list<std::unique_ptr<OvrJob>>& jobList)
4264 {
4265 auto poOldestJob = jobList.front().get();
4266 {
4267 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4268 while( !poOldestJob->bFinished )
4269 {
4270 poOldestJob->cv.wait(oGuard);
4271 }
4272 }
4273 CPLErr l_eErr = poOldestJob->eErr;
4274 if( l_eErr == CE_None )
4275 {
4276 l_eErr = WriteJobData(poOldestJob);
4277 }
4278
4279 jobList.pop_front();
4280 return l_eErr;
4281 };
4282
4283 // Queue of jobs
4284 std::list<std::unique_ptr<OvrJob>> jobList;
4285
4286 GByte *pabyChunkNodataMask = nullptr;
4287 void *pChunk = nullptr;
4288
4289 const char* pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4290 const int nThreads = std::max(1, std::min(128,
4291 EQUAL(pszThreads, "ALL_CPUS") ? CPLGetNumCPUs() : atoi(pszThreads)));
4292 auto poThreadPool = nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4293 auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue() :
4294 std::unique_ptr<CPLJobQueue>(nullptr);
4295
4296 /* -------------------------------------------------------------------- */
4297 /* Loop over image operating on chunks. */
4298 /* -------------------------------------------------------------------- */
4299 int nChunkYOff = 0;
4300 CPLErr eErr = CE_None;
4301
4302 for( nChunkYOff = 0;
4303 nChunkYOff < nHeight && eErr == CE_None;
4304 nChunkYOff += nFullResYChunk )
4305 {
4306 if( !pfnProgress( nChunkYOff / static_cast<double>( nHeight ),
4307 nullptr, pProgressData ) )
4308 {
4309 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
4310 eErr = CE_Failure;
4311 }
4312
4313 if( nFullResYChunk + nChunkYOff > nHeight )
4314 nFullResYChunk = nHeight - nChunkYOff;
4315
4316 int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4317 int nChunkYSizeQueried =
4318 nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4319 if( nChunkYOffQueried < 0 )
4320 {
4321 nChunkYSizeQueried += nChunkYOffQueried;
4322 nChunkYOffQueried = 0;
4323 }
4324 if( nChunkYOffQueried + nChunkYSizeQueried > nHeight )
4325 nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4326
4327 // Avoid accumulating too many tasks and exhaust RAM
4328 // Try to complete already finished jobs
4329 while( eErr == CE_None && !jobList.empty() )
4330 {
4331 auto poOldestJob = jobList.front().get();
4332 {
4333 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4334 if( !poOldestJob->bFinished )
4335 {
4336 break;
4337 }
4338 }
4339 eErr = poOldestJob->eErr;
4340 if( eErr == CE_None )
4341 {
4342 eErr = WriteJobData(poOldestJob);
4343 }
4344
4345 jobList.pop_front();
4346 }
4347
4348 // And in case we have saturated the number of threads,
4349 // wait for completion of tasks to go below the threshold.
4350 while( eErr == CE_None &&
4351 jobList.size() >= static_cast<size_t>(nThreads) )
4352 {
4353 eErr = WaitAndFinalizeOldestJob(jobList);
4354 }
4355
4356 // (Re)allocate buffers if needed
4357 if( pChunk == nullptr )
4358 {
4359 pChunk = VSI_MALLOC3_VERBOSE(
4360 GDALGetDataTypeSizeBytes(eWrkDataType), nMaxChunkYSizeQueried, nWidth );
4361 }
4362 if( bUseNoDataMask && pabyChunkNodataMask == nullptr )
4363 {
4364 pabyChunkNodataMask = static_cast<GByte*>(
4365 VSI_MALLOC2_VERBOSE( nMaxChunkYSizeQueried, nWidth ));
4366 }
4367
4368 if( pChunk == nullptr || (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4369 {
4370 CPLFree(pChunk);
4371 CPLFree(pabyChunkNodataMask);
4372 return CE_Failure;
4373 }
4374
4375 // Read chunk.
4376 if( eErr == CE_None )
4377 eErr = poSrcBand->RasterIO(
4378 GF_Read, 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4379 pChunk, nWidth, nChunkYSizeQueried, eWrkDataType,
4380 0, 0, nullptr );
4381 if( eErr == CE_None && bUseNoDataMask )
4382 eErr = poMaskBand->RasterIO(
4383 GF_Read, 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4384 pabyChunkNodataMask, nWidth, nChunkYSizeQueried, GDT_Byte,
4385 0, 0, nullptr );
4386
4387 // Special case to promote 1bit data to 8bit 0/255 values.
4388 if( EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE") )
4389 {
4390 if( eWrkDataType == GDT_Float32 )
4391 {
4392 float* pafChunk = static_cast<float*>(pChunk);
4393 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4394 {
4395 if( pafChunk[i] == 1.0 )
4396 pafChunk[i] = 255.0;
4397 }
4398 }
4399 else if( eWrkDataType == GDT_Byte )
4400 {
4401 GByte* pabyChunk = static_cast<GByte*>(pChunk);
4402 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4403 {
4404 if( pabyChunk[i] == 1 )
4405 pabyChunk[i] = 255;
4406 }
4407 }
4408 else if( eWrkDataType == GDT_UInt16 )
4409 {
4410 GUInt16* pasChunk = static_cast<GUInt16*>(pChunk);
4411 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4412 {
4413 if( pasChunk[i] == 1 )
4414 pasChunk[i] = 255;
4415 }
4416 }
4417 else {
4418 CPLAssert(false);
4419 }
4420 }
4421 else if( EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE") )
4422 {
4423 if( eWrkDataType == GDT_Float32 )
4424 {
4425 float* pafChunk = static_cast<float*>(pChunk);
4426 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4427 {
4428 if( pafChunk[i] == 1.0 )
4429 pafChunk[i] = 0.0;
4430 else if( pafChunk[i] == 0.0 )
4431 pafChunk[i] = 255.0;
4432 }
4433 }
4434 else if( eWrkDataType == GDT_Byte )
4435 {
4436 GByte* pabyChunk = static_cast<GByte*>(pChunk);
4437 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4438 {
4439 if( pabyChunk[i] == 1 )
4440 pabyChunk[i] = 0;
4441 else if( pabyChunk[i] == 0 )
4442 pabyChunk[i] = 255;
4443 }
4444 }
4445 else if( eWrkDataType == GDT_UInt16 )
4446 {
4447 GUInt16* pasChunk = static_cast<GUInt16*>(pChunk);
4448 for( GPtrDiff_t i = 0; i < static_cast<GPtrDiff_t>(nChunkYSizeQueried)*nWidth; i++)
4449 {
4450 if( pasChunk[i] == 1 )
4451 pasChunk[i] = 0;
4452 else if( pasChunk[i] == 0 )
4453 pasChunk[i] = 255;
4454 }
4455 }
4456 else {
4457 CPLAssert(false);
4458 }
4459 }
4460
4461 std::shared_ptr<PointerHolder> oSrcBufferHolder(
4462 new PointerHolder(poJobQueue ? pChunk : nullptr));
4463 std::shared_ptr<PointerHolder> oSrcMaskBufferHolder(
4464 new PointerHolder(poJobQueue ? pabyChunkNodataMask : nullptr));
4465
4466 for( int iOverview = 0;
4467 iOverview < nOverviewCount && eErr == CE_None;
4468 ++iOverview )
4469 {
4470 GDALRasterBand* poDstBand = papoOvrBands[iOverview];
4471 const int nDstWidth = poDstBand->GetXSize();
4472 const int nDstHeight = poDstBand->GetYSize();
4473
4474 const double dfXRatioDstToSrc =
4475 static_cast<double>(nWidth) / nDstWidth;
4476 const double dfYRatioDstToSrc =
4477 static_cast<double>(nHeight) / nDstHeight;
4478
4479 /* -------------------------------------------------------------------- */
4480 /* Figure out the line to start writing to, and the first line */
4481 /* to not write to. In theory this approach should ensure that */
4482 /* every output line will be written if all input chunks are */
4483 /* processed. */
4484 /* -------------------------------------------------------------------- */
4485 int nDstYOff = static_cast<int>(0.5 + nChunkYOff/dfYRatioDstToSrc);
4486 if( nDstYOff == nDstHeight )
4487 continue;
4488 int nDstYOff2 = static_cast<int>(
4489 0.5 + (nChunkYOff+nFullResYChunk)/dfYRatioDstToSrc);
4490
4491 if( nChunkYOff + nFullResYChunk == nHeight )
4492 nDstYOff2 = nDstHeight;
4493 #if DEBUG_VERBOSE
4494 CPLDebug(
4495 "GDAL",
4496 "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
4497 0, nChunkYOffQueried, nWidth, nChunkYSizeQueried,
4498 0, nDstYOff, nDstWidth, nDstYOff2 - nDstYOff );
4499 #endif
4500
4501 auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
4502 poJob->pfnResampleFn = pfnResampleFn;
4503 poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
4504 poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
4505 poJob->eWrkDataType = eWrkDataType;
4506 poJob->pChunk = pChunk;
4507 poJob->pabyChunkNodataMask = pabyChunkNodataMask;
4508 poJob->nWidth = nWidth;
4509 poJob->nHeight = nHeight;
4510 poJob->nChunkYOff = nChunkYOffQueried;
4511 poJob->nChunkYSize = nChunkYSizeQueried;
4512 poJob->nDstWidth = nDstWidth;
4513 poJob->nDstYOff = nDstYOff;
4514 poJob->nDstYOff2 = nDstYOff2;
4515 poJob->poDstBand = poDstBand;
4516 poJob->pszResampling = pszResampling;
4517 poJob->bHasNoData = bHasNoData;
4518 poJob->fNoDataValue = fNoDataValue;
4519 poJob->poColorTable = poColorTable;
4520 poJob->eSrcDataType = eSrcDataType;
4521 poJob->bPropagateNoData = bPropagateNoData;
4522
4523 if( poJobQueue )
4524 {
4525 poJob->oSrcMaskBufferHolder = oSrcMaskBufferHolder;
4526 poJob->oSrcBufferHolder = oSrcBufferHolder;
4527 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4528 jobList.emplace_back(std::move(poJob));
4529 }
4530 else
4531 {
4532 JobResampleFunc(poJob.get());
4533 eErr = poJob->eErr;
4534 if( eErr == CE_None )
4535 {
4536 eErr = WriteJobData(poJob.get());
4537 }
4538 }
4539 }
4540
4541 if( poJobQueue )
4542 {
4543 pChunk = nullptr;
4544 pabyChunkNodataMask = nullptr;
4545 }
4546 }
4547
4548 VSIFree( pChunk );
4549 VSIFree( pabyChunkNodataMask );
4550
4551 // Wait for all pending jobs to complete
4552 while( !jobList.empty() )
4553 {
4554 const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
4555 if( l_eErr != CE_None && eErr == CE_None )
4556 eErr = l_eErr;
4557 }
4558
4559 /* -------------------------------------------------------------------- */
4560 /* Renormalized overview mean / stddev if needed. */
4561 /* -------------------------------------------------------------------- */
4562 if( eErr == CE_None && EQUAL(pszResampling,"AVERAGE_MP") )
4563 {
4564 GDALOverviewMagnitudeCorrection(
4565 poSrcBand,
4566 nOverviewCount,
4567 reinterpret_cast<GDALRasterBandH *>( papoOvrBands ),
4568 GDALDummyProgress, nullptr );
4569 }
4570
4571 /* -------------------------------------------------------------------- */
4572 /* It can be important to flush out data to overviews. */
4573 /* -------------------------------------------------------------------- */
4574 for( int iOverview = 0;
4575 eErr == CE_None && iOverview < nOverviewCount;
4576 ++iOverview )
4577 {
4578 eErr = papoOvrBands[iOverview]->FlushCache();
4579 }
4580
4581 if( eErr == CE_None )
4582 pfnProgress( 1.0, nullptr, pProgressData );
4583
4584 return eErr;
4585 }
4586
4587 /************************************************************************/
4588 /* GDALRegenerateOverviewsMultiBand() */
4589 /************************************************************************/
4590
4591 /**
4592 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
4593 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
4594 *
4595 * This function will generate one or more overview images from a base
4596 * image using the requested downsampling algorithm. Its primary use
4597 * is for generating overviews via GDALDataset::BuildOverviews(), but it
4598 * can also be used to generate downsampled images in one file from another
4599 * outside the overview architecture.
4600 *
4601 * The output bands need to exist in advance and share the same characteristics
4602 * (type, dimensions)
4603 *
4604 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
4605 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
4606 *
4607 * It does not support color tables or complex data types.
4608 *
4609 * The pseudo-algorithm used by the function is :
4610 * for each overview
4611 * iterate on lines of the source by a step of deltay
4612 * iterate on columns of the source by a step of deltax
4613 * read the source data of size deltax * deltay for all the bands
4614 * generate the corresponding overview block for all the bands
4615 *
4616 * This function will honour properly NODATA_VALUES tuples (special dataset
4617 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4618 * considered as the nodata value and not each value of the triplet
4619 * independently per band.
4620 *
4621 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4622 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4623 * overview computation.
4624 *
4625 * @param nBands the number of bands, size of papoSrcBands and size of
4626 * first dimension of papapoOverviewBands
4627 * @param papoSrcBands the list of source bands to downsample
4628 * @param nOverviews the number of downsampled overview levels being generated.
4629 * @param papapoOverviewBands bidimension array of bands. First dimension is
4630 * indexed by nBands. Second dimension is indexed by
4631 * nOverviews.
4632 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
4633 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
4634 * @param pfnProgress progress report function.
4635 * @param pProgressData progress function callback data.
4636 * @return CE_None on success or CE_Failure on failure.
4637 */
4638
4639 CPLErr
GDALRegenerateOverviewsMultiBand(int nBands,GDALRasterBand ** papoSrcBands,int nOverviews,GDALRasterBand *** papapoOverviewBands,const char * pszResampling,GDALProgressFunc pfnProgress,void * pProgressData)4640 GDALRegenerateOverviewsMultiBand( int nBands, GDALRasterBand** papoSrcBands,
4641 int nOverviews,
4642 GDALRasterBand*** papapoOverviewBands,
4643 const char * pszResampling,
4644 GDALProgressFunc pfnProgress,
4645 void * pProgressData )
4646 {
4647 if( pfnProgress == nullptr )
4648 pfnProgress = GDALDummyProgress;
4649
4650 if( EQUAL(pszResampling,"NONE") )
4651 return CE_None;
4652
4653 // Sanity checks.
4654 if( !STARTS_WITH_CI(pszResampling, "NEAR") &&
4655 !EQUAL(pszResampling, "RMS") &&
4656 !EQUAL(pszResampling, "AVERAGE") &&
4657 !EQUAL(pszResampling, "GAUSS") &&
4658 !EQUAL(pszResampling, "CUBIC") &&
4659 !EQUAL(pszResampling, "CUBICSPLINE") &&
4660 !EQUAL(pszResampling, "LANCZOS") &&
4661 !EQUAL(pszResampling, "BILINEAR") )
4662 {
4663 CPLError(
4664 CE_Failure, CPLE_NotSupported,
4665 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
4666 "not supported", pszResampling );
4667 return CE_Failure;
4668 }
4669
4670 int nKernelRadius = 0;
4671 GDALResampleFunction pfnResampleFn =
4672 GDALGetResampleFunction(pszResampling, &nKernelRadius);
4673 if( pfnResampleFn == nullptr )
4674 return CE_Failure;
4675
4676 const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
4677 const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
4678 GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
4679 for( int iBand = 1; iBand < nBands; ++iBand )
4680 {
4681 if( papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
4682 papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight )
4683 {
4684 CPLError(
4685 CE_Failure, CPLE_NotSupported,
4686 "GDALRegenerateOverviewsMultiBand: all the source bands must "
4687 "have the same dimensions" );
4688 return CE_Failure;
4689 }
4690 if( papoSrcBands[iBand]->GetRasterDataType() != eDataType )
4691 {
4692 CPLError(
4693 CE_Failure, CPLE_NotSupported,
4694 "GDALRegenerateOverviewsMultiBand: all the source bands must "
4695 "have the same data type" );
4696 return CE_Failure;
4697 }
4698 }
4699
4700 for( int iOverview = 0; iOverview < nOverviews; ++iOverview )
4701 {
4702 const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4703 const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
4704 for( int iBand = 1; iBand < nBands; ++iBand )
4705 {
4706 if( papapoOverviewBands[iBand][iOverview]->GetXSize() !=
4707 nDstWidth ||
4708 papapoOverviewBands[iBand][iOverview]->GetYSize()
4709 != nDstHeight )
4710 {
4711 CPLError(
4712 CE_Failure, CPLE_NotSupported,
4713 "GDALRegenerateOverviewsMultiBand: all the overviews bands "
4714 "of the same level must have the same dimensions" );
4715 return CE_Failure;
4716 }
4717 if( papapoOverviewBands[iBand][iOverview]->GetRasterDataType() !=
4718 eDataType )
4719 {
4720 CPLError(
4721 CE_Failure, CPLE_NotSupported,
4722 "GDALRegenerateOverviewsMultiBand: all the overviews bands "
4723 "must have the same data type as the source bands" );
4724 return CE_Failure;
4725 }
4726 }
4727 }
4728
4729 // First pass to compute the total number of pixels to read.
4730 double dfTotalPixelCount = 0;
4731 for( int iOverview = 0; iOverview < nOverviews; ++iOverview )
4732 {
4733 int nSrcWidth = nToplevelSrcWidth;
4734 int nSrcHeight = nToplevelSrcHeight;
4735 const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4736 // Try to use previous level of overview as the source to compute
4737 // the next level.
4738 if( iOverview > 0 &&
4739 papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstWidth )
4740 {
4741 nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
4742 nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
4743 }
4744
4745 dfTotalPixelCount += static_cast<double>(nSrcWidth) * nSrcHeight;
4746 }
4747
4748 const GDALDataType eWrkDataType =
4749 GDALGetOvrWorkDataType(pszResampling, eDataType);
4750
4751 // I'd wish we had a better way of conveying this !
4752 const bool bIsMask = CPLTestBool(CPLGetConfigOption(
4753 "GDAL_REGENERATED_BAND_IS_MASK", "NO"));
4754
4755 // If we have a nodata mask and we are doing something more complicated
4756 // than nearest neighbouring, we have to fetch to nodata mask.
4757 const bool bUseNoDataMask =
4758 !STARTS_WITH_CI(pszResampling, "NEAR") &&
4759 (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
4760
4761 int* const pabHasNoData = static_cast<int *>(
4762 VSI_MALLOC_VERBOSE(nBands * sizeof(int)) );
4763 float* const pafNoDataValue = static_cast<float *>(
4764 VSI_MALLOC_VERBOSE(nBands * sizeof(float)) );
4765 if( pabHasNoData == nullptr || pafNoDataValue == nullptr )
4766 {
4767 CPLFree(pabHasNoData);
4768 CPLFree(pafNoDataValue);
4769 return CE_Failure;
4770 }
4771
4772 for( int iBand = 0; iBand < nBands; ++iBand )
4773 {
4774 pabHasNoData[iBand] = FALSE;
4775 pafNoDataValue[iBand] = static_cast<float>(
4776 papoSrcBands[iBand]->GetNoDataValue(&pabHasNoData[iBand]) );
4777 }
4778 const bool bPropagateNoData =
4779 CPLTestBool( CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO") );
4780
4781 const char* pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4782 const int nThreads = std::max(1, std::min(128,
4783 EQUAL(pszThreads, "ALL_CPUS") ? CPLGetNumCPUs() : atoi(pszThreads)));
4784 auto poThreadPool = nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4785 auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue() :
4786 std::unique_ptr<CPLJobQueue>(nullptr);
4787
4788 // Only configurable for debug / testing
4789 const int nChunkMaxSize =
4790 atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4791
4792 // Second pass to do the real job.
4793 double dfCurPixelCount = 0;
4794 CPLErr eErr = CE_None;
4795 for( int iOverview = 0;
4796 iOverview < nOverviews && eErr == CE_None;
4797 ++iOverview )
4798 {
4799 int iSrcOverview = -1; // -1 means the source bands.
4800
4801 int nDstChunkXSize = 0;
4802 int nDstChunkYSize = 0;
4803 papapoOverviewBands[0][iOverview]->GetBlockSize( &nDstChunkXSize,
4804 &nDstChunkYSize );
4805
4806 const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
4807 const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
4808
4809 // Try to use previous level of overview as the source to compute
4810 // the next level.
4811 int nSrcWidth = nToplevelSrcWidth;
4812 int nSrcHeight = nToplevelSrcHeight;
4813 if( iOverview > 0 &&
4814 papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstWidth )
4815 {
4816 nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
4817 nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
4818 iSrcOverview = iOverview - 1;
4819 }
4820
4821 const double dfXRatioDstToSrc =
4822 static_cast<double>(nSrcWidth) / nDstWidth;
4823 const double dfYRatioDstToSrc =
4824 static_cast<double>(nSrcHeight) / nDstHeight;
4825
4826 int nOvrFactor = std::max( static_cast<int>(0.5 + dfXRatioDstToSrc),
4827 static_cast<int>(0.5 + dfYRatioDstToSrc) );
4828 if( nOvrFactor == 0 ) nOvrFactor = 1;
4829
4830 // Try to extend the chunk size so that the memory needed to acquire
4831 // source pixels goes up to 10 MB.
4832 // This can help for drivers that support multi-threaded reading
4833 const int nFullResYChunk =
4834 2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
4835 const int nFullResYChunkQueried =
4836 nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
4837 while( nDstChunkXSize < nDstWidth )
4838 {
4839 const int nFullResXChunk =
4840 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
4841
4842 const int nFullResXChunkQueried =
4843 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
4844
4845 if( static_cast<GIntBig>(nFullResXChunkQueried) *
4846 nFullResYChunkQueried * nBands *
4847 GDALGetDataTypeSizeBytes(eWrkDataType) > nChunkMaxSize )
4848 {
4849 break;
4850 }
4851
4852 nDstChunkXSize *= 2;
4853 }
4854 nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
4855
4856 const int nFullResXChunk =
4857 2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
4858 const int nFullResXChunkQueried =
4859 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
4860
4861 // Structure describing a resampling job
4862 struct OvrJob
4863 {
4864 // Buffers to free when job is finished
4865 std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
4866 std::unique_ptr<PointerHolder> oSrcBufferHolder{};
4867 std::unique_ptr<PointerHolder> oDstBufferHolder{};
4868
4869 // Input parameters of pfnResampleFn
4870 GDALResampleFunction pfnResampleFn = nullptr;
4871 double dfXRatioDstToSrc{};
4872 double dfYRatioDstToSrc{};
4873 GDALDataType eWrkDataType = GDT_Unknown;
4874 const void * pChunk = nullptr;
4875 const GByte * pabyChunkNodataMask = nullptr;
4876 int nChunkXOff = 0;
4877 int nChunkXSize = 0;
4878 int nChunkYOff = 0;
4879 int nChunkYSize = 0;
4880 int nDstXOff = 0;
4881 int nDstXOff2 = 0;
4882 int nDstYOff = 0;
4883 int nDstYOff2 = 0;
4884 GDALRasterBand* poOverview = nullptr;
4885 const char * pszResampling = nullptr;
4886 int bHasNoData = 0;
4887 float fNoDataValue = 0.0f;
4888 GDALDataType eSrcDataType = GDT_Unknown;
4889 bool bPropagateNoData = false;
4890
4891 // Output values of resampling function
4892 CPLErr eErr = CE_Failure;
4893 void* pDstBuffer = nullptr;
4894 GDALDataType eDstBufferDataType = GDT_Unknown;
4895
4896 // Synchronization
4897 bool bFinished = false;
4898 std::mutex mutex{};
4899 std::condition_variable cv{};
4900 };
4901
4902 // Thread function to resample
4903 const auto JobResampleFunc = [](void* pData)
4904 {
4905 OvrJob* poJob = static_cast<OvrJob*>(pData);
4906
4907 poJob->eErr = poJob->pfnResampleFn(
4908 poJob->dfXRatioDstToSrc,
4909 poJob->dfYRatioDstToSrc,
4910 0.0, 0.0,
4911 poJob->eWrkDataType,
4912 poJob->pChunk,
4913 poJob->pabyChunkNodataMask,
4914 poJob->nChunkXOff,
4915 poJob->nChunkXSize,
4916 poJob->nChunkYOff,
4917 poJob->nChunkYSize,
4918 poJob->nDstXOff,
4919 poJob->nDstXOff2,
4920 poJob->nDstYOff,
4921 poJob->nDstYOff2,
4922 poJob->poOverview,
4923 &(poJob->pDstBuffer),
4924 &(poJob->eDstBufferDataType),
4925 poJob->pszResampling,
4926 poJob->bHasNoData,
4927 poJob->fNoDataValue,
4928 nullptr,
4929 poJob->eSrcDataType,
4930 poJob->bPropagateNoData);
4931
4932 poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
4933
4934 {
4935 std::lock_guard<std::mutex> guard(poJob->mutex);
4936 poJob->bFinished = true;
4937 poJob->cv.notify_one();
4938 }
4939 };
4940
4941 // Function to write resample data to target band
4942 const auto WriteJobData = [](const OvrJob* poJob)
4943 {
4944 return poJob->poOverview->RasterIO(
4945 GF_Write,
4946 poJob->nDstXOff,
4947 poJob->nDstYOff,
4948 poJob->nDstXOff2 - poJob->nDstXOff,
4949 poJob->nDstYOff2 - poJob->nDstYOff,
4950 poJob->pDstBuffer,
4951 poJob->nDstXOff2 - poJob->nDstXOff,
4952 poJob->nDstYOff2 - poJob->nDstYOff,
4953 poJob->eDstBufferDataType,
4954 0, 0, nullptr );
4955 };
4956
4957 // Wait for completion of oldest job and serialize it
4958 const auto WaitAndFinalizeOldestJob = [WriteJobData](
4959 std::list<std::unique_ptr<OvrJob>>& jobList)
4960 {
4961 auto poOldestJob = jobList.front().get();
4962 {
4963 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4964 while( !poOldestJob->bFinished )
4965 {
4966 poOldestJob->cv.wait(oGuard);
4967 }
4968 }
4969 CPLErr l_eErr = poOldestJob->eErr;
4970 if( l_eErr == CE_None )
4971 {
4972 l_eErr = WriteJobData(poOldestJob);
4973 }
4974
4975 jobList.pop_front();
4976 return l_eErr;
4977 };
4978
4979 // Queue of jobs
4980 std::list<std::unique_ptr<OvrJob>> jobList;
4981
4982 std::vector<void*> apaChunk(nBands);
4983 std::vector<GByte*> apabyChunkNoDataMask(nBands);
4984
4985 int nDstYOff = 0;
4986 // Iterate on destination overview, block by block.
4987 for( nDstYOff = 0;
4988 nDstYOff < nDstHeight && eErr == CE_None;
4989 nDstYOff += nDstChunkYSize )
4990 {
4991 int nDstYCount;
4992 if( nDstYOff + nDstChunkYSize <= nDstHeight )
4993 nDstYCount = nDstChunkYSize;
4994 else
4995 nDstYCount = nDstHeight - nDstYOff;
4996
4997 int nChunkYOff =
4998 static_cast<int>(nDstYOff * dfYRatioDstToSrc);
4999 int nChunkYOff2 =
5000 static_cast<int>(
5001 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc) );
5002 if( nChunkYOff2 > nSrcHeight || nDstYOff + nDstYCount == nDstHeight)
5003 nChunkYOff2 = nSrcHeight;
5004 int nYCount = nChunkYOff2 - nChunkYOff;
5005 CPLAssert(nYCount <= nFullResYChunk);
5006
5007 int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5008 int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5009 if( nChunkYOffQueried < 0 )
5010 {
5011 nChunkYSizeQueried += nChunkYOffQueried;
5012 nChunkYOffQueried = 0;
5013 }
5014 if( nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight )
5015 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5016 CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5017
5018 if( !pfnProgress( dfCurPixelCount / dfTotalPixelCount,
5019 nullptr, pProgressData ) )
5020 {
5021 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5022 eErr = CE_Failure;
5023 }
5024
5025 int nDstXOff = 0;
5026 // Iterate on destination overview, block by block.
5027 for( nDstXOff = 0;
5028 nDstXOff < nDstWidth && eErr == CE_None;
5029 nDstXOff += nDstChunkXSize )
5030 {
5031 int nDstXCount = 0;
5032 if( nDstXOff + nDstChunkXSize <= nDstWidth )
5033 nDstXCount = nDstChunkXSize;
5034 else
5035 nDstXCount = nDstWidth - nDstXOff;
5036
5037 int nChunkXOff =
5038 static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5039 int nChunkXOff2 =
5040 static_cast<int>(
5041 ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc) );
5042 if( nChunkXOff2 > nSrcWidth ||
5043 nDstXOff + nDstXCount == nDstWidth )
5044 nChunkXOff2 = nSrcWidth;
5045 const int nXCount = nChunkXOff2 - nChunkXOff;
5046 CPLAssert(nXCount <= nFullResXChunk);
5047
5048 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5049 int nChunkXSizeQueried =
5050 nXCount + 2 * nKernelRadius * nOvrFactor;
5051 if( nChunkXOffQueried < 0 )
5052 {
5053 nChunkXSizeQueried += nChunkXOffQueried;
5054 nChunkXOffQueried = 0;
5055 }
5056 if( nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth )
5057 nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5058 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5059 #if DEBUG_VERBOSE
5060 CPLDebug(
5061 "GDAL",
5062 "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5063 nChunkXOffQueried, nChunkYOffQueried, nChunkXSizeQueried, nChunkYSizeQueried,
5064 nDstXOff, nDstYOff, nDstXCount, nDstYCount );
5065 #endif
5066
5067 // Avoid accumulating too many tasks and exhaust RAM
5068
5069 // Try to complete already finished jobs
5070 while( eErr == CE_None && !jobList.empty() )
5071 {
5072 auto poOldestJob = jobList.front().get();
5073 {
5074 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5075 if( !poOldestJob->bFinished )
5076 {
5077 break;
5078 }
5079 }
5080 eErr = poOldestJob->eErr;
5081 if( eErr == CE_None )
5082 {
5083 eErr = WriteJobData(poOldestJob);
5084 }
5085
5086 jobList.pop_front();
5087 }
5088
5089 // And in case we have saturated the number of threads,
5090 // wait for completion of tasks to go below the threshold.
5091 while( eErr == CE_None &&
5092 jobList.size() >= static_cast<size_t>(nThreads) )
5093 {
5094 eErr = WaitAndFinalizeOldestJob(jobList);
5095 }
5096
5097 // (Re)allocate buffers if needed
5098 for( int iBand = 0; iBand < nBands; ++iBand )
5099 {
5100 if( apaChunk[iBand] == nullptr )
5101 {
5102 apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5103 nFullResXChunkQueried,
5104 nFullResYChunkQueried,
5105 GDALGetDataTypeSizeBytes(eWrkDataType) );
5106 if( apaChunk[iBand] == nullptr )
5107 {
5108 eErr = CE_Failure;
5109 }
5110 }
5111 if( bUseNoDataMask && apabyChunkNoDataMask[iBand] == nullptr )
5112 {
5113 apabyChunkNoDataMask[iBand] = static_cast<GByte *>(
5114 VSI_MALLOC2_VERBOSE( nFullResXChunkQueried,
5115 nFullResYChunkQueried ) );
5116 if( apabyChunkNoDataMask[iBand] == nullptr )
5117 {
5118 eErr = CE_Failure;
5119 }
5120 }
5121 }
5122
5123 // Read the source buffers for all the bands.
5124 for( int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand )
5125 {
5126 GDALRasterBand* poSrcBand = nullptr;
5127 if( iSrcOverview == -1 )
5128 poSrcBand = papoSrcBands[iBand];
5129 else
5130 poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5131 eErr = poSrcBand->RasterIO(
5132 GF_Read,
5133 nChunkXOffQueried, nChunkYOffQueried,
5134 nChunkXSizeQueried, nChunkYSizeQueried,
5135 apaChunk[iBand],
5136 nChunkXSizeQueried, nChunkYSizeQueried,
5137 eWrkDataType, 0, 0, nullptr );
5138
5139 if( bUseNoDataMask && eErr == CE_None )
5140 {
5141 const bool bUseSrcAsMaskBand = bIsMask || papoSrcBands[iBand]->GetColorInterpretation() == GCI_AlphaBand;
5142 auto poMaskBand = bUseSrcAsMaskBand ? poSrcBand : poSrcBand->GetMaskBand();
5143 eErr = poMaskBand->RasterIO(
5144 GF_Read,
5145 nChunkXOffQueried, nChunkYOffQueried,
5146 nChunkXSizeQueried, nChunkYSizeQueried,
5147 apabyChunkNoDataMask[iBand],
5148 nChunkXSizeQueried, nChunkYSizeQueried,
5149 GDT_Byte, 0, 0, nullptr );
5150 }
5151 }
5152
5153 // Compute the resulting overview block.
5154 for( int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand )
5155 {
5156 auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
5157 poJob->pfnResampleFn = pfnResampleFn;
5158 poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
5159 poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
5160 poJob->eWrkDataType = eWrkDataType;
5161 poJob->pChunk = apaChunk[iBand];
5162 poJob->pabyChunkNodataMask = apabyChunkNoDataMask[iBand];
5163 poJob->nChunkXOff = nChunkXOffQueried;
5164 poJob->nChunkXSize = nChunkXSizeQueried;
5165 poJob->nChunkYOff = nChunkYOffQueried;
5166 poJob->nChunkYSize = nChunkYSizeQueried;
5167 poJob->nDstXOff = nDstXOff;
5168 poJob->nDstXOff2 = nDstXOff + nDstXCount;
5169 poJob->nDstYOff = nDstYOff;
5170 poJob->nDstYOff2 = nDstYOff + nDstYCount;
5171 poJob->poOverview = papapoOverviewBands[iBand][iOverview];
5172 poJob->pszResampling = pszResampling;
5173 poJob->bHasNoData = pabHasNoData[iBand];
5174 poJob->fNoDataValue = pafNoDataValue[iBand];
5175 poJob->eSrcDataType = eDataType;
5176 poJob->bPropagateNoData = bPropagateNoData;
5177
5178 if( poJobQueue )
5179 {
5180 poJob->oSrcMaskBufferHolder.reset(
5181 new PointerHolder(apabyChunkNoDataMask[iBand]));
5182 apabyChunkNoDataMask[iBand] = nullptr;
5183
5184 poJob->oSrcBufferHolder.reset(
5185 new PointerHolder(apaChunk[iBand]));
5186 apaChunk[iBand] = nullptr;
5187
5188 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5189 jobList.emplace_back(std::move(poJob));
5190 }
5191 else
5192 {
5193 JobResampleFunc(poJob.get());
5194 eErr = poJob->eErr;
5195 if( eErr == CE_None )
5196 {
5197 eErr = WriteJobData(poJob.get());
5198 }
5199 }
5200 }
5201 }
5202
5203 dfCurPixelCount += static_cast<double>(nYCount) * nSrcWidth;
5204 }
5205
5206 // Wait for all pending jobs to complete
5207 while( !jobList.empty() )
5208 {
5209 const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5210 if( l_eErr != CE_None && eErr == CE_None )
5211 eErr = l_eErr;
5212 }
5213
5214 // Flush the data to overviews.
5215 for( int iBand = 0; iBand < nBands; ++iBand )
5216 {
5217 CPLFree(apaChunk[iBand]);
5218 papapoOverviewBands[iBand][iOverview]->FlushCache();
5219
5220 CPLFree(apabyChunkNoDataMask[iBand]);
5221 }
5222 }
5223
5224 CPLFree(pabHasNoData);
5225 CPLFree(pafNoDataValue);
5226
5227 if( eErr == CE_None )
5228 pfnProgress( 1.0, nullptr, pProgressData );
5229
5230 return eErr;
5231 }
5232
5233 /************************************************************************/
5234 /* GDALComputeBandStats() */
5235 /************************************************************************/
5236
5237 /** Undocumented
5238 * @param hSrcBand undocumented.
5239 * @param nSampleStep undocumented.
5240 * @param pdfMean undocumented.
5241 * @param pdfStdDev undocumented.
5242 * @param pfnProgress undocumented.
5243 * @param pProgressData undocumented.
5244 * @return undocumented
5245 */
5246 CPLErr CPL_STDCALL
GDALComputeBandStats(GDALRasterBandH hSrcBand,int nSampleStep,double * pdfMean,double * pdfStdDev,GDALProgressFunc pfnProgress,void * pProgressData)5247 GDALComputeBandStats( GDALRasterBandH hSrcBand,
5248 int nSampleStep,
5249 double *pdfMean, double *pdfStdDev,
5250 GDALProgressFunc pfnProgress,
5251 void *pProgressData )
5252
5253 {
5254 VALIDATE_POINTER1( hSrcBand, "GDALComputeBandStats", CE_Failure );
5255
5256 GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle( hSrcBand );
5257
5258 if( pfnProgress == nullptr )
5259 pfnProgress = GDALDummyProgress;
5260
5261 const int nWidth = poSrcBand->GetXSize();
5262 const int nHeight = poSrcBand->GetYSize();
5263
5264 if( nSampleStep >= nHeight || nSampleStep < 1 )
5265 nSampleStep = 1;
5266
5267 GDALDataType eWrkType = GDT_Unknown;
5268 float *pafData = nullptr;
5269 GDALDataType eType = poSrcBand->GetRasterDataType();
5270 const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5271 if( bComplex )
5272 {
5273 pafData = static_cast<float *>(
5274 VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)) );
5275 eWrkType = GDT_CFloat32;
5276 }
5277 else
5278 {
5279 pafData = static_cast<float *>(
5280 VSI_MALLOC_VERBOSE(nWidth * sizeof(float)) );
5281 eWrkType = GDT_Float32;
5282 }
5283
5284 if( nWidth == 0 || pafData == nullptr )
5285 {
5286 VSIFree(pafData);
5287 return CE_Failure;
5288 }
5289
5290 /* -------------------------------------------------------------------- */
5291 /* Loop over all sample lines. */
5292 /* -------------------------------------------------------------------- */
5293 double dfSum = 0.0;
5294 double dfSum2 = 0.0;
5295 int iLine = 0;
5296 GIntBig nSamples = 0;
5297
5298 do
5299 {
5300 if( !pfnProgress( iLine / static_cast<double>( nHeight ),
5301 nullptr, pProgressData ) )
5302 {
5303 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5304 CPLFree( pafData );
5305 return CE_Failure;
5306 }
5307
5308 const CPLErr eErr =
5309 poSrcBand->RasterIO( GF_Read, 0, iLine, nWidth, 1,
5310 pafData, nWidth, 1, eWrkType,
5311 0, 0, nullptr );
5312 if( eErr != CE_None )
5313 {
5314 CPLFree( pafData );
5315 return eErr;
5316 }
5317
5318 for( int iPixel = 0; iPixel < nWidth; ++iPixel )
5319 {
5320 float fValue = 0.0f;
5321
5322 if( bComplex )
5323 {
5324 // Compute the magnitude of the complex value.
5325 fValue = std::hypot(pafData[iPixel*2],
5326 pafData[iPixel*2+1]);
5327 }
5328 else
5329 {
5330 fValue = pafData[iPixel];
5331 }
5332
5333 dfSum += fValue;
5334 dfSum2 += fValue * fValue;
5335 }
5336
5337 nSamples += nWidth;
5338 iLine += nSampleStep;
5339 } while( iLine < nHeight );
5340
5341 if( !pfnProgress( 1.0, nullptr, pProgressData ) )
5342 {
5343 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5344 CPLFree( pafData );
5345 return CE_Failure;
5346 }
5347
5348 /* -------------------------------------------------------------------- */
5349 /* Produce the result values. */
5350 /* -------------------------------------------------------------------- */
5351 if( pdfMean != nullptr )
5352 *pdfMean = dfSum / nSamples;
5353
5354 if( pdfStdDev != nullptr )
5355 {
5356 const double dfMean = dfSum / nSamples;
5357
5358 *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
5359 }
5360
5361 CPLFree( pafData );
5362
5363 return CE_None;
5364 }
5365
5366 /************************************************************************/
5367 /* GDALOverviewMagnitudeCorrection() */
5368 /* */
5369 /* Correct the mean and standard deviation of the overviews of */
5370 /* the given band to match the base layer approximately. */
5371 /************************************************************************/
5372
5373 /** Undocumented
5374 * @param hBaseBand undocumented.
5375 * @param nOverviewCount undocumented.
5376 * @param pahOverviews undocumented.
5377 * @param pfnProgress undocumented.
5378 * @param pProgressData undocumented.
5379 * @return undocumented
5380 */
5381 CPLErr
GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,int nOverviewCount,GDALRasterBandH * pahOverviews,GDALProgressFunc pfnProgress,void * pProgressData)5382 GDALOverviewMagnitudeCorrection( GDALRasterBandH hBaseBand,
5383 int nOverviewCount,
5384 GDALRasterBandH *pahOverviews,
5385 GDALProgressFunc pfnProgress,
5386 void *pProgressData )
5387
5388 {
5389 VALIDATE_POINTER1( hBaseBand, "GDALOverviewMagnitudeCorrection",
5390 CE_Failure );
5391
5392 /* -------------------------------------------------------------------- */
5393 /* Compute mean/stddev for source raster. */
5394 /* -------------------------------------------------------------------- */
5395 double dfOrigMean = 0.0;
5396 double dfOrigStdDev = 0.0;
5397 {
5398 const CPLErr eErr
5399 = GDALComputeBandStats( hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
5400 pfnProgress, pProgressData );
5401
5402 if( eErr != CE_None )
5403 return eErr;
5404 }
5405
5406 /* -------------------------------------------------------------------- */
5407 /* Loop on overview bands. */
5408 /* -------------------------------------------------------------------- */
5409 for( int iOverview = 0; iOverview < nOverviewCount; ++iOverview )
5410 {
5411 GDALRasterBand *poOverview = GDALRasterBand::FromHandle(
5412 pahOverviews[iOverview]);
5413 double dfOverviewMean, dfOverviewStdDev;
5414
5415 const CPLErr eErr =
5416 GDALComputeBandStats( pahOverviews[iOverview], 1,
5417 &dfOverviewMean, &dfOverviewStdDev,
5418 pfnProgress, pProgressData );
5419
5420 if( eErr != CE_None )
5421 return eErr;
5422
5423 double dfGain = 1.0;
5424 if( dfOrigStdDev >= 0.0001 )
5425 dfGain = dfOrigStdDev / dfOverviewStdDev;
5426
5427 /* -------------------------------------------------------------------- */
5428 /* Apply gain and offset. */
5429 /* -------------------------------------------------------------------- */
5430 const int nWidth = poOverview->GetXSize();
5431 const int nHeight = poOverview->GetYSize();
5432
5433 GDALDataType eWrkType = GDT_Unknown;
5434 float *pafData = nullptr;
5435 const GDALDataType eType = poOverview->GetRasterDataType();
5436 const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5437 if( bComplex )
5438 {
5439 pafData = static_cast<float *>(
5440 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)) );
5441 eWrkType = GDT_CFloat32;
5442 }
5443 else
5444 {
5445 pafData = static_cast<float *>(
5446 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)) );
5447 eWrkType = GDT_Float32;
5448 }
5449
5450 if( pafData == nullptr )
5451 {
5452 return CE_Failure;
5453 }
5454
5455 for( int iLine = 0; iLine < nHeight; ++iLine )
5456 {
5457 if( !pfnProgress( iLine / static_cast<double>( nHeight ),
5458 nullptr, pProgressData ) )
5459 {
5460 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5461 CPLFree( pafData );
5462 return CE_Failure;
5463 }
5464
5465 if( poOverview->RasterIO( GF_Read, 0, iLine, nWidth, 1,
5466 pafData, nWidth, 1, eWrkType,
5467 0, 0, nullptr ) != CE_None )
5468 {
5469 CPLFree( pafData );
5470 return CE_Failure;
5471 }
5472
5473 for( int iPixel = 0; iPixel < nWidth; ++iPixel )
5474 {
5475 if( bComplex )
5476 {
5477 pafData[iPixel*2] *= static_cast<float>( dfGain );
5478 pafData[iPixel*2+1] *= static_cast<float>( dfGain );
5479 }
5480 else
5481 {
5482 pafData[iPixel] = static_cast<float>(
5483 (pafData[iPixel] - dfOverviewMean)
5484 * dfGain + dfOrigMean );
5485 }
5486 }
5487
5488 if( poOverview->RasterIO( GF_Write, 0, iLine, nWidth, 1,
5489 pafData, nWidth, 1, eWrkType,
5490 0, 0, nullptr ) != CE_None )
5491 {
5492 CPLFree( pafData );
5493 return CE_Failure;
5494 }
5495 }
5496
5497 if( !pfnProgress( 1.0, nullptr, pProgressData ) )
5498 {
5499 CPLError( CE_Failure, CPLE_UserInterrupt, "User terminated" );
5500 CPLFree( pafData );
5501 return CE_Failure;
5502 }
5503
5504 CPLFree( pafData );
5505 }
5506
5507 return CE_None;
5508 }
5509