1 /******************************************************************************
2 *
3 * Project: GDAL Gridding API.
4 * Purpose: Implementation of GDAL scattered data gridder.
5 * Author: Even Rouault, <even dot rouault at spatialys.com>
6 *
7 ******************************************************************************
8 * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included
18 * in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 ****************************************************************************/
28
29 #include "gdalgrid.h"
30 #include "gdalgrid_priv.h"
31
32 #ifdef HAVE_AVX_AT_COMPILE_TIME
33 #include <immintrin.h>
34
35 CPL_CVSID("$Id: gdalgridavx.cpp 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $")
36
37 /************************************************************************/
38 /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX() */
39 /************************************************************************/
40
41 #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
42
43 CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(const void * poOptions,GUInt32 nPoints,CPL_UNUSED const double * unused_padfX,CPL_UNUSED const double * unused_padfY,CPL_UNUSED const double * unused_padfZ,double dfXPoint,double dfYPoint,double * pdfValue,void * hExtraParamsIn)44 GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
45 const void *poOptions,
46 GUInt32 nPoints,
47 CPL_UNUSED const double *unused_padfX,
48 CPL_UNUSED const double *unused_padfY,
49 CPL_UNUSED const double *unused_padfZ,
50 double dfXPoint, double dfYPoint,
51 double *pdfValue,
52 void* hExtraParamsIn )
53 {
54 size_t i = 0;
55 GDALGridExtraParameters* psExtraParams = static_cast<GDALGridExtraParameters*>(hExtraParamsIn);
56 const float* pafX = psExtraParams->pafX;
57 const float* pafY = psExtraParams->pafY;
58 const float* pafZ = psExtraParams->pafZ;
59
60 const float fEpsilon = 0.0000000000001f;
61 const float fXPoint = static_cast<float>(dfXPoint);
62 const float fYPoint = static_cast<float>(dfYPoint);
63 const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
64 const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
65 const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
66 __m256 ymm_nominator = _mm256_setzero_ps();
67 __m256 ymm_denominator = _mm256_setzero_ps();
68 int mask = 0;
69
70 #undef LOOP_SIZE
71 #if defined(__x86_64) || defined(_M_X64)
72 /* This would also work in 32bit mode, but there are only 8 XMM registers */
73 /* whereas we have 16 for 64bit */
74 #define LOOP_SIZE 16
75 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
76 for( i = 0; i < nPointsRound; i += LOOP_SIZE )
77 {
78 __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */
79 __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
80 __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */
81 __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
82 __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
83 _mm256_mul_ps(ymm_ry, ymm_ry));
84 __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
85 _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
86 __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
87 __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
88 ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
89 _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
90 ymm_nominator = _mm256_add_ps(ymm_nominator,
91 _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
92 ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */
93 ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
94 mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
95 (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8);
96 if( mask )
97 break;
98 }
99 #else
100 #define LOOP_SIZE 8
101 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
102 for( i = 0; i < nPointsRound; i += LOOP_SIZE )
103 {
104 __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(const_cast<float*>(pafX) + i), ymm_x); /* rx = pafX[i] - fXPoint */
105 __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(const_cast<float*>(pafY) + i), ymm_y); /* ry = pafY[i] - fYPoint */
106 __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
107 _mm256_mul_ps(ymm_ry, ymm_ry));
108 __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
109 ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
110 _mm256_mul_ps(ymm_invr2, _mm256_load_ps(const_cast<float*>(pafZ) + i)));
111 ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */
112 mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
113 if( mask )
114 break;
115 }
116 #endif
117
118 // Find which i triggered r2 < fEpsilon.
119 if( mask )
120 {
121 for( int j = 0; j < LOOP_SIZE; j++ )
122 {
123 if( mask & (1 << j) )
124 {
125 (*pdfValue) = (pafZ)[i + j];
126
127 // GCC and MSVC need explicit zeroing.
128 #if !defined(__clang__)
129 _mm256_zeroupper();
130 #endif
131 return CE_None;
132 }
133 }
134 }
135 #undef LOOP_SIZE
136
137 // Get back nominator and denominator values for YMM registers.
138 float afNominator[8];
139 float afDenominator[8];
140 _mm256_storeu_ps(afNominator, ymm_nominator);
141 _mm256_storeu_ps(afDenominator, ymm_denominator);
142
143 // MSVC doesn't emit AVX afterwards but may use SSE, so clear
144 // upper bits. Other compilers will continue using AVX for the
145 // below floating points operations.
146 #if defined(_MSC_FULL_VER)
147 _mm256_zeroupper();
148 #endif
149
150 float fNominator = afNominator[0] + afNominator[1] +
151 afNominator[2] + afNominator[3] +
152 afNominator[4] + afNominator[5] +
153 afNominator[6] + afNominator[7];
154 float fDenominator = afDenominator[0] + afDenominator[1] +
155 afDenominator[2] + afDenominator[3] +
156 afDenominator[4] + afDenominator[5] +
157 afDenominator[6] + afDenominator[7];
158
159 // Do the few remaining loop iterations.
160 for( ; i < nPoints; i++ )
161 {
162 const float fRX = pafX[i] - fXPoint;
163 const float fRY = pafY[i] - fYPoint;
164 const float fR2 =
165 fRX * fRX + fRY * fRY;
166
167 // If the test point is close to the grid node, use the point
168 // value directly as a node value to avoid singularity.
169 if( fR2 < 0.0000000000001 )
170 {
171 break;
172 }
173 else
174 {
175 const float fInvR2 = 1.0f / fR2;
176 fNominator += fInvR2 * pafZ[i];
177 fDenominator += fInvR2;
178 }
179 }
180
181 if( i != nPoints )
182 {
183 (*pdfValue) = pafZ[i];
184 }
185 else
186 if( fDenominator == 0.0 )
187 {
188 (*pdfValue) =
189 static_cast<const GDALGridInverseDistanceToAPowerOptions*>(poOptions)->dfNoDataValue;
190 }
191 else
192 (*pdfValue) = fNominator / fDenominator;
193
194 // GCC needs explicit zeroing.
195 #if defined(__GNUC__) && !defined(__clang__)
196 _mm256_zeroupper();
197 #endif
198
199 return CE_None;
200 }
201
202 #endif /* HAVE_AVX_AT_COMPILE_TIME */
203