1 /******************************************************************************
2  *
3  * Project:  GDAL Gridding API.
4  * Purpose:  Implementation of GDAL scattered data gridder.
5  * Author:   Even Rouault, <even dot rouault at spatialys.com>
6  *
7  ******************************************************************************
8  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a
11  * copy of this software and associated documentation files (the "Software"),
12  * to deal in the Software without restriction, including without limitation
13  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14  * and/or sell copies of the Software, and to permit persons to whom the
15  * Software is furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included
18  * in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  ****************************************************************************/
28 
29 #include "gdalgrid.h"
30 #include "gdalgrid_priv.h"
31 
32 #ifdef HAVE_AVX_AT_COMPILE_TIME
33 #include <immintrin.h>
34 
35 CPL_CVSID("$Id: gdalgridavx.cpp 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $")
36 
37 /************************************************************************/
38 /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
39 /************************************************************************/
40 
41 #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
42 
43 CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(const void * poOptions,GUInt32 nPoints,CPL_UNUSED const double * unused_padfX,CPL_UNUSED const double * unused_padfY,CPL_UNUSED const double * unused_padfZ,double dfXPoint,double dfYPoint,double * pdfValue,void * hExtraParamsIn)44 GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
45     const void *poOptions,
46     GUInt32 nPoints,
47     CPL_UNUSED const double *unused_padfX,
48     CPL_UNUSED const double *unused_padfY,
49     CPL_UNUSED const double *unused_padfZ,
50     double dfXPoint, double dfYPoint,
51     double *pdfValue,
52     void* hExtraParamsIn )
53 {
54     size_t i = 0;
55     GDALGridExtraParameters* psExtraParams = static_cast<GDALGridExtraParameters*>(hExtraParamsIn);
56     const float* pafX = psExtraParams->pafX;
57     const float* pafY = psExtraParams->pafY;
58     const float* pafZ = psExtraParams->pafZ;
59 
60     const float fEpsilon = 0.0000000000001f;
61     const float fXPoint = static_cast<float>(dfXPoint);
62     const float fYPoint = static_cast<float>(dfYPoint);
63     const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
64     const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
65     const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
66     __m256 ymm_nominator = _mm256_setzero_ps();
67     __m256 ymm_denominator = _mm256_setzero_ps();
68     int mask = 0;
69 
70 #undef LOOP_SIZE
71 #if defined(__x86_64) || defined(_M_X64)
72     /* This would also work in 32bit mode, but there are only 8 XMM registers */
73     /* whereas we have 16 for 64bit */
74 #define LOOP_SIZE   16
75     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
76     for( i = 0; i < nPointsRound; i += LOOP_SIZE )
77     {
78         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x);            /* rx = pafX[i] - fXPoint */
79         __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
80         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y);            /* ry = pafY[i] - fYPoint */
81         __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
82         __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),               /* r2 = rx * rx + ry * ry */
83                                    _mm256_mul_ps(ymm_ry, ymm_ry));
84         __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
85                                      _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
86         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                               /* invr2 = 1.0f / r2 */
87         __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
88         ymm_nominator = _mm256_add_ps(ymm_nominator,                            /* nominator += invr2 * pafZ[i] */
89                             _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
90         ymm_nominator = _mm256_add_ps(ymm_nominator,
91                             _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
92         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
93         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
94         mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) |           /* if( r2 < fEpsilon) */
95               (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8);
96         if( mask )
97             break;
98     }
99 #else
100 #define LOOP_SIZE   8
101     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
102     for( i = 0; i < nPointsRound; i += LOOP_SIZE )
103     {
104         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(const_cast<float*>(pafX) + i), ymm_x);           /* rx = pafX[i] - fXPoint */
105         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(const_cast<float*>(pafY) + i), ymm_y);           /* ry = pafY[i] - fYPoint */
106         __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),              /* r2 = rx * rx + ry * ry */
107                                    _mm256_mul_ps(ymm_ry, ymm_ry));
108         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                              /* invr2 = 1.0f / r2 */
109         ymm_nominator = _mm256_add_ps(ymm_nominator,                           /* nominator += invr2 * pafZ[i] */
110                             _mm256_mul_ps(ymm_invr2, _mm256_load_ps(const_cast<float*>(pafZ) + i)));
111         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
112         mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS));            /* if( r2 < fEpsilon) */
113         if( mask )
114             break;
115     }
116 #endif
117 
118     // Find which i triggered r2 < fEpsilon.
119     if( mask )
120     {
121         for( int j = 0; j < LOOP_SIZE; j++ )
122         {
123             if( mask & (1 << j) )
124             {
125                 (*pdfValue) = (pafZ)[i + j];
126 
127                 // GCC and MSVC need explicit zeroing.
128 #if !defined(__clang__)
129                 _mm256_zeroupper();
130 #endif
131                 return CE_None;
132             }
133         }
134     }
135 #undef LOOP_SIZE
136 
137     // Get back nominator and denominator values for YMM registers.
138     float afNominator[8];
139     float afDenominator[8];
140     _mm256_storeu_ps(afNominator, ymm_nominator);
141     _mm256_storeu_ps(afDenominator, ymm_denominator);
142 
143     // MSVC doesn't emit AVX afterwards but may use SSE, so clear
144     // upper bits.  Other compilers will continue using AVX for the
145     // below floating points operations.
146 #if defined(_MSC_FULL_VER)
147     _mm256_zeroupper();
148 #endif
149 
150     float fNominator = afNominator[0] + afNominator[1] +
151                        afNominator[2] + afNominator[3] +
152                        afNominator[4] + afNominator[5] +
153                        afNominator[6] + afNominator[7];
154     float fDenominator = afDenominator[0] + afDenominator[1] +
155                          afDenominator[2] + afDenominator[3] +
156                          afDenominator[4] + afDenominator[5] +
157                          afDenominator[6] + afDenominator[7];
158 
159     // Do the few remaining loop iterations.
160     for( ; i < nPoints; i++ )
161     {
162         const float fRX = pafX[i] - fXPoint;
163         const float fRY = pafY[i] - fYPoint;
164         const float fR2 =
165             fRX * fRX + fRY * fRY;
166 
167         // If the test point is close to the grid node, use the point
168         // value directly as a node value to avoid singularity.
169         if( fR2 < 0.0000000000001 )
170         {
171             break;
172         }
173         else
174         {
175             const float fInvR2 = 1.0f / fR2;
176             fNominator += fInvR2 * pafZ[i];
177             fDenominator += fInvR2;
178         }
179     }
180 
181     if( i != nPoints )
182     {
183         (*pdfValue) = pafZ[i];
184     }
185     else
186     if( fDenominator == 0.0 )
187     {
188         (*pdfValue) =
189             static_cast<const GDALGridInverseDistanceToAPowerOptions*>(poOptions)->dfNoDataValue;
190     }
191     else
192         (*pdfValue) = fNominator / fDenominator;
193 
194     // GCC needs explicit zeroing.
195 #if defined(__GNUC__) && !defined(__clang__)
196     _mm256_zeroupper();
197 #endif
198 
199     return CE_None;
200 }
201 
202 #endif /* HAVE_AVX_AT_COMPILE_TIME */
203