1 /******************************************************************************
2  *
3  * Project:  GDAL Gridding API.
4  * Purpose:  Implementation of GDAL scattered data gridder.
5  * Author:   Even Rouault, <even dot rouault at spatialys.com>
6  *
7  ******************************************************************************
8  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a
11  * copy of this software and associated documentation files (the "Software"),
12  * to deal in the Software without restriction, including without limitation
13  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14  * and/or sell copies of the Software, and to permit persons to whom the
15  * Software is furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included
18  * in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  ****************************************************************************/
28 
29 #include "gdalgrid.h"
30 #include "gdalgrid_priv.h"
31 
32 #ifdef HAVE_SSE_AT_COMPILE_TIME
33 #include <xmmintrin.h>
34 
35 CPL_CVSID("$Id: gdalgridsse.cpp 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $")
36 
37 /************************************************************************/
38 /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE()     */
39 /************************************************************************/
40 
41 CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(const void * poOptions,GUInt32 nPoints,CPL_UNUSED const double * unused_padfX,CPL_UNUSED const double * unused_padfY,CPL_UNUSED const double * unused_padfZ,double dfXPoint,double dfYPoint,double * pdfValue,void * hExtraParamsIn)42 GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
43                                         const void *poOptions,
44                                         GUInt32 nPoints,
45                                         CPL_UNUSED const double *unused_padfX,
46                                         CPL_UNUSED const double *unused_padfY,
47                                         CPL_UNUSED const double *unused_padfZ,
48                                         double dfXPoint, double dfYPoint,
49                                         double *pdfValue,
50                                         void* hExtraParamsIn )
51 {
52     size_t i = 0;
53     GDALGridExtraParameters* psExtraParams =
54         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
55     const float* pafX = psExtraParams->pafX;
56     const float* pafY = psExtraParams->pafY;
57     const float* pafZ = psExtraParams->pafZ;
58 
59     const float fEpsilon = 0.0000000000001f;
60     const float fXPoint = static_cast<float>(dfXPoint);
61     const float fYPoint = static_cast<float>(dfYPoint);
62     const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon));
63     const __m128 xmm_x = _mm_load1_ps(const_cast<float*>(&fXPoint));
64     const __m128 xmm_y = _mm_load1_ps(const_cast<float*>(&fYPoint));
65     __m128 xmm_nominator = _mm_setzero_ps();
66     __m128 xmm_denominator = _mm_setzero_ps();
67     int mask = 0;
68 
69 #if defined(__x86_64) || defined(_M_X64)
70     // This would also work in 32bit mode, but there are only 8 XMM registers
71     // whereas we have 16 for 64bit.
72     const size_t LOOP_SIZE = 8;
73     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
74     for( i = 0; i < nPointsRound; i += LOOP_SIZE )
75     {
76         // rx = pafX[i] - fXPoint
77         __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);
78         __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
79         // ry = pafY[i] - fYPoint
80         __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);
81         __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
82         // r2 = rx * rx + ry * ry
83         __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx),
84                                    _mm_mul_ps(xmm_ry, xmm_ry));
85         __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
86                                      _mm_mul_ps(xmm_ry_4, xmm_ry_4));
87         // invr2 = 1.0f / r2
88         __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);
89         __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
90         // nominator += invr2 * pafZ[i]
91         xmm_nominator = _mm_add_ps(xmm_nominator,
92                             _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
93         xmm_nominator = _mm_add_ps(xmm_nominator,
94                             _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
95         // denominator += invr2
96         xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);
97         xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
98         // if( r2 < fEpsilon)
99         mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |
100               (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
101         if( mask )
102             break;
103     }
104 #else
105 #define LOOP_SIZE   4
106     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
107     for( i = 0; i < nPointsRound; i += LOOP_SIZE )
108     {
109         __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);           /* rx = pafX[i] - fXPoint */
110         __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);           /* ry = pafY[i] - fYPoint */
111         __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx),              /* r2 = rx * rx + ry * ry */
112                                    _mm_mul_ps(xmm_ry, xmm_ry));
113         __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);                              /* invr2 = 1.0f / r2 */
114         xmm_nominator = _mm_add_ps(xmm_nominator,                           /* nominator += invr2 * pafZ[i] */
115                             _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
116         xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);           /* denominator += invr2 */
117         mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small));            /* if( r2 < fEpsilon) */
118         if( mask )
119             break;
120     }
121 #endif
122 
123     // Find which i triggered r2 < fEpsilon.
124     if( mask )
125     {
126         for( size_t j = 0; j < LOOP_SIZE; j++ )
127         {
128             if( mask & (1 << j) )
129             {
130                 (*pdfValue) = (pafZ)[i + j];
131                 return CE_None;
132             }
133         }
134     }
135 
136     // Get back nominator and denominator values for XMM registers.
137     float afNominator[4];
138     float afDenominator[4];
139     _mm_storeu_ps(afNominator, xmm_nominator);
140     _mm_storeu_ps(afDenominator, xmm_denominator);
141 
142     float fNominator = afNominator[0] + afNominator[1] +
143                        afNominator[2] + afNominator[3];
144     float fDenominator = afDenominator[0] + afDenominator[1] +
145                          afDenominator[2] + afDenominator[3];
146 
147     /* Do the few remaining loop iterations */
148     for( ; i < nPoints; i++ )
149     {
150         const float fRX = pafX[i] - fXPoint;
151         const float fRY = pafY[i] - fYPoint;
152         const float fR2 =
153             fRX * fRX + fRY * fRY;
154 
155         // If the test point is close to the grid node, use the point
156         // value directly as a node value to avoid singularity.
157         if( fR2 < 0.0000000000001 )
158         {
159             break;
160         }
161         else
162         {
163             const float fInvR2 = 1.0f / fR2;
164             fNominator += fInvR2 * pafZ[i];
165             fDenominator += fInvR2;
166         }
167     }
168 
169     if( i != nPoints )
170     {
171         (*pdfValue) = pafZ[i];
172     }
173     else
174     if( fDenominator == 0.0 )
175     {
176         (*pdfValue) =
177             static_cast<const GDALGridInverseDistanceToAPowerOptions*>(poOptions)->dfNoDataValue;
178     }
179     else
180     {
181         (*pdfValue) = fNominator / fDenominator;
182     }
183 
184     return CE_None;
185 }
186 
187 #endif /* HAVE_SSE_AT_COMPILE_TIME */
188