1 /******************************************************************************
2 *
3 * Project: GDAL Gridding API.
4 * Purpose: Implementation of GDAL scattered data gridder.
5 * Author: Even Rouault, <even dot rouault at spatialys.com>
6 *
7 ******************************************************************************
8 * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included
18 * in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 ****************************************************************************/
28
29 #include "gdalgrid.h"
30 #include "gdalgrid_priv.h"
31
32 #ifdef HAVE_SSE_AT_COMPILE_TIME
33 #include <xmmintrin.h>
34
35 CPL_CVSID("$Id: gdalgridsse.cpp 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $")
36
37 /************************************************************************/
38 /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */
39 /************************************************************************/
40
41 CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(const void * poOptions,GUInt32 nPoints,CPL_UNUSED const double * unused_padfX,CPL_UNUSED const double * unused_padfY,CPL_UNUSED const double * unused_padfZ,double dfXPoint,double dfYPoint,double * pdfValue,void * hExtraParamsIn)42 GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
43 const void *poOptions,
44 GUInt32 nPoints,
45 CPL_UNUSED const double *unused_padfX,
46 CPL_UNUSED const double *unused_padfY,
47 CPL_UNUSED const double *unused_padfZ,
48 double dfXPoint, double dfYPoint,
49 double *pdfValue,
50 void* hExtraParamsIn )
51 {
52 size_t i = 0;
53 GDALGridExtraParameters* psExtraParams =
54 static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
55 const float* pafX = psExtraParams->pafX;
56 const float* pafY = psExtraParams->pafY;
57 const float* pafZ = psExtraParams->pafZ;
58
59 const float fEpsilon = 0.0000000000001f;
60 const float fXPoint = static_cast<float>(dfXPoint);
61 const float fYPoint = static_cast<float>(dfYPoint);
62 const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon));
63 const __m128 xmm_x = _mm_load1_ps(const_cast<float*>(&fXPoint));
64 const __m128 xmm_y = _mm_load1_ps(const_cast<float*>(&fYPoint));
65 __m128 xmm_nominator = _mm_setzero_ps();
66 __m128 xmm_denominator = _mm_setzero_ps();
67 int mask = 0;
68
69 #if defined(__x86_64) || defined(_M_X64)
70 // This would also work in 32bit mode, but there are only 8 XMM registers
71 // whereas we have 16 for 64bit.
72 const size_t LOOP_SIZE = 8;
73 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
74 for( i = 0; i < nPointsRound; i += LOOP_SIZE )
75 {
76 // rx = pafX[i] - fXPoint
77 __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);
78 __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
79 // ry = pafY[i] - fYPoint
80 __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);
81 __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
82 // r2 = rx * rx + ry * ry
83 __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx),
84 _mm_mul_ps(xmm_ry, xmm_ry));
85 __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
86 _mm_mul_ps(xmm_ry_4, xmm_ry_4));
87 // invr2 = 1.0f / r2
88 __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);
89 __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
90 // nominator += invr2 * pafZ[i]
91 xmm_nominator = _mm_add_ps(xmm_nominator,
92 _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
93 xmm_nominator = _mm_add_ps(xmm_nominator,
94 _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
95 // denominator += invr2
96 xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);
97 xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
98 // if( r2 < fEpsilon)
99 mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |
100 (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
101 if( mask )
102 break;
103 }
104 #else
105 #define LOOP_SIZE 4
106 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
107 for( i = 0; i < nPointsRound; i += LOOP_SIZE )
108 {
109 __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */
110 __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */
111 __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
112 _mm_mul_ps(xmm_ry, xmm_ry));
113 __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
114 xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
115 _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
116 xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
117 mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
118 if( mask )
119 break;
120 }
121 #endif
122
123 // Find which i triggered r2 < fEpsilon.
124 if( mask )
125 {
126 for( size_t j = 0; j < LOOP_SIZE; j++ )
127 {
128 if( mask & (1 << j) )
129 {
130 (*pdfValue) = (pafZ)[i + j];
131 return CE_None;
132 }
133 }
134 }
135
136 // Get back nominator and denominator values for XMM registers.
137 float afNominator[4];
138 float afDenominator[4];
139 _mm_storeu_ps(afNominator, xmm_nominator);
140 _mm_storeu_ps(afDenominator, xmm_denominator);
141
142 float fNominator = afNominator[0] + afNominator[1] +
143 afNominator[2] + afNominator[3];
144 float fDenominator = afDenominator[0] + afDenominator[1] +
145 afDenominator[2] + afDenominator[3];
146
147 /* Do the few remaining loop iterations */
148 for( ; i < nPoints; i++ )
149 {
150 const float fRX = pafX[i] - fXPoint;
151 const float fRY = pafY[i] - fYPoint;
152 const float fR2 =
153 fRX * fRX + fRY * fRY;
154
155 // If the test point is close to the grid node, use the point
156 // value directly as a node value to avoid singularity.
157 if( fR2 < 0.0000000000001 )
158 {
159 break;
160 }
161 else
162 {
163 const float fInvR2 = 1.0f / fR2;
164 fNominator += fInvR2 * pafZ[i];
165 fDenominator += fInvR2;
166 }
167 }
168
169 if( i != nPoints )
170 {
171 (*pdfValue) = pafZ[i];
172 }
173 else
174 if( fDenominator == 0.0 )
175 {
176 (*pdfValue) =
177 static_cast<const GDALGridInverseDistanceToAPowerOptions*>(poOptions)->dfNoDataValue;
178 }
179 else
180 {
181 (*pdfValue) = fNominator / fDenominator;
182 }
183
184 return CE_None;
185 }
186
187 #endif /* HAVE_SSE_AT_COMPILE_TIME */
188