1 /******************************************************************************
2  * $Id: gdalsse_priv.h 207e8bcf055703689d991e1278fe08478cfd7956 2020-12-19 17:03:27 +0100 Even Rouault $
3  *
4  * Project:  GDAL
5  * Purpose:  SSE2 helper
6  * Author:   Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40 
41 /* Requires SSE2 */
42 #include <emmintrin.h>
43 #include <string.h>
44 
45 #ifdef __SSE4_1__
46 #include <smmintrin.h>
47 #endif
48 
49 #include "gdal_priv_templates.hpp"
50 
GDALCopyInt16ToXMM(const void * ptr)51 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52 {
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54     unsigned short s;
55     memcpy(&s, ptr, 2);
56     return _mm_cvtsi32_si128(s);
57 #else
58     return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
59 #endif
60 }
61 
GDALCopyInt32ToXMM(const void * ptr)62 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63 {
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65     GInt32 i;
66     memcpy(&i, ptr, 4);
67     return _mm_cvtsi32_si128(i);
68 #else
69     return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
70 #endif
71 }
72 
GDALCopyInt64ToXMM(const void * ptr)73 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74 {
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76     GInt64 i;
77     memcpy(&i, ptr, 8);
78     return _mm_cvtsi64_si128(i);
79 #else
80     return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
81 #endif
82 }
83 
GDALCopyXMMToInt16(const __m128i xmm,void * pDest)84 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85 {
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87     GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88     memcpy(pDest, &i, 2);
89 #else
90     *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91 #endif
92 }
93 
94 class XMMReg2Double
95 {
96   public:
97     __m128d xmm;
98 
99 #if defined(__GNUC__)
100 #pragma GCC diagnostic push
101 #pragma GCC diagnostic ignored "-Weffc++"
102 #endif
103     /* coverity[uninit_member] */
104     XMMReg2Double() = default;
105 #if defined(__GNUC__)
106 #pragma GCC diagnostic pop
107 #endif
108 
XMMReg2Double(double val)109     XMMReg2Double(double  val): xmm(_mm_load_sd (&val)) {}
XMMReg2Double(const XMMReg2Double & other)110     XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
111 
Zero()112     static inline XMMReg2Double Zero()
113     {
114         XMMReg2Double reg;
115         reg.Zeroize();
116         return reg;
117     }
118 
Load1ValHighAndLow(const double * ptr)119     static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
120     {
121         XMMReg2Double reg;
122         reg.nsLoad1ValHighAndLow(ptr);
123         return reg;
124     }
125 
Load2Val(const double * ptr)126     static inline XMMReg2Double Load2Val(const double* ptr)
127     {
128         XMMReg2Double reg;
129         reg.nsLoad2Val(ptr);
130         return reg;
131     }
132 
Load2Val(const float * ptr)133     static inline XMMReg2Double Load2Val(const float* ptr)
134     {
135         XMMReg2Double reg;
136         reg.nsLoad2Val(ptr);
137         return reg;
138     }
139 
Load2ValAligned(const double * ptr)140     static inline XMMReg2Double Load2ValAligned(const double* ptr)
141     {
142         XMMReg2Double reg;
143         reg.nsLoad2ValAligned(ptr);
144         return reg;
145     }
146 
Load2Val(const unsigned char * ptr)147     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
148     {
149         XMMReg2Double reg;
150         reg.nsLoad2Val(ptr);
151         return reg;
152     }
153 
Load2Val(const short * ptr)154     static inline XMMReg2Double Load2Val(const short* ptr)
155     {
156         XMMReg2Double reg;
157         reg.nsLoad2Val(ptr);
158         return reg;
159     }
160 
Load2Val(const unsigned short * ptr)161     static inline XMMReg2Double Load2Val(const unsigned short* ptr)
162     {
163         XMMReg2Double reg;
164         reg.nsLoad2Val(ptr);
165         return reg;
166     }
167 
Equals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)168     static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169     {
170         XMMReg2Double reg;
171         reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
172         return reg;
173     }
174 
NotEquals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)175     static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176     {
177         XMMReg2Double reg;
178         reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
179         return reg;
180     }
181 
Greater(const XMMReg2Double & expr1,const XMMReg2Double & expr2)182     static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183     {
184         XMMReg2Double reg;
185         reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186         return reg;
187     }
188 
And(const XMMReg2Double & expr1,const XMMReg2Double & expr2)189     static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
190     {
191         XMMReg2Double reg;
192         reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193         return reg;
194     }
195 
Ternary(const XMMReg2Double & cond,const XMMReg2Double & true_expr,const XMMReg2Double & false_expr)196     static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
197     {
198         XMMReg2Double reg;
199         reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
200         return reg;
201     }
202 
Min(const XMMReg2Double & expr1,const XMMReg2Double & expr2)203     static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
204     {
205         XMMReg2Double reg;
206         reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
207         return reg;
208     }
209 
nsLoad1ValHighAndLow(const double * ptr)210     inline void nsLoad1ValHighAndLow(const double* ptr)
211     {
212         xmm =  _mm_load1_pd(ptr);
213     }
214 
nsLoad2Val(const double * ptr)215     inline void nsLoad2Val(const double* ptr)
216     {
217         xmm = _mm_loadu_pd(ptr);
218     }
219 
nsLoad2ValAligned(const double * ptr)220     inline void nsLoad2ValAligned(const double* ptr)
221     {
222         xmm = _mm_load_pd(ptr);
223     }
224 
nsLoad2Val(const float * ptr)225     inline void nsLoad2Val(const float* ptr)
226     {
227         xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
228     }
229 
nsLoad2Val(const unsigned char * ptr)230     inline void nsLoad2Val(const unsigned char* ptr)
231     {
232         __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
233 #ifdef __SSE4_1__
234         xmm_i = _mm_cvtepu8_epi32(xmm_i);
235 #else
236         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
238 #endif
239         xmm = _mm_cvtepi32_pd(xmm_i);
240     }
241 
nsLoad2Val(const short * ptr)242     inline void nsLoad2Val(const short* ptr)
243     {
244         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245 #ifdef __SSE4_1__
246         xmm_i = _mm_cvtepi16_epi32(xmm_i);
247 #else
248         xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
249         xmm_i = _mm_srai_epi32(xmm_i, 16);       /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
250 #endif
251         xmm = _mm_cvtepi32_pd(xmm_i);
252     }
253 
nsLoad2Val(const unsigned short * ptr)254     inline void nsLoad2Val(const unsigned short* ptr)
255     {
256         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
257 #ifdef __SSE4_1__
258         xmm_i = _mm_cvtepu16_epi32(xmm_i);
259 #else
260         xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
261 #endif
262         xmm = _mm_cvtepi32_pd(xmm_i);
263     }
264 
Load4Val(const unsigned char * ptr,XMMReg2Double & low,XMMReg2Double & high)265     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
266     {
267         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
268 #ifdef __SSE4_1__
269         xmm_i = _mm_cvtepu8_epi32(xmm_i);
270 #else
271         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
273 #endif
274         low.xmm = _mm_cvtepi32_pd(xmm_i);
275         high.xmm =  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
276     }
277 
Load4Val(const short * ptr,XMMReg2Double & low,XMMReg2Double & high)278     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
279     {
280         low.nsLoad2Val(ptr);
281         high.nsLoad2Val(ptr+2);
282     }
283 
Load4Val(const unsigned short * ptr,XMMReg2Double & low,XMMReg2Double & high)284     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
285     {
286         low.nsLoad2Val(ptr);
287         high.nsLoad2Val(ptr+2);
288     }
289 
Load4Val(const double * ptr,XMMReg2Double & low,XMMReg2Double & high)290     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
291     {
292         low.nsLoad2Val(ptr);
293         high.nsLoad2Val(ptr+2);
294     }
295 
Load4Val(const float * ptr,XMMReg2Double & low,XMMReg2Double & high)296     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
297     {
298         __m128 temp1 = _mm_loadu_ps(ptr);
299         __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300         low.xmm = _mm_cvtps_pd(temp1);
301         high.xmm = _mm_cvtps_pd(temp2);
302     }
303 
Zeroize()304     inline void Zeroize()
305     {
306         xmm = _mm_setzero_pd();
307     }
308 
309     inline XMMReg2Double& operator= (const XMMReg2Double& other)
310     {
311         xmm = other.xmm;
312         return *this;
313     }
314 
315     inline XMMReg2Double& operator+= (const XMMReg2Double& other)
316     {
317         xmm = _mm_add_pd(xmm, other.xmm);
318         return *this;
319     }
320 
321     inline XMMReg2Double& operator*= (const XMMReg2Double& other)
322     {
323         xmm = _mm_mul_pd(xmm, other.xmm);
324         return *this;
325     }
326 
327     inline XMMReg2Double operator+ (const XMMReg2Double& other) const
328     {
329         XMMReg2Double ret;
330         ret.xmm = _mm_add_pd(xmm, other.xmm);
331         return ret;
332     }
333 
334     inline XMMReg2Double operator- (const XMMReg2Double& other) const
335     {
336         XMMReg2Double ret;
337         ret.xmm = _mm_sub_pd(xmm, other.xmm);
338         return ret;
339     }
340 
341     inline XMMReg2Double operator* (const XMMReg2Double& other) const
342     {
343         XMMReg2Double ret;
344         ret.xmm = _mm_mul_pd(xmm, other.xmm);
345         return ret;
346     }
347 
348     inline XMMReg2Double operator/ (const XMMReg2Double& other) const
349     {
350         XMMReg2Double ret;
351         ret.xmm = _mm_div_pd(xmm, other.xmm);
352         return ret;
353     }
354 
GetHorizSum()355     inline double GetHorizSum() const
356     {
357         __m128d xmm2;
358         xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
359         return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360     }
361 
Store2Val(double * ptr)362     inline void Store2Val(double* ptr) const
363     {
364         _mm_storeu_pd(ptr, xmm);
365     }
366 
Store2ValAligned(double * ptr)367     inline void Store2ValAligned(double* ptr) const
368     {
369         _mm_store_pd(ptr, xmm);
370     }
371 
Store2Val(float * ptr)372     inline void Store2Val(float* ptr) const
373     {
374         __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
376     }
377 
Store2Val(unsigned char * ptr)378     inline void Store2Val(unsigned char* ptr) const
379     {
380         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381         tmp = _mm_packs_epi32(tmp, tmp);
382         tmp = _mm_packus_epi16(tmp, tmp);
383         GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
384     }
385 
Store2Val(unsigned short * ptr)386     inline void Store2Val(unsigned short* ptr) const
387     {
388         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
389         // X X X X 0 B 0 A --> X X X X A A B A
390         tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
391         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
392     }
393 
StoreMask(unsigned char * ptr)394     inline void StoreMask(unsigned char* ptr) const
395     {
396         _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
397     }
398 
399     inline operator double () const
400     {
401         return _mm_cvtsd_f64(xmm);
402     }
403 };
404 
405 #else
406 
407 #ifndef NO_WARN_USE_SSE2_EMULATION
408 #warning "Software emulation of SSE2 !"
409 #endif
410 
411 class XMMReg2Double
412 {
413   public:
414     double low;
415     double high;
416 
XMMReg2Double()417     XMMReg2Double() {}
XMMReg2Double(double val)418     XMMReg2Double(double  val)  { low = val; high = 0.0; }
XMMReg2Double(const XMMReg2Double & other)419     XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
420 
Zero()421     static inline XMMReg2Double Zero()
422     {
423         XMMReg2Double reg;
424         reg.Zeroize();
425         return reg;
426     }
427 
Load1ValHighAndLow(const double * ptr)428     static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
429     {
430         XMMReg2Double reg;
431         reg.nsLoad1ValHighAndLow(ptr);
432         return reg;
433     }
434 
Equals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)435     static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
436     {
437         XMMReg2Double reg;
438 
439         if (expr1.low == expr2.low)
440             memset(&(reg.low), 0xFF, sizeof(double));
441         else
442             reg.low = 0;
443 
444         if (expr1.high == expr2.high)
445             memset(&(reg.high), 0xFF, sizeof(double));
446         else
447             reg.high = 0;
448 
449         return reg;
450     }
451 
NotEquals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)452     static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
453     {
454         XMMReg2Double reg;
455 
456         if (expr1.low != expr2.low)
457             memset(&(reg.low), 0xFF, sizeof(double));
458         else
459             reg.low = 0;
460 
461         if (expr1.high != expr2.high)
462             memset(&(reg.high), 0xFF, sizeof(double));
463         else
464             reg.high = 0;
465 
466         return reg;
467     }
468 
Greater(const XMMReg2Double & expr1,const XMMReg2Double & expr2)469     static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
470     {
471         XMMReg2Double reg;
472 
473         if (expr1.low > expr2.low)
474             memset(&(reg.low), 0xFF, sizeof(double));
475         else
476             reg.low = 0;
477 
478         if (expr1.high > expr2.high)
479             memset(&(reg.high), 0xFF, sizeof(double));
480         else
481             reg.high = 0;
482 
483         return reg;
484     }
485 
And(const XMMReg2Double & expr1,const XMMReg2Double & expr2)486     static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
487     {
488         XMMReg2Double reg;
489         int low1[2], high1[2];
490         int low2[2], high2[2];
491         memcpy(low1, &expr1.low, sizeof(double));
492         memcpy(high1, &expr1.high, sizeof(double));
493         memcpy(low2, &expr2.low, sizeof(double));
494         memcpy(high2, &expr2.high, sizeof(double));
495         low1[0] &= low2[0];
496         low1[1] &= low2[1];
497         high1[0] &= high2[0];
498         high1[1] &= high2[1];
499         memcpy(&reg.low, low1, sizeof(double));
500         memcpy(&reg.high, high1, sizeof(double));
501         return reg;
502     }
503 
Ternary(const XMMReg2Double & cond,const XMMReg2Double & true_expr,const XMMReg2Double & false_expr)504     static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
505     {
506         XMMReg2Double reg;
507         if( cond.low )
508             reg.low = true_expr.low;
509         else
510             reg.low = false_expr.low;
511         if( cond.high )
512             reg.high = true_expr.high;
513         else
514             reg.high = false_expr.high;
515         return reg;
516     }
517 
Min(const XMMReg2Double & expr1,const XMMReg2Double & expr2)518     static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
519     {
520         XMMReg2Double reg;
521         reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
522         reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
523         return reg;
524     }
525 
Load2Val(const double * ptr)526     static inline XMMReg2Double Load2Val(const double* ptr)
527     {
528         XMMReg2Double reg;
529         reg.nsLoad2Val(ptr);
530         return reg;
531     }
532 
Load2ValAligned(const double * ptr)533     static inline XMMReg2Double Load2ValAligned(const double* ptr)
534     {
535         XMMReg2Double reg;
536         reg.nsLoad2ValAligned(ptr);
537         return reg;
538     }
539 
Load2Val(const float * ptr)540     static inline XMMReg2Double Load2Val(const float* ptr)
541     {
542         XMMReg2Double reg;
543         reg.nsLoad2Val(ptr);
544         return reg;
545     }
546 
Load2Val(const unsigned char * ptr)547     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
548     {
549         XMMReg2Double reg;
550         reg.nsLoad2Val(ptr);
551         return reg;
552     }
553 
Load2Val(const short * ptr)554     static inline XMMReg2Double Load2Val(const short* ptr)
555     {
556         XMMReg2Double reg;
557         reg.nsLoad2Val(ptr);
558         return reg;
559     }
560 
Load2Val(const unsigned short * ptr)561     static inline XMMReg2Double Load2Val(const unsigned short* ptr)
562     {
563         XMMReg2Double reg;
564         reg.nsLoad2Val(ptr);
565         return reg;
566     }
567 
nsLoad1ValHighAndLow(const double * ptr)568     inline void nsLoad1ValHighAndLow(const double* ptr)
569     {
570         low = ptr[0];
571         high = ptr[0];
572     }
573 
nsLoad2Val(const double * ptr)574     inline void nsLoad2Val(const double* ptr)
575     {
576         low = ptr[0];
577         high = ptr[1];
578     }
579 
nsLoad2ValAligned(const double * ptr)580     inline void nsLoad2ValAligned(const double* ptr)
581     {
582         low = ptr[0];
583         high = ptr[1];
584     }
585 
nsLoad2Val(const float * ptr)586     inline void nsLoad2Val(const float* ptr)
587     {
588         low = ptr[0];
589         high = ptr[1];
590     }
591 
nsLoad2Val(const unsigned char * ptr)592     inline void nsLoad2Val(const unsigned char* ptr)
593     {
594         low = ptr[0];
595         high = ptr[1];
596     }
597 
nsLoad2Val(const short * ptr)598     inline void nsLoad2Val(const short* ptr)
599     {
600         low = ptr[0];
601         high = ptr[1];
602     }
603 
nsLoad2Val(const unsigned short * ptr)604     inline void nsLoad2Val(const unsigned short* ptr)
605     {
606         low = ptr[0];
607         high = ptr[1];
608     }
609 
Load4Val(const unsigned char * ptr,XMMReg2Double & low,XMMReg2Double & high)610     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
611     {
612         low.low = ptr[0];
613         low.high = ptr[1];
614         high.low = ptr[2];
615         high.high = ptr[3];
616     }
617 
Load4Val(const short * ptr,XMMReg2Double & low,XMMReg2Double & high)618     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
619     {
620         low.nsLoad2Val(ptr);
621         high.nsLoad2Val(ptr+2);
622     }
623 
Load4Val(const unsigned short * ptr,XMMReg2Double & low,XMMReg2Double & high)624     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
625     {
626         low.nsLoad2Val(ptr);
627         high.nsLoad2Val(ptr+2);
628     }
629 
Load4Val(const double * ptr,XMMReg2Double & low,XMMReg2Double & high)630     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
631     {
632         low.nsLoad2Val(ptr);
633         high.nsLoad2Val(ptr+2);
634     }
635 
Load4Val(const float * ptr,XMMReg2Double & low,XMMReg2Double & high)636     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
637     {
638         low.nsLoad2Val(ptr);
639         high.nsLoad2Val(ptr+2);
640     }
641 
Zeroize()642     inline void Zeroize()
643     {
644         low = 0.0;
645         high = 0.0;
646     }
647 
648     inline XMMReg2Double& operator= (const XMMReg2Double& other)
649     {
650         low = other.low;
651         high = other.high;
652         return *this;
653     }
654 
655     inline XMMReg2Double& operator+= (const XMMReg2Double& other)
656     {
657         low += other.low;
658         high += other.high;
659         return *this;
660     }
661 
662     inline XMMReg2Double& operator*= (const XMMReg2Double& other)
663     {
664         low *= other.low;
665         high *= other.high;
666         return *this;
667     }
668 
669     inline XMMReg2Double operator+ (const XMMReg2Double& other) const
670     {
671         XMMReg2Double ret;
672         ret.low = low + other.low;
673         ret.high = high + other.high;
674         return ret;
675     }
676 
677     inline XMMReg2Double operator- (const XMMReg2Double& other) const
678     {
679         XMMReg2Double ret;
680         ret.low = low - other.low;
681         ret.high = high - other.high;
682         return ret;
683     }
684 
685     inline XMMReg2Double operator* (const XMMReg2Double& other) const
686     {
687         XMMReg2Double ret;
688         ret.low = low * other.low;
689         ret.high = high * other.high;
690         return ret;
691     }
692 
693     inline XMMReg2Double operator/ (const XMMReg2Double& other) const
694     {
695         XMMReg2Double ret;
696         ret.low = low / other.low;
697         ret.high = high / other.high;
698         return ret;
699     }
700 
GetHorizSum()701     inline double GetHorizSum() const
702     {
703         return low + high;
704     }
705 
Store2Val(double * ptr)706     inline void Store2Val(double* ptr) const
707     {
708         ptr[0] = low;
709         ptr[1] = high;
710     }
711 
Store2ValAligned(double * ptr)712     inline void Store2ValAligned(double* ptr) const
713     {
714         ptr[0] = low;
715         ptr[1] = high;
716     }
717 
Store2Val(float * ptr)718     inline void Store2Val(float* ptr) const
719     {
720         ptr[0] = low;
721         ptr[1] = high;
722     }
723 
Store2Val(unsigned char * ptr)724     void Store2Val(unsigned char* ptr) const
725     {
726         ptr[0] = (unsigned char)(low + 0.5);
727         ptr[1] = (unsigned char)(high + 0.5);
728     }
729 
Store2Val(unsigned short * ptr)730     void Store2Val(unsigned short* ptr) const
731     {
732         ptr[0] = (GUInt16)(low + 0.5);
733         ptr[1] = (GUInt16)(high + 0.5);
734     }
735 
StoreMask(unsigned char * ptr)736     inline void StoreMask(unsigned char* ptr) const
737     {
738         memcpy(ptr, &low, 8);
739         memcpy(ptr + 8, &high, 8);
740     }
741 
742     inline operator double () const
743     {
744         return low;
745     }
746 };
747 
748 #endif /*  defined(__x86_64) || defined(_M_X64) */
749 
750 #ifdef __AVX__
751 
752 #include <immintrin.h>
753 
754 class XMMReg4Double
755 {
756   public:
757     __m256d ymm;
758 
XMMReg4Double()759     XMMReg4Double(): ymm(_mm256_setzero_pd()) {}
XMMReg4Double(const XMMReg4Double & other)760     XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
761 
Zero()762     static inline XMMReg4Double Zero()
763     {
764         XMMReg4Double reg;
765         reg.Zeroize();
766         return reg;
767     }
768 
Zeroize()769     inline void Zeroize()
770     {
771         ymm = _mm256_setzero_pd();
772     }
773 
Load1ValHighAndLow(const double * ptr)774     static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
775     {
776         XMMReg4Double reg;
777         reg.nsLoad1ValHighAndLow(ptr);
778         return reg;
779     }
780 
nsLoad1ValHighAndLow(const double * ptr)781     inline void nsLoad1ValHighAndLow(const double* ptr)
782     {
783         ymm = _mm256_set1_pd(*ptr);
784     }
785 
Load4Val(const unsigned char * ptr)786     static inline XMMReg4Double Load4Val(const unsigned char* ptr)
787     {
788         XMMReg4Double reg;
789         reg.nsLoad4Val(ptr);
790         return reg;
791     }
792 
nsLoad4Val(const unsigned char * ptr)793     inline void nsLoad4Val(const unsigned char* ptr)
794     {
795         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
796         xmm_i = _mm_cvtepu8_epi32(xmm_i);
797         ymm = _mm256_cvtepi32_pd(xmm_i);
798     }
799 
Load4Val(const short * ptr)800     static inline XMMReg4Double Load4Val(const short* ptr)
801     {
802         XMMReg4Double reg;
803         reg.nsLoad4Val(ptr);
804         return reg;
805     }
806 
nsLoad4Val(const short * ptr)807     inline void nsLoad4Val(const short* ptr)
808     {
809         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
810         xmm_i = _mm_cvtepi16_epi32(xmm_i);
811         ymm = _mm256_cvtepi32_pd(xmm_i);
812     }
813 
Load4Val(const unsigned short * ptr)814     static inline XMMReg4Double Load4Val(const unsigned short* ptr)
815     {
816         XMMReg4Double reg;
817         reg.nsLoad4Val(ptr);
818         return reg;
819     }
820 
nsLoad4Val(const unsigned short * ptr)821     inline void nsLoad4Val(const unsigned short* ptr)
822     {
823         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
824         xmm_i = _mm_cvtepu16_epi32(xmm_i);
825         ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
826     }
827 
Load4Val(const double * ptr)828     static inline XMMReg4Double Load4Val(const double* ptr)
829     {
830         XMMReg4Double reg;
831         reg.nsLoad4Val(ptr);
832         return reg;
833     }
834 
nsLoad4Val(const double * ptr)835     inline void nsLoad4Val(const double* ptr)
836     {
837         ymm = _mm256_loadu_pd(ptr);
838     }
839 
Load4ValAligned(const double * ptr)840     static inline XMMReg4Double Load4ValAligned(const double* ptr)
841     {
842         XMMReg4Double reg;
843         reg.nsLoad4ValAligned(ptr);
844         return reg;
845     }
846 
nsLoad4ValAligned(const double * ptr)847     inline void nsLoad4ValAligned(const double* ptr)
848     {
849         ymm = _mm256_load_pd(ptr);
850     }
851 
Load4Val(const float * ptr)852     static inline XMMReg4Double Load4Val(const float* ptr)
853     {
854         XMMReg4Double reg;
855         reg.nsLoad4Val(ptr);
856         return reg;
857     }
858 
nsLoad4Val(const float * ptr)859     inline void nsLoad4Val(const float* ptr)
860     {
861         ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
862     }
863 
Equals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)864     static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
865     {
866         XMMReg4Double reg;
867         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
868         return reg;
869     }
870 
NotEquals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)871     static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
872     {
873         XMMReg4Double reg;
874         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
875         return reg;
876     }
877 
Greater(const XMMReg4Double & expr1,const XMMReg4Double & expr2)878     static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
879     {
880         XMMReg4Double reg;
881         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
882         return reg;
883     }
884 
And(const XMMReg4Double & expr1,const XMMReg4Double & expr2)885     static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
886     {
887         XMMReg4Double reg;
888         reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
889         return reg;
890     }
891 
Ternary(const XMMReg4Double & cond,const XMMReg4Double & true_expr,const XMMReg4Double & false_expr)892     static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
893     {
894         XMMReg4Double reg;
895         reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
896         return reg;
897     }
898 
Min(const XMMReg4Double & expr1,const XMMReg4Double & expr2)899     static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
900     {
901         XMMReg4Double reg;
902         reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
903         return reg;
904     }
905 
906     inline XMMReg4Double& operator= (const XMMReg4Double& other)
907     {
908         ymm = other.ymm;
909         return *this;
910     }
911 
912     inline XMMReg4Double& operator+= (const XMMReg4Double& other)
913     {
914         ymm = _mm256_add_pd(ymm, other.ymm);
915         return *this;
916     }
917 
918     inline XMMReg4Double& operator*= (const XMMReg4Double& other)
919     {
920         ymm = _mm256_mul_pd(ymm, other.ymm);
921         return *this;
922     }
923 
924     inline XMMReg4Double operator+ (const XMMReg4Double& other) const
925     {
926         XMMReg4Double ret;
927         ret.ymm = _mm256_add_pd(ymm, other.ymm);
928         return ret;
929     }
930 
931     inline XMMReg4Double operator- (const XMMReg4Double& other) const
932     {
933         XMMReg4Double ret;
934         ret.ymm = _mm256_sub_pd(ymm, other.ymm);
935         return ret;
936     }
937 
938     inline XMMReg4Double operator* (const XMMReg4Double& other) const
939     {
940         XMMReg4Double ret;
941         ret.ymm = _mm256_mul_pd(ymm, other.ymm);
942         return ret;
943     }
944 
945     inline XMMReg4Double operator/ (const XMMReg4Double& other) const
946     {
947         XMMReg4Double ret;
948         ret.ymm = _mm256_div_pd(ymm, other.ymm);
949         return ret;
950     }
951 
AddToLow(const XMMReg2Double & other)952     void AddToLow( const XMMReg2Double& other )
953     {
954          __m256d ymm2 = _mm256_setzero_pd();
955          ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
956         ymm = _mm256_add_pd(ymm, ymm2);
957     }
958 
GetHorizSum()959     inline double GetHorizSum() const
960     {
961         __m256d ymm_tmp1, ymm_tmp2;
962         ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
963         ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
964         ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
965         return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
966     }
967 
Store4Val(unsigned char * ptr)968     inline void Store4Val(unsigned char* ptr) const
969     {
970         __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
971         //xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
972         //xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
973         xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); //  SSSE3
974         GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
975     }
976 
Store4Val(unsigned short * ptr)977     inline void Store4Val(unsigned short* ptr) const
978     {
979         __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
980         xmm_i = _mm_packus_epi32(xmm_i, xmm_i);   // Pack uint32 to uint16
981         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
982     }
983 
Store4Val(float * ptr)984     inline void Store4Val(float* ptr) const
985     {
986         _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
987     }
988 
Store4Val(double * ptr)989     inline void Store4Val(double* ptr) const
990     {
991         _mm256_storeu_pd(ptr, ymm);
992     }
993 
StoreMask(unsigned char * ptr)994     inline void StoreMask(unsigned char* ptr) const
995     {
996         _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
997     }
998 };
999 
1000 #else
1001 
1002 class XMMReg4Double
1003 {
1004   public:
1005     XMMReg2Double low, high;
1006 
1007 #if defined(__GNUC__)
1008 #pragma GCC diagnostic push
1009 #pragma GCC diagnostic ignored "-Weffc++"
1010 #endif
1011     /* coverity[uninit_member] */
1012     XMMReg4Double() = default;
1013 #if defined(__GNUC__)
1014 #pragma GCC diagnostic pop
1015 #endif
1016 
XMMReg4Double(const XMMReg4Double & other)1017     XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1018 
Zero()1019     static inline XMMReg4Double Zero()
1020     {
1021         XMMReg4Double reg;
1022         reg.low.Zeroize();
1023         reg.high.Zeroize();
1024         return reg;
1025     }
1026 
Load1ValHighAndLow(const double * ptr)1027     static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1028     {
1029         XMMReg4Double reg;
1030         reg.low.nsLoad1ValHighAndLow(ptr);
1031         reg.high = reg.low;
1032         return reg;
1033     }
1034 
Load4Val(const unsigned char * ptr)1035     static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1036     {
1037         XMMReg4Double reg;
1038         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1039         return reg;
1040     }
1041 
Load4Val(const short * ptr)1042     static inline XMMReg4Double Load4Val(const short* ptr)
1043     {
1044         XMMReg4Double reg;
1045         reg.low.nsLoad2Val(ptr);
1046         reg.high.nsLoad2Val(ptr+2);
1047         return reg;
1048     }
1049 
Load4Val(const unsigned short * ptr)1050     static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1051     {
1052         XMMReg4Double reg;
1053         reg.low.nsLoad2Val(ptr);
1054         reg.high.nsLoad2Val(ptr+2);
1055         return reg;
1056     }
1057 
Load4Val(const double * ptr)1058     static inline XMMReg4Double Load4Val(const double* ptr)
1059     {
1060         XMMReg4Double reg;
1061         reg.low.nsLoad2Val(ptr);
1062         reg.high.nsLoad2Val(ptr+2);
1063         return reg;
1064     }
1065 
Load4ValAligned(const double * ptr)1066     static inline XMMReg4Double Load4ValAligned(const double* ptr)
1067     {
1068         XMMReg4Double reg;
1069         reg.low.nsLoad2ValAligned(ptr);
1070         reg.high.nsLoad2ValAligned(ptr+2);
1071         return reg;
1072     }
1073 
Load4Val(const float * ptr)1074     static inline XMMReg4Double Load4Val(const float* ptr)
1075     {
1076         XMMReg4Double reg;
1077         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1078         return reg;
1079     }
1080 
Equals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1081     static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1082     {
1083         XMMReg4Double reg;
1084         reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1085         reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1086         return reg;
1087     }
1088 
NotEquals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1089     static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1090     {
1091         XMMReg4Double reg;
1092         reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1093         reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1094         return reg;
1095     }
1096 
Greater(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1097     static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1098     {
1099         XMMReg4Double reg;
1100         reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1101         reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1102         return reg;
1103     }
1104 
And(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1105     static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1106     {
1107         XMMReg4Double reg;
1108         reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1109         reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1110         return reg;
1111     }
1112 
Ternary(const XMMReg4Double & cond,const XMMReg4Double & true_expr,const XMMReg4Double & false_expr)1113     static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1114     {
1115         XMMReg4Double reg;
1116         reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1117         reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1118         return reg;
1119     }
1120 
Min(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1121     static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1122     {
1123         XMMReg4Double reg;
1124         reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1125         reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1126         return reg;
1127     }
1128 
1129     inline XMMReg4Double& operator= (const XMMReg4Double& other)
1130     {
1131         low = other.low;
1132         high = other.high;
1133         return *this;
1134     }
1135 
1136     inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1137     {
1138         low += other.low;
1139         high += other.high;
1140         return *this;
1141     }
1142 
1143     inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1144     {
1145         low *= other.low;
1146         high *= other.high;
1147         return *this;
1148     }
1149 
1150     inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1151     {
1152         XMMReg4Double ret;
1153         ret.low = low + other.low;
1154         ret.high = high + other.high;
1155         return ret;
1156     }
1157 
1158     inline XMMReg4Double operator- (const XMMReg4Double& other) const
1159     {
1160         XMMReg4Double ret;
1161         ret.low = low - other.low;
1162         ret.high = high - other.high;
1163         return ret;
1164     }
1165 
1166     inline XMMReg4Double operator* (const XMMReg4Double& other) const
1167     {
1168         XMMReg4Double ret;
1169         ret.low = low * other.low;
1170         ret.high = high * other.high;
1171         return ret;
1172     }
1173 
1174     inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1175     {
1176         XMMReg4Double ret;
1177         ret.low = low / other.low;
1178         ret.high = high / other.high;
1179         return ret;
1180     }
1181 
AddToLow(const XMMReg2Double & other)1182     void AddToLow( const XMMReg2Double& other )
1183     {
1184         low += other;
1185     }
1186 
GetHorizSum()1187     inline double GetHorizSum() const
1188     {
1189         return (low + high).GetHorizSum();
1190     }
1191 
Store4Val(unsigned char * ptr)1192     inline void Store4Val(unsigned char* ptr) const
1193     {
1194 #ifdef USE_SSE2_EMULATION
1195         low.Store2Val(ptr);
1196         high.Store2Val(ptr+2);
1197 #else
1198         __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(low.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1199         __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(high.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1200         auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow), _mm_castsi128_ps(tmpHigh), _MM_SHUFFLE(1, 0, 1, 0)));
1201         tmp = _mm_packs_epi32(tmp, tmp);
1202         tmp = _mm_packus_epi16(tmp, tmp);
1203         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
1204 #endif
1205     }
1206 
Store4Val(unsigned short * ptr)1207     inline void Store4Val(unsigned short* ptr) const
1208     {
1209 #if 1
1210         low.Store2Val(ptr);
1211         high.Store2Val(ptr+2);
1212 #else
1213         __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1214         __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1215         xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1216 #if __SSE4_1__
1217         xmm0 = _mm_packus_epi32(xmm0, xmm0);   // Pack uint32 to uint16
1218 #else
1219         xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1220         xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1221         xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1222 #endif
1223         GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1224 #endif
1225     }
1226 
Store4Val(float * ptr)1227     inline void Store4Val(float* ptr) const
1228     {
1229         low.Store2Val(ptr);
1230         high.Store2Val(ptr+2);
1231     }
1232 
Store4Val(double * ptr)1233     inline void Store4Val(double* ptr) const
1234     {
1235         low.Store2Val(ptr);
1236         high.Store2Val(ptr+2);
1237     }
1238 
StoreMask(unsigned char * ptr)1239     inline void StoreMask(unsigned char* ptr) const
1240     {
1241         low.StoreMask(ptr);
1242         high.StoreMask(ptr+16);
1243     }
1244 
1245 };
1246 
1247 #endif
1248 
1249 #endif /* #ifndef DOXYGEN_SKIP */
1250 
1251 #endif /* GDALSSE_PRIV_H_INCLUDED */
1252