1 /******************************************************************************
2 * $Id: gdalsse_priv.h 207e8bcf055703689d991e1278fe08478cfd7956 2020-12-19 17:03:27 +0100 Even Rouault $
3 *
4 * Project: GDAL
5 * Purpose: SSE2 helper
6 * Author: Even Rouault <even dot rouault at spatialys dot com>
7 *
8 ******************************************************************************
9 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10 *
11 * Permission is hereby granted, free of charge, to any person obtaining a
12 * copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 ****************************************************************************/
29
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32
33 #ifndef DOXYGEN_SKIP
34
35 #include "cpl_port.h"
36
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40
41 /* Requires SSE2 */
42 #include <emmintrin.h>
43 #include <string.h>
44
45 #ifdef __SSE4_1__
46 #include <smmintrin.h>
47 #endif
48
49 #include "gdal_priv_templates.hpp"
50
GDALCopyInt16ToXMM(const void * ptr)51 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52 {
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54 unsigned short s;
55 memcpy(&s, ptr, 2);
56 return _mm_cvtsi32_si128(s);
57 #else
58 return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
59 #endif
60 }
61
GDALCopyInt32ToXMM(const void * ptr)62 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63 {
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65 GInt32 i;
66 memcpy(&i, ptr, 4);
67 return _mm_cvtsi32_si128(i);
68 #else
69 return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
70 #endif
71 }
72
GDALCopyInt64ToXMM(const void * ptr)73 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74 {
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76 GInt64 i;
77 memcpy(&i, ptr, 8);
78 return _mm_cvtsi64_si128(i);
79 #else
80 return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
81 #endif
82 }
83
GDALCopyXMMToInt16(const __m128i xmm,void * pDest)84 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85 {
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88 memcpy(pDest, &i, 2);
89 #else
90 *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91 #endif
92 }
93
94 class XMMReg2Double
95 {
96 public:
97 __m128d xmm;
98
99 #if defined(__GNUC__)
100 #pragma GCC diagnostic push
101 #pragma GCC diagnostic ignored "-Weffc++"
102 #endif
103 /* coverity[uninit_member] */
104 XMMReg2Double() = default;
105 #if defined(__GNUC__)
106 #pragma GCC diagnostic pop
107 #endif
108
XMMReg2Double(double val)109 XMMReg2Double(double val): xmm(_mm_load_sd (&val)) {}
XMMReg2Double(const XMMReg2Double & other)110 XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
111
Zero()112 static inline XMMReg2Double Zero()
113 {
114 XMMReg2Double reg;
115 reg.Zeroize();
116 return reg;
117 }
118
Load1ValHighAndLow(const double * ptr)119 static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
120 {
121 XMMReg2Double reg;
122 reg.nsLoad1ValHighAndLow(ptr);
123 return reg;
124 }
125
Load2Val(const double * ptr)126 static inline XMMReg2Double Load2Val(const double* ptr)
127 {
128 XMMReg2Double reg;
129 reg.nsLoad2Val(ptr);
130 return reg;
131 }
132
Load2Val(const float * ptr)133 static inline XMMReg2Double Load2Val(const float* ptr)
134 {
135 XMMReg2Double reg;
136 reg.nsLoad2Val(ptr);
137 return reg;
138 }
139
Load2ValAligned(const double * ptr)140 static inline XMMReg2Double Load2ValAligned(const double* ptr)
141 {
142 XMMReg2Double reg;
143 reg.nsLoad2ValAligned(ptr);
144 return reg;
145 }
146
Load2Val(const unsigned char * ptr)147 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
148 {
149 XMMReg2Double reg;
150 reg.nsLoad2Val(ptr);
151 return reg;
152 }
153
Load2Val(const short * ptr)154 static inline XMMReg2Double Load2Val(const short* ptr)
155 {
156 XMMReg2Double reg;
157 reg.nsLoad2Val(ptr);
158 return reg;
159 }
160
Load2Val(const unsigned short * ptr)161 static inline XMMReg2Double Load2Val(const unsigned short* ptr)
162 {
163 XMMReg2Double reg;
164 reg.nsLoad2Val(ptr);
165 return reg;
166 }
167
Equals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)168 static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169 {
170 XMMReg2Double reg;
171 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
172 return reg;
173 }
174
NotEquals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)175 static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176 {
177 XMMReg2Double reg;
178 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
179 return reg;
180 }
181
Greater(const XMMReg2Double & expr1,const XMMReg2Double & expr2)182 static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183 {
184 XMMReg2Double reg;
185 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186 return reg;
187 }
188
And(const XMMReg2Double & expr1,const XMMReg2Double & expr2)189 static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
190 {
191 XMMReg2Double reg;
192 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193 return reg;
194 }
195
Ternary(const XMMReg2Double & cond,const XMMReg2Double & true_expr,const XMMReg2Double & false_expr)196 static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
197 {
198 XMMReg2Double reg;
199 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
200 return reg;
201 }
202
Min(const XMMReg2Double & expr1,const XMMReg2Double & expr2)203 static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
204 {
205 XMMReg2Double reg;
206 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
207 return reg;
208 }
209
nsLoad1ValHighAndLow(const double * ptr)210 inline void nsLoad1ValHighAndLow(const double* ptr)
211 {
212 xmm = _mm_load1_pd(ptr);
213 }
214
nsLoad2Val(const double * ptr)215 inline void nsLoad2Val(const double* ptr)
216 {
217 xmm = _mm_loadu_pd(ptr);
218 }
219
nsLoad2ValAligned(const double * ptr)220 inline void nsLoad2ValAligned(const double* ptr)
221 {
222 xmm = _mm_load_pd(ptr);
223 }
224
nsLoad2Val(const float * ptr)225 inline void nsLoad2Val(const float* ptr)
226 {
227 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
228 }
229
nsLoad2Val(const unsigned char * ptr)230 inline void nsLoad2Val(const unsigned char* ptr)
231 {
232 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
233 #ifdef __SSE4_1__
234 xmm_i = _mm_cvtepu8_epi32(xmm_i);
235 #else
236 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
238 #endif
239 xmm = _mm_cvtepi32_pd(xmm_i);
240 }
241
nsLoad2Val(const short * ptr)242 inline void nsLoad2Val(const short* ptr)
243 {
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245 #ifdef __SSE4_1__
246 xmm_i = _mm_cvtepi16_epi32(xmm_i);
247 #else
248 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
249 xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
250 #endif
251 xmm = _mm_cvtepi32_pd(xmm_i);
252 }
253
nsLoad2Val(const unsigned short * ptr)254 inline void nsLoad2Val(const unsigned short* ptr)
255 {
256 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
257 #ifdef __SSE4_1__
258 xmm_i = _mm_cvtepu16_epi32(xmm_i);
259 #else
260 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
261 #endif
262 xmm = _mm_cvtepi32_pd(xmm_i);
263 }
264
Load4Val(const unsigned char * ptr,XMMReg2Double & low,XMMReg2Double & high)265 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
266 {
267 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
268 #ifdef __SSE4_1__
269 xmm_i = _mm_cvtepu8_epi32(xmm_i);
270 #else
271 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
273 #endif
274 low.xmm = _mm_cvtepi32_pd(xmm_i);
275 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
276 }
277
Load4Val(const short * ptr,XMMReg2Double & low,XMMReg2Double & high)278 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
279 {
280 low.nsLoad2Val(ptr);
281 high.nsLoad2Val(ptr+2);
282 }
283
Load4Val(const unsigned short * ptr,XMMReg2Double & low,XMMReg2Double & high)284 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
285 {
286 low.nsLoad2Val(ptr);
287 high.nsLoad2Val(ptr+2);
288 }
289
Load4Val(const double * ptr,XMMReg2Double & low,XMMReg2Double & high)290 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
291 {
292 low.nsLoad2Val(ptr);
293 high.nsLoad2Val(ptr+2);
294 }
295
Load4Val(const float * ptr,XMMReg2Double & low,XMMReg2Double & high)296 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
297 {
298 __m128 temp1 = _mm_loadu_ps(ptr);
299 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300 low.xmm = _mm_cvtps_pd(temp1);
301 high.xmm = _mm_cvtps_pd(temp2);
302 }
303
Zeroize()304 inline void Zeroize()
305 {
306 xmm = _mm_setzero_pd();
307 }
308
309 inline XMMReg2Double& operator= (const XMMReg2Double& other)
310 {
311 xmm = other.xmm;
312 return *this;
313 }
314
315 inline XMMReg2Double& operator+= (const XMMReg2Double& other)
316 {
317 xmm = _mm_add_pd(xmm, other.xmm);
318 return *this;
319 }
320
321 inline XMMReg2Double& operator*= (const XMMReg2Double& other)
322 {
323 xmm = _mm_mul_pd(xmm, other.xmm);
324 return *this;
325 }
326
327 inline XMMReg2Double operator+ (const XMMReg2Double& other) const
328 {
329 XMMReg2Double ret;
330 ret.xmm = _mm_add_pd(xmm, other.xmm);
331 return ret;
332 }
333
334 inline XMMReg2Double operator- (const XMMReg2Double& other) const
335 {
336 XMMReg2Double ret;
337 ret.xmm = _mm_sub_pd(xmm, other.xmm);
338 return ret;
339 }
340
341 inline XMMReg2Double operator* (const XMMReg2Double& other) const
342 {
343 XMMReg2Double ret;
344 ret.xmm = _mm_mul_pd(xmm, other.xmm);
345 return ret;
346 }
347
348 inline XMMReg2Double operator/ (const XMMReg2Double& other) const
349 {
350 XMMReg2Double ret;
351 ret.xmm = _mm_div_pd(xmm, other.xmm);
352 return ret;
353 }
354
GetHorizSum()355 inline double GetHorizSum() const
356 {
357 __m128d xmm2;
358 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
359 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360 }
361
Store2Val(double * ptr)362 inline void Store2Val(double* ptr) const
363 {
364 _mm_storeu_pd(ptr, xmm);
365 }
366
Store2ValAligned(double * ptr)367 inline void Store2ValAligned(double* ptr) const
368 {
369 _mm_store_pd(ptr, xmm);
370 }
371
Store2Val(float * ptr)372 inline void Store2Val(float* ptr) const
373 {
374 __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
376 }
377
Store2Val(unsigned char * ptr)378 inline void Store2Val(unsigned char* ptr) const
379 {
380 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381 tmp = _mm_packs_epi32(tmp, tmp);
382 tmp = _mm_packus_epi16(tmp, tmp);
383 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
384 }
385
Store2Val(unsigned short * ptr)386 inline void Store2Val(unsigned short* ptr) const
387 {
388 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
389 // X X X X 0 B 0 A --> X X X X A A B A
390 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
391 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
392 }
393
StoreMask(unsigned char * ptr)394 inline void StoreMask(unsigned char* ptr) const
395 {
396 _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
397 }
398
399 inline operator double () const
400 {
401 return _mm_cvtsd_f64(xmm);
402 }
403 };
404
405 #else
406
407 #ifndef NO_WARN_USE_SSE2_EMULATION
408 #warning "Software emulation of SSE2 !"
409 #endif
410
411 class XMMReg2Double
412 {
413 public:
414 double low;
415 double high;
416
XMMReg2Double()417 XMMReg2Double() {}
XMMReg2Double(double val)418 XMMReg2Double(double val) { low = val; high = 0.0; }
XMMReg2Double(const XMMReg2Double & other)419 XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
420
Zero()421 static inline XMMReg2Double Zero()
422 {
423 XMMReg2Double reg;
424 reg.Zeroize();
425 return reg;
426 }
427
Load1ValHighAndLow(const double * ptr)428 static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
429 {
430 XMMReg2Double reg;
431 reg.nsLoad1ValHighAndLow(ptr);
432 return reg;
433 }
434
Equals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)435 static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
436 {
437 XMMReg2Double reg;
438
439 if (expr1.low == expr2.low)
440 memset(&(reg.low), 0xFF, sizeof(double));
441 else
442 reg.low = 0;
443
444 if (expr1.high == expr2.high)
445 memset(&(reg.high), 0xFF, sizeof(double));
446 else
447 reg.high = 0;
448
449 return reg;
450 }
451
NotEquals(const XMMReg2Double & expr1,const XMMReg2Double & expr2)452 static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
453 {
454 XMMReg2Double reg;
455
456 if (expr1.low != expr2.low)
457 memset(&(reg.low), 0xFF, sizeof(double));
458 else
459 reg.low = 0;
460
461 if (expr1.high != expr2.high)
462 memset(&(reg.high), 0xFF, sizeof(double));
463 else
464 reg.high = 0;
465
466 return reg;
467 }
468
Greater(const XMMReg2Double & expr1,const XMMReg2Double & expr2)469 static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
470 {
471 XMMReg2Double reg;
472
473 if (expr1.low > expr2.low)
474 memset(&(reg.low), 0xFF, sizeof(double));
475 else
476 reg.low = 0;
477
478 if (expr1.high > expr2.high)
479 memset(&(reg.high), 0xFF, sizeof(double));
480 else
481 reg.high = 0;
482
483 return reg;
484 }
485
And(const XMMReg2Double & expr1,const XMMReg2Double & expr2)486 static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
487 {
488 XMMReg2Double reg;
489 int low1[2], high1[2];
490 int low2[2], high2[2];
491 memcpy(low1, &expr1.low, sizeof(double));
492 memcpy(high1, &expr1.high, sizeof(double));
493 memcpy(low2, &expr2.low, sizeof(double));
494 memcpy(high2, &expr2.high, sizeof(double));
495 low1[0] &= low2[0];
496 low1[1] &= low2[1];
497 high1[0] &= high2[0];
498 high1[1] &= high2[1];
499 memcpy(®.low, low1, sizeof(double));
500 memcpy(®.high, high1, sizeof(double));
501 return reg;
502 }
503
Ternary(const XMMReg2Double & cond,const XMMReg2Double & true_expr,const XMMReg2Double & false_expr)504 static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
505 {
506 XMMReg2Double reg;
507 if( cond.low )
508 reg.low = true_expr.low;
509 else
510 reg.low = false_expr.low;
511 if( cond.high )
512 reg.high = true_expr.high;
513 else
514 reg.high = false_expr.high;
515 return reg;
516 }
517
Min(const XMMReg2Double & expr1,const XMMReg2Double & expr2)518 static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
519 {
520 XMMReg2Double reg;
521 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
522 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
523 return reg;
524 }
525
Load2Val(const double * ptr)526 static inline XMMReg2Double Load2Val(const double* ptr)
527 {
528 XMMReg2Double reg;
529 reg.nsLoad2Val(ptr);
530 return reg;
531 }
532
Load2ValAligned(const double * ptr)533 static inline XMMReg2Double Load2ValAligned(const double* ptr)
534 {
535 XMMReg2Double reg;
536 reg.nsLoad2ValAligned(ptr);
537 return reg;
538 }
539
Load2Val(const float * ptr)540 static inline XMMReg2Double Load2Val(const float* ptr)
541 {
542 XMMReg2Double reg;
543 reg.nsLoad2Val(ptr);
544 return reg;
545 }
546
Load2Val(const unsigned char * ptr)547 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
548 {
549 XMMReg2Double reg;
550 reg.nsLoad2Val(ptr);
551 return reg;
552 }
553
Load2Val(const short * ptr)554 static inline XMMReg2Double Load2Val(const short* ptr)
555 {
556 XMMReg2Double reg;
557 reg.nsLoad2Val(ptr);
558 return reg;
559 }
560
Load2Val(const unsigned short * ptr)561 static inline XMMReg2Double Load2Val(const unsigned short* ptr)
562 {
563 XMMReg2Double reg;
564 reg.nsLoad2Val(ptr);
565 return reg;
566 }
567
nsLoad1ValHighAndLow(const double * ptr)568 inline void nsLoad1ValHighAndLow(const double* ptr)
569 {
570 low = ptr[0];
571 high = ptr[0];
572 }
573
nsLoad2Val(const double * ptr)574 inline void nsLoad2Val(const double* ptr)
575 {
576 low = ptr[0];
577 high = ptr[1];
578 }
579
nsLoad2ValAligned(const double * ptr)580 inline void nsLoad2ValAligned(const double* ptr)
581 {
582 low = ptr[0];
583 high = ptr[1];
584 }
585
nsLoad2Val(const float * ptr)586 inline void nsLoad2Val(const float* ptr)
587 {
588 low = ptr[0];
589 high = ptr[1];
590 }
591
nsLoad2Val(const unsigned char * ptr)592 inline void nsLoad2Val(const unsigned char* ptr)
593 {
594 low = ptr[0];
595 high = ptr[1];
596 }
597
nsLoad2Val(const short * ptr)598 inline void nsLoad2Val(const short* ptr)
599 {
600 low = ptr[0];
601 high = ptr[1];
602 }
603
nsLoad2Val(const unsigned short * ptr)604 inline void nsLoad2Val(const unsigned short* ptr)
605 {
606 low = ptr[0];
607 high = ptr[1];
608 }
609
Load4Val(const unsigned char * ptr,XMMReg2Double & low,XMMReg2Double & high)610 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
611 {
612 low.low = ptr[0];
613 low.high = ptr[1];
614 high.low = ptr[2];
615 high.high = ptr[3];
616 }
617
Load4Val(const short * ptr,XMMReg2Double & low,XMMReg2Double & high)618 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
619 {
620 low.nsLoad2Val(ptr);
621 high.nsLoad2Val(ptr+2);
622 }
623
Load4Val(const unsigned short * ptr,XMMReg2Double & low,XMMReg2Double & high)624 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
625 {
626 low.nsLoad2Val(ptr);
627 high.nsLoad2Val(ptr+2);
628 }
629
Load4Val(const double * ptr,XMMReg2Double & low,XMMReg2Double & high)630 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
631 {
632 low.nsLoad2Val(ptr);
633 high.nsLoad2Val(ptr+2);
634 }
635
Load4Val(const float * ptr,XMMReg2Double & low,XMMReg2Double & high)636 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
637 {
638 low.nsLoad2Val(ptr);
639 high.nsLoad2Val(ptr+2);
640 }
641
Zeroize()642 inline void Zeroize()
643 {
644 low = 0.0;
645 high = 0.0;
646 }
647
648 inline XMMReg2Double& operator= (const XMMReg2Double& other)
649 {
650 low = other.low;
651 high = other.high;
652 return *this;
653 }
654
655 inline XMMReg2Double& operator+= (const XMMReg2Double& other)
656 {
657 low += other.low;
658 high += other.high;
659 return *this;
660 }
661
662 inline XMMReg2Double& operator*= (const XMMReg2Double& other)
663 {
664 low *= other.low;
665 high *= other.high;
666 return *this;
667 }
668
669 inline XMMReg2Double operator+ (const XMMReg2Double& other) const
670 {
671 XMMReg2Double ret;
672 ret.low = low + other.low;
673 ret.high = high + other.high;
674 return ret;
675 }
676
677 inline XMMReg2Double operator- (const XMMReg2Double& other) const
678 {
679 XMMReg2Double ret;
680 ret.low = low - other.low;
681 ret.high = high - other.high;
682 return ret;
683 }
684
685 inline XMMReg2Double operator* (const XMMReg2Double& other) const
686 {
687 XMMReg2Double ret;
688 ret.low = low * other.low;
689 ret.high = high * other.high;
690 return ret;
691 }
692
693 inline XMMReg2Double operator/ (const XMMReg2Double& other) const
694 {
695 XMMReg2Double ret;
696 ret.low = low / other.low;
697 ret.high = high / other.high;
698 return ret;
699 }
700
GetHorizSum()701 inline double GetHorizSum() const
702 {
703 return low + high;
704 }
705
Store2Val(double * ptr)706 inline void Store2Val(double* ptr) const
707 {
708 ptr[0] = low;
709 ptr[1] = high;
710 }
711
Store2ValAligned(double * ptr)712 inline void Store2ValAligned(double* ptr) const
713 {
714 ptr[0] = low;
715 ptr[1] = high;
716 }
717
Store2Val(float * ptr)718 inline void Store2Val(float* ptr) const
719 {
720 ptr[0] = low;
721 ptr[1] = high;
722 }
723
Store2Val(unsigned char * ptr)724 void Store2Val(unsigned char* ptr) const
725 {
726 ptr[0] = (unsigned char)(low + 0.5);
727 ptr[1] = (unsigned char)(high + 0.5);
728 }
729
Store2Val(unsigned short * ptr)730 void Store2Val(unsigned short* ptr) const
731 {
732 ptr[0] = (GUInt16)(low + 0.5);
733 ptr[1] = (GUInt16)(high + 0.5);
734 }
735
StoreMask(unsigned char * ptr)736 inline void StoreMask(unsigned char* ptr) const
737 {
738 memcpy(ptr, &low, 8);
739 memcpy(ptr + 8, &high, 8);
740 }
741
742 inline operator double () const
743 {
744 return low;
745 }
746 };
747
748 #endif /* defined(__x86_64) || defined(_M_X64) */
749
750 #ifdef __AVX__
751
752 #include <immintrin.h>
753
754 class XMMReg4Double
755 {
756 public:
757 __m256d ymm;
758
XMMReg4Double()759 XMMReg4Double(): ymm(_mm256_setzero_pd()) {}
XMMReg4Double(const XMMReg4Double & other)760 XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
761
Zero()762 static inline XMMReg4Double Zero()
763 {
764 XMMReg4Double reg;
765 reg.Zeroize();
766 return reg;
767 }
768
Zeroize()769 inline void Zeroize()
770 {
771 ymm = _mm256_setzero_pd();
772 }
773
Load1ValHighAndLow(const double * ptr)774 static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
775 {
776 XMMReg4Double reg;
777 reg.nsLoad1ValHighAndLow(ptr);
778 return reg;
779 }
780
nsLoad1ValHighAndLow(const double * ptr)781 inline void nsLoad1ValHighAndLow(const double* ptr)
782 {
783 ymm = _mm256_set1_pd(*ptr);
784 }
785
Load4Val(const unsigned char * ptr)786 static inline XMMReg4Double Load4Val(const unsigned char* ptr)
787 {
788 XMMReg4Double reg;
789 reg.nsLoad4Val(ptr);
790 return reg;
791 }
792
nsLoad4Val(const unsigned char * ptr)793 inline void nsLoad4Val(const unsigned char* ptr)
794 {
795 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
796 xmm_i = _mm_cvtepu8_epi32(xmm_i);
797 ymm = _mm256_cvtepi32_pd(xmm_i);
798 }
799
Load4Val(const short * ptr)800 static inline XMMReg4Double Load4Val(const short* ptr)
801 {
802 XMMReg4Double reg;
803 reg.nsLoad4Val(ptr);
804 return reg;
805 }
806
nsLoad4Val(const short * ptr)807 inline void nsLoad4Val(const short* ptr)
808 {
809 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
810 xmm_i = _mm_cvtepi16_epi32(xmm_i);
811 ymm = _mm256_cvtepi32_pd(xmm_i);
812 }
813
Load4Val(const unsigned short * ptr)814 static inline XMMReg4Double Load4Val(const unsigned short* ptr)
815 {
816 XMMReg4Double reg;
817 reg.nsLoad4Val(ptr);
818 return reg;
819 }
820
nsLoad4Val(const unsigned short * ptr)821 inline void nsLoad4Val(const unsigned short* ptr)
822 {
823 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
824 xmm_i = _mm_cvtepu16_epi32(xmm_i);
825 ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
826 }
827
Load4Val(const double * ptr)828 static inline XMMReg4Double Load4Val(const double* ptr)
829 {
830 XMMReg4Double reg;
831 reg.nsLoad4Val(ptr);
832 return reg;
833 }
834
nsLoad4Val(const double * ptr)835 inline void nsLoad4Val(const double* ptr)
836 {
837 ymm = _mm256_loadu_pd(ptr);
838 }
839
Load4ValAligned(const double * ptr)840 static inline XMMReg4Double Load4ValAligned(const double* ptr)
841 {
842 XMMReg4Double reg;
843 reg.nsLoad4ValAligned(ptr);
844 return reg;
845 }
846
nsLoad4ValAligned(const double * ptr)847 inline void nsLoad4ValAligned(const double* ptr)
848 {
849 ymm = _mm256_load_pd(ptr);
850 }
851
Load4Val(const float * ptr)852 static inline XMMReg4Double Load4Val(const float* ptr)
853 {
854 XMMReg4Double reg;
855 reg.nsLoad4Val(ptr);
856 return reg;
857 }
858
nsLoad4Val(const float * ptr)859 inline void nsLoad4Val(const float* ptr)
860 {
861 ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
862 }
863
Equals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)864 static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
865 {
866 XMMReg4Double reg;
867 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
868 return reg;
869 }
870
NotEquals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)871 static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
872 {
873 XMMReg4Double reg;
874 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
875 return reg;
876 }
877
Greater(const XMMReg4Double & expr1,const XMMReg4Double & expr2)878 static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
879 {
880 XMMReg4Double reg;
881 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
882 return reg;
883 }
884
And(const XMMReg4Double & expr1,const XMMReg4Double & expr2)885 static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
886 {
887 XMMReg4Double reg;
888 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
889 return reg;
890 }
891
Ternary(const XMMReg4Double & cond,const XMMReg4Double & true_expr,const XMMReg4Double & false_expr)892 static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
893 {
894 XMMReg4Double reg;
895 reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
896 return reg;
897 }
898
Min(const XMMReg4Double & expr1,const XMMReg4Double & expr2)899 static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
900 {
901 XMMReg4Double reg;
902 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
903 return reg;
904 }
905
906 inline XMMReg4Double& operator= (const XMMReg4Double& other)
907 {
908 ymm = other.ymm;
909 return *this;
910 }
911
912 inline XMMReg4Double& operator+= (const XMMReg4Double& other)
913 {
914 ymm = _mm256_add_pd(ymm, other.ymm);
915 return *this;
916 }
917
918 inline XMMReg4Double& operator*= (const XMMReg4Double& other)
919 {
920 ymm = _mm256_mul_pd(ymm, other.ymm);
921 return *this;
922 }
923
924 inline XMMReg4Double operator+ (const XMMReg4Double& other) const
925 {
926 XMMReg4Double ret;
927 ret.ymm = _mm256_add_pd(ymm, other.ymm);
928 return ret;
929 }
930
931 inline XMMReg4Double operator- (const XMMReg4Double& other) const
932 {
933 XMMReg4Double ret;
934 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
935 return ret;
936 }
937
938 inline XMMReg4Double operator* (const XMMReg4Double& other) const
939 {
940 XMMReg4Double ret;
941 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
942 return ret;
943 }
944
945 inline XMMReg4Double operator/ (const XMMReg4Double& other) const
946 {
947 XMMReg4Double ret;
948 ret.ymm = _mm256_div_pd(ymm, other.ymm);
949 return ret;
950 }
951
AddToLow(const XMMReg2Double & other)952 void AddToLow( const XMMReg2Double& other )
953 {
954 __m256d ymm2 = _mm256_setzero_pd();
955 ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
956 ymm = _mm256_add_pd(ymm, ymm2);
957 }
958
GetHorizSum()959 inline double GetHorizSum() const
960 {
961 __m256d ymm_tmp1, ymm_tmp2;
962 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
963 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
964 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
965 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
966 }
967
Store4Val(unsigned char * ptr)968 inline void Store4Val(unsigned char* ptr) const
969 {
970 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
971 //xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
972 //xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
973 xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); // SSSE3
974 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
975 }
976
Store4Val(unsigned short * ptr)977 inline void Store4Val(unsigned short* ptr) const
978 {
979 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
980 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
981 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
982 }
983
Store4Val(float * ptr)984 inline void Store4Val(float* ptr) const
985 {
986 _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
987 }
988
Store4Val(double * ptr)989 inline void Store4Val(double* ptr) const
990 {
991 _mm256_storeu_pd(ptr, ymm);
992 }
993
StoreMask(unsigned char * ptr)994 inline void StoreMask(unsigned char* ptr) const
995 {
996 _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
997 }
998 };
999
1000 #else
1001
1002 class XMMReg4Double
1003 {
1004 public:
1005 XMMReg2Double low, high;
1006
1007 #if defined(__GNUC__)
1008 #pragma GCC diagnostic push
1009 #pragma GCC diagnostic ignored "-Weffc++"
1010 #endif
1011 /* coverity[uninit_member] */
1012 XMMReg4Double() = default;
1013 #if defined(__GNUC__)
1014 #pragma GCC diagnostic pop
1015 #endif
1016
XMMReg4Double(const XMMReg4Double & other)1017 XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1018
Zero()1019 static inline XMMReg4Double Zero()
1020 {
1021 XMMReg4Double reg;
1022 reg.low.Zeroize();
1023 reg.high.Zeroize();
1024 return reg;
1025 }
1026
Load1ValHighAndLow(const double * ptr)1027 static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1028 {
1029 XMMReg4Double reg;
1030 reg.low.nsLoad1ValHighAndLow(ptr);
1031 reg.high = reg.low;
1032 return reg;
1033 }
1034
Load4Val(const unsigned char * ptr)1035 static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1036 {
1037 XMMReg4Double reg;
1038 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1039 return reg;
1040 }
1041
Load4Val(const short * ptr)1042 static inline XMMReg4Double Load4Val(const short* ptr)
1043 {
1044 XMMReg4Double reg;
1045 reg.low.nsLoad2Val(ptr);
1046 reg.high.nsLoad2Val(ptr+2);
1047 return reg;
1048 }
1049
Load4Val(const unsigned short * ptr)1050 static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1051 {
1052 XMMReg4Double reg;
1053 reg.low.nsLoad2Val(ptr);
1054 reg.high.nsLoad2Val(ptr+2);
1055 return reg;
1056 }
1057
Load4Val(const double * ptr)1058 static inline XMMReg4Double Load4Val(const double* ptr)
1059 {
1060 XMMReg4Double reg;
1061 reg.low.nsLoad2Val(ptr);
1062 reg.high.nsLoad2Val(ptr+2);
1063 return reg;
1064 }
1065
Load4ValAligned(const double * ptr)1066 static inline XMMReg4Double Load4ValAligned(const double* ptr)
1067 {
1068 XMMReg4Double reg;
1069 reg.low.nsLoad2ValAligned(ptr);
1070 reg.high.nsLoad2ValAligned(ptr+2);
1071 return reg;
1072 }
1073
Load4Val(const float * ptr)1074 static inline XMMReg4Double Load4Val(const float* ptr)
1075 {
1076 XMMReg4Double reg;
1077 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1078 return reg;
1079 }
1080
Equals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1081 static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1082 {
1083 XMMReg4Double reg;
1084 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1085 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1086 return reg;
1087 }
1088
NotEquals(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1089 static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1090 {
1091 XMMReg4Double reg;
1092 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1093 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1094 return reg;
1095 }
1096
Greater(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1097 static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1098 {
1099 XMMReg4Double reg;
1100 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1101 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1102 return reg;
1103 }
1104
And(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1105 static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1106 {
1107 XMMReg4Double reg;
1108 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1109 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1110 return reg;
1111 }
1112
Ternary(const XMMReg4Double & cond,const XMMReg4Double & true_expr,const XMMReg4Double & false_expr)1113 static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1114 {
1115 XMMReg4Double reg;
1116 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1117 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1118 return reg;
1119 }
1120
Min(const XMMReg4Double & expr1,const XMMReg4Double & expr2)1121 static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1122 {
1123 XMMReg4Double reg;
1124 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1125 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1126 return reg;
1127 }
1128
1129 inline XMMReg4Double& operator= (const XMMReg4Double& other)
1130 {
1131 low = other.low;
1132 high = other.high;
1133 return *this;
1134 }
1135
1136 inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1137 {
1138 low += other.low;
1139 high += other.high;
1140 return *this;
1141 }
1142
1143 inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1144 {
1145 low *= other.low;
1146 high *= other.high;
1147 return *this;
1148 }
1149
1150 inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1151 {
1152 XMMReg4Double ret;
1153 ret.low = low + other.low;
1154 ret.high = high + other.high;
1155 return ret;
1156 }
1157
1158 inline XMMReg4Double operator- (const XMMReg4Double& other) const
1159 {
1160 XMMReg4Double ret;
1161 ret.low = low - other.low;
1162 ret.high = high - other.high;
1163 return ret;
1164 }
1165
1166 inline XMMReg4Double operator* (const XMMReg4Double& other) const
1167 {
1168 XMMReg4Double ret;
1169 ret.low = low * other.low;
1170 ret.high = high * other.high;
1171 return ret;
1172 }
1173
1174 inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1175 {
1176 XMMReg4Double ret;
1177 ret.low = low / other.low;
1178 ret.high = high / other.high;
1179 return ret;
1180 }
1181
AddToLow(const XMMReg2Double & other)1182 void AddToLow( const XMMReg2Double& other )
1183 {
1184 low += other;
1185 }
1186
GetHorizSum()1187 inline double GetHorizSum() const
1188 {
1189 return (low + high).GetHorizSum();
1190 }
1191
Store4Val(unsigned char * ptr)1192 inline void Store4Val(unsigned char* ptr) const
1193 {
1194 #ifdef USE_SSE2_EMULATION
1195 low.Store2Val(ptr);
1196 high.Store2Val(ptr+2);
1197 #else
1198 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(low.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1199 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(high.xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1200 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow), _mm_castsi128_ps(tmpHigh), _MM_SHUFFLE(1, 0, 1, 0)));
1201 tmp = _mm_packs_epi32(tmp, tmp);
1202 tmp = _mm_packus_epi16(tmp, tmp);
1203 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
1204 #endif
1205 }
1206
Store4Val(unsigned short * ptr)1207 inline void Store4Val(unsigned short* ptr) const
1208 {
1209 #if 1
1210 low.Store2Val(ptr);
1211 high.Store2Val(ptr+2);
1212 #else
1213 __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1214 __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1215 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1216 #if __SSE4_1__
1217 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1218 #else
1219 xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1220 xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1221 xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1222 #endif
1223 GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1224 #endif
1225 }
1226
Store4Val(float * ptr)1227 inline void Store4Val(float* ptr) const
1228 {
1229 low.Store2Val(ptr);
1230 high.Store2Val(ptr+2);
1231 }
1232
Store4Val(double * ptr)1233 inline void Store4Val(double* ptr) const
1234 {
1235 low.Store2Val(ptr);
1236 high.Store2Val(ptr+2);
1237 }
1238
StoreMask(unsigned char * ptr)1239 inline void StoreMask(unsigned char* ptr) const
1240 {
1241 low.StoreMask(ptr);
1242 high.StoreMask(ptr+16);
1243 }
1244
1245 };
1246
1247 #endif
1248
1249 #endif /* #ifndef DOXYGEN_SKIP */
1250
1251 #endif /* GDALSSE_PRIV_H_INCLUDED */
1252