1 #ifndef NPY_SIMD
2 #error "Not a standalone header"
3 #endif
4
5 #ifndef _NPY_SIMD_SSE_MEMORY_H
6 #define _NPY_SIMD_SSE_MEMORY_H
7
8 #include "misc.h"
9
10 /***************************
11 * load/store
12 ***************************/
13 // stream load
14 #ifdef NPY_HAVE_SSE41
15 #define npyv__loads(PTR) _mm_stream_load_si128((__m128i *)(PTR))
16 #else
17 #define npyv__loads(PTR) _mm_load_si128((const __m128i *)(PTR))
18 #endif
19 #define NPYV_IMPL_SSE_MEM_INT(CTYPE, SFX) \
20 NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \
21 { return _mm_loadu_si128((const __m128i*)ptr); } \
22 NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \
23 { return _mm_load_si128((const __m128i*)ptr); } \
24 NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \
25 { return npyv__loads(ptr); } \
26 NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \
27 { return _mm_loadl_epi64((const __m128i*)ptr); } \
28 NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \
29 { _mm_storeu_si128((__m128i*)ptr, vec); } \
30 NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \
31 { _mm_store_si128((__m128i*)ptr, vec); } \
32 NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \
33 { _mm_stream_si128((__m128i*)ptr, vec); } \
34 NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \
35 { _mm_storel_epi64((__m128i *)ptr, vec); } \
36 NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \
37 { _mm_storel_epi64((__m128i *)ptr, _mm_unpackhi_epi64(vec, vec)); }
38
NPYV_IMPL_SSE_MEM_INT(npy_uint8,u8)39 NPYV_IMPL_SSE_MEM_INT(npy_uint8, u8)
40 NPYV_IMPL_SSE_MEM_INT(npy_int8, s8)
41 NPYV_IMPL_SSE_MEM_INT(npy_uint16, u16)
42 NPYV_IMPL_SSE_MEM_INT(npy_int16, s16)
43 NPYV_IMPL_SSE_MEM_INT(npy_uint32, u32)
44 NPYV_IMPL_SSE_MEM_INT(npy_int32, s32)
45 NPYV_IMPL_SSE_MEM_INT(npy_uint64, u64)
46 NPYV_IMPL_SSE_MEM_INT(npy_int64, s64)
47
48 // unaligned load
49 #define npyv_load_f32 _mm_loadu_ps
50 #define npyv_load_f64 _mm_loadu_pd
51 // aligned load
52 #define npyv_loada_f32 _mm_load_ps
53 #define npyv_loada_f64 _mm_load_pd
54 // load lower part
55 #define npyv_loadl_f32(PTR) _mm_castsi128_ps(npyv_loadl_u32((const npy_uint32*)(PTR)))
56 #define npyv_loadl_f64(PTR) _mm_castsi128_pd(npyv_loadl_u32((const npy_uint32*)(PTR)))
57 // stream load
58 #define npyv_loads_f32(PTR) _mm_castsi128_ps(npyv__loads(PTR))
59 #define npyv_loads_f64(PTR) _mm_castsi128_pd(npyv__loads(PTR))
60 // unaligned store
61 #define npyv_store_f32 _mm_storeu_ps
62 #define npyv_store_f64 _mm_storeu_pd
63 // aligned store
64 #define npyv_storea_f32 _mm_store_ps
65 #define npyv_storea_f64 _mm_store_pd
66 // stream store
67 #define npyv_stores_f32 _mm_stream_ps
68 #define npyv_stores_f64 _mm_stream_pd
69 // store lower part
70 #define npyv_storel_f32(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castps_si128(VEC));
71 #define npyv_storel_f64(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castpd_si128(VEC));
72 // store higher part
73 #define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
74 #define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
75 /***************************
76 * Non-contiguous Load
77 ***************************/
78 //// 32
79 NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
80 {
81 __m128i a = _mm_cvtsi32_si128(*ptr);
82 #ifdef NPY_HAVE_SSE41
83 a = _mm_insert_epi32(a, ptr[stride], 1);
84 a = _mm_insert_epi32(a, ptr[stride*2], 2);
85 a = _mm_insert_epi32(a, ptr[stride*3], 3);
86 #else
87 __m128i a1 = _mm_cvtsi32_si128(ptr[stride]);
88 __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]);
89 __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]);
90 a = _mm_unpacklo_epi32(a, a1);
91 a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3));
92 #endif
93 return a;
94 }
npyv_loadn_u32(const npy_uint32 * ptr,npy_intp stride)95 NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
96 { return npyv_loadn_s32((const npy_int32*)ptr, stride); }
npyv_loadn_f32(const float * ptr,npy_intp stride)97 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
98 { return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); }
99 //// 64
npyv_loadn_f64(const double * ptr,npy_intp stride)100 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
101 { return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); }
npyv_loadn_u64(const npy_uint64 * ptr,npy_intp stride)102 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
103 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
npyv_loadn_s64(const npy_int64 * ptr,npy_intp stride)104 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
105 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
106 /***************************
107 * Non-contiguous Store
108 ***************************/
109 //// 32
npyv_storen_s32(npy_int32 * ptr,npy_intp stride,npyv_s32 a)110 NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
111 {
112 ptr[stride * 0] = _mm_cvtsi128_si32(a);
113 #ifdef NPY_HAVE_SSE41
114 ptr[stride * 1] = _mm_extract_epi32(a, 1);
115 ptr[stride * 2] = _mm_extract_epi32(a, 2);
116 ptr[stride * 3] = _mm_extract_epi32(a, 3);
117 #else
118 ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
119 ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
120 ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
121 #endif
122 }
npyv_storen_u32(npy_uint32 * ptr,npy_intp stride,npyv_u32 a)123 NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
124 { npyv_storen_s32((npy_int32*)ptr, stride, a); }
npyv_storen_f32(float * ptr,npy_intp stride,npyv_f32 a)125 NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
126 { npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); }
127 //// 64
npyv_storen_f64(double * ptr,npy_intp stride,npyv_f64 a)128 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
129 {
130 _mm_storel_pd(ptr, a);
131 _mm_storeh_pd(ptr + stride, a);
132 }
npyv_storen_u64(npy_uint64 * ptr,npy_intp stride,npyv_u64 a)133 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
134 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
npyv_storen_s64(npy_int64 * ptr,npy_intp stride,npyv_s64 a)135 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
136 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
137
138 /*********************************
139 * Partial Load
140 *********************************/
141 #if defined(__clang__) && __clang_major__ > 7
142 /**
143 * Clang >=8 perform aggressive optimization that tends to
144 * zero the bits of upper half part of vectors even
145 * when we try to fill it up with certain scalars,
146 * which my lead to zero division errors.
147 */
148 #define NPYV__CLANG_ZEROUPPER
149 #endif
150 //// 32
npyv_load_till_s32(const npy_int32 * ptr,npy_uintp nlane,npy_int32 fill)151 NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
152 {
153 assert(nlane > 0);
154 #ifdef NPYV__CLANG_ZEROUPPER
155 if (nlane > 3) {
156 return npyv_load_s32(ptr);
157 }
158 npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
159 for (npy_uint64 i = 0; i < nlane; ++i) {
160 data[i] = ptr[i];
161 }
162 return npyv_loada_s32(data);
163 #else
164 #ifndef NPY_HAVE_SSE41
165 const short *wptr = (const short*)ptr;
166 #endif
167 const __m128i vfill = npyv_setall_s32(fill);
168 __m128i a;
169 switch(nlane) {
170 case 2:
171 return _mm_castpd_si128(
172 _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
173 );
174 #ifdef NPY_HAVE_SSE41
175 case 1:
176 return _mm_insert_epi32(vfill, ptr[0], 0);
177 case 3:
178 a = _mm_loadl_epi64((const __m128i*)ptr);
179 a = _mm_insert_epi32(a, ptr[2], 2);
180 a = _mm_insert_epi32(a, fill, 3);
181 return a;
182 #else
183 case 1:
184 a = _mm_insert_epi16(vfill, wptr[0], 0);
185 return _mm_insert_epi16(a, wptr[1], 1);
186 case 3:
187 a = _mm_loadl_epi64((const __m128i*)ptr);
188 a = _mm_unpacklo_epi64(a, vfill);
189 a = _mm_insert_epi16(a, wptr[4], 4);
190 a = _mm_insert_epi16(a, wptr[5], 5);
191 return a;
192 #endif // NPY_HAVE_SSE41
193 default:
194 return npyv_load_s32(ptr);
195 }
196 #endif
197 }
198 // fill zero to rest lanes
npyv_load_tillz_s32(const npy_int32 * ptr,npy_uintp nlane)199 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
200 {
201 assert(nlane > 0);
202 switch(nlane) {
203 case 1:
204 return _mm_cvtsi32_si128(*ptr);
205 case 2:
206 return _mm_loadl_epi64((const __m128i*)ptr);
207 case 3:;
208 npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
209 #ifdef NPY_HAVE_SSE41
210 return _mm_insert_epi32(a, ptr[2], 2);
211 #else
212 return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
213 #endif
214 default:
215 return npyv_load_s32(ptr);
216 }
217 }
218 //// 64
npyv_load_till_s64(const npy_int64 * ptr,npy_uintp nlane,npy_int64 fill)219 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
220 {
221 assert(nlane > 0);
222 #ifdef NPYV__CLANG_ZEROUPPER
223 if (nlane <= 2) {
224 npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
225 for (npy_uint64 i = 0; i < nlane; ++i) {
226 data[i] = ptr[i];
227 }
228 return npyv_loada_s64(data);
229 }
230 #else
231 if (nlane == 1) {
232 const __m128i vfill = npyv_setall_s64(fill);
233 return _mm_castpd_si128(
234 _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
235 );
236 }
237 #endif
238 return npyv_load_s64(ptr);
239 }
240 // fill zero to rest lanes
npyv_load_tillz_s64(const npy_int64 * ptr,npy_uintp nlane)241 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
242 {
243 assert(nlane > 0);
244 if (nlane == 1) {
245 return _mm_loadl_epi64((const __m128i*)ptr);
246 }
247 return npyv_load_s64(ptr);
248 }
249 /*********************************
250 * Non-contiguous partial load
251 *********************************/
252 //// 32
253 NPY_FINLINE npyv_s32
npyv_loadn_till_s32(const npy_int32 * ptr,npy_intp stride,npy_uintp nlane,npy_int32 fill)254 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
255 {
256 assert(nlane > 0);
257 #ifdef NPYV__CLANG_ZEROUPPER
258 if (nlane > 3) {
259 return npyv_loadn_s32(ptr, stride);
260 }
261 npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
262 for (npy_uint64 i = 0; i < nlane; ++i) {
263 data[i] = ptr[stride*i];
264 }
265 return npyv_loada_s32(data);
266 #else
267 __m128i vfill = npyv_setall_s32(fill);
268 #ifndef NPY_HAVE_SSE41
269 const short *wptr = (const short*)ptr;
270 #endif
271 switch(nlane) {
272 #ifdef NPY_HAVE_SSE41
273 case 3:
274 vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2);
275 case 2:
276 vfill = _mm_insert_epi32(vfill, ptr[stride], 1);
277 case 1:
278 vfill = _mm_insert_epi32(vfill, ptr[0], 0);
279 break;
280 #else
281 case 3:
282 vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill);
283 case 2:
284 vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32(
285 _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride])
286 ), vfill);
287 break;
288 case 1:
289 vfill = _mm_insert_epi16(vfill, wptr[0], 0);
290 vfill = _mm_insert_epi16(vfill, wptr[1], 1);
291 break;
292 #endif // NPY_HAVE_SSE41
293 default:
294 return npyv_loadn_s32(ptr, stride);
295 } // switch
296 return vfill;
297 #endif
298 }
299 // fill zero to rest lanes
300 NPY_FINLINE npyv_s32
npyv_loadn_tillz_s32(const npy_int32 * ptr,npy_intp stride,npy_uintp nlane)301 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
302 {
303 assert(nlane > 0);
304 switch(nlane) {
305 case 1:
306 return _mm_cvtsi32_si128(ptr[0]);
307 case 2:;
308 npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
309 #ifdef NPY_HAVE_SSE41
310 return _mm_insert_epi32(a, ptr[stride], 1);
311 #else
312 return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
313 #endif // NPY_HAVE_SSE41
314 case 3:;
315 a = _mm_cvtsi32_si128(ptr[0]);
316 #ifdef NPY_HAVE_SSE41
317 a = _mm_insert_epi32(a, ptr[stride], 1);
318 a = _mm_insert_epi32(a, ptr[stride*2], 2);
319 return a;
320 #else
321 a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
322 a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
323 return a;
324 #endif // NPY_HAVE_SSE41
325 default:
326 return npyv_loadn_s32(ptr, stride);
327 }
328 }
329 //// 64
330 NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 * ptr,npy_intp stride,npy_uintp nlane,npy_int64 fill)331 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
332 {
333 assert(nlane > 0);
334 #ifdef NPYV__CLANG_ZEROUPPER
335 if (nlane <= 2) {
336 npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
337 for (npy_uint64 i = 0; i < nlane; ++i) {
338 data[i] = ptr[i*stride];
339 }
340 return npyv_loada_s64(data);
341 }
342 #else
343 if (nlane == 1) {
344 const __m128i vfill = npyv_setall_s64(fill);
345 return _mm_castpd_si128(
346 _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
347 );
348 }
349 #endif
350 return npyv_loadn_s64(ptr, stride);
351 }
352 // fill zero to rest lanes
npyv_loadn_tillz_s64(const npy_int64 * ptr,npy_intp stride,npy_uintp nlane)353 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
354 {
355 assert(nlane > 0);
356 if (nlane == 1) {
357 return _mm_loadl_epi64((const __m128i*)ptr);
358 }
359 return npyv_loadn_s64(ptr, stride);
360 }
361 /*********************************
362 * Partial store
363 *********************************/
364 //// 32
npyv_store_till_s32(npy_int32 * ptr,npy_uintp nlane,npyv_s32 a)365 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
366 {
367 assert(nlane > 0);
368 switch(nlane) {
369 case 1:
370 *ptr = _mm_cvtsi128_si32(a);
371 break;
372 case 2:
373 _mm_storel_epi64((__m128i *)ptr, a);
374 break;
375 case 3:
376 _mm_storel_epi64((__m128i *)ptr, a);
377 #ifdef NPY_HAVE_SSE41
378 ptr[2] = _mm_extract_epi32(a, 2);
379 #else
380 ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
381 #endif
382 break;
383 default:
384 npyv_store_s32(ptr, a);
385 }
386 }
387 //// 64
npyv_store_till_s64(npy_int64 * ptr,npy_uintp nlane,npyv_s64 a)388 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
389 {
390 assert(nlane > 0);
391 if (nlane == 1) {
392 _mm_storel_epi64((__m128i *)ptr, a);
393 return;
394 }
395 npyv_store_s64(ptr, a);
396 }
397 /*********************************
398 * Non-contiguous partial store
399 *********************************/
400 //// 32
npyv_storen_till_s32(npy_int32 * ptr,npy_intp stride,npy_uintp nlane,npyv_s32 a)401 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
402 {
403 assert(nlane > 0);
404 switch(nlane) {
405 #ifdef NPY_HAVE_SSE41
406 default:
407 ptr[stride*3] = _mm_extract_epi32(a, 3);
408 case 3:
409 ptr[stride*2] = _mm_extract_epi32(a, 2);
410 case 2:
411 ptr[stride*1] = _mm_extract_epi32(a, 1);
412 #else
413 default:
414 ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
415 case 3:
416 ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
417 case 2:
418 ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
419 #endif
420 case 1:
421 ptr[stride*0] = _mm_cvtsi128_si32(a);
422 break;
423 }
424 }
425 //// 64
npyv_storen_till_s64(npy_int64 * ptr,npy_intp stride,npy_uintp nlane,npyv_s64 a)426 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
427 {
428 assert(nlane > 0);
429 if (nlane == 1) {
430 _mm_storel_epi64((__m128i *)ptr, a);
431 return;
432 }
433 npyv_storen_s64(ptr, stride, a);
434 }
435 /*****************************************************************
436 * Implement partial load/store for u32/f32/u64/f64... via casting
437 *****************************************************************/
438 #define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
439 NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
440 (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
441 { \
442 union { \
443 npyv_lanetype_##F_SFX from_##F_SFX; \
444 npyv_lanetype_##T_SFX to_##T_SFX; \
445 } pun = {.from_##F_SFX = fill}; \
446 return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
447 (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
448 )); \
449 } \
450 NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
451 (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
452 npyv_lanetype_##F_SFX fill) \
453 { \
454 union { \
455 npyv_lanetype_##F_SFX from_##F_SFX; \
456 npyv_lanetype_##T_SFX to_##T_SFX; \
457 } pun = {.from_##F_SFX = fill}; \
458 return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
459 (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
460 )); \
461 } \
462 NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
463 (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
464 { \
465 return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
466 (const npyv_lanetype_##T_SFX *)ptr, nlane \
467 )); \
468 } \
469 NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
470 (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
471 { \
472 return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
473 (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
474 )); \
475 } \
476 NPY_FINLINE void npyv_store_till_##F_SFX \
477 (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
478 { \
479 npyv_store_till_##T_SFX( \
480 (npyv_lanetype_##T_SFX *)ptr, nlane, \
481 npyv_reinterpret_##T_SFX##_##F_SFX(a) \
482 ); \
483 } \
484 NPY_FINLINE void npyv_storen_till_##F_SFX \
485 (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
486 { \
487 npyv_storen_till_##T_SFX( \
488 (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
489 npyv_reinterpret_##T_SFX##_##F_SFX(a) \
490 ); \
491 }
492
493 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32)
494 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
495 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
496 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
497
498 #endif // _NPY_SIMD_SSE_MEMORY_H
499