1 #ifndef NPY_SIMD
2     #error "Not a standalone header"
3 #endif
4 
5 #ifndef _NPY_SIMD_SSE_MEMORY_H
6 #define _NPY_SIMD_SSE_MEMORY_H
7 
8 #include "misc.h"
9 
10 /***************************
11  * load/store
12  ***************************/
13 // stream load
14 #ifdef NPY_HAVE_SSE41
15     #define npyv__loads(PTR) _mm_stream_load_si128((__m128i *)(PTR))
16 #else
17     #define npyv__loads(PTR) _mm_load_si128((const __m128i *)(PTR))
18 #endif
19 #define NPYV_IMPL_SSE_MEM_INT(CTYPE, SFX)                                    \
20     NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr)                 \
21     { return _mm_loadu_si128((const __m128i*)ptr); }                         \
22     NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr)                \
23     { return _mm_load_si128((const __m128i*)ptr); }                          \
24     NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr)                \
25     { return npyv__loads(ptr); }                                             \
26     NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr)                \
27     { return _mm_loadl_epi64((const __m128i*)ptr); }                         \
28     NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec)            \
29     { _mm_storeu_si128((__m128i*)ptr, vec); }                                \
30     NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec)           \
31     { _mm_store_si128((__m128i*)ptr, vec); }                                 \
32     NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec)           \
33     { _mm_stream_si128((__m128i*)ptr, vec); }                                \
34     NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec)           \
35     { _mm_storel_epi64((__m128i *)ptr, vec); }                               \
36     NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec)           \
37     { _mm_storel_epi64((__m128i *)ptr, _mm_unpackhi_epi64(vec, vec)); }
38 
NPYV_IMPL_SSE_MEM_INT(npy_uint8,u8)39 NPYV_IMPL_SSE_MEM_INT(npy_uint8,  u8)
40 NPYV_IMPL_SSE_MEM_INT(npy_int8,   s8)
41 NPYV_IMPL_SSE_MEM_INT(npy_uint16, u16)
42 NPYV_IMPL_SSE_MEM_INT(npy_int16,  s16)
43 NPYV_IMPL_SSE_MEM_INT(npy_uint32, u32)
44 NPYV_IMPL_SSE_MEM_INT(npy_int32,  s32)
45 NPYV_IMPL_SSE_MEM_INT(npy_uint64, u64)
46 NPYV_IMPL_SSE_MEM_INT(npy_int64,  s64)
47 
48 // unaligned load
49 #define npyv_load_f32 _mm_loadu_ps
50 #define npyv_load_f64 _mm_loadu_pd
51 // aligned load
52 #define npyv_loada_f32 _mm_load_ps
53 #define npyv_loada_f64 _mm_load_pd
54 // load lower part
55 #define npyv_loadl_f32(PTR) _mm_castsi128_ps(npyv_loadl_u32((const npy_uint32*)(PTR)))
56 #define npyv_loadl_f64(PTR) _mm_castsi128_pd(npyv_loadl_u32((const npy_uint32*)(PTR)))
57 // stream load
58 #define npyv_loads_f32(PTR) _mm_castsi128_ps(npyv__loads(PTR))
59 #define npyv_loads_f64(PTR) _mm_castsi128_pd(npyv__loads(PTR))
60 // unaligned store
61 #define npyv_store_f32 _mm_storeu_ps
62 #define npyv_store_f64 _mm_storeu_pd
63 // aligned store
64 #define npyv_storea_f32 _mm_store_ps
65 #define npyv_storea_f64 _mm_store_pd
66 // stream store
67 #define npyv_stores_f32 _mm_stream_ps
68 #define npyv_stores_f64 _mm_stream_pd
69 // store lower part
70 #define npyv_storel_f32(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castps_si128(VEC));
71 #define npyv_storel_f64(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castpd_si128(VEC));
72 // store higher part
73 #define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
74 #define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
75 /***************************
76  * Non-contiguous Load
77  ***************************/
78 //// 32
79 NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
80 {
81     __m128i a = _mm_cvtsi32_si128(*ptr);
82 #ifdef NPY_HAVE_SSE41
83     a = _mm_insert_epi32(a, ptr[stride],   1);
84     a = _mm_insert_epi32(a, ptr[stride*2], 2);
85     a = _mm_insert_epi32(a, ptr[stride*3], 3);
86 #else
87     __m128i a1 = _mm_cvtsi32_si128(ptr[stride]);
88     __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]);
89     __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]);
90     a = _mm_unpacklo_epi32(a, a1);
91     a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3));
92 #endif
93     return a;
94 }
npyv_loadn_u32(const npy_uint32 * ptr,npy_intp stride)95 NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
96 { return npyv_loadn_s32((const npy_int32*)ptr, stride); }
npyv_loadn_f32(const float * ptr,npy_intp stride)97 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
98 { return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); }
99 //// 64
npyv_loadn_f64(const double * ptr,npy_intp stride)100 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
101 { return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); }
npyv_loadn_u64(const npy_uint64 * ptr,npy_intp stride)102 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
103 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
npyv_loadn_s64(const npy_int64 * ptr,npy_intp stride)104 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
105 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
106 /***************************
107  * Non-contiguous Store
108  ***************************/
109 //// 32
npyv_storen_s32(npy_int32 * ptr,npy_intp stride,npyv_s32 a)110 NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
111 {
112     ptr[stride * 0] = _mm_cvtsi128_si32(a);
113 #ifdef NPY_HAVE_SSE41
114     ptr[stride * 1] = _mm_extract_epi32(a, 1);
115     ptr[stride * 2] = _mm_extract_epi32(a, 2);
116     ptr[stride * 3] = _mm_extract_epi32(a, 3);
117 #else
118     ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
119     ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
120     ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
121 #endif
122 }
npyv_storen_u32(npy_uint32 * ptr,npy_intp stride,npyv_u32 a)123 NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
124 { npyv_storen_s32((npy_int32*)ptr, stride, a); }
npyv_storen_f32(float * ptr,npy_intp stride,npyv_f32 a)125 NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
126 { npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); }
127 //// 64
npyv_storen_f64(double * ptr,npy_intp stride,npyv_f64 a)128 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
129 {
130     _mm_storel_pd(ptr, a);
131     _mm_storeh_pd(ptr + stride, a);
132 }
npyv_storen_u64(npy_uint64 * ptr,npy_intp stride,npyv_u64 a)133 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
134 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
npyv_storen_s64(npy_int64 * ptr,npy_intp stride,npyv_s64 a)135 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
136 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
137 
138 /*********************************
139  * Partial Load
140  *********************************/
141 #if defined(__clang__) && __clang_major__ > 7
142     /**
143      * Clang >=8 perform aggressive optimization that tends to
144      * zero the bits of upper half part of vectors even
145      * when we try to fill it up with certain scalars,
146      * which my lead to zero division errors.
147     */
148     #define NPYV__CLANG_ZEROUPPER
149 #endif
150 //// 32
npyv_load_till_s32(const npy_int32 * ptr,npy_uintp nlane,npy_int32 fill)151 NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
152 {
153     assert(nlane > 0);
154 #ifdef NPYV__CLANG_ZEROUPPER
155     if (nlane > 3) {
156         return npyv_load_s32(ptr);
157     }
158     npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
159     for (npy_uint64 i = 0; i < nlane; ++i) {
160         data[i] = ptr[i];
161     }
162     return npyv_loada_s32(data);
163 #else
164     #ifndef NPY_HAVE_SSE41
165         const short *wptr = (const short*)ptr;
166     #endif
167     const __m128i vfill = npyv_setall_s32(fill);
168     __m128i a;
169     switch(nlane) {
170     case 2:
171         return _mm_castpd_si128(
172             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
173         );
174     #ifdef NPY_HAVE_SSE41
175         case 1:
176             return _mm_insert_epi32(vfill, ptr[0], 0);
177         case 3:
178             a = _mm_loadl_epi64((const __m128i*)ptr);
179             a = _mm_insert_epi32(a, ptr[2], 2);
180             a = _mm_insert_epi32(a, fill, 3);
181             return a;
182     #else
183         case 1:
184             a = _mm_insert_epi16(vfill, wptr[0], 0);
185             return _mm_insert_epi16(a, wptr[1], 1);
186         case 3:
187             a = _mm_loadl_epi64((const __m128i*)ptr);
188             a = _mm_unpacklo_epi64(a, vfill);
189             a = _mm_insert_epi16(a, wptr[4], 4);
190             a = _mm_insert_epi16(a, wptr[5], 5);
191             return a;
192     #endif // NPY_HAVE_SSE41
193         default:
194             return npyv_load_s32(ptr);
195         }
196 #endif
197 }
198 // fill zero to rest lanes
npyv_load_tillz_s32(const npy_int32 * ptr,npy_uintp nlane)199 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
200 {
201     assert(nlane > 0);
202     switch(nlane) {
203     case 1:
204         return _mm_cvtsi32_si128(*ptr);
205     case 2:
206         return _mm_loadl_epi64((const __m128i*)ptr);
207     case 3:;
208         npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
209     #ifdef NPY_HAVE_SSE41
210         return _mm_insert_epi32(a, ptr[2], 2);
211     #else
212         return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
213     #endif
214     default:
215         return npyv_load_s32(ptr);
216     }
217 }
218 //// 64
npyv_load_till_s64(const npy_int64 * ptr,npy_uintp nlane,npy_int64 fill)219 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
220 {
221     assert(nlane > 0);
222 #ifdef NPYV__CLANG_ZEROUPPER
223     if (nlane <= 2) {
224         npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
225         for (npy_uint64 i = 0; i < nlane; ++i) {
226             data[i] = ptr[i];
227         }
228         return npyv_loada_s64(data);
229     }
230 #else
231     if (nlane == 1) {
232         const __m128i vfill = npyv_setall_s64(fill);
233         return _mm_castpd_si128(
234             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
235         );
236     }
237 #endif
238     return npyv_load_s64(ptr);
239 }
240 // fill zero to rest lanes
npyv_load_tillz_s64(const npy_int64 * ptr,npy_uintp nlane)241 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
242 {
243     assert(nlane > 0);
244     if (nlane == 1) {
245         return _mm_loadl_epi64((const __m128i*)ptr);
246     }
247     return npyv_load_s64(ptr);
248 }
249 /*********************************
250  * Non-contiguous partial load
251  *********************************/
252 //// 32
253 NPY_FINLINE npyv_s32
npyv_loadn_till_s32(const npy_int32 * ptr,npy_intp stride,npy_uintp nlane,npy_int32 fill)254 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
255 {
256     assert(nlane > 0);
257 #ifdef NPYV__CLANG_ZEROUPPER
258     if (nlane > 3) {
259         return npyv_loadn_s32(ptr, stride);
260     }
261     npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
262     for (npy_uint64 i = 0; i < nlane; ++i) {
263         data[i] = ptr[stride*i];
264     }
265     return npyv_loada_s32(data);
266 #else
267     __m128i vfill = npyv_setall_s32(fill);
268     #ifndef NPY_HAVE_SSE41
269         const short *wptr = (const short*)ptr;
270     #endif
271     switch(nlane) {
272     #ifdef NPY_HAVE_SSE41
273         case 3:
274             vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2);
275         case 2:
276             vfill = _mm_insert_epi32(vfill, ptr[stride], 1);
277         case 1:
278             vfill = _mm_insert_epi32(vfill, ptr[0], 0);
279             break;
280     #else
281         case 3:
282             vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill);
283         case 2:
284             vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32(
285                 _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride])
286             ), vfill);
287             break;
288         case 1:
289             vfill = _mm_insert_epi16(vfill, wptr[0], 0);
290             vfill = _mm_insert_epi16(vfill, wptr[1], 1);
291             break;
292     #endif // NPY_HAVE_SSE41
293     default:
294         return npyv_loadn_s32(ptr, stride);
295     } // switch
296     return vfill;
297 #endif
298 }
299 // fill zero to rest lanes
300 NPY_FINLINE npyv_s32
npyv_loadn_tillz_s32(const npy_int32 * ptr,npy_intp stride,npy_uintp nlane)301 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
302 {
303     assert(nlane > 0);
304     switch(nlane) {
305     case 1:
306         return _mm_cvtsi32_si128(ptr[0]);
307     case 2:;
308         npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
309 #ifdef NPY_HAVE_SSE41
310         return _mm_insert_epi32(a, ptr[stride], 1);
311 #else
312         return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
313 #endif // NPY_HAVE_SSE41
314     case 3:;
315         a = _mm_cvtsi32_si128(ptr[0]);
316 #ifdef NPY_HAVE_SSE41
317         a = _mm_insert_epi32(a, ptr[stride], 1);
318         a = _mm_insert_epi32(a, ptr[stride*2], 2);
319         return a;
320 #else
321         a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
322         a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
323         return a;
324 #endif // NPY_HAVE_SSE41
325     default:
326         return npyv_loadn_s32(ptr, stride);
327     }
328 }
329 //// 64
330 NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 * ptr,npy_intp stride,npy_uintp nlane,npy_int64 fill)331 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
332 {
333     assert(nlane > 0);
334 #ifdef NPYV__CLANG_ZEROUPPER
335     if (nlane <= 2) {
336         npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
337         for (npy_uint64 i = 0; i < nlane; ++i) {
338             data[i] = ptr[i*stride];
339         }
340         return npyv_loada_s64(data);
341     }
342 #else
343     if (nlane == 1) {
344         const __m128i vfill = npyv_setall_s64(fill);
345         return _mm_castpd_si128(
346             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
347         );
348     }
349 #endif
350     return npyv_loadn_s64(ptr, stride);
351 }
352 // fill zero to rest lanes
npyv_loadn_tillz_s64(const npy_int64 * ptr,npy_intp stride,npy_uintp nlane)353 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
354 {
355     assert(nlane > 0);
356     if (nlane == 1) {
357         return _mm_loadl_epi64((const __m128i*)ptr);
358     }
359     return npyv_loadn_s64(ptr, stride);
360 }
361 /*********************************
362  * Partial store
363  *********************************/
364 //// 32
npyv_store_till_s32(npy_int32 * ptr,npy_uintp nlane,npyv_s32 a)365 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
366 {
367     assert(nlane > 0);
368     switch(nlane) {
369     case 1:
370         *ptr = _mm_cvtsi128_si32(a);
371         break;
372     case 2:
373         _mm_storel_epi64((__m128i *)ptr, a);
374         break;
375     case 3:
376         _mm_storel_epi64((__m128i *)ptr, a);
377     #ifdef NPY_HAVE_SSE41
378         ptr[2] = _mm_extract_epi32(a, 2);
379     #else
380         ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
381     #endif
382         break;
383     default:
384         npyv_store_s32(ptr, a);
385     }
386 }
387 //// 64
npyv_store_till_s64(npy_int64 * ptr,npy_uintp nlane,npyv_s64 a)388 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
389 {
390     assert(nlane > 0);
391     if (nlane == 1) {
392         _mm_storel_epi64((__m128i *)ptr, a);
393         return;
394     }
395     npyv_store_s64(ptr, a);
396 }
397 /*********************************
398  * Non-contiguous partial store
399  *********************************/
400 //// 32
npyv_storen_till_s32(npy_int32 * ptr,npy_intp stride,npy_uintp nlane,npyv_s32 a)401 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
402 {
403     assert(nlane > 0);
404     switch(nlane) {
405 #ifdef NPY_HAVE_SSE41
406     default:
407         ptr[stride*3] = _mm_extract_epi32(a, 3);
408     case 3:
409         ptr[stride*2] = _mm_extract_epi32(a, 2);
410     case 2:
411         ptr[stride*1] = _mm_extract_epi32(a, 1);
412 #else
413     default:
414         ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
415     case 3:
416         ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
417     case 2:
418         ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
419 #endif
420     case 1:
421         ptr[stride*0] = _mm_cvtsi128_si32(a);
422         break;
423     }
424 }
425 //// 64
npyv_storen_till_s64(npy_int64 * ptr,npy_intp stride,npy_uintp nlane,npyv_s64 a)426 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
427 {
428     assert(nlane > 0);
429     if (nlane == 1) {
430         _mm_storel_epi64((__m128i *)ptr, a);
431         return;
432     }
433     npyv_storen_s64(ptr, stride, a);
434 }
435 /*****************************************************************
436  * Implement partial load/store for u32/f32/u64/f64... via casting
437  *****************************************************************/
438 #define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
439     NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
440     (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
441     {                                                                                       \
442         union {                                                                             \
443             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
444             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
445         } pun = {.from_##F_SFX = fill};                                                     \
446         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
447             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
448         ));                                                                                 \
449     }                                                                                       \
450     NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
451     (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
452      npyv_lanetype_##F_SFX fill)                                                            \
453     {                                                                                       \
454         union {                                                                             \
455             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
456             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
457         } pun = {.from_##F_SFX = fill};                                                     \
458         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
459             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
460         ));                                                                                 \
461     }                                                                                       \
462     NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
463     (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
464     {                                                                                       \
465         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
466             (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
467         ));                                                                                 \
468     }                                                                                       \
469     NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
470     (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
471     {                                                                                       \
472         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
473             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
474         ));                                                                                 \
475     }                                                                                       \
476     NPY_FINLINE void npyv_store_till_##F_SFX                                                \
477     (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
478     {                                                                                       \
479         npyv_store_till_##T_SFX(                                                            \
480             (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
481             npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
482         );                                                                                  \
483     }                                                                                       \
484     NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
485     (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
486     {                                                                                       \
487         npyv_storen_till_##T_SFX(                                                           \
488             (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
489             npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
490         );                                                                                  \
491     }
492 
493 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32)
494 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
495 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
496 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
497 
498 #endif // _NPY_SIMD_SSE_MEMORY_H
499