1 /**
2  * Copyright 2014-2016 Andreas Schäfer
3  *
4  * Distributed under the Boost Software License, Version 1.0. (See accompanying
5  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
6  */
7 
8 #ifndef FLAT_ARRAY_SHORT_VEC_HPP
9 #define FLAT_ARRAY_SHORT_VEC_HPP
10 
11 namespace LibFlatArray {
12 
13 template<typename CARGO, int ARITY>
14 class short_vec;
15 
16 template<typename CARGO, int ARITY>
operator +(CARGO a,const short_vec<CARGO,ARITY> & b)17 inline short_vec<CARGO, ARITY> operator+(CARGO a, const short_vec<CARGO, ARITY>& b)
18 {
19     return short_vec<CARGO, ARITY>(a) + b;
20 }
21 
22 template<typename CARGO, int ARITY>
operator -(CARGO a,const short_vec<CARGO,ARITY> & b)23 inline short_vec<CARGO, ARITY> operator-(CARGO a, const short_vec<CARGO, ARITY>& b)
24 {
25     return short_vec<CARGO, ARITY>(a) - b;
26 }
27 
28 template<typename CARGO, int ARITY>
operator *(CARGO a,const short_vec<CARGO,ARITY> & b)29 inline short_vec<CARGO, ARITY> operator*(CARGO a, const short_vec<CARGO, ARITY>& b)
30 {
31     return short_vec<CARGO, ARITY>(a) * b;
32 }
33 
34 template<typename CARGO, int ARITY>
operator /(CARGO a,const short_vec<CARGO,ARITY> & b)35 inline short_vec<CARGO, ARITY> operator/(CARGO a, const short_vec<CARGO, ARITY>& b)
36 {
37     return short_vec<CARGO, ARITY>(a) / b;
38 }
39 
40 template<typename CARGO, int ARITY >
any(const short_vec<CARGO,ARITY> & vec)41 inline bool any(const short_vec<CARGO, ARITY>& vec)
42 {
43     return vec.any();
44 }
45 
any(unsigned mask)46 inline bool any(unsigned mask)
47 {
48     return mask;
49 }
50 
any(unsigned short mask)51 inline bool any(unsigned short mask)
52 {
53     return mask;
54 }
55 
any(unsigned char mask)56 inline bool any(unsigned char mask)
57 {
58     return mask;
59 }
60 
61 template<typename CARGO, int ARITY >
get(const short_vec<CARGO,ARITY> & vec,const int i)62 inline CARGO get(const short_vec<CARGO, ARITY>& vec, const int i)
63 {
64     return vec.get(i);
65 }
66 
get(unsigned mask,const int i)67 inline bool get(unsigned mask, const int i)
68 {
69     return (mask >> i) & 1;
70 }
71 
get(unsigned short mask,const int i)72 inline bool get(unsigned short mask, const int i)
73 {
74     return (mask >> i) & 1;
75 }
76 
get(unsigned char mask,const int i)77 inline bool get(unsigned char mask, const int i)
78 {
79     return (mask >> i) & 1;
80 }
81 
82 class short_vec_strategy
83 {
84 public:
85     class scalar
86     {};
87 
88     class avx
89     {};
90 
91     class avx2
92     {};
93 
94     class avx512f
95     {};
96 
97     class cuda
98     {};
99 
100     class qpx
101     {};
102 
103     class sse
104     {};
105 
106     class sse2
107     {};
108 
109     class sse4_1
110     {};
111 
112     class mic
113     {};
114 
115     class neon
116     {};
117 };
118 
119 }
120 
121 #define LIBFLATARRAY_SCALAR        10
122 #define LIBFLATARRAY_QPX           11
123 #define LIBFLATARRAY_ARM_NEON      12
124 #define LIBFLATARRAY_MIC           13
125 #define LIBFLATARRAY_AVX512F       14
126 #define LIBFLATARRAY_AVX           15
127 #define LIBFLATARRAY_AVX2          16
128 #define LIBFLATARRAY_SSE           17
129 #define LIBFLATARRAY_SSE2          18
130 #define LIBFLATARRAY_SSE4_1        19
131 
132 #ifdef __CUDA_ARCH__
133 // Use only scalar short_vec implementations on CUDA devices:
134 #define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_SCALAR
135 #else
136 // for IBM Blue Gene/Q's QPX, which is mutually exclusive to
137 // Intel/AMD's AVX/SSE or ARM's NEON ISAs:
138 #  ifdef __VECTOR4DOUBLE__
139 #    define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_QPX
140 #  endif
141 
142 // Dito for ARM NEON:
143 #  ifdef __ARM_NEON__
144 #    define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_ARM_NEON
145 #  endif
146 
147 #  ifndef LIBFLATARRAY_WIDEST_VECTOR_ISA
148 // Only the case of the IBM PC is complicated. No thanks to you,
149 // history!
150 #    ifdef __MIC__
151 #      define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_MIC
152 #    else
153 #      ifdef __AVX512F__
154 #        define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_AVX512F
155 #      else
156 #        ifdef __AVX2__
157 #          define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_AVX2
158 #        else
159 #          ifdef __AVX__
160 #            define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_AVX
161 #          else
162 #            ifdef __SSE4_1__
163 #              define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_SSE4_1
164 #            else
165 #              ifdef __SSE2__
166 #                define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_SSE2
167 #              else
168 #                ifdef __SSE__
169 #                  define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_SSE
170 #                else
171 // fallback: scalar implementation always works and is still yields
172 // code that's easy to vectorize for the compiler:
173 #                  define LIBFLATARRAY_WIDEST_VECTOR_ISA LIBFLATARRAY_SCALAR
174 #                endif
175 #              endif
176 #            endif
177 #          endif
178 #        endif
179 #      endif
180 #    endif
181 #  endif
182 
183 #endif
184 
185 #include <sstream>
186 
187 #include <libflatarray/detail/short_vec_avx512_double_8.hpp>
188 #include <libflatarray/detail/short_vec_avx512_double_16.hpp>
189 #include <libflatarray/detail/short_vec_avx512_double_32.hpp>
190 
191 #include <libflatarray/detail/short_vec_avx512_float_16.hpp>
192 #include <libflatarray/detail/short_vec_avx512_float_32.hpp>
193 
194 #include <libflatarray/detail/short_vec_avx_double_4.hpp>
195 #include <libflatarray/detail/short_vec_avx_double_8.hpp>
196 #include <libflatarray/detail/short_vec_avx_double_16.hpp>
197 #include <libflatarray/detail/short_vec_avx_double_32.hpp>
198 
199 #include <libflatarray/detail/short_vec_avx_float_8.hpp>
200 #include <libflatarray/detail/short_vec_avx_float_16.hpp>
201 #include <libflatarray/detail/short_vec_avx_float_32.hpp>
202 
203 #include <libflatarray/detail/short_vec_scalar_double_1.hpp>
204 #include <libflatarray/detail/short_vec_scalar_double_2.hpp>
205 #include <libflatarray/detail/short_vec_scalar_double_4.hpp>
206 #include <libflatarray/detail/short_vec_scalar_double_8.hpp>
207 #include <libflatarray/detail/short_vec_scalar_double_16.hpp>
208 #include <libflatarray/detail/short_vec_scalar_double_32.hpp>
209 
210 #include <libflatarray/detail/short_vec_scalar_float_1.hpp>
211 #include <libflatarray/detail/short_vec_scalar_float_2.hpp>
212 #include <libflatarray/detail/short_vec_scalar_float_4.hpp>
213 #include <libflatarray/detail/short_vec_scalar_float_8.hpp>
214 #include <libflatarray/detail/short_vec_scalar_float_16.hpp>
215 #include <libflatarray/detail/short_vec_scalar_float_32.hpp>
216 
217 #include <libflatarray/detail/short_vec_scalar_int_1.hpp>
218 #include <libflatarray/detail/short_vec_scalar_int_2.hpp>
219 #include <libflatarray/detail/short_vec_scalar_int_4.hpp>
220 #include <libflatarray/detail/short_vec_scalar_int_8.hpp>
221 #include <libflatarray/detail/short_vec_scalar_int_16.hpp>
222 #include <libflatarray/detail/short_vec_scalar_int_32.hpp>
223 
224 #include <libflatarray/detail/short_vec_sse_int_4.hpp>
225 #include <libflatarray/detail/short_vec_sse_int_8.hpp>
226 #include <libflatarray/detail/short_vec_sse_int_16.hpp>
227 #include <libflatarray/detail/short_vec_sse_int_32.hpp>
228 
229 #include <libflatarray/detail/short_vec_avx_int_8.hpp>
230 #include <libflatarray/detail/short_vec_avx_int_16.hpp>
231 #include <libflatarray/detail/short_vec_avx_int_32.hpp>
232 
233 #include <libflatarray/detail/short_vec_avx512_int_16.hpp>
234 #include <libflatarray/detail/short_vec_avx512_int_32.hpp>
235 
236 #include <libflatarray/detail/short_vec_sse_double_2.hpp>
237 #include <libflatarray/detail/short_vec_sse_double_4.hpp>
238 #include <libflatarray/detail/short_vec_sse_double_8.hpp>
239 #include <libflatarray/detail/short_vec_sse_double_16.hpp>
240 #include <libflatarray/detail/short_vec_sse_double_32.hpp>
241 
242 #include <libflatarray/detail/short_vec_sse_float_4.hpp>
243 #include <libflatarray/detail/short_vec_sse_float_8.hpp>
244 #include <libflatarray/detail/short_vec_sse_float_16.hpp>
245 #include <libflatarray/detail/short_vec_sse_float_32.hpp>
246 
247 #include <libflatarray/detail/short_vec_qpx_double_4.hpp>
248 #include <libflatarray/detail/short_vec_qpx_double_8.hpp>
249 #include <libflatarray/detail/short_vec_qpx_double_16.hpp>
250 #include <libflatarray/detail/short_vec_qpx_double_32.hpp>
251 
252 #include <libflatarray/detail/short_vec_neon_float_4.hpp>
253 #include <libflatarray/detail/short_vec_neon_float_8.hpp>
254 #include <libflatarray/detail/short_vec_neon_float_16.hpp>
255 #include <libflatarray/detail/short_vec_neon_float_32.hpp>
256 
257 #include <libflatarray/detail/short_vec_mic_double_8.hpp>
258 #include <libflatarray/detail/short_vec_mic_double_16.hpp>
259 #include <libflatarray/detail/short_vec_mic_double_32.hpp>
260 
261 #include <libflatarray/detail/short_vec_mic_float_16.hpp>
262 #include <libflatarray/detail/short_vec_mic_float_32.hpp>
263 
264 #endif
265