1 //  (C) Copyright John Maddock 2006.
2 //  Use, modification and distribution are subject to the
3 //  Boost Software License, Version 1.0. (See accompanying file
4 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
7 #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
8 
9 #ifdef _MSC_VER
10 #pragma once
11 #endif
12 
13 #include <emmintrin.h>
14 
15 #if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC)
16 #define ALIGN16 __attribute__((__aligned__(16)))
17 #else
18 #define ALIGN16 __declspec(align(16))
19 #endif
20 
21 namespace boost{ namespace math{ namespace lanczos{
22 
23 template <>
lanczos_sum(const double & x)24 inline double lanczos13m53::lanczos_sum<double>(const double& x)
25 {
26    static const ALIGN16 double coeff[26] = {
27       static_cast<double>(2.506628274631000270164908177133837338626L),
28       static_cast<double>(1u),
29       static_cast<double>(210.8242777515793458725097339207133627117L),
30       static_cast<double>(66u),
31       static_cast<double>(8071.672002365816210638002902272250613822L),
32       static_cast<double>(1925u),
33       static_cast<double>(186056.2653952234950402949897160456992822L),
34       static_cast<double>(32670u),
35       static_cast<double>(2876370.628935372441225409051620849613599L),
36       static_cast<double>(357423u),
37       static_cast<double>(31426415.58540019438061423162831820536287L),
38       static_cast<double>(2637558u),
39       static_cast<double>(248874557.8620541565114603864132294232163L),
40       static_cast<double>(13339535u),
41       static_cast<double>(1439720407.311721673663223072794912393972L),
42       static_cast<double>(45995730u),
43       static_cast<double>(6039542586.35202800506429164430729792107L),
44       static_cast<double>(105258076u),
45       static_cast<double>(17921034426.03720969991975575445893111267L),
46       static_cast<double>(150917976u),
47       static_cast<double>(35711959237.35566804944018545154716670596L),
48       static_cast<double>(120543840u),
49       static_cast<double>(42919803642.64909876895789904700198885093L),
50       static_cast<double>(39916800u),
51       static_cast<double>(23531376880.41075968857200767445163675473L),
52       static_cast<double>(0u)
53    };
54 
55    static const double lim = 4.31965e+25; // By experiment, the largest x for which the SSE2 code does not go bad.
56 
57    if (x > lim)
58    {
59       double z = 1 / x;
60       return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
61    }
62 
63    __m128d vx = _mm_load1_pd(&x);
64    __m128d sum_even = _mm_load_pd(coeff);
65    __m128d sum_odd = _mm_load_pd(coeff+2);
66    __m128d nc_odd, nc_even;
67    __m128d vx2 = _mm_mul_pd(vx, vx);
68 
69    sum_even = _mm_mul_pd(sum_even, vx2);
70    nc_even = _mm_load_pd(coeff + 4);
71    sum_odd = _mm_mul_pd(sum_odd, vx2);
72    nc_odd = _mm_load_pd(coeff + 6);
73    sum_even = _mm_add_pd(sum_even, nc_even);
74    sum_odd = _mm_add_pd(sum_odd, nc_odd);
75 
76    sum_even = _mm_mul_pd(sum_even, vx2);
77    nc_even = _mm_load_pd(coeff + 8);
78    sum_odd = _mm_mul_pd(sum_odd, vx2);
79    nc_odd = _mm_load_pd(coeff + 10);
80    sum_even = _mm_add_pd(sum_even, nc_even);
81    sum_odd = _mm_add_pd(sum_odd, nc_odd);
82 
83    sum_even = _mm_mul_pd(sum_even, vx2);
84    nc_even = _mm_load_pd(coeff + 12);
85    sum_odd = _mm_mul_pd(sum_odd, vx2);
86    nc_odd = _mm_load_pd(coeff + 14);
87    sum_even = _mm_add_pd(sum_even, nc_even);
88    sum_odd = _mm_add_pd(sum_odd, nc_odd);
89 
90    sum_even = _mm_mul_pd(sum_even, vx2);
91    nc_even = _mm_load_pd(coeff + 16);
92    sum_odd = _mm_mul_pd(sum_odd, vx2);
93    nc_odd = _mm_load_pd(coeff + 18);
94    sum_even = _mm_add_pd(sum_even, nc_even);
95    sum_odd = _mm_add_pd(sum_odd, nc_odd);
96 
97    sum_even = _mm_mul_pd(sum_even, vx2);
98    nc_even = _mm_load_pd(coeff + 20);
99    sum_odd = _mm_mul_pd(sum_odd, vx2);
100    nc_odd = _mm_load_pd(coeff + 22);
101    sum_even = _mm_add_pd(sum_even, nc_even);
102    sum_odd = _mm_add_pd(sum_odd, nc_odd);
103 
104    sum_even = _mm_mul_pd(sum_even, vx2);
105    nc_even = _mm_load_pd(coeff + 24);
106    sum_odd = _mm_mul_pd(sum_odd, vx);
107    sum_even = _mm_add_pd(sum_even, nc_even);
108    sum_even = _mm_add_pd(sum_even, sum_odd);
109 
110 
111    double ALIGN16 t[2];
112    _mm_store_pd(t, sum_even);
113 
114    return t[0] / t[1];
115 }
116 
117 template <>
lanczos_sum_expG_scaled(const double & x)118 inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
119 {
120    static const ALIGN16 double coeff[26] = {
121          static_cast<double>(0.006061842346248906525783753964555936883222L),
122          static_cast<double>(1u),
123          static_cast<double>(0.5098416655656676188125178644804694509993L),
124          static_cast<double>(66u),
125          static_cast<double>(19.51992788247617482847860966235652136208L),
126          static_cast<double>(1925u),
127          static_cast<double>(449.9445569063168119446858607650988409623L),
128          static_cast<double>(32670u),
129          static_cast<double>(6955.999602515376140356310115515198987526L),
130          static_cast<double>(357423u),
131          static_cast<double>(75999.29304014542649875303443598909137092L),
132          static_cast<double>(2637558u),
133          static_cast<double>(601859.6171681098786670226533699352302507L),
134          static_cast<double>(13339535u),
135          static_cast<double>(3481712.15498064590882071018964774556468L),
136          static_cast<double>(45995730u),
137          static_cast<double>(14605578.08768506808414169982791359218571L),
138          static_cast<double>(105258076u),
139          static_cast<double>(43338889.32467613834773723740590533316085L),
140          static_cast<double>(150917976u),
141          static_cast<double>(86363131.28813859145546927288977868422342L),
142          static_cast<double>(120543840u),
143          static_cast<double>(103794043.1163445451906271053616070238554L),
144          static_cast<double>(39916800u),
145          static_cast<double>(56906521.91347156388090791033559122686859L),
146          static_cast<double>(0u)
147    };
148 
149    static const double lim = 4.76886e+25; // By experiment, the largest x for which the SSE2 code does not go bad.
150 
151    if (x > lim)
152    {
153       double z = 1 / x;
154       return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
155    }
156 
157    __m128d vx = _mm_load1_pd(&x);
158    __m128d sum_even = _mm_load_pd(coeff);
159    __m128d sum_odd = _mm_load_pd(coeff+2);
160    __m128d nc_odd, nc_even;
161    __m128d vx2 = _mm_mul_pd(vx, vx);
162 
163    sum_even = _mm_mul_pd(sum_even, vx2);
164    nc_even = _mm_load_pd(coeff + 4);
165    sum_odd = _mm_mul_pd(sum_odd, vx2);
166    nc_odd = _mm_load_pd(coeff + 6);
167    sum_even = _mm_add_pd(sum_even, nc_even);
168    sum_odd = _mm_add_pd(sum_odd, nc_odd);
169 
170    sum_even = _mm_mul_pd(sum_even, vx2);
171    nc_even = _mm_load_pd(coeff + 8);
172    sum_odd = _mm_mul_pd(sum_odd, vx2);
173    nc_odd = _mm_load_pd(coeff + 10);
174    sum_even = _mm_add_pd(sum_even, nc_even);
175    sum_odd = _mm_add_pd(sum_odd, nc_odd);
176 
177    sum_even = _mm_mul_pd(sum_even, vx2);
178    nc_even = _mm_load_pd(coeff + 12);
179    sum_odd = _mm_mul_pd(sum_odd, vx2);
180    nc_odd = _mm_load_pd(coeff + 14);
181    sum_even = _mm_add_pd(sum_even, nc_even);
182    sum_odd = _mm_add_pd(sum_odd, nc_odd);
183 
184    sum_even = _mm_mul_pd(sum_even, vx2);
185    nc_even = _mm_load_pd(coeff + 16);
186    sum_odd = _mm_mul_pd(sum_odd, vx2);
187    nc_odd = _mm_load_pd(coeff + 18);
188    sum_even = _mm_add_pd(sum_even, nc_even);
189    sum_odd = _mm_add_pd(sum_odd, nc_odd);
190 
191    sum_even = _mm_mul_pd(sum_even, vx2);
192    nc_even = _mm_load_pd(coeff + 20);
193    sum_odd = _mm_mul_pd(sum_odd, vx2);
194    nc_odd = _mm_load_pd(coeff + 22);
195    sum_even = _mm_add_pd(sum_even, nc_even);
196    sum_odd = _mm_add_pd(sum_odd, nc_odd);
197 
198    sum_even = _mm_mul_pd(sum_even, vx2);
199    nc_even = _mm_load_pd(coeff + 24);
200    sum_odd = _mm_mul_pd(sum_odd, vx);
201    sum_even = _mm_add_pd(sum_even, nc_even);
202    sum_even = _mm_add_pd(sum_even, sum_odd);
203 
204 
205    double ALIGN16 t[2];
206    _mm_store_pd(t, sum_even);
207 
208    return t[0] / t[1];
209 }
210 
211 #ifdef _MSC_VER
212 
213 static_assert(sizeof(double) == sizeof(long double), "sizeof(long double) != sizeof(double) is not supported");
214 
215 template <>
lanczos_sum(const long double & x)216 inline long double lanczos13m53::lanczos_sum<long double>(const long double& x)
217 {
218    return lanczos_sum<double>(static_cast<double>(x));
219 }
220 template <>
lanczos_sum_expG_scaled(const long double & x)221 inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x)
222 {
223    return lanczos_sum_expG_scaled<double>(static_cast<double>(x));
224 }
225 #endif
226 
227 } // namespace lanczos
228 } // namespace math
229 } // namespace boost
230 
231 #undef ALIGN16
232 
233 #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
234 
235 
236 
237 
238 
239