1 /////////////////////////////////////////////////////////////////////////////
2 //  einspline:  a library for creating and evaluating B-splines            //
3 //  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
4 //                                                                         //
5 //  This program is free software; you can redistribute it and/or modify   //
6 //  it under the terms of the GNU General Public License as published by   //
7 //  the Free Software Foundation; either version 2 of the License, or      //
8 //  (at your option) any later version.                                    //
9 //                                                                         //
10 //  This program is distributed in the hope that it will be useful,        //
11 //  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
12 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
13 //  GNU General Public License for more details.                           //
14 //                                                                         //
15 //  You should have received a copy of the GNU General Public License      //
16 //  along with this program; if not, write to the Free Software            //
17 //  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
18 //  Boston, MA  02110-1301  USA                                            //
19 /////////////////////////////////////////////////////////////////////////////
20 
21 //#include "config.h"
22 
23 #include "local_definitions.h"
24 
25 /*****************/
26 /*   SSE Data    */
27 /*****************/
28 
29 #ifdef _XOPEN_SOURCE
30 #undef _XOPEN_SOURCE
31 #endif
32 
33 #define _XOPEN_SOURCE 600
34 
35 #ifndef __USE_XOPEN2K
36   #define __USE_XOPEN2K
37 #endif
38 #include <stdlib.h>
39 
40 #ifdef HAVE_SSE
41 #include <xmmintrin.h>
42 
43 // Single-precision version of matrices
44 __m128 *restrict A_s = (__m128 *)0;
45 // There is a problem with alignment of global variables in shared
46 // libraries on 32-bit machines.
47 // __m128  A0, A1, A2, A3, dA0, dA1, dA2, dA3, d2A0, d2A1, d2A2, d2A3;
48 #endif
49 
50 #ifdef HAVE_SSE2
51 // Double-precision version of matrices
52 #include <emmintrin.h>
53 __m128d *restrict A_d = (__m128d *)0;
54 
55 // There is a problem with alignment of global variables in shared
56 // libraries on 32-bit machines.
57 //__m128d A0_01, A0_23, A1_01, A1_23, A2_01, A2_23, A3_01, A3_23,
58 //  dA0_01, dA0_23, dA1_01, dA1_23, dA2_01, dA2_23, dA3_01, dA3_23,
59 //  d2A0_01, d2A0_23, d2A1_01, d2A1_23, d2A2_01, d2A2_23, d2A3_01, d2A3_23;
60 #endif
61 
init_sse_data()62 void init_sse_data()
63 {
64 #ifdef HAVE_SSE
65   if (A_s == 0) {
66     posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
67     A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
68     A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
69     A_s[1]  = _mm_setr_ps ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0 );
70     A_s[2]  = _mm_setr_ps ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0 );
71     A_s[3]  = _mm_setr_ps ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0 );
72     A_s[4]  = _mm_setr_ps ( -0.5,  1.0, -0.5, 0.0  );
73     A_s[5]  = _mm_setr_ps (  0.0, -2.0,  1.5, 0.0  );
74     A_s[6]  = _mm_setr_ps (  0.5,  1.0, -1.5, 0.0  );
75     A_s[7]  = _mm_setr_ps (  0.0,  0.0,  0.5, 0.0  );
76     A_s[8]  = _mm_setr_ps (  1.0, -1.0,  0.0, 0.0  );
77     A_s[9]  = _mm_setr_ps ( -2.0,  3.0,  0.0, 0.0  );
78     A_s[10] = _mm_setr_ps (  1.0, -3.0,  0.0, 0.0  );
79     A_s[11] = _mm_setr_ps (  0.0,  1.0,  0.0, 0.0  );
80   }
81 
82 #endif
83 #ifdef HAVE_SSE2
84   if (A_d == 0) {
85     posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24));
86     A_d[ 0] = _mm_setr_pd (  3.0/6.0, -1.0/6.0 );
87     A_d[ 1] = _mm_setr_pd (  1.0/6.0, -3.0/6.0 );
88     A_d[ 2] = _mm_setr_pd ( -6.0/6.0,  3.0/6.0 );
89     A_d[ 3] = _mm_setr_pd (  4.0/6.0,  0.0/6.0 );
90     A_d[ 4] = _mm_setr_pd (  3.0/6.0, -3.0/6.0 );
91     A_d[ 5] = _mm_setr_pd (  1.0/6.0,  3.0/6.0 );
92     A_d[ 6] = _mm_setr_pd (  0.0/6.0,  1.0/6.0 );
93     A_d[ 7] = _mm_setr_pd (  0.0/6.0,  0.0/6.0 );
94     A_d[ 8] = _mm_setr_pd ( -0.5,  0.0 );
95     A_d[ 9] = _mm_setr_pd ( -0.5,  1.0 );
96     A_d[10] = _mm_setr_pd (  1.5,  0.0 );
97     A_d[11] = _mm_setr_pd (  0.0, -2.0 );
98     A_d[12] = _mm_setr_pd ( -1.5,  0.0 );
99     A_d[13] = _mm_setr_pd (  0.5,  1.0 );
100     A_d[14] = _mm_setr_pd (  0.5,  0.0 );
101     A_d[15] = _mm_setr_pd (  0.0,  0.0 );
102     A_d[16] = _mm_setr_pd (  0.0,  0.0 );
103     A_d[17] = _mm_setr_pd (  1.0, -1.0 );
104     A_d[18] = _mm_setr_pd (  0.0,  0.0 );
105     A_d[19] = _mm_setr_pd ( -2.0,  3.0 );
106     A_d[20] = _mm_setr_pd (  0.0,  0.0 );
107     A_d[21] = _mm_setr_pd (  1.0, -3.0 );
108     A_d[22] = _mm_setr_pd (  0.0,  0.0 );
109     A_d[23] = _mm_setr_pd (  0.0,  1.0 );
110   }
111 #endif
112 }
113 
114 
115 #ifdef USE_ALTIVEC
116 vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0);
117 vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0);
118 vector float A2   = (vector float) ( -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0);
119 vector float A3   = (vector float) (  1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0);
120 /* vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0); */
121 /* vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  3.0/6.0, 0.0/6.0); */
122 /* vector float A2   = (vector float) ( -3.0/6.0,  0.0/6.0,  3.0/6.0, 0.0/6.0); */
123 /* vector float A3   = (vector float) (  1.0/6.0,  4.0/6.0,  1.0/6.0, 0.0/6.0); */
124 /* vector float A0   = (vector float) ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0); */
125 /* vector float A1   = (vector float) ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0); */
126 /* vector float A2   = (vector float) ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0); */
127 /* vector float A3   = (vector float) ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0); */
128 vector float dA0  = (vector float) ( 0.0, -0.5,  1.0, -0.5 );
129 vector float dA1  = (vector float) ( 0.0,  1.5, -2.0,  0.0 );
130 vector float dA2  = (vector float) ( 0.0, -1.5,  1.0,  0.5 );
131 vector float dA3  = (vector float) ( 0.0,  0.5,  0.0,  0.0 );
132 vector float d2A0 = (vector float) ( 0.0,  0.0, -1.0,  1.0 );
133 vector float d2A1 = (vector float) ( 0.0,  0.0,  3.0, -2.0 );
134 vector float d2A2 = (vector float) ( 0.0,  0.0, -3.0,  1.0 );
135 vector float d2A3 = (vector float) ( 0.0,  0.0,  1.0,  0.0 );
136 #endif
137 
138 /*****************/
139 /* Standard Data */
140 /*****************/
141 
142 //////////////////////
143 // Single precision //
144 //////////////////////
145 const float A44f[16] =
146   { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
147      3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
148     -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
149      1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
150 const float* restrict Af = A44f;
151 
152 const float dA44f[16] =
153   {  0.0, -0.5,  1.0, -0.5,
154      0.0,  1.5, -2.0,  0.0,
155      0.0, -1.5,  1.0,  0.5,
156      0.0,  0.5,  0.0,  0.0 };
157 const float* restrict dAf = dA44f;
158 
159 const float d2A44f[16] =
160   {  0.0, 0.0, -1.0,  1.0,
161      0.0, 0.0,  3.0, -2.0,
162      0.0, 0.0, -3.0,  1.0,
163      0.0, 0.0,  1.0,  0.0 };
164 const float* restrict d2Af = d2A44f;
165 
166 const float d3A44f[16] =
167   {  0.0, 0.0,  0.0, -1.0,
168      0.0, 0.0,  0.0,  3.0,
169      0.0, 0.0,  0.0, -3.0,
170      0.0, 0.0,  0.0,  1.0};
171 const float* restrict d3Af = d3A44f;
172 
173 //////////////////////
174 // Double precision //
175 //////////////////////
176 const double A44d[16] =
177   { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
178      3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
179     -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
180      1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
181 const double* restrict Ad = A44d;
182 
183 const double dA44d[16] =
184   {  0.0, -0.5,  1.0, -0.5,
185      0.0,  1.5, -2.0,  0.0,
186      0.0, -1.5,  1.0,  0.5,
187      0.0,  0.5,  0.0,  0.0 };
188 const double* restrict dAd = dA44d;
189 
190 const double d2A44d[16] =
191   {  0.0, 0.0, -1.0,  1.0,
192      0.0, 0.0,  3.0, -2.0,
193      0.0, 0.0, -3.0,  1.0,
194      0.0, 0.0,  1.0,  0.0 };
195 const double* restrict d2Ad = d2A44d;
196 
197 const double d3A44d[16] =
198   {  0.0, 0.0,  0.0, -1.0,
199      0.0, 0.0,  0.0,  3.0,
200      0.0, 0.0,  0.0, -3.0,
201      0.0, 0.0,  0.0,  1.0};
202 const double* restrict d3Ad = d3A44d;
203