1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "pmmintrin.h"
36 #include "blis.h"
37
38
39 typedef union
40 {
41 __m128d v;
42 double d[2];
43 } v2df_t;
44
45
bli_daxpy2v_penryn_int(conj_t conjx,conj_t conjy,dim_t n,double * restrict alpha,double * restrict beta,double * restrict x,inc_t incx,double * restrict y,inc_t incy,double * restrict z,inc_t incz,cntx_t * restrict cntx)46 void bli_daxpy2v_penryn_int
47 (
48 conj_t conjx,
49 conj_t conjy,
50 dim_t n,
51 double* restrict alpha,
52 double* restrict beta,
53 double* restrict x, inc_t incx,
54 double* restrict y, inc_t incy,
55 double* restrict z, inc_t incz,
56 cntx_t* restrict cntx
57 )
58 {
59 double* restrict alpha_cast = alpha;
60 double* restrict beta_cast = beta;
61 double* restrict x_cast = x;
62 double* restrict y_cast = y;
63 double* restrict z_cast = z;
64 dim_t i;
65
66 const dim_t n_elem_per_reg = 2;
67 const dim_t n_iter_unroll = 4;
68
69 dim_t n_pre;
70 dim_t n_run;
71 dim_t n_left;
72
73 double* restrict x1;
74 double* restrict y1;
75 double* restrict z1;
76 double alphac, betac, x1c, y1c;
77
78 v2df_t alphav, betav;
79 v2df_t x1v, y1v, z1v;
80 v2df_t x2v, y2v, z2v;
81
82 bool use_ref = FALSE;
83
84
85 if ( bli_zero_dim1( n ) ) return;
86
87 n_pre = 0;
88
89 // If there is anything that would interfere with our use of aligned
90 // vector loads/stores, call the reference implementation.
91 if ( incx != 1 || incy != 1 || incz != 1 )
92 {
93 use_ref = TRUE;
94 }
95 else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) ||
96 bli_is_unaligned_to( ( siz_t )y, 16 ) ||
97 bli_is_unaligned_to( ( siz_t )z, 16 ) )
98 {
99 use_ref = TRUE;
100
101 if ( bli_is_unaligned_to( ( siz_t )x, 16 ) &&
102 bli_is_unaligned_to( ( siz_t )y, 16 ) &&
103 bli_is_unaligned_to( ( siz_t )z, 16 ) )
104 {
105 use_ref = FALSE;
106 n_pre = 1;
107 }
108 }
109
110 // Call the reference implementation if needed.
111 if ( use_ref == TRUE )
112 {
113 daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
114
115 f
116 (
117 conjx,
118 conjy,
119 n,
120 alpha,
121 beta,
122 x, incx,
123 y, incy,
124 z, incz,
125 cntx
126 );
127 return;
128 }
129
130
131 n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
132 n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
133
134 alphac = *alpha_cast;
135 betac = *beta_cast;
136
137 x1 = x_cast;
138 y1 = y_cast;
139 z1 = z_cast;
140
141 if ( n_pre == 1 )
142 {
143 x1c = *x1;
144 y1c = *y1;
145
146 *z1 += alphac * x1c +
147 betac * y1c;
148
149 x1 += incx;
150 y1 += incy;
151 z1 += incz;
152 }
153
154 alphav.v = _mm_loaddup_pd( ( double* )alpha_cast );
155 betav.v = _mm_loaddup_pd( ( double* )beta_cast );
156
157 for ( i = 0; i < n_run; ++i )
158 {
159 /*
160 z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
161 x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
162 y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
163
164 z1v.v += alphav.v * x1v.v;
165 z1v.v += betav.v * y1v.v;
166
167 _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
168
169 z1v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
170 x1v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
171 y1v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
172
173 z1v.v += alphav.v * x1v.v;
174 z1v.v += betav.v * y1v.v;
175
176 _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z1v.v );
177 */
178 /*
179 z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
180 x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
181 y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
182
183 z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
184 x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
185 y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
186
187 z1v.v += alphav.v * x1v.v;
188 z1v.v += betav.v * y1v.v;
189
190 _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
191
192 z2v.v += alphav.v * x2v.v;
193 z2v.v += betav.v * y2v.v;
194
195 _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
196 */
197 z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
198 x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
199 y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
200
201 z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
202 x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
203 y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
204
205 z1v.v += alphav.v * x1v.v;
206 z1v.v += betav.v * y1v.v;
207
208 _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
209
210 z1v.v = _mm_load_pd( ( double* )z1 + 2*n_elem_per_reg );
211 x1v.v = _mm_load_pd( ( double* )x1 + 2*n_elem_per_reg );
212 y1v.v = _mm_load_pd( ( double* )y1 + 2*n_elem_per_reg );
213
214 z2v.v += alphav.v * x2v.v;
215 z2v.v += betav.v * y2v.v;
216
217 _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
218
219 z2v.v = _mm_load_pd( ( double* )z1 + 3*n_elem_per_reg );
220 x2v.v = _mm_load_pd( ( double* )x1 + 3*n_elem_per_reg );
221 y2v.v = _mm_load_pd( ( double* )y1 + 3*n_elem_per_reg );
222
223 z1v.v += alphav.v * x1v.v;
224 z1v.v += betav.v * y1v.v;
225
226 _mm_store_pd( ( double* )(z1 + 2*n_elem_per_reg ), z1v.v );
227
228 z2v.v += alphav.v * x2v.v;
229 z2v.v += betav.v * y2v.v;
230
231 _mm_store_pd( ( double* )(z1 + 3*n_elem_per_reg ), z2v.v );
232
233
234
235 x1 += n_elem_per_reg * n_iter_unroll;
236 y1 += n_elem_per_reg * n_iter_unroll;
237 z1 += n_elem_per_reg * n_iter_unroll;
238 }
239
240 if ( n_left > 0 )
241 {
242 for ( i = 0; i < n_left; ++i )
243 {
244 x1c = *x1;
245 y1c = *y1;
246
247 *z1 += alphac * x1c +
248 betac * y1c;
249
250 x1 += incx;
251 y1 += incy;
252 z1 += incz;
253 }
254 }
255 }
256