1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "pmmintrin.h"
36 #include "blis.h"
37 
38 
39 typedef union
40 {
41 	__m128d v;
42 	double  d[2];
43 } v2df_t;
44 
45 
bli_daxpy2v_penryn_int(conj_t conjx,conj_t conjy,dim_t n,double * restrict alpha,double * restrict beta,double * restrict x,inc_t incx,double * restrict y,inc_t incy,double * restrict z,inc_t incz,cntx_t * restrict cntx)46 void bli_daxpy2v_penryn_int
47      (
48        conj_t           conjx,
49        conj_t           conjy,
50        dim_t            n,
51        double* restrict alpha,
52        double* restrict beta,
53        double* restrict x, inc_t incx,
54        double* restrict y, inc_t incy,
55        double* restrict z, inc_t incz,
56        cntx_t* restrict cntx
57      )
58 {
59 	double*  restrict alpha_cast  = alpha;
60 	double*  restrict beta_cast   = beta;
61 	double*  restrict x_cast      = x;
62 	double*  restrict y_cast      = y;
63 	double*  restrict z_cast      = z;
64 	dim_t             i;
65 
66 	const dim_t       n_elem_per_reg = 2;
67 	const dim_t       n_iter_unroll  = 4;
68 
69 	dim_t             n_pre;
70 	dim_t             n_run;
71 	dim_t             n_left;
72 
73 	double*  restrict x1;
74 	double*  restrict y1;
75 	double*  restrict z1;
76 	double            alphac, betac, x1c, y1c;
77 
78 	v2df_t            alphav, betav;
79 	v2df_t            x1v, y1v, z1v;
80 	v2df_t            x2v, y2v, z2v;
81 
82 	bool              use_ref = FALSE;
83 
84 
85 	if ( bli_zero_dim1( n ) ) return;
86 
87 	n_pre = 0;
88 
89 	// If there is anything that would interfere with our use of aligned
90 	// vector loads/stores, call the reference implementation.
91 	if ( incx != 1 || incy != 1 || incz != 1 )
92 	{
93 		use_ref = TRUE;
94 	}
95 	else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) ||
96 	          bli_is_unaligned_to( ( siz_t )y, 16 ) ||
97 	          bli_is_unaligned_to( ( siz_t )z, 16 ) )
98 	{
99 		use_ref = TRUE;
100 
101 		if ( bli_is_unaligned_to( ( siz_t )x, 16 ) &&
102 		     bli_is_unaligned_to( ( siz_t )y, 16 ) &&
103 		     bli_is_unaligned_to( ( siz_t )z, 16 ) )
104 		{
105 			use_ref = FALSE;
106 			n_pre   = 1;
107 		}
108 	}
109 
110 	// Call the reference implementation if needed.
111 	if ( use_ref == TRUE )
112 	{
113 		daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
114 
115 		f
116 		(
117 		  conjx,
118 		  conjy,
119 		  n,
120 		  alpha,
121 		  beta,
122 		  x, incx,
123 		  y, incy,
124 		  z, incz,
125 		  cntx
126 		);
127 		return;
128 	}
129 
130 
131 	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
132 	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
133 
134 	alphac = *alpha_cast;
135 	betac  = *beta_cast;
136 
137 	x1 = x_cast;
138 	y1 = y_cast;
139 	z1 = z_cast;
140 
141 	if ( n_pre == 1 )
142 	{
143 		x1c = *x1;
144 		y1c = *y1;
145 
146 		*z1 += alphac * x1c +
147 		       betac  * y1c;
148 
149 		x1 += incx;
150 		y1 += incy;
151 		z1 += incz;
152 	}
153 
154 	alphav.v = _mm_loaddup_pd( ( double* )alpha_cast );
155 	betav.v  = _mm_loaddup_pd( ( double* )beta_cast );
156 
157 	for ( i = 0; i < n_run; ++i )
158 	{
159 /*
160 		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
161 		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
162 		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
163 
164 		z1v.v += alphav.v * x1v.v;
165 		z1v.v += betav.v  * y1v.v;
166 
167 		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
168 
169 		z1v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
170 		x1v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
171 		y1v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
172 
173 		z1v.v += alphav.v * x1v.v;
174 		z1v.v += betav.v  * y1v.v;
175 
176 		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z1v.v );
177 */
178 /*
179 		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
180 		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
181 		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
182 
183 		z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
184 		x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
185 		y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
186 
187 		z1v.v += alphav.v * x1v.v;
188 		z1v.v += betav.v  * y1v.v;
189 
190 		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
191 
192 		z2v.v += alphav.v * x2v.v;
193 		z2v.v += betav.v  * y2v.v;
194 
195 		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
196 */
197 		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
198 		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
199 		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
200 
201 		z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
202 		x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
203 		y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
204 
205 		z1v.v += alphav.v * x1v.v;
206 		z1v.v += betav.v  * y1v.v;
207 
208 		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
209 
210 		z1v.v = _mm_load_pd( ( double* )z1 + 2*n_elem_per_reg );
211 		x1v.v = _mm_load_pd( ( double* )x1 + 2*n_elem_per_reg );
212 		y1v.v = _mm_load_pd( ( double* )y1 + 2*n_elem_per_reg );
213 
214 		z2v.v += alphav.v * x2v.v;
215 		z2v.v += betav.v  * y2v.v;
216 
217 		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
218 
219 		z2v.v = _mm_load_pd( ( double* )z1 + 3*n_elem_per_reg );
220 		x2v.v = _mm_load_pd( ( double* )x1 + 3*n_elem_per_reg );
221 		y2v.v = _mm_load_pd( ( double* )y1 + 3*n_elem_per_reg );
222 
223 		z1v.v += alphav.v * x1v.v;
224 		z1v.v += betav.v  * y1v.v;
225 
226 		_mm_store_pd( ( double* )(z1 + 2*n_elem_per_reg ), z1v.v );
227 
228 		z2v.v += alphav.v * x2v.v;
229 		z2v.v += betav.v  * y2v.v;
230 
231 		_mm_store_pd( ( double* )(z1 + 3*n_elem_per_reg ), z2v.v );
232 
233 
234 
235 		x1 += n_elem_per_reg * n_iter_unroll;
236 		y1 += n_elem_per_reg * n_iter_unroll;
237 		z1 += n_elem_per_reg * n_iter_unroll;
238 	}
239 
240 	if ( n_left > 0 )
241 	{
242 		for ( i = 0; i < n_left; ++i )
243 		{
244 			x1c = *x1;
245 			y1c = *y1;
246 
247 			*z1 += alphac * x1c +
248 			       betac  * y1c;
249 
250 			x1 += incx;
251 			y1 += incy;
252 			z1 += incz;
253 		}
254 	}
255 }
256