1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "pmmintrin.h"
36 #include "blis.h"
37 
38 
39 typedef union
40 {
41     __m128d v;
42     double  d[2];
43 } v2df_t;
44 
45 
bli_daxpyf_penryn_int(conj_t conja,conj_t conjx,dim_t m,dim_t b_n,double * restrict alpha,double * restrict a,inc_t inca,inc_t lda,double * restrict x,inc_t incx,double * restrict y,inc_t incy,cntx_t * restrict cntx)46 void bli_daxpyf_penryn_int
47      (
48        conj_t           conja,
49        conj_t           conjx,
50        dim_t            m,
51        dim_t            b_n,
52        double* restrict alpha,
53        double* restrict a, inc_t inca, inc_t lda,
54        double* restrict x, inc_t incx,
55        double* restrict y, inc_t incy,
56        cntx_t* restrict cntx
57      )
58 {
59 	double*  restrict alpha_cast = alpha;
60 	double*  restrict a_cast = a;
61 	double*  restrict x_cast = x;
62 	double*  restrict y_cast = y;
63 	dim_t             i;
64 
65 	const dim_t       n_elem_per_reg = 2;
66 	const dim_t       n_iter_unroll  = 2;
67 
68 	dim_t             m_pre;
69 	dim_t             m_run;
70 	dim_t             m_left;
71 
72     double*  restrict a0;
73     double*  restrict a1;
74     double*  restrict a2;
75     double*  restrict a3;
76     double*  restrict y0;
77     double            a0c, a1c, a2c, a3c;
78     double            chi0, chi1, chi2, chi3;
79 
80 	v2df_t            a00v, a01v, a02v, a03v, y0v;
81 	v2df_t            a10v, a11v, a12v, a13v, y1v;
82 	v2df_t            chi0v, chi1v, chi2v, chi3v;
83 
84 	bool              use_ref = FALSE;
85 
86 
87 	if ( bli_zero_dim2( m, b_n ) ) return;
88 
89 	m_pre = 0;
90 
91 	// If there is anything that would interfere with our use of aligned
92 	// vector loads/stores, call the reference implementation.
93 	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) )
94 	{
95 		use_ref = TRUE;
96 	}
97 	else if ( inca != 1 || incx != 1 || incy != 1 ||
98 	          bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) )
99 	{
100 		use_ref = TRUE;
101 	}
102 	else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) ||
103 	          bli_is_unaligned_to( ( siz_t )y, 16 ) )
104 	{
105 		use_ref = TRUE;
106 
107 		if ( bli_is_unaligned_to( ( siz_t )a, 16 ) &&
108 		     bli_is_unaligned_to( ( siz_t )y, 16 ) )
109 		{
110 			use_ref = FALSE;
111 			m_pre   = 1;
112 		}
113 	}
114 
115 	// Call the reference implementation if needed.
116 	if ( use_ref == TRUE )
117 	{
118 		daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
119 
120 		f
121 		(
122 		  conja,
123 		  conjx,
124 		  m,
125 		  b_n,
126 		  alpha_cast,
127 		  a_cast, inca, lda,
128 		  x_cast, incx,
129 		  y_cast, incy,
130 		  cntx
131 		);
132 		return;
133 	}
134 
135 
136 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
137 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
138 
139 	a0   = a_cast + 0*lda;
140 	a1   = a_cast + 1*lda;
141 	a2   = a_cast + 2*lda;
142 	a3   = a_cast + 3*lda;
143 	y0   = y_cast;
144 
145 	chi0 = *(x_cast + 0*incx);
146 	chi1 = *(x_cast + 1*incx);
147 	chi2 = *(x_cast + 2*incx);
148 	chi3 = *(x_cast + 3*incx);
149 
150 	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
151 	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
152 	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
153 	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );
154 
155 	if ( m_pre == 1 )
156 	{
157 		a0c = *a0;
158 		a1c = *a1;
159 		a2c = *a2;
160 		a3c = *a3;
161 
162 		*y0 += chi0 * a0c +
163 		       chi1 * a1c +
164 		       chi2 * a2c +
165 		       chi3 * a3c;
166 
167 		a0 += inca;
168 		a1 += inca;
169 		a2 += inca;
170 		a3 += inca;
171 		y0 += incy;
172 	}
173 
174 	chi0v.v = _mm_loaddup_pd( ( double* )&chi0 );
175 	chi1v.v = _mm_loaddup_pd( ( double* )&chi1 );
176 	chi2v.v = _mm_loaddup_pd( ( double* )&chi2 );
177 	chi3v.v = _mm_loaddup_pd( ( double* )&chi3 );
178 
179 	for ( i = 0; i < m_run; ++i )
180 	{
181 		y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) );
182 
183 		a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) );
184 		a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) );
185 
186 		y0v.v += chi0v.v * a00v.v;
187 		y0v.v += chi1v.v * a01v.v;
188 
189 		a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) );
190 		a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) );
191 
192 		y0v.v += chi2v.v * a02v.v;
193 		y0v.v += chi3v.v * a03v.v;
194 
195 		_mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v );
196 
197 
198 		y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) );
199 
200 		a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) );
201 		a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) );
202 
203 		y1v.v += chi0v.v * a10v.v;
204 		y1v.v += chi1v.v * a11v.v;
205 
206 		a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) );
207 		a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) );
208 
209 		y1v.v += chi2v.v * a12v.v;
210 		y1v.v += chi3v.v * a13v.v;
211 
212 		_mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v );
213 
214 
215 		a0 += n_elem_per_reg * n_iter_unroll;
216 		a1 += n_elem_per_reg * n_iter_unroll;
217 		a2 += n_elem_per_reg * n_iter_unroll;
218 		a3 += n_elem_per_reg * n_iter_unroll;
219 		y0 += n_elem_per_reg * n_iter_unroll;
220 	}
221 
222 	if ( m_left > 0 )
223 	{
224 		for ( i = 0; i < m_left; ++i )
225 		{
226 			a0c = *a0;
227 			a1c = *a1;
228 			a2c = *a2;
229 			a3c = *a3;
230 
231 			*y0 += chi0 * a0c +
232 			       chi1 * a1c +
233 			       chi2 * a2c +
234 			       chi3 * a3c;
235 
236 			a0 += inca;
237 			a1 += inca;
238 			a2 += inca;
239 			a3 += inca;
240 			y0 += incy;
241 		}
242 	}
243 }
244 
245