1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
8    Copyright (C) 2018, The University of Texas at Austin
9 
10    Redistribution and use in source and binary forms, with or without
11    modification, are permitted provided that the following conditions are
12    met:
13     - Redistributions of source code must retain the above copyright
14       notice, this list of conditions and the following disclaimer.
15     - Redistributions in binary form must reproduce the above copyright
16       notice, this list of conditions and the following disclaimer in the
17       documentation and/or other materials provided with the distribution.
18     - Neither the name(s) of the copyright holder(s) nor the names of its
19       contributors may be used to endorse or promote products derived
20       from this software without specific prior written permission.
21 
22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 
34 */
35 
36 #include "immintrin.h"
37 #include "blis.h"
38 
39 
40 /* Union data structure to access AVX registers
41    One 256-bit AVX register holds 8 SP elements. */
42 typedef union
43 {
44 	__m256  v;
45 	float   f[8] __attribute__((aligned(64)));
46 } v8sf_t;
47 
48 /* Union data structure to access AVX registers
49 *  One 256-bit AVX register holds 4 DP elements. */
50 typedef union
51 {
52 	__m256d v;
53 	double  d[4] __attribute__((aligned(64)));
54 } v4df_t;
55 
56 // -----------------------------------------------------------------------------
57 
bli_saxpyv_zen_int10(conj_t conjx,dim_t n,float * restrict alpha,float * restrict x,inc_t incx,float * restrict y,inc_t incy,cntx_t * restrict cntx)58 void bli_saxpyv_zen_int10
59      (
60        conj_t           conjx,
61        dim_t            n,
62        float*  restrict alpha,
63        float*  restrict x, inc_t incx,
64        float*  restrict y, inc_t incy,
65        cntx_t* restrict cntx
66      )
67 {
68 	const dim_t      n_elem_per_reg = 8;
69 
70 	dim_t            i;
71 
72 	float*  restrict x0;
73 	float*  restrict y0;
74 
75 	__m256           alphav;
76 	__m256           xv[10];
77 	__m256           yv[10];
78 	__m256           zv[10];
79 
80 	// If the vector dimension is zero, or if alpha is zero, return early.
81 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
82 
83 	// Initialize local pointers.
84 	x0 = x;
85 	y0 = y;
86 
87 	if ( incx == 1 && incy == 1 )
88 	{
89 		// Broadcast the alpha scalar to all elements of a vector register.
90 		alphav = _mm256_broadcast_ss( alpha );
91 
92 		for ( i = 0; (i + 79) < n; i += 80 )
93 		{
94 			// 80 elements will be processed per loop; 10 FMAs will run per loop.
95 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
96 			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
97 			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
98 			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
99 			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
100 			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
101 			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
102 			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
103 			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
104 			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
105 
106 			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
107 			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
108 			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
109 			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
110 			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
111 			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
112 			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
113 			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
114 			yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
115 			yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
116 
117 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
118 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
119 			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
120 			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
121 			zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
122 			zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] );
123 			zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] );
124 			zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] );
125 			zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] );
126 			zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] );
127 
128 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
129 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
130 			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
131 			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
132 			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
133 			_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] );
134 			_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] );
135 			_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] );
136 			_mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] );
137 			_mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] );
138 
139 			x0 += 10*n_elem_per_reg;
140 			y0 += 10*n_elem_per_reg;
141 		}
142 
143 		for ( ; (i + 39) < n; i += 40 )
144 		{
145 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
146 			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
147 			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
148 			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
149 			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
150 
151 			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
152 			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
153 			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
154 			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
155 			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
156 
157 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
158 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
159 			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
160 			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
161 			zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
162 
163 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
164 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
165 			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
166 			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
167 			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
168 
169 			x0 += 5*n_elem_per_reg;
170 			y0 += 5*n_elem_per_reg;
171 		}
172 
173 		for ( ; (i + 31) < n; i += 32 )
174 		{
175 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
176 			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
177 			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
178 			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
179 
180 			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
181 			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
182 			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
183 			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
184 
185 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
186 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
187 			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
188 			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
189 
190 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
191 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
192 			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
193 			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
194 
195 			x0 += 4*n_elem_per_reg;
196 			y0 += 4*n_elem_per_reg;
197 		}
198 
199 		for ( ; (i + 15) < n; i += 16 )
200 		{
201 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
202 			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
203 
204 			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
205 			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
206 
207 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
208 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
209 
210 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
211 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
212 
213 			x0 += 2*n_elem_per_reg;
214 			y0 += 2*n_elem_per_reg;
215 		}
216 
217 		for ( ; (i + 7) < n; i += 8 )
218 		{
219 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
220 
221 			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
222 
223 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
224 
225 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
226 
227 			x0 += 1*n_elem_per_reg;
228 			y0 += 1*n_elem_per_reg;
229 		}
230 
231 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
232 		// This avoids a performance penalty caused by false dependencies when
233 		// transitioning from from AVX to SSE instructions (which may occur
234 		// as soon as the n_left cleanup loop below if BLIS is compiled with
235 		// -mfpmath=sse).
236 		_mm256_zeroupper();
237 
238 		for ( ; (i + 0) < n; i += 1 )
239 		{
240 			*y0 += (*alpha) * (*x0);
241 
242 			x0 += 1;
243 			y0 += 1;
244 		}
245 	}
246 	else
247 	{
248 		const float alphac = *alpha;
249 
250 		for ( i = 0; i < n; ++i )
251 		{
252 			const float x0c = *x0;
253 
254 			*y0 += alphac * x0c;
255 
256 			x0 += incx;
257 			y0 += incy;
258 		}
259 	}
260 }
261 
262 // -----------------------------------------------------------------------------
263 
bli_daxpyv_zen_int10(conj_t conjx,dim_t n,double * restrict alpha,double * restrict x,inc_t incx,double * restrict y,inc_t incy,cntx_t * restrict cntx)264 void bli_daxpyv_zen_int10
265      (
266        conj_t           conjx,
267        dim_t            n,
268        double* restrict alpha,
269        double* restrict x, inc_t incx,
270        double* restrict y, inc_t incy,
271        cntx_t* restrict cntx
272      )
273 {
274 	const dim_t      n_elem_per_reg = 4;
275 
276 	dim_t            i;
277 
278 	double* restrict x0 = x;
279 	double* restrict y0 = y;
280 
281 	__m256d          alphav;
282 	__m256d          xv[10];
283 	__m256d          yv[10];
284 	__m256d          zv[10];
285 
286 	// If the vector dimension is zero, or if alpha is zero, return early.
287 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
288 
289 	// Initialize local pointers.
290 	x0 = x;
291 	y0 = y;
292 
293 	if ( incx == 1 && incy == 1 )
294 	{
295 		// Broadcast the alpha scalar to all elements of a vector register.
296 		alphav = _mm256_broadcast_sd( alpha );
297 
298 		for ( i = 0; (i + 39) < n; i += 40 )
299 		{
300 			// 40 elements will be processed per loop; 10 FMAs will run per loop.
301 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
302 			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
303 			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
304 			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
305 			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
306 			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
307 			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
308 			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
309 			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
310 			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
311 
312 			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
313 			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
314 			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
315 			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
316 			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
317 			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
318 			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
319 			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
320 			yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
321 			yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
322 
323 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
324 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
325 			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
326 			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
327 			zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
328 			zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] );
329 			zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] );
330 			zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] );
331 			zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
332 			zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
333 
334 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
335 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
336 			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
337 			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
338 			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
339 			_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
340 			_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
341 			_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
342 			_mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
343 			_mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
344 
345 			x0 += 10*n_elem_per_reg;
346 			y0 += 10*n_elem_per_reg;
347 		}
348 
349 		for ( ; (i + 19) < n; i += 20 )
350 		{
351 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
352 			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
353 			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
354 			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
355 			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
356 
357 			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
358 			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
359 			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
360 			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
361 			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
362 
363 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
364 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
365 			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
366 			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
367 			zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
368 
369 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
370 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
371 			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
372 			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
373 			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
374 
375 			x0 += 5*n_elem_per_reg;
376 			y0 += 5*n_elem_per_reg;
377 		}
378 
379 		for ( ; (i + 15) < n; i += 16 )
380 		{
381 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
382 			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
383 			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
384 			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
385 
386 			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
387 			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
388 			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
389 			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
390 
391 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
392 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
393 			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
394 			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
395 
396 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
397 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
398 			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
399 			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
400 
401 			x0 += 4*n_elem_per_reg;
402 			y0 += 4*n_elem_per_reg;
403 		}
404 
405 		for ( ; i + 7 < n; i += 8 )
406 		{
407 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
408 			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
409 
410 			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
411 			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
412 
413 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
414 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
415 
416 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
417 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
418 
419 			x0 += 2*n_elem_per_reg;
420 			y0 += 2*n_elem_per_reg;
421 		}
422 
423 		for ( ; i + 3 < n; i += 4 )
424 		{
425 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
426 
427 			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
428 
429 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
430 
431 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
432 
433 			x0 += 1*n_elem_per_reg;
434 			y0 += 1*n_elem_per_reg;
435 		}
436 
437 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
438 		// This avoids a performance penalty caused by false dependencies when
439 		// transitioning from from AVX to SSE instructions (which may occur
440 		// as soon as the n_left cleanup loop below if BLIS is compiled with
441 		// -mfpmath=sse).
442 		_mm256_zeroupper();
443 
444 		for ( ; i < n; i += 1 )
445 		{
446 			*y0 += (*alpha) * (*x0);
447 
448 			y0 += 1;
449 			x0 += 1;
450 		}
451 	}
452 	else
453 	{
454 		const double alphac = *alpha;
455 
456 		for ( i = 0; i < n; ++i )
457 		{
458 			const double x0c = *x0;
459 
460 			*y0 += alphac * x0c;
461 
462 			x0 += incx;
463 			y0 += incy;
464 		}
465 	}
466 }
467 
468