1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
8 Copyright (C) 2018, The University of Texas at Austin
9
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are
12 met:
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15 - Redistributions in binary form must reproduce the above copyright
16 notice, this list of conditions and the following disclaimer in the
17 documentation and/or other materials provided with the distribution.
18 - Neither the name(s) of the copyright holder(s) nor the names of its
19 contributors may be used to endorse or promote products derived
20 from this software without specific prior written permission.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34 */
35
36 #include "immintrin.h"
37 #include "blis.h"
38
39
40 /* Union data structure to access AVX registers
41 One 256-bit AVX register holds 8 SP elements. */
42 typedef union
43 {
44 __m256 v;
45 float f[8] __attribute__((aligned(64)));
46 } v8sf_t;
47
48 /* Union data structure to access AVX registers
49 * One 256-bit AVX register holds 4 DP elements. */
50 typedef union
51 {
52 __m256d v;
53 double d[4] __attribute__((aligned(64)));
54 } v4df_t;
55
56 // -----------------------------------------------------------------------------
57
bli_saxpyv_zen_int10(conj_t conjx,dim_t n,float * restrict alpha,float * restrict x,inc_t incx,float * restrict y,inc_t incy,cntx_t * restrict cntx)58 void bli_saxpyv_zen_int10
59 (
60 conj_t conjx,
61 dim_t n,
62 float* restrict alpha,
63 float* restrict x, inc_t incx,
64 float* restrict y, inc_t incy,
65 cntx_t* restrict cntx
66 )
67 {
68 const dim_t n_elem_per_reg = 8;
69
70 dim_t i;
71
72 float* restrict x0;
73 float* restrict y0;
74
75 __m256 alphav;
76 __m256 xv[10];
77 __m256 yv[10];
78 __m256 zv[10];
79
80 // If the vector dimension is zero, or if alpha is zero, return early.
81 if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
82
83 // Initialize local pointers.
84 x0 = x;
85 y0 = y;
86
87 if ( incx == 1 && incy == 1 )
88 {
89 // Broadcast the alpha scalar to all elements of a vector register.
90 alphav = _mm256_broadcast_ss( alpha );
91
92 for ( i = 0; (i + 79) < n; i += 80 )
93 {
94 // 80 elements will be processed per loop; 10 FMAs will run per loop.
95 xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
96 xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
97 xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
98 xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
99 xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
100 xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
101 xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
102 xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
103 xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
104 xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
105
106 yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
107 yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
108 yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
109 yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
110 yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
111 yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
112 yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
113 yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
114 yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
115 yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
116
117 zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
118 zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
119 zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
120 zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
121 zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
122 zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] );
123 zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] );
124 zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] );
125 zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] );
126 zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] );
127
128 _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
129 _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
130 _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
131 _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
132 _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
133 _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] );
134 _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] );
135 _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] );
136 _mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] );
137 _mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] );
138
139 x0 += 10*n_elem_per_reg;
140 y0 += 10*n_elem_per_reg;
141 }
142
143 for ( ; (i + 39) < n; i += 40 )
144 {
145 xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
146 xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
147 xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
148 xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
149 xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
150
151 yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
152 yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
153 yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
154 yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
155 yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
156
157 zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
158 zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
159 zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
160 zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
161 zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
162
163 _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
164 _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
165 _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
166 _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
167 _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
168
169 x0 += 5*n_elem_per_reg;
170 y0 += 5*n_elem_per_reg;
171 }
172
173 for ( ; (i + 31) < n; i += 32 )
174 {
175 xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
176 xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
177 xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
178 xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
179
180 yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
181 yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
182 yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
183 yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
184
185 zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
186 zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
187 zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
188 zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
189
190 _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
191 _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
192 _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
193 _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
194
195 x0 += 4*n_elem_per_reg;
196 y0 += 4*n_elem_per_reg;
197 }
198
199 for ( ; (i + 15) < n; i += 16 )
200 {
201 xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
202 xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
203
204 yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
205 yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
206
207 zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
208 zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
209
210 _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
211 _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
212
213 x0 += 2*n_elem_per_reg;
214 y0 += 2*n_elem_per_reg;
215 }
216
217 for ( ; (i + 7) < n; i += 8 )
218 {
219 xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
220
221 yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
222
223 zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
224
225 _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
226
227 x0 += 1*n_elem_per_reg;
228 y0 += 1*n_elem_per_reg;
229 }
230
231 // Issue vzeroupper instruction to clear upper lanes of ymm registers.
232 // This avoids a performance penalty caused by false dependencies when
233 // transitioning from from AVX to SSE instructions (which may occur
234 // as soon as the n_left cleanup loop below if BLIS is compiled with
235 // -mfpmath=sse).
236 _mm256_zeroupper();
237
238 for ( ; (i + 0) < n; i += 1 )
239 {
240 *y0 += (*alpha) * (*x0);
241
242 x0 += 1;
243 y0 += 1;
244 }
245 }
246 else
247 {
248 const float alphac = *alpha;
249
250 for ( i = 0; i < n; ++i )
251 {
252 const float x0c = *x0;
253
254 *y0 += alphac * x0c;
255
256 x0 += incx;
257 y0 += incy;
258 }
259 }
260 }
261
262 // -----------------------------------------------------------------------------
263
bli_daxpyv_zen_int10(conj_t conjx,dim_t n,double * restrict alpha,double * restrict x,inc_t incx,double * restrict y,inc_t incy,cntx_t * restrict cntx)264 void bli_daxpyv_zen_int10
265 (
266 conj_t conjx,
267 dim_t n,
268 double* restrict alpha,
269 double* restrict x, inc_t incx,
270 double* restrict y, inc_t incy,
271 cntx_t* restrict cntx
272 )
273 {
274 const dim_t n_elem_per_reg = 4;
275
276 dim_t i;
277
278 double* restrict x0 = x;
279 double* restrict y0 = y;
280
281 __m256d alphav;
282 __m256d xv[10];
283 __m256d yv[10];
284 __m256d zv[10];
285
286 // If the vector dimension is zero, or if alpha is zero, return early.
287 if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
288
289 // Initialize local pointers.
290 x0 = x;
291 y0 = y;
292
293 if ( incx == 1 && incy == 1 )
294 {
295 // Broadcast the alpha scalar to all elements of a vector register.
296 alphav = _mm256_broadcast_sd( alpha );
297
298 for ( i = 0; (i + 39) < n; i += 40 )
299 {
300 // 40 elements will be processed per loop; 10 FMAs will run per loop.
301 xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
302 xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
303 xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
304 xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
305 xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
306 xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
307 xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
308 xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
309 xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
310 xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
311
312 yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
313 yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
314 yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
315 yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
316 yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
317 yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
318 yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
319 yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
320 yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
321 yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
322
323 zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
324 zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
325 zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
326 zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
327 zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
328 zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] );
329 zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] );
330 zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] );
331 zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
332 zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
333
334 _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
335 _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
336 _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
337 _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
338 _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
339 _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
340 _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
341 _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
342 _mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
343 _mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
344
345 x0 += 10*n_elem_per_reg;
346 y0 += 10*n_elem_per_reg;
347 }
348
349 for ( ; (i + 19) < n; i += 20 )
350 {
351 xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
352 xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
353 xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
354 xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
355 xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
356
357 yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
358 yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
359 yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
360 yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
361 yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
362
363 zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
364 zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
365 zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
366 zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
367 zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
368
369 _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
370 _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
371 _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
372 _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
373 _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
374
375 x0 += 5*n_elem_per_reg;
376 y0 += 5*n_elem_per_reg;
377 }
378
379 for ( ; (i + 15) < n; i += 16 )
380 {
381 xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
382 xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
383 xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
384 xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
385
386 yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
387 yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
388 yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
389 yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
390
391 zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
392 zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
393 zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
394 zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
395
396 _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
397 _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
398 _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
399 _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
400
401 x0 += 4*n_elem_per_reg;
402 y0 += 4*n_elem_per_reg;
403 }
404
405 for ( ; i + 7 < n; i += 8 )
406 {
407 xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
408 xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
409
410 yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
411 yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
412
413 zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
414 zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
415
416 _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
417 _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
418
419 x0 += 2*n_elem_per_reg;
420 y0 += 2*n_elem_per_reg;
421 }
422
423 for ( ; i + 3 < n; i += 4 )
424 {
425 xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
426
427 yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
428
429 zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
430
431 _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
432
433 x0 += 1*n_elem_per_reg;
434 y0 += 1*n_elem_per_reg;
435 }
436
437 // Issue vzeroupper instruction to clear upper lanes of ymm registers.
438 // This avoids a performance penalty caused by false dependencies when
439 // transitioning from from AVX to SSE instructions (which may occur
440 // as soon as the n_left cleanup loop below if BLIS is compiled with
441 // -mfpmath=sse).
442 _mm256_zeroupper();
443
444 for ( ; i < n; i += 1 )
445 {
446 *y0 += (*alpha) * (*x0);
447
448 y0 += 1;
449 x0 += 1;
450 }
451 }
452 else
453 {
454 const double alphac = *alpha;
455
456 for ( i = 0; i < n; ++i )
457 {
458 const double x0c = *x0;
459
460 *y0 += alphac * x0c;
461
462 x0 += incx;
463 y0 += incy;
464 }
465 }
466 }
467
468