1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_cgemv2_c_s(enum blas_order_type order,enum blas_trans_type trans,int m,int n,const void * alpha,const void * a,int lda,const float * head_x,const float * tail_x,int incx,const void * beta,void * y,int incy)3 void BLAS_cgemv2_c_s(enum blas_order_type order, enum blas_trans_type trans,
4 		     int m, int n, const void *alpha, const void *a, int lda,
5 		     const float *head_x, const float *tail_x, int incx,
6 		     const void *beta, void *y, int incy)
7 
8 /*
9  * Purpose
10  * =======
11  *
12  * Computes y = alpha * op(A) * head_x + alpha * op(A) * tail_x + beta * y,
13  * where A is a general matrix.
14  *
15  * Arguments
16  * =========
17  *
18  * order        (input) blas_order_type
19  *              Order of A; row or column major
20  *
21  * trans        (input) blas_trans_type
22  *              Transpose of A: no trans, trans, or conjugate trans
23  *
24  * m            (input) int
25  *              Dimension of A
26  *
27  * n            (input) int
28  *              Dimension of A and the length of vector x and z
29  *
30  * alpha        (input) const void*
31  *
32  * A            (input) const void*
33  *
34  * lda          (input) int
35  *              Leading dimension of A
36  *
37  * head_x
38  * tail_x       (input) const float*
39  *
40  * incx         (input) int
41  *              The stride for vector x.
42  *
43  * beta         (input) const void*
44  *
45  * y            (input) const void*
46  *
47  * incy         (input) int
48  *              The stride for vector y.
49  *
50  */
51 {
52   static const char routine_name[] = "BLAS_cgemv2_c_s";
53 
54   int i, j;
55   int iy, jx, kx, ky;
56   int lenx, leny;
57   int ai, aij;
58   int incai, incaij;
59 
60   const float *a_i = (float *) a;
61   const float *head_x_i = head_x;
62   const float *tail_x_i = tail_x;
63   float *y_i = (float *) y;
64   float *alpha_i = (float *) alpha;
65   float *beta_i = (float *) beta;
66   float a_elem[2];
67   float x_elem;
68   float y_elem[2];
69   float prod[2];
70   float sum[2];
71   float sum2[2];
72   float tmp1[2];
73   float tmp2[2];
74 
75 
76   /* all error calls */
77   if (m < 0)
78     BLAS_error(routine_name, -3, m, 0);
79   else if (n <= 0)
80     BLAS_error(routine_name, -4, n, 0);
81   else if (incx == 0)
82     BLAS_error(routine_name, -10, incx, 0);
83   else if (incy == 0)
84     BLAS_error(routine_name, -13, incy, 0);
85 
86   if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
87     lenx = n;
88     leny = m;
89     incai = lda;
90     incaij = 1;
91   } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
92     lenx = m;
93     leny = n;
94     incai = 1;
95     incaij = lda;
96   } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
97     lenx = n;
98     leny = m;
99     incai = 1;
100     incaij = lda;
101   } else {			/* colmajor and blas_trans */
102     lenx = m;
103     leny = n;
104     incai = lda;
105     incaij = 1;
106   }
107 
108   if (lda < leny)
109     BLAS_error(routine_name, -7, lda, NULL);
110 
111 
112 
113 
114   incy *= 2;
115   incai *= 2;
116   incaij *= 2;
117 
118   if (incx > 0)
119     kx = 0;
120   else
121     kx = (1 - lenx) * incx;
122   if (incy > 0)
123     ky = 0;
124   else
125     ky = (1 - leny) * incy;
126 
127   /* No extra-precision needed for alpha = 0 */
128   if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
129     if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
130       iy = ky;
131       for (i = 0; i < leny; i++) {
132 	y_i[iy] = 0.0;
133 	y_i[iy + 1] = 0.0;
134 	iy += incy;
135       }
136     } else if (!(beta_i[0] == 0.0 && beta_i[1] == 0.0)) {
137       iy = ky;
138       for (i = 0; i < leny; i++) {
139 	y_elem[0] = y_i[iy];
140 	y_elem[1] = y_i[iy + 1];
141 	{
142 	  tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
143 	  tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
144 	}
145 
146 	y_i[iy] = tmp1[0];
147 	y_i[iy + 1] = tmp1[1];
148 	iy += incy;
149       }
150     }
151   } else {			/* alpha != 0 */
152     if (trans == blas_conj_trans) {
153 
154       /* if beta = 0, we can save m multiplies:
155          y = alpha*A*head_x + alpha*A*tail_x  */
156       if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
157 	if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
158 	  /* save m more multiplies if alpha = 1 */
159 	  ai = 0;
160 	  iy = ky;
161 	  for (i = 0; i < leny; i++) {
162 	    sum[0] = sum[1] = 0.0;
163 	    sum2[0] = sum2[1] = 0.0;
164 	    aij = ai;
165 	    jx = kx;
166 	    for (j = 0; j < lenx; j++) {
167 	      a_elem[0] = a_i[aij];
168 	      a_elem[1] = a_i[aij + 1];
169 	      a_elem[1] = -a_elem[1];
170 	      x_elem = head_x_i[jx];
171 	      {
172 		prod[0] = a_elem[0] * x_elem;
173 		prod[1] = a_elem[1] * x_elem;
174 	      }
175 	      sum[0] = sum[0] + prod[0];
176 	      sum[1] = sum[1] + prod[1];
177 	      x_elem = tail_x_i[jx];
178 	      {
179 		prod[0] = a_elem[0] * x_elem;
180 		prod[1] = a_elem[1] * x_elem;
181 	      }
182 	      sum2[0] = sum2[0] + prod[0];
183 	      sum2[1] = sum2[1] + prod[1];
184 	      aij += incaij;
185 	      jx += incx;
186 	    }
187 	    sum[0] = sum[0] + sum2[0];
188 	    sum[1] = sum[1] + sum2[1];
189 	    y_i[iy] = sum[0];
190 	    y_i[iy + 1] = sum[1];
191 	    ai += incai;
192 	    iy += incy;
193 	  }			/* end for */
194 	} else {		/* alpha != 1 */
195 	  ai = 0;
196 	  iy = ky;
197 	  for (i = 0; i < leny; i++) {
198 	    sum[0] = sum[1] = 0.0;
199 	    sum2[0] = sum2[1] = 0.0;
200 	    aij = ai;
201 	    jx = kx;
202 	    for (j = 0; j < lenx; j++) {
203 	      a_elem[0] = a_i[aij];
204 	      a_elem[1] = a_i[aij + 1];
205 	      a_elem[1] = -a_elem[1];
206 	      x_elem = head_x_i[jx];
207 	      {
208 		prod[0] = a_elem[0] * x_elem;
209 		prod[1] = a_elem[1] * x_elem;
210 	      }
211 	      sum[0] = sum[0] + prod[0];
212 	      sum[1] = sum[1] + prod[1];
213 	      x_elem = tail_x_i[jx];
214 	      {
215 		prod[0] = a_elem[0] * x_elem;
216 		prod[1] = a_elem[1] * x_elem;
217 	      }
218 	      sum2[0] = sum2[0] + prod[0];
219 	      sum2[1] = sum2[1] + prod[1];
220 	      aij += incaij;
221 	      jx += incx;
222 	    }
223 	    {
224 	      tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
225 	      tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
226 	    }
227 
228 	    {
229 	      tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
230 	      tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
231 	    }
232 
233 	    tmp1[0] = tmp1[0] + tmp2[0];
234 	    tmp1[1] = tmp1[1] + tmp2[1];
235 	    y_i[iy] = tmp1[0];
236 	    y_i[iy + 1] = tmp1[1];
237 	    ai += incai;
238 	    iy += incy;
239 	  }
240 	}
241       } else {			/* beta != 0 */
242 	if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
243 	  /* save m multiplies if alpha = 1 */
244 	  ai = 0;
245 	  iy = ky;
246 	  for (i = 0; i < leny; i++) {
247 	    sum[0] = sum[1] = 0.0;;
248 	    sum2[0] = sum2[1] = 0.0;;
249 	    aij = ai;
250 	    jx = kx;
251 	    for (j = 0; j < lenx; j++) {
252 	      a_elem[0] = a_i[aij];
253 	      a_elem[1] = a_i[aij + 1];
254 	      a_elem[1] = -a_elem[1];
255 	      x_elem = head_x_i[jx];
256 	      {
257 		prod[0] = a_elem[0] * x_elem;
258 		prod[1] = a_elem[1] * x_elem;
259 	      }
260 	      sum[0] = sum[0] + prod[0];
261 	      sum[1] = sum[1] + prod[1];
262 	      x_elem = tail_x_i[jx];
263 	      {
264 		prod[0] = a_elem[0] * x_elem;
265 		prod[1] = a_elem[1] * x_elem;
266 	      }
267 	      sum2[0] = sum2[0] + prod[0];
268 	      sum2[1] = sum2[1] + prod[1];
269 	      aij += incaij;
270 	      jx += incx;
271 	    }
272 	    sum[0] = sum[0] + sum2[0];
273 	    sum[1] = sum[1] + sum2[1];
274 	    y_elem[0] = y_i[iy];
275 	    y_elem[1] = y_i[iy + 1];
276 	    {
277 	      tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
278 	      tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
279 	    }
280 
281 	    tmp2[0] = sum[0] + tmp1[0];
282 	    tmp2[1] = sum[1] + tmp1[1];
283 	    y_i[iy] = tmp2[0];
284 	    y_i[iy + 1] = tmp2[1];
285 	    ai += incai;
286 	    iy += incy;
287 	  }
288 	} else {		/* alpha != 1, the most general form:
289 				   y = alpha*A*head_x + alpha*A*tail_x + beta*y */
290 	  ai = 0;
291 	  iy = ky;
292 	  for (i = 0; i < leny; i++) {
293 	    sum[0] = sum[1] = 0.0;;
294 	    sum2[0] = sum2[1] = 0.0;;
295 	    aij = ai;
296 	    jx = kx;
297 	    for (j = 0; j < lenx; j++) {
298 	      a_elem[0] = a_i[aij];
299 	      a_elem[1] = a_i[aij + 1];
300 	      a_elem[1] = -a_elem[1];
301 	      x_elem = head_x_i[jx];
302 	      {
303 		prod[0] = a_elem[0] * x_elem;
304 		prod[1] = a_elem[1] * x_elem;
305 	      }
306 	      sum[0] = sum[0] + prod[0];
307 	      sum[1] = sum[1] + prod[1];
308 	      x_elem = tail_x_i[jx];
309 	      {
310 		prod[0] = a_elem[0] * x_elem;
311 		prod[1] = a_elem[1] * x_elem;
312 	      }
313 	      sum2[0] = sum2[0] + prod[0];
314 	      sum2[1] = sum2[1] + prod[1];
315 	      aij += incaij;
316 	      jx += incx;
317 	    }
318 	    {
319 	      tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
320 	      tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
321 	    }
322 
323 	    {
324 	      tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
325 	      tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
326 	    }
327 
328 	    tmp1[0] = tmp1[0] + tmp2[0];
329 	    tmp1[1] = tmp1[1] + tmp2[1];
330 	    y_elem[0] = y_i[iy];
331 	    y_elem[1] = y_i[iy + 1];
332 	    {
333 	      tmp2[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
334 	      tmp2[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
335 	    }
336 
337 	    tmp1[0] = tmp1[0] + tmp2[0];
338 	    tmp1[1] = tmp1[1] + tmp2[1];
339 	    y_i[iy] = tmp1[0];
340 	    y_i[iy + 1] = tmp1[1];
341 	    ai += incai;
342 	    iy += incy;
343 	  }
344 	}
345       }
346 
347     } else {
348 
349       /* if beta = 0, we can save m multiplies:
350          y = alpha*A*head_x + alpha*A*tail_x  */
351       if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
352 	if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
353 	  /* save m more multiplies if alpha = 1 */
354 	  ai = 0;
355 	  iy = ky;
356 	  for (i = 0; i < leny; i++) {
357 	    sum[0] = sum[1] = 0.0;
358 	    sum2[0] = sum2[1] = 0.0;
359 	    aij = ai;
360 	    jx = kx;
361 	    for (j = 0; j < lenx; j++) {
362 	      a_elem[0] = a_i[aij];
363 	      a_elem[1] = a_i[aij + 1];
364 
365 	      x_elem = head_x_i[jx];
366 	      {
367 		prod[0] = a_elem[0] * x_elem;
368 		prod[1] = a_elem[1] * x_elem;
369 	      }
370 	      sum[0] = sum[0] + prod[0];
371 	      sum[1] = sum[1] + prod[1];
372 	      x_elem = tail_x_i[jx];
373 	      {
374 		prod[0] = a_elem[0] * x_elem;
375 		prod[1] = a_elem[1] * x_elem;
376 	      }
377 	      sum2[0] = sum2[0] + prod[0];
378 	      sum2[1] = sum2[1] + prod[1];
379 	      aij += incaij;
380 	      jx += incx;
381 	    }
382 	    sum[0] = sum[0] + sum2[0];
383 	    sum[1] = sum[1] + sum2[1];
384 	    y_i[iy] = sum[0];
385 	    y_i[iy + 1] = sum[1];
386 	    ai += incai;
387 	    iy += incy;
388 	  }			/* end for */
389 	} else {		/* alpha != 1 */
390 	  ai = 0;
391 	  iy = ky;
392 	  for (i = 0; i < leny; i++) {
393 	    sum[0] = sum[1] = 0.0;
394 	    sum2[0] = sum2[1] = 0.0;
395 	    aij = ai;
396 	    jx = kx;
397 	    for (j = 0; j < lenx; j++) {
398 	      a_elem[0] = a_i[aij];
399 	      a_elem[1] = a_i[aij + 1];
400 
401 	      x_elem = head_x_i[jx];
402 	      {
403 		prod[0] = a_elem[0] * x_elem;
404 		prod[1] = a_elem[1] * x_elem;
405 	      }
406 	      sum[0] = sum[0] + prod[0];
407 	      sum[1] = sum[1] + prod[1];
408 	      x_elem = tail_x_i[jx];
409 	      {
410 		prod[0] = a_elem[0] * x_elem;
411 		prod[1] = a_elem[1] * x_elem;
412 	      }
413 	      sum2[0] = sum2[0] + prod[0];
414 	      sum2[1] = sum2[1] + prod[1];
415 	      aij += incaij;
416 	      jx += incx;
417 	    }
418 	    {
419 	      tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
420 	      tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
421 	    }
422 
423 	    {
424 	      tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
425 	      tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
426 	    }
427 
428 	    tmp1[0] = tmp1[0] + tmp2[0];
429 	    tmp1[1] = tmp1[1] + tmp2[1];
430 	    y_i[iy] = tmp1[0];
431 	    y_i[iy + 1] = tmp1[1];
432 	    ai += incai;
433 	    iy += incy;
434 	  }
435 	}
436       } else {			/* beta != 0 */
437 	if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
438 	  /* save m multiplies if alpha = 1 */
439 	  ai = 0;
440 	  iy = ky;
441 	  for (i = 0; i < leny; i++) {
442 	    sum[0] = sum[1] = 0.0;;
443 	    sum2[0] = sum2[1] = 0.0;;
444 	    aij = ai;
445 	    jx = kx;
446 	    for (j = 0; j < lenx; j++) {
447 	      a_elem[0] = a_i[aij];
448 	      a_elem[1] = a_i[aij + 1];
449 
450 	      x_elem = head_x_i[jx];
451 	      {
452 		prod[0] = a_elem[0] * x_elem;
453 		prod[1] = a_elem[1] * x_elem;
454 	      }
455 	      sum[0] = sum[0] + prod[0];
456 	      sum[1] = sum[1] + prod[1];
457 	      x_elem = tail_x_i[jx];
458 	      {
459 		prod[0] = a_elem[0] * x_elem;
460 		prod[1] = a_elem[1] * x_elem;
461 	      }
462 	      sum2[0] = sum2[0] + prod[0];
463 	      sum2[1] = sum2[1] + prod[1];
464 	      aij += incaij;
465 	      jx += incx;
466 	    }
467 	    sum[0] = sum[0] + sum2[0];
468 	    sum[1] = sum[1] + sum2[1];
469 	    y_elem[0] = y_i[iy];
470 	    y_elem[1] = y_i[iy + 1];
471 	    {
472 	      tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
473 	      tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
474 	    }
475 
476 	    tmp2[0] = sum[0] + tmp1[0];
477 	    tmp2[1] = sum[1] + tmp1[1];
478 	    y_i[iy] = tmp2[0];
479 	    y_i[iy + 1] = tmp2[1];
480 	    ai += incai;
481 	    iy += incy;
482 	  }
483 	} else {		/* alpha != 1, the most general form:
484 				   y = alpha*A*head_x + alpha*A*tail_x + beta*y */
485 	  ai = 0;
486 	  iy = ky;
487 	  for (i = 0; i < leny; i++) {
488 	    sum[0] = sum[1] = 0.0;;
489 	    sum2[0] = sum2[1] = 0.0;;
490 	    aij = ai;
491 	    jx = kx;
492 	    for (j = 0; j < lenx; j++) {
493 	      a_elem[0] = a_i[aij];
494 	      a_elem[1] = a_i[aij + 1];
495 
496 	      x_elem = head_x_i[jx];
497 	      {
498 		prod[0] = a_elem[0] * x_elem;
499 		prod[1] = a_elem[1] * x_elem;
500 	      }
501 	      sum[0] = sum[0] + prod[0];
502 	      sum[1] = sum[1] + prod[1];
503 	      x_elem = tail_x_i[jx];
504 	      {
505 		prod[0] = a_elem[0] * x_elem;
506 		prod[1] = a_elem[1] * x_elem;
507 	      }
508 	      sum2[0] = sum2[0] + prod[0];
509 	      sum2[1] = sum2[1] + prod[1];
510 	      aij += incaij;
511 	      jx += incx;
512 	    }
513 	    {
514 	      tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
515 	      tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
516 	    }
517 
518 	    {
519 	      tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
520 	      tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
521 	    }
522 
523 	    tmp1[0] = tmp1[0] + tmp2[0];
524 	    tmp1[1] = tmp1[1] + tmp2[1];
525 	    y_elem[0] = y_i[iy];
526 	    y_elem[1] = y_i[iy + 1];
527 	    {
528 	      tmp2[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
529 	      tmp2[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
530 	    }
531 
532 	    tmp1[0] = tmp1[0] + tmp2[0];
533 	    tmp1[1] = tmp1[1] + tmp2[1];
534 	    y_i[iy] = tmp1[0];
535 	    y_i[iy + 1] = tmp1[1];
536 	    ai += incai;
537 	    iy += incy;
538 	  }
539 	}
540       }
541 
542     }
543   }
544 
545 
546 
547 }
548