1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_cgemv2_c_s(enum blas_order_type order,enum blas_trans_type trans,int m,int n,const void * alpha,const void * a,int lda,const float * head_x,const float * tail_x,int incx,const void * beta,void * y,int incy)3 void BLAS_cgemv2_c_s(enum blas_order_type order, enum blas_trans_type trans,
4 int m, int n, const void *alpha, const void *a, int lda,
5 const float *head_x, const float *tail_x, int incx,
6 const void *beta, void *y, int incy)
7
8 /*
9 * Purpose
10 * =======
11 *
12 * Computes y = alpha * op(A) * head_x + alpha * op(A) * tail_x + beta * y,
13 * where A is a general matrix.
14 *
15 * Arguments
16 * =========
17 *
18 * order (input) blas_order_type
19 * Order of A; row or column major
20 *
21 * trans (input) blas_trans_type
22 * Transpose of A: no trans, trans, or conjugate trans
23 *
24 * m (input) int
25 * Dimension of A
26 *
27 * n (input) int
28 * Dimension of A and the length of vector x and z
29 *
30 * alpha (input) const void*
31 *
32 * A (input) const void*
33 *
34 * lda (input) int
35 * Leading dimension of A
36 *
37 * head_x
38 * tail_x (input) const float*
39 *
40 * incx (input) int
41 * The stride for vector x.
42 *
43 * beta (input) const void*
44 *
45 * y (input) const void*
46 *
47 * incy (input) int
48 * The stride for vector y.
49 *
50 */
51 {
52 static const char routine_name[] = "BLAS_cgemv2_c_s";
53
54 int i, j;
55 int iy, jx, kx, ky;
56 int lenx, leny;
57 int ai, aij;
58 int incai, incaij;
59
60 const float *a_i = (float *) a;
61 const float *head_x_i = head_x;
62 const float *tail_x_i = tail_x;
63 float *y_i = (float *) y;
64 float *alpha_i = (float *) alpha;
65 float *beta_i = (float *) beta;
66 float a_elem[2];
67 float x_elem;
68 float y_elem[2];
69 float prod[2];
70 float sum[2];
71 float sum2[2];
72 float tmp1[2];
73 float tmp2[2];
74
75
76 /* all error calls */
77 if (m < 0)
78 BLAS_error(routine_name, -3, m, 0);
79 else if (n <= 0)
80 BLAS_error(routine_name, -4, n, 0);
81 else if (incx == 0)
82 BLAS_error(routine_name, -10, incx, 0);
83 else if (incy == 0)
84 BLAS_error(routine_name, -13, incy, 0);
85
86 if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
87 lenx = n;
88 leny = m;
89 incai = lda;
90 incaij = 1;
91 } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
92 lenx = m;
93 leny = n;
94 incai = 1;
95 incaij = lda;
96 } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
97 lenx = n;
98 leny = m;
99 incai = 1;
100 incaij = lda;
101 } else { /* colmajor and blas_trans */
102 lenx = m;
103 leny = n;
104 incai = lda;
105 incaij = 1;
106 }
107
108 if (lda < leny)
109 BLAS_error(routine_name, -7, lda, NULL);
110
111
112
113
114 incy *= 2;
115 incai *= 2;
116 incaij *= 2;
117
118 if (incx > 0)
119 kx = 0;
120 else
121 kx = (1 - lenx) * incx;
122 if (incy > 0)
123 ky = 0;
124 else
125 ky = (1 - leny) * incy;
126
127 /* No extra-precision needed for alpha = 0 */
128 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
129 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
130 iy = ky;
131 for (i = 0; i < leny; i++) {
132 y_i[iy] = 0.0;
133 y_i[iy + 1] = 0.0;
134 iy += incy;
135 }
136 } else if (!(beta_i[0] == 0.0 && beta_i[1] == 0.0)) {
137 iy = ky;
138 for (i = 0; i < leny; i++) {
139 y_elem[0] = y_i[iy];
140 y_elem[1] = y_i[iy + 1];
141 {
142 tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
143 tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
144 }
145
146 y_i[iy] = tmp1[0];
147 y_i[iy + 1] = tmp1[1];
148 iy += incy;
149 }
150 }
151 } else { /* alpha != 0 */
152 if (trans == blas_conj_trans) {
153
154 /* if beta = 0, we can save m multiplies:
155 y = alpha*A*head_x + alpha*A*tail_x */
156 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
157 if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
158 /* save m more multiplies if alpha = 1 */
159 ai = 0;
160 iy = ky;
161 for (i = 0; i < leny; i++) {
162 sum[0] = sum[1] = 0.0;
163 sum2[0] = sum2[1] = 0.0;
164 aij = ai;
165 jx = kx;
166 for (j = 0; j < lenx; j++) {
167 a_elem[0] = a_i[aij];
168 a_elem[1] = a_i[aij + 1];
169 a_elem[1] = -a_elem[1];
170 x_elem = head_x_i[jx];
171 {
172 prod[0] = a_elem[0] * x_elem;
173 prod[1] = a_elem[1] * x_elem;
174 }
175 sum[0] = sum[0] + prod[0];
176 sum[1] = sum[1] + prod[1];
177 x_elem = tail_x_i[jx];
178 {
179 prod[0] = a_elem[0] * x_elem;
180 prod[1] = a_elem[1] * x_elem;
181 }
182 sum2[0] = sum2[0] + prod[0];
183 sum2[1] = sum2[1] + prod[1];
184 aij += incaij;
185 jx += incx;
186 }
187 sum[0] = sum[0] + sum2[0];
188 sum[1] = sum[1] + sum2[1];
189 y_i[iy] = sum[0];
190 y_i[iy + 1] = sum[1];
191 ai += incai;
192 iy += incy;
193 } /* end for */
194 } else { /* alpha != 1 */
195 ai = 0;
196 iy = ky;
197 for (i = 0; i < leny; i++) {
198 sum[0] = sum[1] = 0.0;
199 sum2[0] = sum2[1] = 0.0;
200 aij = ai;
201 jx = kx;
202 for (j = 0; j < lenx; j++) {
203 a_elem[0] = a_i[aij];
204 a_elem[1] = a_i[aij + 1];
205 a_elem[1] = -a_elem[1];
206 x_elem = head_x_i[jx];
207 {
208 prod[0] = a_elem[0] * x_elem;
209 prod[1] = a_elem[1] * x_elem;
210 }
211 sum[0] = sum[0] + prod[0];
212 sum[1] = sum[1] + prod[1];
213 x_elem = tail_x_i[jx];
214 {
215 prod[0] = a_elem[0] * x_elem;
216 prod[1] = a_elem[1] * x_elem;
217 }
218 sum2[0] = sum2[0] + prod[0];
219 sum2[1] = sum2[1] + prod[1];
220 aij += incaij;
221 jx += incx;
222 }
223 {
224 tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
225 tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
226 }
227
228 {
229 tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
230 tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
231 }
232
233 tmp1[0] = tmp1[0] + tmp2[0];
234 tmp1[1] = tmp1[1] + tmp2[1];
235 y_i[iy] = tmp1[0];
236 y_i[iy + 1] = tmp1[1];
237 ai += incai;
238 iy += incy;
239 }
240 }
241 } else { /* beta != 0 */
242 if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
243 /* save m multiplies if alpha = 1 */
244 ai = 0;
245 iy = ky;
246 for (i = 0; i < leny; i++) {
247 sum[0] = sum[1] = 0.0;;
248 sum2[0] = sum2[1] = 0.0;;
249 aij = ai;
250 jx = kx;
251 for (j = 0; j < lenx; j++) {
252 a_elem[0] = a_i[aij];
253 a_elem[1] = a_i[aij + 1];
254 a_elem[1] = -a_elem[1];
255 x_elem = head_x_i[jx];
256 {
257 prod[0] = a_elem[0] * x_elem;
258 prod[1] = a_elem[1] * x_elem;
259 }
260 sum[0] = sum[0] + prod[0];
261 sum[1] = sum[1] + prod[1];
262 x_elem = tail_x_i[jx];
263 {
264 prod[0] = a_elem[0] * x_elem;
265 prod[1] = a_elem[1] * x_elem;
266 }
267 sum2[0] = sum2[0] + prod[0];
268 sum2[1] = sum2[1] + prod[1];
269 aij += incaij;
270 jx += incx;
271 }
272 sum[0] = sum[0] + sum2[0];
273 sum[1] = sum[1] + sum2[1];
274 y_elem[0] = y_i[iy];
275 y_elem[1] = y_i[iy + 1];
276 {
277 tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
278 tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
279 }
280
281 tmp2[0] = sum[0] + tmp1[0];
282 tmp2[1] = sum[1] + tmp1[1];
283 y_i[iy] = tmp2[0];
284 y_i[iy + 1] = tmp2[1];
285 ai += incai;
286 iy += incy;
287 }
288 } else { /* alpha != 1, the most general form:
289 y = alpha*A*head_x + alpha*A*tail_x + beta*y */
290 ai = 0;
291 iy = ky;
292 for (i = 0; i < leny; i++) {
293 sum[0] = sum[1] = 0.0;;
294 sum2[0] = sum2[1] = 0.0;;
295 aij = ai;
296 jx = kx;
297 for (j = 0; j < lenx; j++) {
298 a_elem[0] = a_i[aij];
299 a_elem[1] = a_i[aij + 1];
300 a_elem[1] = -a_elem[1];
301 x_elem = head_x_i[jx];
302 {
303 prod[0] = a_elem[0] * x_elem;
304 prod[1] = a_elem[1] * x_elem;
305 }
306 sum[0] = sum[0] + prod[0];
307 sum[1] = sum[1] + prod[1];
308 x_elem = tail_x_i[jx];
309 {
310 prod[0] = a_elem[0] * x_elem;
311 prod[1] = a_elem[1] * x_elem;
312 }
313 sum2[0] = sum2[0] + prod[0];
314 sum2[1] = sum2[1] + prod[1];
315 aij += incaij;
316 jx += incx;
317 }
318 {
319 tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
320 tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
321 }
322
323 {
324 tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
325 tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
326 }
327
328 tmp1[0] = tmp1[0] + tmp2[0];
329 tmp1[1] = tmp1[1] + tmp2[1];
330 y_elem[0] = y_i[iy];
331 y_elem[1] = y_i[iy + 1];
332 {
333 tmp2[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
334 tmp2[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
335 }
336
337 tmp1[0] = tmp1[0] + tmp2[0];
338 tmp1[1] = tmp1[1] + tmp2[1];
339 y_i[iy] = tmp1[0];
340 y_i[iy + 1] = tmp1[1];
341 ai += incai;
342 iy += incy;
343 }
344 }
345 }
346
347 } else {
348
349 /* if beta = 0, we can save m multiplies:
350 y = alpha*A*head_x + alpha*A*tail_x */
351 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
352 if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
353 /* save m more multiplies if alpha = 1 */
354 ai = 0;
355 iy = ky;
356 for (i = 0; i < leny; i++) {
357 sum[0] = sum[1] = 0.0;
358 sum2[0] = sum2[1] = 0.0;
359 aij = ai;
360 jx = kx;
361 for (j = 0; j < lenx; j++) {
362 a_elem[0] = a_i[aij];
363 a_elem[1] = a_i[aij + 1];
364
365 x_elem = head_x_i[jx];
366 {
367 prod[0] = a_elem[0] * x_elem;
368 prod[1] = a_elem[1] * x_elem;
369 }
370 sum[0] = sum[0] + prod[0];
371 sum[1] = sum[1] + prod[1];
372 x_elem = tail_x_i[jx];
373 {
374 prod[0] = a_elem[0] * x_elem;
375 prod[1] = a_elem[1] * x_elem;
376 }
377 sum2[0] = sum2[0] + prod[0];
378 sum2[1] = sum2[1] + prod[1];
379 aij += incaij;
380 jx += incx;
381 }
382 sum[0] = sum[0] + sum2[0];
383 sum[1] = sum[1] + sum2[1];
384 y_i[iy] = sum[0];
385 y_i[iy + 1] = sum[1];
386 ai += incai;
387 iy += incy;
388 } /* end for */
389 } else { /* alpha != 1 */
390 ai = 0;
391 iy = ky;
392 for (i = 0; i < leny; i++) {
393 sum[0] = sum[1] = 0.0;
394 sum2[0] = sum2[1] = 0.0;
395 aij = ai;
396 jx = kx;
397 for (j = 0; j < lenx; j++) {
398 a_elem[0] = a_i[aij];
399 a_elem[1] = a_i[aij + 1];
400
401 x_elem = head_x_i[jx];
402 {
403 prod[0] = a_elem[0] * x_elem;
404 prod[1] = a_elem[1] * x_elem;
405 }
406 sum[0] = sum[0] + prod[0];
407 sum[1] = sum[1] + prod[1];
408 x_elem = tail_x_i[jx];
409 {
410 prod[0] = a_elem[0] * x_elem;
411 prod[1] = a_elem[1] * x_elem;
412 }
413 sum2[0] = sum2[0] + prod[0];
414 sum2[1] = sum2[1] + prod[1];
415 aij += incaij;
416 jx += incx;
417 }
418 {
419 tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
420 tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
421 }
422
423 {
424 tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
425 tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
426 }
427
428 tmp1[0] = tmp1[0] + tmp2[0];
429 tmp1[1] = tmp1[1] + tmp2[1];
430 y_i[iy] = tmp1[0];
431 y_i[iy + 1] = tmp1[1];
432 ai += incai;
433 iy += incy;
434 }
435 }
436 } else { /* beta != 0 */
437 if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
438 /* save m multiplies if alpha = 1 */
439 ai = 0;
440 iy = ky;
441 for (i = 0; i < leny; i++) {
442 sum[0] = sum[1] = 0.0;;
443 sum2[0] = sum2[1] = 0.0;;
444 aij = ai;
445 jx = kx;
446 for (j = 0; j < lenx; j++) {
447 a_elem[0] = a_i[aij];
448 a_elem[1] = a_i[aij + 1];
449
450 x_elem = head_x_i[jx];
451 {
452 prod[0] = a_elem[0] * x_elem;
453 prod[1] = a_elem[1] * x_elem;
454 }
455 sum[0] = sum[0] + prod[0];
456 sum[1] = sum[1] + prod[1];
457 x_elem = tail_x_i[jx];
458 {
459 prod[0] = a_elem[0] * x_elem;
460 prod[1] = a_elem[1] * x_elem;
461 }
462 sum2[0] = sum2[0] + prod[0];
463 sum2[1] = sum2[1] + prod[1];
464 aij += incaij;
465 jx += incx;
466 }
467 sum[0] = sum[0] + sum2[0];
468 sum[1] = sum[1] + sum2[1];
469 y_elem[0] = y_i[iy];
470 y_elem[1] = y_i[iy + 1];
471 {
472 tmp1[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
473 tmp1[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
474 }
475
476 tmp2[0] = sum[0] + tmp1[0];
477 tmp2[1] = sum[1] + tmp1[1];
478 y_i[iy] = tmp2[0];
479 y_i[iy + 1] = tmp2[1];
480 ai += incai;
481 iy += incy;
482 }
483 } else { /* alpha != 1, the most general form:
484 y = alpha*A*head_x + alpha*A*tail_x + beta*y */
485 ai = 0;
486 iy = ky;
487 for (i = 0; i < leny; i++) {
488 sum[0] = sum[1] = 0.0;;
489 sum2[0] = sum2[1] = 0.0;;
490 aij = ai;
491 jx = kx;
492 for (j = 0; j < lenx; j++) {
493 a_elem[0] = a_i[aij];
494 a_elem[1] = a_i[aij + 1];
495
496 x_elem = head_x_i[jx];
497 {
498 prod[0] = a_elem[0] * x_elem;
499 prod[1] = a_elem[1] * x_elem;
500 }
501 sum[0] = sum[0] + prod[0];
502 sum[1] = sum[1] + prod[1];
503 x_elem = tail_x_i[jx];
504 {
505 prod[0] = a_elem[0] * x_elem;
506 prod[1] = a_elem[1] * x_elem;
507 }
508 sum2[0] = sum2[0] + prod[0];
509 sum2[1] = sum2[1] + prod[1];
510 aij += incaij;
511 jx += incx;
512 }
513 {
514 tmp1[0] = sum[0] * alpha_i[0] - sum[1] * alpha_i[1];
515 tmp1[1] = sum[0] * alpha_i[1] + sum[1] * alpha_i[0];
516 }
517
518 {
519 tmp2[0] = sum2[0] * alpha_i[0] - sum2[1] * alpha_i[1];
520 tmp2[1] = sum2[0] * alpha_i[1] + sum2[1] * alpha_i[0];
521 }
522
523 tmp1[0] = tmp1[0] + tmp2[0];
524 tmp1[1] = tmp1[1] + tmp2[1];
525 y_elem[0] = y_i[iy];
526 y_elem[1] = y_i[iy + 1];
527 {
528 tmp2[0] = y_elem[0] * beta_i[0] - y_elem[1] * beta_i[1];
529 tmp2[1] = y_elem[0] * beta_i[1] + y_elem[1] * beta_i[0];
530 }
531
532 tmp1[0] = tmp1[0] + tmp2[0];
533 tmp1[1] = tmp1[1] + tmp2[1];
534 y_i[iy] = tmp1[0];
535 y_i[iy + 1] = tmp1[1];
536 ai += incai;
537 iy += incy;
538 }
539 }
540 }
541
542 }
543 }
544
545
546
547 }
548