1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_cge_sum_mv_c_s(enum blas_order_type order,int m,int n,const void * alpha,const void * a,int lda,const float * x,int incx,const void * beta,const void * b,int ldb,void * y,int incy)3 void BLAS_cge_sum_mv_c_s(enum blas_order_type order, int m, int n,
4 const void *alpha, const void *a, int lda,
5 const float *x, int incx,
6 const void *beta, const void *b, int ldb,
7 void *y, int incy)
8
9 /*
10 * Purpose
11 * =======
12 *
13 * Computes y = alpha * A * x + beta * B * y,
14 * where A, B are general matricies.
15 *
16 * Arguments
17 * =========
18 *
19 * order (input) blas_order_type
20 * Order of A; row or column major
21 *
22 * m (input) int
23 * Row Dimension of A, B, length of output vector y
24 *
25 * n (input) int
26 * Column Dimension of A, B and the length of vector x
27 *
28 * alpha (input) const void*
29 *
30 * A (input) const void*
31 *
32 * lda (input) int
33 * Leading dimension of A
34 *
35 * x (input) const float*
36 *
37 * incx (input) int
38 * The stride for vector x.
39 *
40 * beta (input) const void*
41 *
42 * b (input) const void*
43 *
44 * ldb (input) int
45 * Leading dimension of B
46 *
47 * y (input/output) void*
48 *
49 * incy (input) int
50 * The stride for vector y.
51 *
52 */
53 {
54 /* Routine name */
55 static const char routine_name[] = "BLAS_cge_sum_mv_c_s";
56 int i, j;
57 int xi, yi;
58 int x_starti, y_starti, incxi, incyi;
59 int lda_min;
60 int ai;
61 int incai;
62 int aij;
63 int incaij;
64 int bi;
65 int incbi;
66 int bij;
67 int incbij;
68
69 const float *a_i = (float *) a;
70 const float *b_i = (float *) b;
71 const float *x_i = x;
72 float *y_i = (float *) y;
73 float *alpha_i = (float *) alpha;
74 float *beta_i = (float *) beta;
75 float a_elem[2];
76 float b_elem[2];
77 float x_elem;
78 float prod[2];
79 float sumA[2];
80 float sumB[2];
81 float tmp1[2];
82 float tmp2[2];
83
84
85
86 /* m is number of rows */
87 /* n is number of columns */
88
89 if (m == 0 || n == 0)
90 return;
91
92
93 /* all error calls */
94 if (order == blas_rowmajor) {
95 lda_min = n;
96 incai = lda; /* row stride */
97 incbi = ldb;
98 incbij = incaij = 1; /* column stride */
99 } else if (order == blas_colmajor) {
100 lda_min = m;
101 incai = incbi = 1; /*row stride */
102 incaij = lda; /* column stride */
103 incbij = ldb;
104 } else {
105 /* error, order not blas_colmajor not blas_rowmajor */
106 BLAS_error(routine_name, -1, order, 0);
107 return;
108 }
109
110 if (m < 0)
111 BLAS_error(routine_name, -2, m, 0);
112 else if (n < 0)
113 BLAS_error(routine_name, -3, n, 0);
114 if (lda < lda_min)
115 BLAS_error(routine_name, -6, lda, 0);
116 else if (ldb < lda_min)
117 BLAS_error(routine_name, -11, ldb, 0);
118 else if (incx == 0)
119 BLAS_error(routine_name, -8, incx, 0);
120 else if (incy == 0)
121 BLAS_error(routine_name, -13, incy, 0);
122
123 incxi = incx;
124 incyi = incy;
125
126 incyi *= 2;
127 incai *= 2;
128 incaij *= 2;
129 incbi *= 2;
130 incbij *= 2;
131
132 if (incxi > 0)
133 x_starti = 0;
134 else
135 x_starti = (1 - n) * incxi;
136
137 if (incyi > 0)
138 y_starti = 0;
139 else
140 y_starti = (1 - m) * incyi;
141
142
143
144 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
145 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
146 /* alpha, beta are 0.0 */
147 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
148 y_i[yi] = 0.0;
149 y_i[yi + 1] = 0.0;
150 }
151 } else if ((beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
152 /* alpha is 0.0, beta is 1.0 */
153
154
155 bi = 0;
156 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
157
158 sumB[0] = sumB[1] = 0.0;
159 bij = bi;
160 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
161 x_elem = x_i[xi];
162
163 b_elem[0] = b_i[bij];
164 b_elem[1] = b_i[bij + 1];
165 {
166 prod[0] = b_elem[0] * x_elem;
167 prod[1] = b_elem[1] * x_elem;
168 }
169 sumB[0] = sumB[0] + prod[0];
170 sumB[1] = sumB[1] + prod[1];
171 bij += incbij;
172 }
173 /* now put the result into y_i */
174 y_i[yi] = sumB[0];
175 y_i[yi + 1] = sumB[1];
176
177 bi += incbi;
178 }
179 } else {
180 /* alpha is 0.0, beta not 1.0 nor 0.0 */
181
182
183 bi = 0;
184 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
185
186 sumB[0] = sumB[1] = 0.0;
187 bij = bi;
188 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
189 x_elem = x_i[xi];
190
191 b_elem[0] = b_i[bij];
192 b_elem[1] = b_i[bij + 1];
193 {
194 prod[0] = b_elem[0] * x_elem;
195 prod[1] = b_elem[1] * x_elem;
196 }
197 sumB[0] = sumB[0] + prod[0];
198 sumB[1] = sumB[1] + prod[1];
199 bij += incbij;
200 }
201 /* now put the result into y_i */
202 {
203 tmp1[0] = sumB[0] * beta_i[0] - sumB[1] * beta_i[1];
204 tmp1[1] = sumB[0] * beta_i[1] + sumB[1] * beta_i[0];
205 }
206
207 y_i[yi] = tmp1[0];
208 y_i[yi + 1] = tmp1[1];
209
210 bi += incbi;
211 }
212 }
213 } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
214 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
215 /* alpha is 1.0, beta is 0.0 */
216
217 ai = 0;
218
219 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
220 sumA[0] = sumA[1] = 0.0;
221 aij = ai;
222
223 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
224 x_elem = x_i[xi];
225 a_elem[0] = a_i[aij];
226 a_elem[1] = a_i[aij + 1];
227 {
228 prod[0] = a_elem[0] * x_elem;
229 prod[1] = a_elem[1] * x_elem;
230 }
231 sumA[0] = sumA[0] + prod[0];
232 sumA[1] = sumA[1] + prod[1];
233 aij += incaij;
234
235 }
236 /* now put the result into y_i */
237 y_i[yi] = sumA[0];
238 y_i[yi + 1] = sumA[1];
239 ai += incai;
240
241 }
242 } else if ((beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
243 /* alpha is 1.0, beta is 1.0 */
244
245 ai = 0;
246 bi = 0;
247 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
248 sumA[0] = sumA[1] = 0.0;
249 aij = ai;
250 sumB[0] = sumB[1] = 0.0;
251 bij = bi;
252 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
253 x_elem = x_i[xi];
254 a_elem[0] = a_i[aij];
255 a_elem[1] = a_i[aij + 1];
256 {
257 prod[0] = a_elem[0] * x_elem;
258 prod[1] = a_elem[1] * x_elem;
259 }
260 sumA[0] = sumA[0] + prod[0];
261 sumA[1] = sumA[1] + prod[1];
262 aij += incaij;
263 b_elem[0] = b_i[bij];
264 b_elem[1] = b_i[bij + 1];
265 {
266 prod[0] = b_elem[0] * x_elem;
267 prod[1] = b_elem[1] * x_elem;
268 }
269 sumB[0] = sumB[0] + prod[0];
270 sumB[1] = sumB[1] + prod[1];
271 bij += incbij;
272 }
273 /* now put the result into y_i */
274 tmp1[0] = sumA[0];
275 tmp1[1] = sumA[1];
276 tmp2[0] = sumB[0];
277 tmp2[1] = sumB[1];
278 tmp1[0] = tmp1[0] + tmp2[0];
279 tmp1[1] = tmp1[1] + tmp2[1];
280 y_i[yi] = tmp1[0];
281 y_i[yi + 1] = tmp1[1];
282 ai += incai;
283 bi += incbi;
284 }
285 } else {
286 /* alpha is 1.0, beta is other */
287
288 ai = 0;
289 bi = 0;
290 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
291 sumA[0] = sumA[1] = 0.0;
292 aij = ai;
293 sumB[0] = sumB[1] = 0.0;
294 bij = bi;
295 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
296 x_elem = x_i[xi];
297 a_elem[0] = a_i[aij];
298 a_elem[1] = a_i[aij + 1];
299 {
300 prod[0] = a_elem[0] * x_elem;
301 prod[1] = a_elem[1] * x_elem;
302 }
303 sumA[0] = sumA[0] + prod[0];
304 sumA[1] = sumA[1] + prod[1];
305 aij += incaij;
306 b_elem[0] = b_i[bij];
307 b_elem[1] = b_i[bij + 1];
308 {
309 prod[0] = b_elem[0] * x_elem;
310 prod[1] = b_elem[1] * x_elem;
311 }
312 sumB[0] = sumB[0] + prod[0];
313 sumB[1] = sumB[1] + prod[1];
314 bij += incbij;
315 }
316 /* now put the result into y_i */
317 tmp1[0] = sumA[0];
318 tmp1[1] = sumA[1];
319 {
320 tmp2[0] = sumB[0] * beta_i[0] - sumB[1] * beta_i[1];
321 tmp2[1] = sumB[0] * beta_i[1] + sumB[1] * beta_i[0];
322 }
323
324 tmp1[0] = tmp1[0] + tmp2[0];
325 tmp1[1] = tmp1[1] + tmp2[1];
326 y_i[yi] = tmp1[0];
327 y_i[yi + 1] = tmp1[1];
328 ai += incai;
329 bi += incbi;
330 }
331 }
332 } else {
333 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
334 /* alpha is other, beta is 0.0 */
335
336 ai = 0;
337
338 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
339 sumA[0] = sumA[1] = 0.0;
340 aij = ai;
341
342 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
343 x_elem = x_i[xi];
344 a_elem[0] = a_i[aij];
345 a_elem[1] = a_i[aij + 1];
346 {
347 prod[0] = a_elem[0] * x_elem;
348 prod[1] = a_elem[1] * x_elem;
349 }
350 sumA[0] = sumA[0] + prod[0];
351 sumA[1] = sumA[1] + prod[1];
352 aij += incaij;
353
354 }
355 /* now put the result into y_i */
356 {
357 tmp1[0] = sumA[0] * alpha_i[0] - sumA[1] * alpha_i[1];
358 tmp1[1] = sumA[0] * alpha_i[1] + sumA[1] * alpha_i[0];
359 }
360
361 y_i[yi] = tmp1[0];
362 y_i[yi + 1] = tmp1[1];
363 ai += incai;
364
365 }
366 } else if ((beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
367 /* alpha is other, beta is 1.0 */
368
369 ai = 0;
370 bi = 0;
371 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
372 sumA[0] = sumA[1] = 0.0;
373 aij = ai;
374 sumB[0] = sumB[1] = 0.0;
375 bij = bi;
376 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
377 x_elem = x_i[xi];
378 a_elem[0] = a_i[aij];
379 a_elem[1] = a_i[aij + 1];
380 {
381 prod[0] = a_elem[0] * x_elem;
382 prod[1] = a_elem[1] * x_elem;
383 }
384 sumA[0] = sumA[0] + prod[0];
385 sumA[1] = sumA[1] + prod[1];
386 aij += incaij;
387 b_elem[0] = b_i[bij];
388 b_elem[1] = b_i[bij + 1];
389 {
390 prod[0] = b_elem[0] * x_elem;
391 prod[1] = b_elem[1] * x_elem;
392 }
393 sumB[0] = sumB[0] + prod[0];
394 sumB[1] = sumB[1] + prod[1];
395 bij += incbij;
396 }
397 /* now put the result into y_i */
398 {
399 tmp1[0] = sumA[0] * alpha_i[0] - sumA[1] * alpha_i[1];
400 tmp1[1] = sumA[0] * alpha_i[1] + sumA[1] * alpha_i[0];
401 }
402
403 tmp2[0] = sumB[0];
404 tmp2[1] = sumB[1];
405 tmp1[0] = tmp1[0] + tmp2[0];
406 tmp1[1] = tmp1[1] + tmp2[1];
407 y_i[yi] = tmp1[0];
408 y_i[yi + 1] = tmp1[1];
409 ai += incai;
410 bi += incbi;
411 }
412 } else {
413 /* most general form, alpha, beta are other */
414
415 ai = 0;
416 bi = 0;
417 for (i = 0, yi = y_starti; i < m; i++, yi += incyi) {
418 sumA[0] = sumA[1] = 0.0;
419 aij = ai;
420 sumB[0] = sumB[1] = 0.0;
421 bij = bi;
422 for (j = 0, xi = x_starti; j < n; j++, xi += incxi) {
423 x_elem = x_i[xi];
424 a_elem[0] = a_i[aij];
425 a_elem[1] = a_i[aij + 1];
426 {
427 prod[0] = a_elem[0] * x_elem;
428 prod[1] = a_elem[1] * x_elem;
429 }
430 sumA[0] = sumA[0] + prod[0];
431 sumA[1] = sumA[1] + prod[1];
432 aij += incaij;
433 b_elem[0] = b_i[bij];
434 b_elem[1] = b_i[bij + 1];
435 {
436 prod[0] = b_elem[0] * x_elem;
437 prod[1] = b_elem[1] * x_elem;
438 }
439 sumB[0] = sumB[0] + prod[0];
440 sumB[1] = sumB[1] + prod[1];
441 bij += incbij;
442 }
443 /* now put the result into y_i */
444 {
445 tmp1[0] = sumA[0] * alpha_i[0] - sumA[1] * alpha_i[1];
446 tmp1[1] = sumA[0] * alpha_i[1] + sumA[1] * alpha_i[0];
447 }
448
449 {
450 tmp2[0] = sumB[0] * beta_i[0] - sumB[1] * beta_i[1];
451 tmp2[1] = sumB[0] * beta_i[1] + sumB[1] * beta_i[0];
452 }
453
454 tmp1[0] = tmp1[0] + tmp2[0];
455 tmp1[1] = tmp1[1] + tmp2[1];
456 y_i[yi] = tmp1[0];
457 y_i[yi + 1] = tmp1[1];
458 ai += incai;
459 bi += incbi;
460 }
461 }
462 }
463
464
465 } /* end BLAS_cge_sum_mv_c_s */
466