1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_zgemv2_d_d(enum blas_order_type order,enum blas_trans_type trans,int m,int n,const void * alpha,const double * a,int lda,const double * head_x,const double * tail_x,int incx,const void * beta,void * y,int incy)3 void BLAS_zgemv2_d_d(enum blas_order_type order, enum blas_trans_type trans,
4 		     int m, int n, const void *alpha, const double *a,
5 		     int lda, const double *head_x, const double *tail_x,
6 		     int incx, const void *beta, void *y, int incy)
7 
8 /*
9  * Purpose
10  * =======
11  *
12  * Computes y = alpha * op(A) * head_x + alpha * op(A) * tail_x + beta * y,
13  * where A is a general matrix.
14  *
15  * Arguments
16  * =========
17  *
18  * order        (input) blas_order_type
19  *              Order of A; row or column major
20  *
21  * trans        (input) blas_trans_type
22  *              Transpose of A: no trans, trans, or conjugate trans
23  *
24  * m            (input) int
25  *              Dimension of A
26  *
27  * n            (input) int
28  *              Dimension of A and the length of vector x and z
29  *
30  * alpha        (input) const void*
31  *
32  * A            (input) const double*
33  *
34  * lda          (input) int
35  *              Leading dimension of A
36  *
37  * head_x
38  * tail_x       (input) const double*
39  *
40  * incx         (input) int
41  *              The stride for vector x.
42  *
43  * beta         (input) const void*
44  *
45  * y            (input) const void*
46  *
47  * incy         (input) int
48  *              The stride for vector y.
49  *
50  */
51 {
52   static const char routine_name[] = "BLAS_zgemv2_d_d";
53 
54   int i, j;
55   int iy, jx, kx, ky;
56   int lenx, leny;
57   int ai, aij;
58   int incai, incaij;
59 
60   const double *a_i = a;
61   const double *head_x_i = head_x;
62   const double *tail_x_i = tail_x;
63   double *y_i = (double *) y;
64   double *alpha_i = (double *) alpha;
65   double *beta_i = (double *) beta;
66   double a_elem;
67   double x_elem;
68   double y_elem[2];
69   double prod;
70   double sum;
71   double sum2;
72   double tmp1[2];
73   double tmp2[2];
74 
75 
76   /* all error calls */
77   if (m < 0)
78     BLAS_error(routine_name, -3, m, 0);
79   else if (n <= 0)
80     BLAS_error(routine_name, -4, n, 0);
81   else if (incx == 0)
82     BLAS_error(routine_name, -10, incx, 0);
83   else if (incy == 0)
84     BLAS_error(routine_name, -13, incy, 0);
85 
86   if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
87     lenx = n;
88     leny = m;
89     incai = lda;
90     incaij = 1;
91   } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
92     lenx = m;
93     leny = n;
94     incai = 1;
95     incaij = lda;
96   } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
97     lenx = n;
98     leny = m;
99     incai = 1;
100     incaij = lda;
101   } else {			/* colmajor and blas_trans */
102     lenx = m;
103     leny = n;
104     incai = lda;
105     incaij = 1;
106   }
107 
108   if (lda < leny)
109     BLAS_error(routine_name, -7, lda, NULL);
110 
111 
112 
113 
114   incy *= 2;
115 
116 
117 
118   if (incx > 0)
119     kx = 0;
120   else
121     kx = (1 - lenx) * incx;
122   if (incy > 0)
123     ky = 0;
124   else
125     ky = (1 - leny) * incy;
126 
127   /* No extra-precision needed for alpha = 0 */
128   if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
129     if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
130       iy = ky;
131       for (i = 0; i < leny; i++) {
132 	y_i[iy] = 0.0;
133 	y_i[iy + 1] = 0.0;
134 	iy += incy;
135       }
136     } else if (!(beta_i[0] == 0.0 && beta_i[1] == 0.0)) {
137       iy = ky;
138       for (i = 0; i < leny; i++) {
139 	y_elem[0] = y_i[iy];
140 	y_elem[1] = y_i[iy + 1];
141 	{
142 	  tmp1[0] =
143 	    (double) y_elem[0] * beta_i[0] - (double) y_elem[1] * beta_i[1];
144 	  tmp1[1] =
145 	    (double) y_elem[0] * beta_i[1] + (double) y_elem[1] * beta_i[0];
146 	}
147 	y_i[iy] = tmp1[0];
148 	y_i[iy + 1] = tmp1[1];
149 	iy += incy;
150       }
151     }
152   } else {			/* alpha != 0 */
153 
154     /* if beta = 0, we can save m multiplies:
155        y = alpha*A*head_x + alpha*A*tail_x  */
156     if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
157       if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
158 	/* save m more multiplies if alpha = 1 */
159 	ai = 0;
160 	iy = ky;
161 	for (i = 0; i < leny; i++) {
162 	  sum = 0.0;
163 	  sum2 = 0.0;
164 	  aij = ai;
165 	  jx = kx;
166 	  for (j = 0; j < lenx; j++) {
167 	    a_elem = a_i[aij];
168 
169 	    x_elem = head_x_i[jx];
170 	    prod = a_elem * x_elem;
171 	    sum = sum + prod;
172 	    x_elem = tail_x_i[jx];
173 	    prod = a_elem * x_elem;
174 	    sum2 = sum2 + prod;
175 	    aij += incaij;
176 	    jx += incx;
177 	  }
178 	  sum = sum + sum2;
179 	  tmp1[0] = sum;
180 	  tmp1[1] = 0.0;
181 	  y_i[iy] = tmp1[0];
182 	  y_i[iy + 1] = tmp1[1];
183 	  ai += incai;
184 	  iy += incy;
185 	}			/* end for */
186       } else {			/* alpha != 1 */
187 	ai = 0;
188 	iy = ky;
189 	for (i = 0; i < leny; i++) {
190 	  sum = 0.0;
191 	  sum2 = 0.0;
192 	  aij = ai;
193 	  jx = kx;
194 	  for (j = 0; j < lenx; j++) {
195 	    a_elem = a_i[aij];
196 
197 	    x_elem = head_x_i[jx];
198 	    prod = a_elem * x_elem;
199 	    sum = sum + prod;
200 	    x_elem = tail_x_i[jx];
201 	    prod = a_elem * x_elem;
202 	    sum2 = sum2 + prod;
203 	    aij += incaij;
204 	    jx += incx;
205 	  }
206 	  {
207 	    tmp1[0] = alpha_i[0] * sum;
208 	    tmp1[1] = alpha_i[1] * sum;
209 	  }
210 	  {
211 	    tmp2[0] = alpha_i[0] * sum2;
212 	    tmp2[1] = alpha_i[1] * sum2;
213 	  }
214 	  tmp1[0] = tmp1[0] + tmp2[0];
215 	  tmp1[1] = tmp1[1] + tmp2[1];
216 	  y_i[iy] = tmp1[0];
217 	  y_i[iy + 1] = tmp1[1];
218 	  ai += incai;
219 	  iy += incy;
220 	}
221       }
222     } else {			/* beta != 0 */
223       if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
224 	/* save m multiplies if alpha = 1 */
225 	ai = 0;
226 	iy = ky;
227 	for (i = 0; i < leny; i++) {
228 	  sum = 0.0;;
229 	  sum2 = 0.0;;
230 	  aij = ai;
231 	  jx = kx;
232 	  for (j = 0; j < lenx; j++) {
233 	    a_elem = a_i[aij];
234 
235 	    x_elem = head_x_i[jx];
236 	    prod = a_elem * x_elem;
237 	    sum = sum + prod;
238 	    x_elem = tail_x_i[jx];
239 	    prod = a_elem * x_elem;
240 	    sum2 = sum2 + prod;
241 	    aij += incaij;
242 	    jx += incx;
243 	  }
244 	  sum = sum + sum2;
245 	  y_elem[0] = y_i[iy];
246 	  y_elem[1] = y_i[iy + 1];
247 	  {
248 	    tmp1[0] =
249 	      (double) y_elem[0] * beta_i[0] - (double) y_elem[1] * beta_i[1];
250 	    tmp1[1] =
251 	      (double) y_elem[0] * beta_i[1] + (double) y_elem[1] * beta_i[0];
252 	  }
253 	  tmp2[0] = sum;
254 	  tmp2[1] = 0.0;
255 	  tmp2[0] = tmp2[0] + tmp1[0];
256 	  tmp2[1] = tmp2[1] + tmp1[1];
257 	  y_i[iy] = tmp2[0];
258 	  y_i[iy + 1] = tmp2[1];
259 	  ai += incai;
260 	  iy += incy;
261 	}
262       } else {			/* alpha != 1, the most general form:
263 				   y = alpha*A*head_x + alpha*A*tail_x + beta*y */
264 	ai = 0;
265 	iy = ky;
266 	for (i = 0; i < leny; i++) {
267 	  sum = 0.0;;
268 	  sum2 = 0.0;;
269 	  aij = ai;
270 	  jx = kx;
271 	  for (j = 0; j < lenx; j++) {
272 	    a_elem = a_i[aij];
273 
274 	    x_elem = head_x_i[jx];
275 	    prod = a_elem * x_elem;
276 	    sum = sum + prod;
277 	    x_elem = tail_x_i[jx];
278 	    prod = a_elem * x_elem;
279 	    sum2 = sum2 + prod;
280 	    aij += incaij;
281 	    jx += incx;
282 	  }
283 	  {
284 	    tmp1[0] = alpha_i[0] * sum;
285 	    tmp1[1] = alpha_i[1] * sum;
286 	  }
287 	  {
288 	    tmp2[0] = alpha_i[0] * sum2;
289 	    tmp2[1] = alpha_i[1] * sum2;
290 	  }
291 	  tmp1[0] = tmp1[0] + tmp2[0];
292 	  tmp1[1] = tmp1[1] + tmp2[1];
293 	  y_elem[0] = y_i[iy];
294 	  y_elem[1] = y_i[iy + 1];
295 	  {
296 	    tmp2[0] =
297 	      (double) y_elem[0] * beta_i[0] - (double) y_elem[1] * beta_i[1];
298 	    tmp2[1] =
299 	      (double) y_elem[0] * beta_i[1] + (double) y_elem[1] * beta_i[0];
300 	  }
301 	  tmp1[0] = tmp1[0] + tmp2[0];
302 	  tmp1[1] = tmp1[1] + tmp2[1];
303 	  y_i[iy] = tmp1[0];
304 	  y_i[iy + 1] = tmp1[1];
305 	  ai += incai;
306 	  iy += incy;
307 	}
308       }
309     }
310 
311   }
312 
313 
314 
315 }
316