1 /* Implementation of the MATMUL intrinsic
2    Copyright (C) 2002-2020 Free Software Foundation, Inc.
3    Contributed by Paul Brook <paul@nowt.org>
4 
5 This file is part of the GNU Fortran runtime library (libgfortran).
6 
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11 
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20 
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24 <http://www.gnu.org/licenses/>.  */
25 
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29 
30 
31 #if defined (HAVE_GFC_COMPLEX_4)
32 
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34    passed to us by the front-end, in which case we call it for large
35    matrices.  */
36 
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38                           const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
39                           const int *, const GFC_COMPLEX_4 *, const int *,
40                           const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
41                           int, int);
42 
43 /* The order of loops is different in the case of plain matrix
44    multiplication C=MATMUL(A,B), and in the frequent special case where
45    the argument A is the temporary result of a TRANSPOSE intrinsic:
46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47    looking at their strides.
48 
49    The equivalent Fortran pseudo-code is:
50 
51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52    IF (.NOT.IS_TRANSPOSED(A)) THEN
53      C = 0
54      DO J=1,N
55        DO K=1,COUNT
56          DO I=1,M
57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58    ELSE
59      DO J=1,N
60        DO I=1,M
61          S = 0
62          DO K=1,COUNT
63            S = S+A(I,K)*B(K,J)
64          C(I,J) = S
65    ENDIF
66 */
67 
68 /* If try_blas is set to a nonzero value, then the matmul function will
69    see if there is a way to perform the matrix multiplication by a call
70    to the BLAS gemm function.  */
71 
72 extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
73 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
74 	int blas_limit, blas_call gemm);
75 export_proto(matmul_c4);
76 
77 /* Put exhaustive list of possible architectures here here, ORed together.  */
78 
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80 
81 #ifdef HAVE_AVX
82 static void
83 matmul_c4_avx (gfc_array_c4 * const restrict retarray,
84 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
85 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
matmul_c4_avx(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87 matmul_c4_avx (gfc_array_c4 * const restrict retarray,
88 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
89 	int blas_limit, blas_call gemm)
90 {
91   const GFC_COMPLEX_4 * restrict abase;
92   const GFC_COMPLEX_4 * restrict bbase;
93   GFC_COMPLEX_4 * restrict dest;
94 
95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96   index_type x, y, n, count, xcount, ycount;
97 
98   assert (GFC_DESCRIPTOR_RANK (a) == 2
99           || GFC_DESCRIPTOR_RANK (b) == 2);
100 
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102 
103    Either A or B (but not both) can be rank 1:
104 
105    o One-dimensional argument A is implicitly treated as a row matrix
106      dimensioned [1,count], so xcount=1.
107 
108    o One-dimensional argument B is implicitly treated as a column matrix
109      dimensioned [count, 1], so ycount=1.
110 */
111 
112   if (retarray->base_addr == NULL)
113     {
114       if (GFC_DESCRIPTOR_RANK (a) == 1)
115         {
116 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118         }
119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120         {
121 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123         }
124       else
125         {
126 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128 
129           GFC_DIMENSION_SET(retarray->dim[1], 0,
130 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132         }
133 
134       retarray->base_addr
135 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
136       retarray->offset = 0;
137     }
138   else if (unlikely (compile_options.bounds_check))
139     {
140       index_type ret_extent, arg_extent;
141 
142       if (GFC_DESCRIPTOR_RANK (a) == 1)
143 	{
144 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 	  if (arg_extent != ret_extent)
147 	    runtime_error ("Array bound mismatch for dimension 1 of "
148 	    		   "array (%ld/%ld) ",
149 			   (long int) ret_extent, (long int) arg_extent);
150 	}
151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 	{
153 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 	  if (arg_extent != ret_extent)
156 	    runtime_error ("Array bound mismatch for dimension 1 of "
157 	    		   "array (%ld/%ld) ",
158 			   (long int) ret_extent, (long int) arg_extent);
159 	}
160       else
161 	{
162 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 	  if (arg_extent != ret_extent)
165 	    runtime_error ("Array bound mismatch for dimension 1 of "
166 	    		   "array (%ld/%ld) ",
167 			   (long int) ret_extent, (long int) arg_extent);
168 
169 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171 	  if (arg_extent != ret_extent)
172 	    runtime_error ("Array bound mismatch for dimension 2 of "
173 	    		   "array (%ld/%ld) ",
174 			   (long int) ret_extent, (long int) arg_extent);
175 	}
176     }
177 
178 
179   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180     {
181       /* One-dimensional result may be addressed in the code below
182 	 either as a row or a column matrix. We want both cases to
183 	 work. */
184       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185     }
186   else
187     {
188       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190     }
191 
192 
193   if (GFC_DESCRIPTOR_RANK (a) == 1)
194     {
195       /* Treat it as a a row matrix A[1,count]. */
196       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197       aystride = 1;
198 
199       xcount = 1;
200       count = GFC_DESCRIPTOR_EXTENT(a,0);
201     }
202   else
203     {
204       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206 
207       count = GFC_DESCRIPTOR_EXTENT(a,1);
208       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209     }
210 
211   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212     {
213       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215 		       "in dimension 1: is %ld, should be %ld",
216 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217     }
218 
219   if (GFC_DESCRIPTOR_RANK (b) == 1)
220     {
221       /* Treat it as a column matrix B[count,1] */
222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223 
224       /* bystride should never be used for 1-dimensional b.
225          The value is only used for calculation of the
226          memory by the buffer.  */
227       bystride = 256;
228       ycount = 1;
229     }
230   else
231     {
232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235     }
236 
237   abase = a->base_addr;
238   bbase = b->base_addr;
239   dest = retarray->base_addr;
240 
241   /* Now that everything is set up, we perform the multiplication
242      itself.  */
243 
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247 
248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249       && (bxstride == 1 || bystride == 1)
250       && (((float) xcount) * ((float) ycount) * ((float) count)
251           > POW3(blas_limit)))
252     {
253       const int m = xcount, n = ycount, k = count, ldc = rystride;
254       const GFC_COMPLEX_4 one = 1, zero = 0;
255       const int lda = (axstride == 1) ? aystride : axstride,
256 		ldb = (bxstride == 1) ? bystride : bxstride;
257 
258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 	{
260 	  assert (gemm != NULL);
261 	  const char *transa, *transb;
262 	  if (try_blas & 2)
263 	    transa = "C";
264 	  else
265 	    transa = axstride == 1 ? "N" : "T";
266 
267 	  if (try_blas & 4)
268 	    transb = "C";
269 	  else
270 	    transb = bxstride == 1 ? "N" : "T";
271 
272 	  gemm (transa, transb , &m,
273 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274 		&ldc, 1, 1);
275 	  return;
276 	}
277     }
278 
279   if (rxstride == 1 && axstride == 1 && bxstride == 1)
280     {
281       /* This block of code implements a tuned matmul, derived from
282          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
283 
284                Bo Kagstrom and Per Ling
285                Department of Computing Science
286                Umea University
287                S-901 87 Umea, Sweden
288 
289 	 from netlib.org, translated to C, and modified for matmul.m4.  */
290 
291       const GFC_COMPLEX_4 *a, *b;
292       GFC_COMPLEX_4 *c;
293       const index_type m = xcount, n = ycount, k = count;
294 
295       /* System generated locals */
296       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
297 		 i1, i2, i3, i4, i5, i6;
298 
299       /* Local variables */
300       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
301 		 f13, f14, f23, f24, f33, f34, f43, f44;
302       index_type i, j, l, ii, jj, ll;
303       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
304       GFC_COMPLEX_4 *t1;
305 
306       a = abase;
307       b = bbase;
308       c = retarray->base_addr;
309 
310       /* Parameter adjustments */
311       c_dim1 = rystride;
312       c_offset = 1 + c_dim1;
313       c -= c_offset;
314       a_dim1 = aystride;
315       a_offset = 1 + a_dim1;
316       a -= a_offset;
317       b_dim1 = bystride;
318       b_offset = 1 + b_dim1;
319       b -= b_offset;
320 
321       /* Empty c first.  */
322       for (j=1; j<=n; j++)
323 	for (i=1; i<=m; i++)
324 	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
325 
326       /* Early exit if possible */
327       if (m == 0 || n == 0 || k == 0)
328 	return;
329 
330       /* Adjust size of t1 to what is needed.  */
331       index_type t1_dim, a_sz;
332       if (aystride == 1)
333         a_sz = rystride;
334       else
335         a_sz = a_dim1;
336 
337       t1_dim = a_sz * 256 + b_dim1;
338       if (t1_dim > 65536)
339 	t1_dim = 65536;
340 
341       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
342 
343       /* Start turning the crank. */
344       i1 = n;
345       for (jj = 1; jj <= i1; jj += 512)
346 	{
347 	  /* Computing MIN */
348 	  i2 = 512;
349 	  i3 = n - jj + 1;
350 	  jsec = min(i2,i3);
351 	  ujsec = jsec - jsec % 4;
352 	  i2 = k;
353 	  for (ll = 1; ll <= i2; ll += 256)
354 	    {
355 	      /* Computing MIN */
356 	      i3 = 256;
357 	      i4 = k - ll + 1;
358 	      lsec = min(i3,i4);
359 	      ulsec = lsec - lsec % 2;
360 
361 	      i3 = m;
362 	      for (ii = 1; ii <= i3; ii += 256)
363 		{
364 		  /* Computing MIN */
365 		  i4 = 256;
366 		  i5 = m - ii + 1;
367 		  isec = min(i4,i5);
368 		  uisec = isec - isec % 2;
369 		  i4 = ll + ulsec - 1;
370 		  for (l = ll; l <= i4; l += 2)
371 		    {
372 		      i5 = ii + uisec - 1;
373 		      for (i = ii; i <= i5; i += 2)
374 			{
375 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
376 					a[i + l * a_dim1];
377 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
378 					a[i + (l + 1) * a_dim1];
379 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
380 					a[i + 1 + l * a_dim1];
381 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
382 					a[i + 1 + (l + 1) * a_dim1];
383 			}
384 		      if (uisec < isec)
385 			{
386 			  t1[l - ll + 1 + (isec << 8) - 257] =
387 				    a[ii + isec - 1 + l * a_dim1];
388 			  t1[l - ll + 2 + (isec << 8) - 257] =
389 				    a[ii + isec - 1 + (l + 1) * a_dim1];
390 			}
391 		    }
392 		  if (ulsec < lsec)
393 		    {
394 		      i4 = ii + isec - 1;
395 		      for (i = ii; i<= i4; ++i)
396 			{
397 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
398 				    a[i + (ll + lsec - 1) * a_dim1];
399 			}
400 		    }
401 
402 		  uisec = isec - isec % 4;
403 		  i4 = jj + ujsec - 1;
404 		  for (j = jj; j <= i4; j += 4)
405 		    {
406 		      i5 = ii + uisec - 1;
407 		      for (i = ii; i <= i5; i += 4)
408 			{
409 			  f11 = c[i + j * c_dim1];
410 			  f21 = c[i + 1 + j * c_dim1];
411 			  f12 = c[i + (j + 1) * c_dim1];
412 			  f22 = c[i + 1 + (j + 1) * c_dim1];
413 			  f13 = c[i + (j + 2) * c_dim1];
414 			  f23 = c[i + 1 + (j + 2) * c_dim1];
415 			  f14 = c[i + (j + 3) * c_dim1];
416 			  f24 = c[i + 1 + (j + 3) * c_dim1];
417 			  f31 = c[i + 2 + j * c_dim1];
418 			  f41 = c[i + 3 + j * c_dim1];
419 			  f32 = c[i + 2 + (j + 1) * c_dim1];
420 			  f42 = c[i + 3 + (j + 1) * c_dim1];
421 			  f33 = c[i + 2 + (j + 2) * c_dim1];
422 			  f43 = c[i + 3 + (j + 2) * c_dim1];
423 			  f34 = c[i + 2 + (j + 3) * c_dim1];
424 			  f44 = c[i + 3 + (j + 3) * c_dim1];
425 			  i6 = ll + lsec - 1;
426 			  for (l = ll; l <= i6; ++l)
427 			    {
428 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
429 				      * b[l + j * b_dim1];
430 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
431 				      * b[l + j * b_dim1];
432 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
433 				      * b[l + (j + 1) * b_dim1];
434 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
435 				      * b[l + (j + 1) * b_dim1];
436 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
437 				      * b[l + (j + 2) * b_dim1];
438 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
439 				      * b[l + (j + 2) * b_dim1];
440 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
441 				      * b[l + (j + 3) * b_dim1];
442 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
443 				      * b[l + (j + 3) * b_dim1];
444 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
445 				      * b[l + j * b_dim1];
446 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
447 				      * b[l + j * b_dim1];
448 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
449 				      * b[l + (j + 1) * b_dim1];
450 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
451 				      * b[l + (j + 1) * b_dim1];
452 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
453 				      * b[l + (j + 2) * b_dim1];
454 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
455 				      * b[l + (j + 2) * b_dim1];
456 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
457 				      * b[l + (j + 3) * b_dim1];
458 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
459 				      * b[l + (j + 3) * b_dim1];
460 			    }
461 			  c[i + j * c_dim1] = f11;
462 			  c[i + 1 + j * c_dim1] = f21;
463 			  c[i + (j + 1) * c_dim1] = f12;
464 			  c[i + 1 + (j + 1) * c_dim1] = f22;
465 			  c[i + (j + 2) * c_dim1] = f13;
466 			  c[i + 1 + (j + 2) * c_dim1] = f23;
467 			  c[i + (j + 3) * c_dim1] = f14;
468 			  c[i + 1 + (j + 3) * c_dim1] = f24;
469 			  c[i + 2 + j * c_dim1] = f31;
470 			  c[i + 3 + j * c_dim1] = f41;
471 			  c[i + 2 + (j + 1) * c_dim1] = f32;
472 			  c[i + 3 + (j + 1) * c_dim1] = f42;
473 			  c[i + 2 + (j + 2) * c_dim1] = f33;
474 			  c[i + 3 + (j + 2) * c_dim1] = f43;
475 			  c[i + 2 + (j + 3) * c_dim1] = f34;
476 			  c[i + 3 + (j + 3) * c_dim1] = f44;
477 			}
478 		      if (uisec < isec)
479 			{
480 			  i5 = ii + isec - 1;
481 			  for (i = ii + uisec; i <= i5; ++i)
482 			    {
483 			      f11 = c[i + j * c_dim1];
484 			      f12 = c[i + (j + 1) * c_dim1];
485 			      f13 = c[i + (j + 2) * c_dim1];
486 			      f14 = c[i + (j + 3) * c_dim1];
487 			      i6 = ll + lsec - 1;
488 			      for (l = ll; l <= i6; ++l)
489 				{
490 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
491 					  257] * b[l + j * b_dim1];
492 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
493 					  257] * b[l + (j + 1) * b_dim1];
494 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
495 					  257] * b[l + (j + 2) * b_dim1];
496 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
497 					  257] * b[l + (j + 3) * b_dim1];
498 				}
499 			      c[i + j * c_dim1] = f11;
500 			      c[i + (j + 1) * c_dim1] = f12;
501 			      c[i + (j + 2) * c_dim1] = f13;
502 			      c[i + (j + 3) * c_dim1] = f14;
503 			    }
504 			}
505 		    }
506 		  if (ujsec < jsec)
507 		    {
508 		      i4 = jj + jsec - 1;
509 		      for (j = jj + ujsec; j <= i4; ++j)
510 			{
511 			  i5 = ii + uisec - 1;
512 			  for (i = ii; i <= i5; i += 4)
513 			    {
514 			      f11 = c[i + j * c_dim1];
515 			      f21 = c[i + 1 + j * c_dim1];
516 			      f31 = c[i + 2 + j * c_dim1];
517 			      f41 = c[i + 3 + j * c_dim1];
518 			      i6 = ll + lsec - 1;
519 			      for (l = ll; l <= i6; ++l)
520 				{
521 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
522 					  257] * b[l + j * b_dim1];
523 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
524 					  257] * b[l + j * b_dim1];
525 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
526 					  257] * b[l + j * b_dim1];
527 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
528 					  257] * b[l + j * b_dim1];
529 				}
530 			      c[i + j * c_dim1] = f11;
531 			      c[i + 1 + j * c_dim1] = f21;
532 			      c[i + 2 + j * c_dim1] = f31;
533 			      c[i + 3 + j * c_dim1] = f41;
534 			    }
535 			  i5 = ii + isec - 1;
536 			  for (i = ii + uisec; i <= i5; ++i)
537 			    {
538 			      f11 = c[i + j * c_dim1];
539 			      i6 = ll + lsec - 1;
540 			      for (l = ll; l <= i6; ++l)
541 				{
542 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
543 					  257] * b[l + j * b_dim1];
544 				}
545 			      c[i + j * c_dim1] = f11;
546 			    }
547 			}
548 		    }
549 		}
550 	    }
551 	}
552       free(t1);
553       return;
554     }
555   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
556     {
557       if (GFC_DESCRIPTOR_RANK (a) != 1)
558 	{
559 	  const GFC_COMPLEX_4 *restrict abase_x;
560 	  const GFC_COMPLEX_4 *restrict bbase_y;
561 	  GFC_COMPLEX_4 *restrict dest_y;
562 	  GFC_COMPLEX_4 s;
563 
564 	  for (y = 0; y < ycount; y++)
565 	    {
566 	      bbase_y = &bbase[y*bystride];
567 	      dest_y = &dest[y*rystride];
568 	      for (x = 0; x < xcount; x++)
569 		{
570 		  abase_x = &abase[x*axstride];
571 		  s = (GFC_COMPLEX_4) 0;
572 		  for (n = 0; n < count; n++)
573 		    s += abase_x[n] * bbase_y[n];
574 		  dest_y[x] = s;
575 		}
576 	    }
577 	}
578       else
579 	{
580 	  const GFC_COMPLEX_4 *restrict bbase_y;
581 	  GFC_COMPLEX_4 s;
582 
583 	  for (y = 0; y < ycount; y++)
584 	    {
585 	      bbase_y = &bbase[y*bystride];
586 	      s = (GFC_COMPLEX_4) 0;
587 	      for (n = 0; n < count; n++)
588 		s += abase[n*axstride] * bbase_y[n];
589 	      dest[y*rystride] = s;
590 	    }
591 	}
592     }
593   else if (axstride < aystride)
594     {
595       for (y = 0; y < ycount; y++)
596 	for (x = 0; x < xcount; x++)
597 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
598 
599       for (y = 0; y < ycount; y++)
600 	for (n = 0; n < count; n++)
601 	  for (x = 0; x < xcount; x++)
602 	    /* dest[x,y] += a[x,n] * b[n,y] */
603 	    dest[x*rxstride + y*rystride] +=
604 					abase[x*axstride + n*aystride] *
605 					bbase[n*bxstride + y*bystride];
606     }
607   else if (GFC_DESCRIPTOR_RANK (a) == 1)
608     {
609       const GFC_COMPLEX_4 *restrict bbase_y;
610       GFC_COMPLEX_4 s;
611 
612       for (y = 0; y < ycount; y++)
613 	{
614 	  bbase_y = &bbase[y*bystride];
615 	  s = (GFC_COMPLEX_4) 0;
616 	  for (n = 0; n < count; n++)
617 	    s += abase[n*axstride] * bbase_y[n*bxstride];
618 	  dest[y*rxstride] = s;
619 	}
620     }
621   else
622     {
623       const GFC_COMPLEX_4 *restrict abase_x;
624       const GFC_COMPLEX_4 *restrict bbase_y;
625       GFC_COMPLEX_4 *restrict dest_y;
626       GFC_COMPLEX_4 s;
627 
628       for (y = 0; y < ycount; y++)
629 	{
630 	  bbase_y = &bbase[y*bystride];
631 	  dest_y = &dest[y*rystride];
632 	  for (x = 0; x < xcount; x++)
633 	    {
634 	      abase_x = &abase[x*axstride];
635 	      s = (GFC_COMPLEX_4) 0;
636 	      for (n = 0; n < count; n++)
637 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
638 	      dest_y[x*rxstride] = s;
639 	    }
640 	}
641     }
642 }
643 #undef POW3
644 #undef min
645 #undef max
646 
647 #endif /* HAVE_AVX */
648 
649 #ifdef HAVE_AVX2
650 static void
651 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
652 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
653 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
654 static void
matmul_c4_avx2(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)655 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
656 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
657 	int blas_limit, blas_call gemm)
658 {
659   const GFC_COMPLEX_4 * restrict abase;
660   const GFC_COMPLEX_4 * restrict bbase;
661   GFC_COMPLEX_4 * restrict dest;
662 
663   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
664   index_type x, y, n, count, xcount, ycount;
665 
666   assert (GFC_DESCRIPTOR_RANK (a) == 2
667           || GFC_DESCRIPTOR_RANK (b) == 2);
668 
669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
670 
671    Either A or B (but not both) can be rank 1:
672 
673    o One-dimensional argument A is implicitly treated as a row matrix
674      dimensioned [1,count], so xcount=1.
675 
676    o One-dimensional argument B is implicitly treated as a column matrix
677      dimensioned [count, 1], so ycount=1.
678 */
679 
680   if (retarray->base_addr == NULL)
681     {
682       if (GFC_DESCRIPTOR_RANK (a) == 1)
683         {
684 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
685 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
686         }
687       else if (GFC_DESCRIPTOR_RANK (b) == 1)
688         {
689 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
690 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
691         }
692       else
693         {
694 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
695 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
696 
697           GFC_DIMENSION_SET(retarray->dim[1], 0,
698 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
699 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
700         }
701 
702       retarray->base_addr
703 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
704       retarray->offset = 0;
705     }
706   else if (unlikely (compile_options.bounds_check))
707     {
708       index_type ret_extent, arg_extent;
709 
710       if (GFC_DESCRIPTOR_RANK (a) == 1)
711 	{
712 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
713 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
714 	  if (arg_extent != ret_extent)
715 	    runtime_error ("Array bound mismatch for dimension 1 of "
716 	    		   "array (%ld/%ld) ",
717 			   (long int) ret_extent, (long int) arg_extent);
718 	}
719       else if (GFC_DESCRIPTOR_RANK (b) == 1)
720 	{
721 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
722 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
723 	  if (arg_extent != ret_extent)
724 	    runtime_error ("Array bound mismatch for dimension 1 of "
725 	    		   "array (%ld/%ld) ",
726 			   (long int) ret_extent, (long int) arg_extent);
727 	}
728       else
729 	{
730 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
731 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
732 	  if (arg_extent != ret_extent)
733 	    runtime_error ("Array bound mismatch for dimension 1 of "
734 	    		   "array (%ld/%ld) ",
735 			   (long int) ret_extent, (long int) arg_extent);
736 
737 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
738 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
739 	  if (arg_extent != ret_extent)
740 	    runtime_error ("Array bound mismatch for dimension 2 of "
741 	    		   "array (%ld/%ld) ",
742 			   (long int) ret_extent, (long int) arg_extent);
743 	}
744     }
745 
746 
747   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
748     {
749       /* One-dimensional result may be addressed in the code below
750 	 either as a row or a column matrix. We want both cases to
751 	 work. */
752       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
753     }
754   else
755     {
756       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
757       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
758     }
759 
760 
761   if (GFC_DESCRIPTOR_RANK (a) == 1)
762     {
763       /* Treat it as a a row matrix A[1,count]. */
764       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
765       aystride = 1;
766 
767       xcount = 1;
768       count = GFC_DESCRIPTOR_EXTENT(a,0);
769     }
770   else
771     {
772       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
773       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
774 
775       count = GFC_DESCRIPTOR_EXTENT(a,1);
776       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
777     }
778 
779   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
780     {
781       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
782 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
783 		       "in dimension 1: is %ld, should be %ld",
784 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
785     }
786 
787   if (GFC_DESCRIPTOR_RANK (b) == 1)
788     {
789       /* Treat it as a column matrix B[count,1] */
790       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
791 
792       /* bystride should never be used for 1-dimensional b.
793          The value is only used for calculation of the
794          memory by the buffer.  */
795       bystride = 256;
796       ycount = 1;
797     }
798   else
799     {
800       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
801       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
802       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
803     }
804 
805   abase = a->base_addr;
806   bbase = b->base_addr;
807   dest = retarray->base_addr;
808 
809   /* Now that everything is set up, we perform the multiplication
810      itself.  */
811 
812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
813 #define min(a,b) ((a) <= (b) ? (a) : (b))
814 #define max(a,b) ((a) >= (b) ? (a) : (b))
815 
816   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
817       && (bxstride == 1 || bystride == 1)
818       && (((float) xcount) * ((float) ycount) * ((float) count)
819           > POW3(blas_limit)))
820     {
821       const int m = xcount, n = ycount, k = count, ldc = rystride;
822       const GFC_COMPLEX_4 one = 1, zero = 0;
823       const int lda = (axstride == 1) ? aystride : axstride,
824 		ldb = (bxstride == 1) ? bystride : bxstride;
825 
826       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
827 	{
828 	  assert (gemm != NULL);
829 	  const char *transa, *transb;
830 	  if (try_blas & 2)
831 	    transa = "C";
832 	  else
833 	    transa = axstride == 1 ? "N" : "T";
834 
835 	  if (try_blas & 4)
836 	    transb = "C";
837 	  else
838 	    transb = bxstride == 1 ? "N" : "T";
839 
840 	  gemm (transa, transb , &m,
841 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
842 		&ldc, 1, 1);
843 	  return;
844 	}
845     }
846 
847   if (rxstride == 1 && axstride == 1 && bxstride == 1)
848     {
849       /* This block of code implements a tuned matmul, derived from
850          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
851 
852                Bo Kagstrom and Per Ling
853                Department of Computing Science
854                Umea University
855                S-901 87 Umea, Sweden
856 
857 	 from netlib.org, translated to C, and modified for matmul.m4.  */
858 
859       const GFC_COMPLEX_4 *a, *b;
860       GFC_COMPLEX_4 *c;
861       const index_type m = xcount, n = ycount, k = count;
862 
863       /* System generated locals */
864       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
865 		 i1, i2, i3, i4, i5, i6;
866 
867       /* Local variables */
868       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
869 		 f13, f14, f23, f24, f33, f34, f43, f44;
870       index_type i, j, l, ii, jj, ll;
871       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
872       GFC_COMPLEX_4 *t1;
873 
874       a = abase;
875       b = bbase;
876       c = retarray->base_addr;
877 
878       /* Parameter adjustments */
879       c_dim1 = rystride;
880       c_offset = 1 + c_dim1;
881       c -= c_offset;
882       a_dim1 = aystride;
883       a_offset = 1 + a_dim1;
884       a -= a_offset;
885       b_dim1 = bystride;
886       b_offset = 1 + b_dim1;
887       b -= b_offset;
888 
889       /* Empty c first.  */
890       for (j=1; j<=n; j++)
891 	for (i=1; i<=m; i++)
892 	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
893 
894       /* Early exit if possible */
895       if (m == 0 || n == 0 || k == 0)
896 	return;
897 
898       /* Adjust size of t1 to what is needed.  */
899       index_type t1_dim, a_sz;
900       if (aystride == 1)
901         a_sz = rystride;
902       else
903         a_sz = a_dim1;
904 
905       t1_dim = a_sz * 256 + b_dim1;
906       if (t1_dim > 65536)
907 	t1_dim = 65536;
908 
909       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
910 
911       /* Start turning the crank. */
912       i1 = n;
913       for (jj = 1; jj <= i1; jj += 512)
914 	{
915 	  /* Computing MIN */
916 	  i2 = 512;
917 	  i3 = n - jj + 1;
918 	  jsec = min(i2,i3);
919 	  ujsec = jsec - jsec % 4;
920 	  i2 = k;
921 	  for (ll = 1; ll <= i2; ll += 256)
922 	    {
923 	      /* Computing MIN */
924 	      i3 = 256;
925 	      i4 = k - ll + 1;
926 	      lsec = min(i3,i4);
927 	      ulsec = lsec - lsec % 2;
928 
929 	      i3 = m;
930 	      for (ii = 1; ii <= i3; ii += 256)
931 		{
932 		  /* Computing MIN */
933 		  i4 = 256;
934 		  i5 = m - ii + 1;
935 		  isec = min(i4,i5);
936 		  uisec = isec - isec % 2;
937 		  i4 = ll + ulsec - 1;
938 		  for (l = ll; l <= i4; l += 2)
939 		    {
940 		      i5 = ii + uisec - 1;
941 		      for (i = ii; i <= i5; i += 2)
942 			{
943 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
944 					a[i + l * a_dim1];
945 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
946 					a[i + (l + 1) * a_dim1];
947 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
948 					a[i + 1 + l * a_dim1];
949 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
950 					a[i + 1 + (l + 1) * a_dim1];
951 			}
952 		      if (uisec < isec)
953 			{
954 			  t1[l - ll + 1 + (isec << 8) - 257] =
955 				    a[ii + isec - 1 + l * a_dim1];
956 			  t1[l - ll + 2 + (isec << 8) - 257] =
957 				    a[ii + isec - 1 + (l + 1) * a_dim1];
958 			}
959 		    }
960 		  if (ulsec < lsec)
961 		    {
962 		      i4 = ii + isec - 1;
963 		      for (i = ii; i<= i4; ++i)
964 			{
965 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
966 				    a[i + (ll + lsec - 1) * a_dim1];
967 			}
968 		    }
969 
970 		  uisec = isec - isec % 4;
971 		  i4 = jj + ujsec - 1;
972 		  for (j = jj; j <= i4; j += 4)
973 		    {
974 		      i5 = ii + uisec - 1;
975 		      for (i = ii; i <= i5; i += 4)
976 			{
977 			  f11 = c[i + j * c_dim1];
978 			  f21 = c[i + 1 + j * c_dim1];
979 			  f12 = c[i + (j + 1) * c_dim1];
980 			  f22 = c[i + 1 + (j + 1) * c_dim1];
981 			  f13 = c[i + (j + 2) * c_dim1];
982 			  f23 = c[i + 1 + (j + 2) * c_dim1];
983 			  f14 = c[i + (j + 3) * c_dim1];
984 			  f24 = c[i + 1 + (j + 3) * c_dim1];
985 			  f31 = c[i + 2 + j * c_dim1];
986 			  f41 = c[i + 3 + j * c_dim1];
987 			  f32 = c[i + 2 + (j + 1) * c_dim1];
988 			  f42 = c[i + 3 + (j + 1) * c_dim1];
989 			  f33 = c[i + 2 + (j + 2) * c_dim1];
990 			  f43 = c[i + 3 + (j + 2) * c_dim1];
991 			  f34 = c[i + 2 + (j + 3) * c_dim1];
992 			  f44 = c[i + 3 + (j + 3) * c_dim1];
993 			  i6 = ll + lsec - 1;
994 			  for (l = ll; l <= i6; ++l)
995 			    {
996 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
997 				      * b[l + j * b_dim1];
998 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
999 				      * b[l + j * b_dim1];
1000 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1001 				      * b[l + (j + 1) * b_dim1];
1002 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1003 				      * b[l + (j + 1) * b_dim1];
1004 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1005 				      * b[l + (j + 2) * b_dim1];
1006 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1007 				      * b[l + (j + 2) * b_dim1];
1008 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1009 				      * b[l + (j + 3) * b_dim1];
1010 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1011 				      * b[l + (j + 3) * b_dim1];
1012 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1013 				      * b[l + j * b_dim1];
1014 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1015 				      * b[l + j * b_dim1];
1016 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1017 				      * b[l + (j + 1) * b_dim1];
1018 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1019 				      * b[l + (j + 1) * b_dim1];
1020 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1021 				      * b[l + (j + 2) * b_dim1];
1022 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1023 				      * b[l + (j + 2) * b_dim1];
1024 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1025 				      * b[l + (j + 3) * b_dim1];
1026 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1027 				      * b[l + (j + 3) * b_dim1];
1028 			    }
1029 			  c[i + j * c_dim1] = f11;
1030 			  c[i + 1 + j * c_dim1] = f21;
1031 			  c[i + (j + 1) * c_dim1] = f12;
1032 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1033 			  c[i + (j + 2) * c_dim1] = f13;
1034 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1035 			  c[i + (j + 3) * c_dim1] = f14;
1036 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1037 			  c[i + 2 + j * c_dim1] = f31;
1038 			  c[i + 3 + j * c_dim1] = f41;
1039 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1040 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1041 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1042 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1043 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1044 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1045 			}
1046 		      if (uisec < isec)
1047 			{
1048 			  i5 = ii + isec - 1;
1049 			  for (i = ii + uisec; i <= i5; ++i)
1050 			    {
1051 			      f11 = c[i + j * c_dim1];
1052 			      f12 = c[i + (j + 1) * c_dim1];
1053 			      f13 = c[i + (j + 2) * c_dim1];
1054 			      f14 = c[i + (j + 3) * c_dim1];
1055 			      i6 = ll + lsec - 1;
1056 			      for (l = ll; l <= i6; ++l)
1057 				{
1058 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1059 					  257] * b[l + j * b_dim1];
1060 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061 					  257] * b[l + (j + 1) * b_dim1];
1062 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063 					  257] * b[l + (j + 2) * b_dim1];
1064 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 					  257] * b[l + (j + 3) * b_dim1];
1066 				}
1067 			      c[i + j * c_dim1] = f11;
1068 			      c[i + (j + 1) * c_dim1] = f12;
1069 			      c[i + (j + 2) * c_dim1] = f13;
1070 			      c[i + (j + 3) * c_dim1] = f14;
1071 			    }
1072 			}
1073 		    }
1074 		  if (ujsec < jsec)
1075 		    {
1076 		      i4 = jj + jsec - 1;
1077 		      for (j = jj + ujsec; j <= i4; ++j)
1078 			{
1079 			  i5 = ii + uisec - 1;
1080 			  for (i = ii; i <= i5; i += 4)
1081 			    {
1082 			      f11 = c[i + j * c_dim1];
1083 			      f21 = c[i + 1 + j * c_dim1];
1084 			      f31 = c[i + 2 + j * c_dim1];
1085 			      f41 = c[i + 3 + j * c_dim1];
1086 			      i6 = ll + lsec - 1;
1087 			      for (l = ll; l <= i6; ++l)
1088 				{
1089 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1090 					  257] * b[l + j * b_dim1];
1091 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1092 					  257] * b[l + j * b_dim1];
1093 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1094 					  257] * b[l + j * b_dim1];
1095 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1096 					  257] * b[l + j * b_dim1];
1097 				}
1098 			      c[i + j * c_dim1] = f11;
1099 			      c[i + 1 + j * c_dim1] = f21;
1100 			      c[i + 2 + j * c_dim1] = f31;
1101 			      c[i + 3 + j * c_dim1] = f41;
1102 			    }
1103 			  i5 = ii + isec - 1;
1104 			  for (i = ii + uisec; i <= i5; ++i)
1105 			    {
1106 			      f11 = c[i + j * c_dim1];
1107 			      i6 = ll + lsec - 1;
1108 			      for (l = ll; l <= i6; ++l)
1109 				{
1110 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1111 					  257] * b[l + j * b_dim1];
1112 				}
1113 			      c[i + j * c_dim1] = f11;
1114 			    }
1115 			}
1116 		    }
1117 		}
1118 	    }
1119 	}
1120       free(t1);
1121       return;
1122     }
1123   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1124     {
1125       if (GFC_DESCRIPTOR_RANK (a) != 1)
1126 	{
1127 	  const GFC_COMPLEX_4 *restrict abase_x;
1128 	  const GFC_COMPLEX_4 *restrict bbase_y;
1129 	  GFC_COMPLEX_4 *restrict dest_y;
1130 	  GFC_COMPLEX_4 s;
1131 
1132 	  for (y = 0; y < ycount; y++)
1133 	    {
1134 	      bbase_y = &bbase[y*bystride];
1135 	      dest_y = &dest[y*rystride];
1136 	      for (x = 0; x < xcount; x++)
1137 		{
1138 		  abase_x = &abase[x*axstride];
1139 		  s = (GFC_COMPLEX_4) 0;
1140 		  for (n = 0; n < count; n++)
1141 		    s += abase_x[n] * bbase_y[n];
1142 		  dest_y[x] = s;
1143 		}
1144 	    }
1145 	}
1146       else
1147 	{
1148 	  const GFC_COMPLEX_4 *restrict bbase_y;
1149 	  GFC_COMPLEX_4 s;
1150 
1151 	  for (y = 0; y < ycount; y++)
1152 	    {
1153 	      bbase_y = &bbase[y*bystride];
1154 	      s = (GFC_COMPLEX_4) 0;
1155 	      for (n = 0; n < count; n++)
1156 		s += abase[n*axstride] * bbase_y[n];
1157 	      dest[y*rystride] = s;
1158 	    }
1159 	}
1160     }
1161   else if (axstride < aystride)
1162     {
1163       for (y = 0; y < ycount; y++)
1164 	for (x = 0; x < xcount; x++)
1165 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1166 
1167       for (y = 0; y < ycount; y++)
1168 	for (n = 0; n < count; n++)
1169 	  for (x = 0; x < xcount; x++)
1170 	    /* dest[x,y] += a[x,n] * b[n,y] */
1171 	    dest[x*rxstride + y*rystride] +=
1172 					abase[x*axstride + n*aystride] *
1173 					bbase[n*bxstride + y*bystride];
1174     }
1175   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1176     {
1177       const GFC_COMPLEX_4 *restrict bbase_y;
1178       GFC_COMPLEX_4 s;
1179 
1180       for (y = 0; y < ycount; y++)
1181 	{
1182 	  bbase_y = &bbase[y*bystride];
1183 	  s = (GFC_COMPLEX_4) 0;
1184 	  for (n = 0; n < count; n++)
1185 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1186 	  dest[y*rxstride] = s;
1187 	}
1188     }
1189   else
1190     {
1191       const GFC_COMPLEX_4 *restrict abase_x;
1192       const GFC_COMPLEX_4 *restrict bbase_y;
1193       GFC_COMPLEX_4 *restrict dest_y;
1194       GFC_COMPLEX_4 s;
1195 
1196       for (y = 0; y < ycount; y++)
1197 	{
1198 	  bbase_y = &bbase[y*bystride];
1199 	  dest_y = &dest[y*rystride];
1200 	  for (x = 0; x < xcount; x++)
1201 	    {
1202 	      abase_x = &abase[x*axstride];
1203 	      s = (GFC_COMPLEX_4) 0;
1204 	      for (n = 0; n < count; n++)
1205 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1206 	      dest_y[x*rxstride] = s;
1207 	    }
1208 	}
1209     }
1210 }
1211 #undef POW3
1212 #undef min
1213 #undef max
1214 
1215 #endif /* HAVE_AVX2 */
1216 
1217 #ifdef HAVE_AVX512F
1218 static void
1219 matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1220 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1221 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1222 static void
matmul_c4_avx512f(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1223 matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1224 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1225 	int blas_limit, blas_call gemm)
1226 {
1227   const GFC_COMPLEX_4 * restrict abase;
1228   const GFC_COMPLEX_4 * restrict bbase;
1229   GFC_COMPLEX_4 * restrict dest;
1230 
1231   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1232   index_type x, y, n, count, xcount, ycount;
1233 
1234   assert (GFC_DESCRIPTOR_RANK (a) == 2
1235           || GFC_DESCRIPTOR_RANK (b) == 2);
1236 
1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1238 
1239    Either A or B (but not both) can be rank 1:
1240 
1241    o One-dimensional argument A is implicitly treated as a row matrix
1242      dimensioned [1,count], so xcount=1.
1243 
1244    o One-dimensional argument B is implicitly treated as a column matrix
1245      dimensioned [count, 1], so ycount=1.
1246 */
1247 
1248   if (retarray->base_addr == NULL)
1249     {
1250       if (GFC_DESCRIPTOR_RANK (a) == 1)
1251         {
1252 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1253 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1254         }
1255       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256         {
1257 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1258 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1259         }
1260       else
1261         {
1262 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1263 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1264 
1265           GFC_DIMENSION_SET(retarray->dim[1], 0,
1266 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1267 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1268         }
1269 
1270       retarray->base_addr
1271 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1272       retarray->offset = 0;
1273     }
1274   else if (unlikely (compile_options.bounds_check))
1275     {
1276       index_type ret_extent, arg_extent;
1277 
1278       if (GFC_DESCRIPTOR_RANK (a) == 1)
1279 	{
1280 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1281 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1282 	  if (arg_extent != ret_extent)
1283 	    runtime_error ("Array bound mismatch for dimension 1 of "
1284 	    		   "array (%ld/%ld) ",
1285 			   (long int) ret_extent, (long int) arg_extent);
1286 	}
1287       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1288 	{
1289 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1290 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1291 	  if (arg_extent != ret_extent)
1292 	    runtime_error ("Array bound mismatch for dimension 1 of "
1293 	    		   "array (%ld/%ld) ",
1294 			   (long int) ret_extent, (long int) arg_extent);
1295 	}
1296       else
1297 	{
1298 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1299 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1300 	  if (arg_extent != ret_extent)
1301 	    runtime_error ("Array bound mismatch for dimension 1 of "
1302 	    		   "array (%ld/%ld) ",
1303 			   (long int) ret_extent, (long int) arg_extent);
1304 
1305 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1306 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1307 	  if (arg_extent != ret_extent)
1308 	    runtime_error ("Array bound mismatch for dimension 2 of "
1309 	    		   "array (%ld/%ld) ",
1310 			   (long int) ret_extent, (long int) arg_extent);
1311 	}
1312     }
1313 
1314 
1315   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1316     {
1317       /* One-dimensional result may be addressed in the code below
1318 	 either as a row or a column matrix. We want both cases to
1319 	 work. */
1320       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1321     }
1322   else
1323     {
1324       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1325       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1326     }
1327 
1328 
1329   if (GFC_DESCRIPTOR_RANK (a) == 1)
1330     {
1331       /* Treat it as a a row matrix A[1,count]. */
1332       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1333       aystride = 1;
1334 
1335       xcount = 1;
1336       count = GFC_DESCRIPTOR_EXTENT(a,0);
1337     }
1338   else
1339     {
1340       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1341       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1342 
1343       count = GFC_DESCRIPTOR_EXTENT(a,1);
1344       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1345     }
1346 
1347   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1348     {
1349       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1350 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1351 		       "in dimension 1: is %ld, should be %ld",
1352 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1353     }
1354 
1355   if (GFC_DESCRIPTOR_RANK (b) == 1)
1356     {
1357       /* Treat it as a column matrix B[count,1] */
1358       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1359 
1360       /* bystride should never be used for 1-dimensional b.
1361          The value is only used for calculation of the
1362          memory by the buffer.  */
1363       bystride = 256;
1364       ycount = 1;
1365     }
1366   else
1367     {
1368       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1369       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1370       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1371     }
1372 
1373   abase = a->base_addr;
1374   bbase = b->base_addr;
1375   dest = retarray->base_addr;
1376 
1377   /* Now that everything is set up, we perform the multiplication
1378      itself.  */
1379 
1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1381 #define min(a,b) ((a) <= (b) ? (a) : (b))
1382 #define max(a,b) ((a) >= (b) ? (a) : (b))
1383 
1384   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1385       && (bxstride == 1 || bystride == 1)
1386       && (((float) xcount) * ((float) ycount) * ((float) count)
1387           > POW3(blas_limit)))
1388     {
1389       const int m = xcount, n = ycount, k = count, ldc = rystride;
1390       const GFC_COMPLEX_4 one = 1, zero = 0;
1391       const int lda = (axstride == 1) ? aystride : axstride,
1392 		ldb = (bxstride == 1) ? bystride : bxstride;
1393 
1394       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1395 	{
1396 	  assert (gemm != NULL);
1397 	  const char *transa, *transb;
1398 	  if (try_blas & 2)
1399 	    transa = "C";
1400 	  else
1401 	    transa = axstride == 1 ? "N" : "T";
1402 
1403 	  if (try_blas & 4)
1404 	    transb = "C";
1405 	  else
1406 	    transb = bxstride == 1 ? "N" : "T";
1407 
1408 	  gemm (transa, transb , &m,
1409 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1410 		&ldc, 1, 1);
1411 	  return;
1412 	}
1413     }
1414 
1415   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1416     {
1417       /* This block of code implements a tuned matmul, derived from
1418          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1419 
1420                Bo Kagstrom and Per Ling
1421                Department of Computing Science
1422                Umea University
1423                S-901 87 Umea, Sweden
1424 
1425 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1426 
1427       const GFC_COMPLEX_4 *a, *b;
1428       GFC_COMPLEX_4 *c;
1429       const index_type m = xcount, n = ycount, k = count;
1430 
1431       /* System generated locals */
1432       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1433 		 i1, i2, i3, i4, i5, i6;
1434 
1435       /* Local variables */
1436       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
1437 		 f13, f14, f23, f24, f33, f34, f43, f44;
1438       index_type i, j, l, ii, jj, ll;
1439       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1440       GFC_COMPLEX_4 *t1;
1441 
1442       a = abase;
1443       b = bbase;
1444       c = retarray->base_addr;
1445 
1446       /* Parameter adjustments */
1447       c_dim1 = rystride;
1448       c_offset = 1 + c_dim1;
1449       c -= c_offset;
1450       a_dim1 = aystride;
1451       a_offset = 1 + a_dim1;
1452       a -= a_offset;
1453       b_dim1 = bystride;
1454       b_offset = 1 + b_dim1;
1455       b -= b_offset;
1456 
1457       /* Empty c first.  */
1458       for (j=1; j<=n; j++)
1459 	for (i=1; i<=m; i++)
1460 	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
1461 
1462       /* Early exit if possible */
1463       if (m == 0 || n == 0 || k == 0)
1464 	return;
1465 
1466       /* Adjust size of t1 to what is needed.  */
1467       index_type t1_dim, a_sz;
1468       if (aystride == 1)
1469         a_sz = rystride;
1470       else
1471         a_sz = a_dim1;
1472 
1473       t1_dim = a_sz * 256 + b_dim1;
1474       if (t1_dim > 65536)
1475 	t1_dim = 65536;
1476 
1477       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
1478 
1479       /* Start turning the crank. */
1480       i1 = n;
1481       for (jj = 1; jj <= i1; jj += 512)
1482 	{
1483 	  /* Computing MIN */
1484 	  i2 = 512;
1485 	  i3 = n - jj + 1;
1486 	  jsec = min(i2,i3);
1487 	  ujsec = jsec - jsec % 4;
1488 	  i2 = k;
1489 	  for (ll = 1; ll <= i2; ll += 256)
1490 	    {
1491 	      /* Computing MIN */
1492 	      i3 = 256;
1493 	      i4 = k - ll + 1;
1494 	      lsec = min(i3,i4);
1495 	      ulsec = lsec - lsec % 2;
1496 
1497 	      i3 = m;
1498 	      for (ii = 1; ii <= i3; ii += 256)
1499 		{
1500 		  /* Computing MIN */
1501 		  i4 = 256;
1502 		  i5 = m - ii + 1;
1503 		  isec = min(i4,i5);
1504 		  uisec = isec - isec % 2;
1505 		  i4 = ll + ulsec - 1;
1506 		  for (l = ll; l <= i4; l += 2)
1507 		    {
1508 		      i5 = ii + uisec - 1;
1509 		      for (i = ii; i <= i5; i += 2)
1510 			{
1511 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1512 					a[i + l * a_dim1];
1513 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1514 					a[i + (l + 1) * a_dim1];
1515 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1516 					a[i + 1 + l * a_dim1];
1517 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1518 					a[i + 1 + (l + 1) * a_dim1];
1519 			}
1520 		      if (uisec < isec)
1521 			{
1522 			  t1[l - ll + 1 + (isec << 8) - 257] =
1523 				    a[ii + isec - 1 + l * a_dim1];
1524 			  t1[l - ll + 2 + (isec << 8) - 257] =
1525 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1526 			}
1527 		    }
1528 		  if (ulsec < lsec)
1529 		    {
1530 		      i4 = ii + isec - 1;
1531 		      for (i = ii; i<= i4; ++i)
1532 			{
1533 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1534 				    a[i + (ll + lsec - 1) * a_dim1];
1535 			}
1536 		    }
1537 
1538 		  uisec = isec - isec % 4;
1539 		  i4 = jj + ujsec - 1;
1540 		  for (j = jj; j <= i4; j += 4)
1541 		    {
1542 		      i5 = ii + uisec - 1;
1543 		      for (i = ii; i <= i5; i += 4)
1544 			{
1545 			  f11 = c[i + j * c_dim1];
1546 			  f21 = c[i + 1 + j * c_dim1];
1547 			  f12 = c[i + (j + 1) * c_dim1];
1548 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1549 			  f13 = c[i + (j + 2) * c_dim1];
1550 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1551 			  f14 = c[i + (j + 3) * c_dim1];
1552 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1553 			  f31 = c[i + 2 + j * c_dim1];
1554 			  f41 = c[i + 3 + j * c_dim1];
1555 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1556 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1557 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1558 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1559 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1560 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1561 			  i6 = ll + lsec - 1;
1562 			  for (l = ll; l <= i6; ++l)
1563 			    {
1564 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1565 				      * b[l + j * b_dim1];
1566 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1567 				      * b[l + j * b_dim1];
1568 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1569 				      * b[l + (j + 1) * b_dim1];
1570 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1571 				      * b[l + (j + 1) * b_dim1];
1572 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1573 				      * b[l + (j + 2) * b_dim1];
1574 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1575 				      * b[l + (j + 2) * b_dim1];
1576 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1577 				      * b[l + (j + 3) * b_dim1];
1578 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1579 				      * b[l + (j + 3) * b_dim1];
1580 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1581 				      * b[l + j * b_dim1];
1582 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1583 				      * b[l + j * b_dim1];
1584 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1585 				      * b[l + (j + 1) * b_dim1];
1586 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1587 				      * b[l + (j + 1) * b_dim1];
1588 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1589 				      * b[l + (j + 2) * b_dim1];
1590 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1591 				      * b[l + (j + 2) * b_dim1];
1592 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1593 				      * b[l + (j + 3) * b_dim1];
1594 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1595 				      * b[l + (j + 3) * b_dim1];
1596 			    }
1597 			  c[i + j * c_dim1] = f11;
1598 			  c[i + 1 + j * c_dim1] = f21;
1599 			  c[i + (j + 1) * c_dim1] = f12;
1600 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1601 			  c[i + (j + 2) * c_dim1] = f13;
1602 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1603 			  c[i + (j + 3) * c_dim1] = f14;
1604 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1605 			  c[i + 2 + j * c_dim1] = f31;
1606 			  c[i + 3 + j * c_dim1] = f41;
1607 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1608 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1609 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1610 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1611 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1612 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1613 			}
1614 		      if (uisec < isec)
1615 			{
1616 			  i5 = ii + isec - 1;
1617 			  for (i = ii + uisec; i <= i5; ++i)
1618 			    {
1619 			      f11 = c[i + j * c_dim1];
1620 			      f12 = c[i + (j + 1) * c_dim1];
1621 			      f13 = c[i + (j + 2) * c_dim1];
1622 			      f14 = c[i + (j + 3) * c_dim1];
1623 			      i6 = ll + lsec - 1;
1624 			      for (l = ll; l <= i6; ++l)
1625 				{
1626 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1627 					  257] * b[l + j * b_dim1];
1628 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1629 					  257] * b[l + (j + 1) * b_dim1];
1630 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631 					  257] * b[l + (j + 2) * b_dim1];
1632 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1633 					  257] * b[l + (j + 3) * b_dim1];
1634 				}
1635 			      c[i + j * c_dim1] = f11;
1636 			      c[i + (j + 1) * c_dim1] = f12;
1637 			      c[i + (j + 2) * c_dim1] = f13;
1638 			      c[i + (j + 3) * c_dim1] = f14;
1639 			    }
1640 			}
1641 		    }
1642 		  if (ujsec < jsec)
1643 		    {
1644 		      i4 = jj + jsec - 1;
1645 		      for (j = jj + ujsec; j <= i4; ++j)
1646 			{
1647 			  i5 = ii + uisec - 1;
1648 			  for (i = ii; i <= i5; i += 4)
1649 			    {
1650 			      f11 = c[i + j * c_dim1];
1651 			      f21 = c[i + 1 + j * c_dim1];
1652 			      f31 = c[i + 2 + j * c_dim1];
1653 			      f41 = c[i + 3 + j * c_dim1];
1654 			      i6 = ll + lsec - 1;
1655 			      for (l = ll; l <= i6; ++l)
1656 				{
1657 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1658 					  257] * b[l + j * b_dim1];
1659 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1660 					  257] * b[l + j * b_dim1];
1661 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1662 					  257] * b[l + j * b_dim1];
1663 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1664 					  257] * b[l + j * b_dim1];
1665 				}
1666 			      c[i + j * c_dim1] = f11;
1667 			      c[i + 1 + j * c_dim1] = f21;
1668 			      c[i + 2 + j * c_dim1] = f31;
1669 			      c[i + 3 + j * c_dim1] = f41;
1670 			    }
1671 			  i5 = ii + isec - 1;
1672 			  for (i = ii + uisec; i <= i5; ++i)
1673 			    {
1674 			      f11 = c[i + j * c_dim1];
1675 			      i6 = ll + lsec - 1;
1676 			      for (l = ll; l <= i6; ++l)
1677 				{
1678 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1679 					  257] * b[l + j * b_dim1];
1680 				}
1681 			      c[i + j * c_dim1] = f11;
1682 			    }
1683 			}
1684 		    }
1685 		}
1686 	    }
1687 	}
1688       free(t1);
1689       return;
1690     }
1691   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1692     {
1693       if (GFC_DESCRIPTOR_RANK (a) != 1)
1694 	{
1695 	  const GFC_COMPLEX_4 *restrict abase_x;
1696 	  const GFC_COMPLEX_4 *restrict bbase_y;
1697 	  GFC_COMPLEX_4 *restrict dest_y;
1698 	  GFC_COMPLEX_4 s;
1699 
1700 	  for (y = 0; y < ycount; y++)
1701 	    {
1702 	      bbase_y = &bbase[y*bystride];
1703 	      dest_y = &dest[y*rystride];
1704 	      for (x = 0; x < xcount; x++)
1705 		{
1706 		  abase_x = &abase[x*axstride];
1707 		  s = (GFC_COMPLEX_4) 0;
1708 		  for (n = 0; n < count; n++)
1709 		    s += abase_x[n] * bbase_y[n];
1710 		  dest_y[x] = s;
1711 		}
1712 	    }
1713 	}
1714       else
1715 	{
1716 	  const GFC_COMPLEX_4 *restrict bbase_y;
1717 	  GFC_COMPLEX_4 s;
1718 
1719 	  for (y = 0; y < ycount; y++)
1720 	    {
1721 	      bbase_y = &bbase[y*bystride];
1722 	      s = (GFC_COMPLEX_4) 0;
1723 	      for (n = 0; n < count; n++)
1724 		s += abase[n*axstride] * bbase_y[n];
1725 	      dest[y*rystride] = s;
1726 	    }
1727 	}
1728     }
1729   else if (axstride < aystride)
1730     {
1731       for (y = 0; y < ycount; y++)
1732 	for (x = 0; x < xcount; x++)
1733 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1734 
1735       for (y = 0; y < ycount; y++)
1736 	for (n = 0; n < count; n++)
1737 	  for (x = 0; x < xcount; x++)
1738 	    /* dest[x,y] += a[x,n] * b[n,y] */
1739 	    dest[x*rxstride + y*rystride] +=
1740 					abase[x*axstride + n*aystride] *
1741 					bbase[n*bxstride + y*bystride];
1742     }
1743   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1744     {
1745       const GFC_COMPLEX_4 *restrict bbase_y;
1746       GFC_COMPLEX_4 s;
1747 
1748       for (y = 0; y < ycount; y++)
1749 	{
1750 	  bbase_y = &bbase[y*bystride];
1751 	  s = (GFC_COMPLEX_4) 0;
1752 	  for (n = 0; n < count; n++)
1753 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1754 	  dest[y*rxstride] = s;
1755 	}
1756     }
1757   else
1758     {
1759       const GFC_COMPLEX_4 *restrict abase_x;
1760       const GFC_COMPLEX_4 *restrict bbase_y;
1761       GFC_COMPLEX_4 *restrict dest_y;
1762       GFC_COMPLEX_4 s;
1763 
1764       for (y = 0; y < ycount; y++)
1765 	{
1766 	  bbase_y = &bbase[y*bystride];
1767 	  dest_y = &dest[y*rystride];
1768 	  for (x = 0; x < xcount; x++)
1769 	    {
1770 	      abase_x = &abase[x*axstride];
1771 	      s = (GFC_COMPLEX_4) 0;
1772 	      for (n = 0; n < count; n++)
1773 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1774 	      dest_y[x*rxstride] = s;
1775 	    }
1776 	}
1777     }
1778 }
1779 #undef POW3
1780 #undef min
1781 #undef max
1782 
1783 #endif  /* HAVE_AVX512F */
1784 
1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1786 
1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1788 void
1789 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
1790 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1791 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1792 internal_proto(matmul_c4_avx128_fma3);
1793 #endif
1794 
1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1796 void
1797 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
1798 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1799 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1800 internal_proto(matmul_c4_avx128_fma4);
1801 #endif
1802 
1803 /* Function to fall back to if there is no special processor-specific version.  */
1804 static void
matmul_c4_vanilla(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1805 matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
1806 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1807 	int blas_limit, blas_call gemm)
1808 {
1809   const GFC_COMPLEX_4 * restrict abase;
1810   const GFC_COMPLEX_4 * restrict bbase;
1811   GFC_COMPLEX_4 * restrict dest;
1812 
1813   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1814   index_type x, y, n, count, xcount, ycount;
1815 
1816   assert (GFC_DESCRIPTOR_RANK (a) == 2
1817           || GFC_DESCRIPTOR_RANK (b) == 2);
1818 
1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1820 
1821    Either A or B (but not both) can be rank 1:
1822 
1823    o One-dimensional argument A is implicitly treated as a row matrix
1824      dimensioned [1,count], so xcount=1.
1825 
1826    o One-dimensional argument B is implicitly treated as a column matrix
1827      dimensioned [count, 1], so ycount=1.
1828 */
1829 
1830   if (retarray->base_addr == NULL)
1831     {
1832       if (GFC_DESCRIPTOR_RANK (a) == 1)
1833         {
1834 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1835 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1836         }
1837       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1838         {
1839 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1840 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1841         }
1842       else
1843         {
1844 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1845 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1846 
1847           GFC_DIMENSION_SET(retarray->dim[1], 0,
1848 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1849 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1850         }
1851 
1852       retarray->base_addr
1853 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1854       retarray->offset = 0;
1855     }
1856   else if (unlikely (compile_options.bounds_check))
1857     {
1858       index_type ret_extent, arg_extent;
1859 
1860       if (GFC_DESCRIPTOR_RANK (a) == 1)
1861 	{
1862 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1863 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1864 	  if (arg_extent != ret_extent)
1865 	    runtime_error ("Array bound mismatch for dimension 1 of "
1866 	    		   "array (%ld/%ld) ",
1867 			   (long int) ret_extent, (long int) arg_extent);
1868 	}
1869       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1870 	{
1871 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1872 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1873 	  if (arg_extent != ret_extent)
1874 	    runtime_error ("Array bound mismatch for dimension 1 of "
1875 	    		   "array (%ld/%ld) ",
1876 			   (long int) ret_extent, (long int) arg_extent);
1877 	}
1878       else
1879 	{
1880 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1881 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1882 	  if (arg_extent != ret_extent)
1883 	    runtime_error ("Array bound mismatch for dimension 1 of "
1884 	    		   "array (%ld/%ld) ",
1885 			   (long int) ret_extent, (long int) arg_extent);
1886 
1887 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1888 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1889 	  if (arg_extent != ret_extent)
1890 	    runtime_error ("Array bound mismatch for dimension 2 of "
1891 	    		   "array (%ld/%ld) ",
1892 			   (long int) ret_extent, (long int) arg_extent);
1893 	}
1894     }
1895 
1896 
1897   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1898     {
1899       /* One-dimensional result may be addressed in the code below
1900 	 either as a row or a column matrix. We want both cases to
1901 	 work. */
1902       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1903     }
1904   else
1905     {
1906       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1907       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1908     }
1909 
1910 
1911   if (GFC_DESCRIPTOR_RANK (a) == 1)
1912     {
1913       /* Treat it as a a row matrix A[1,count]. */
1914       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1915       aystride = 1;
1916 
1917       xcount = 1;
1918       count = GFC_DESCRIPTOR_EXTENT(a,0);
1919     }
1920   else
1921     {
1922       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1923       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1924 
1925       count = GFC_DESCRIPTOR_EXTENT(a,1);
1926       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1927     }
1928 
1929   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1930     {
1931       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1932 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1933 		       "in dimension 1: is %ld, should be %ld",
1934 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1935     }
1936 
1937   if (GFC_DESCRIPTOR_RANK (b) == 1)
1938     {
1939       /* Treat it as a column matrix B[count,1] */
1940       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1941 
1942       /* bystride should never be used for 1-dimensional b.
1943          The value is only used for calculation of the
1944          memory by the buffer.  */
1945       bystride = 256;
1946       ycount = 1;
1947     }
1948   else
1949     {
1950       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1951       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1952       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1953     }
1954 
1955   abase = a->base_addr;
1956   bbase = b->base_addr;
1957   dest = retarray->base_addr;
1958 
1959   /* Now that everything is set up, we perform the multiplication
1960      itself.  */
1961 
1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1963 #define min(a,b) ((a) <= (b) ? (a) : (b))
1964 #define max(a,b) ((a) >= (b) ? (a) : (b))
1965 
1966   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1967       && (bxstride == 1 || bystride == 1)
1968       && (((float) xcount) * ((float) ycount) * ((float) count)
1969           > POW3(blas_limit)))
1970     {
1971       const int m = xcount, n = ycount, k = count, ldc = rystride;
1972       const GFC_COMPLEX_4 one = 1, zero = 0;
1973       const int lda = (axstride == 1) ? aystride : axstride,
1974 		ldb = (bxstride == 1) ? bystride : bxstride;
1975 
1976       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1977 	{
1978 	  assert (gemm != NULL);
1979 	  const char *transa, *transb;
1980 	  if (try_blas & 2)
1981 	    transa = "C";
1982 	  else
1983 	    transa = axstride == 1 ? "N" : "T";
1984 
1985 	  if (try_blas & 4)
1986 	    transb = "C";
1987 	  else
1988 	    transb = bxstride == 1 ? "N" : "T";
1989 
1990 	  gemm (transa, transb , &m,
1991 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1992 		&ldc, 1, 1);
1993 	  return;
1994 	}
1995     }
1996 
1997   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1998     {
1999       /* This block of code implements a tuned matmul, derived from
2000          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2001 
2002                Bo Kagstrom and Per Ling
2003                Department of Computing Science
2004                Umea University
2005                S-901 87 Umea, Sweden
2006 
2007 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2008 
2009       const GFC_COMPLEX_4 *a, *b;
2010       GFC_COMPLEX_4 *c;
2011       const index_type m = xcount, n = ycount, k = count;
2012 
2013       /* System generated locals */
2014       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2015 		 i1, i2, i3, i4, i5, i6;
2016 
2017       /* Local variables */
2018       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
2019 		 f13, f14, f23, f24, f33, f34, f43, f44;
2020       index_type i, j, l, ii, jj, ll;
2021       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2022       GFC_COMPLEX_4 *t1;
2023 
2024       a = abase;
2025       b = bbase;
2026       c = retarray->base_addr;
2027 
2028       /* Parameter adjustments */
2029       c_dim1 = rystride;
2030       c_offset = 1 + c_dim1;
2031       c -= c_offset;
2032       a_dim1 = aystride;
2033       a_offset = 1 + a_dim1;
2034       a -= a_offset;
2035       b_dim1 = bystride;
2036       b_offset = 1 + b_dim1;
2037       b -= b_offset;
2038 
2039       /* Empty c first.  */
2040       for (j=1; j<=n; j++)
2041 	for (i=1; i<=m; i++)
2042 	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
2043 
2044       /* Early exit if possible */
2045       if (m == 0 || n == 0 || k == 0)
2046 	return;
2047 
2048       /* Adjust size of t1 to what is needed.  */
2049       index_type t1_dim, a_sz;
2050       if (aystride == 1)
2051         a_sz = rystride;
2052       else
2053         a_sz = a_dim1;
2054 
2055       t1_dim = a_sz * 256 + b_dim1;
2056       if (t1_dim > 65536)
2057 	t1_dim = 65536;
2058 
2059       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
2060 
2061       /* Start turning the crank. */
2062       i1 = n;
2063       for (jj = 1; jj <= i1; jj += 512)
2064 	{
2065 	  /* Computing MIN */
2066 	  i2 = 512;
2067 	  i3 = n - jj + 1;
2068 	  jsec = min(i2,i3);
2069 	  ujsec = jsec - jsec % 4;
2070 	  i2 = k;
2071 	  for (ll = 1; ll <= i2; ll += 256)
2072 	    {
2073 	      /* Computing MIN */
2074 	      i3 = 256;
2075 	      i4 = k - ll + 1;
2076 	      lsec = min(i3,i4);
2077 	      ulsec = lsec - lsec % 2;
2078 
2079 	      i3 = m;
2080 	      for (ii = 1; ii <= i3; ii += 256)
2081 		{
2082 		  /* Computing MIN */
2083 		  i4 = 256;
2084 		  i5 = m - ii + 1;
2085 		  isec = min(i4,i5);
2086 		  uisec = isec - isec % 2;
2087 		  i4 = ll + ulsec - 1;
2088 		  for (l = ll; l <= i4; l += 2)
2089 		    {
2090 		      i5 = ii + uisec - 1;
2091 		      for (i = ii; i <= i5; i += 2)
2092 			{
2093 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2094 					a[i + l * a_dim1];
2095 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2096 					a[i + (l + 1) * a_dim1];
2097 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2098 					a[i + 1 + l * a_dim1];
2099 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2100 					a[i + 1 + (l + 1) * a_dim1];
2101 			}
2102 		      if (uisec < isec)
2103 			{
2104 			  t1[l - ll + 1 + (isec << 8) - 257] =
2105 				    a[ii + isec - 1 + l * a_dim1];
2106 			  t1[l - ll + 2 + (isec << 8) - 257] =
2107 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2108 			}
2109 		    }
2110 		  if (ulsec < lsec)
2111 		    {
2112 		      i4 = ii + isec - 1;
2113 		      for (i = ii; i<= i4; ++i)
2114 			{
2115 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2116 				    a[i + (ll + lsec - 1) * a_dim1];
2117 			}
2118 		    }
2119 
2120 		  uisec = isec - isec % 4;
2121 		  i4 = jj + ujsec - 1;
2122 		  for (j = jj; j <= i4; j += 4)
2123 		    {
2124 		      i5 = ii + uisec - 1;
2125 		      for (i = ii; i <= i5; i += 4)
2126 			{
2127 			  f11 = c[i + j * c_dim1];
2128 			  f21 = c[i + 1 + j * c_dim1];
2129 			  f12 = c[i + (j + 1) * c_dim1];
2130 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2131 			  f13 = c[i + (j + 2) * c_dim1];
2132 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2133 			  f14 = c[i + (j + 3) * c_dim1];
2134 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2135 			  f31 = c[i + 2 + j * c_dim1];
2136 			  f41 = c[i + 3 + j * c_dim1];
2137 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2138 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2139 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2140 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2141 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2142 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2143 			  i6 = ll + lsec - 1;
2144 			  for (l = ll; l <= i6; ++l)
2145 			    {
2146 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2147 				      * b[l + j * b_dim1];
2148 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2149 				      * b[l + j * b_dim1];
2150 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151 				      * b[l + (j + 1) * b_dim1];
2152 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153 				      * b[l + (j + 1) * b_dim1];
2154 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155 				      * b[l + (j + 2) * b_dim1];
2156 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157 				      * b[l + (j + 2) * b_dim1];
2158 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159 				      * b[l + (j + 3) * b_dim1];
2160 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161 				      * b[l + (j + 3) * b_dim1];
2162 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2163 				      * b[l + j * b_dim1];
2164 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2165 				      * b[l + j * b_dim1];
2166 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167 				      * b[l + (j + 1) * b_dim1];
2168 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169 				      * b[l + (j + 1) * b_dim1];
2170 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171 				      * b[l + (j + 2) * b_dim1];
2172 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173 				      * b[l + (j + 2) * b_dim1];
2174 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175 				      * b[l + (j + 3) * b_dim1];
2176 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177 				      * b[l + (j + 3) * b_dim1];
2178 			    }
2179 			  c[i + j * c_dim1] = f11;
2180 			  c[i + 1 + j * c_dim1] = f21;
2181 			  c[i + (j + 1) * c_dim1] = f12;
2182 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2183 			  c[i + (j + 2) * c_dim1] = f13;
2184 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2185 			  c[i + (j + 3) * c_dim1] = f14;
2186 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2187 			  c[i + 2 + j * c_dim1] = f31;
2188 			  c[i + 3 + j * c_dim1] = f41;
2189 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2190 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2191 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2192 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2193 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2194 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2195 			}
2196 		      if (uisec < isec)
2197 			{
2198 			  i5 = ii + isec - 1;
2199 			  for (i = ii + uisec; i <= i5; ++i)
2200 			    {
2201 			      f11 = c[i + j * c_dim1];
2202 			      f12 = c[i + (j + 1) * c_dim1];
2203 			      f13 = c[i + (j + 2) * c_dim1];
2204 			      f14 = c[i + (j + 3) * c_dim1];
2205 			      i6 = ll + lsec - 1;
2206 			      for (l = ll; l <= i6; ++l)
2207 				{
2208 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2209 					  257] * b[l + j * b_dim1];
2210 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2211 					  257] * b[l + (j + 1) * b_dim1];
2212 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213 					  257] * b[l + (j + 2) * b_dim1];
2214 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215 					  257] * b[l + (j + 3) * b_dim1];
2216 				}
2217 			      c[i + j * c_dim1] = f11;
2218 			      c[i + (j + 1) * c_dim1] = f12;
2219 			      c[i + (j + 2) * c_dim1] = f13;
2220 			      c[i + (j + 3) * c_dim1] = f14;
2221 			    }
2222 			}
2223 		    }
2224 		  if (ujsec < jsec)
2225 		    {
2226 		      i4 = jj + jsec - 1;
2227 		      for (j = jj + ujsec; j <= i4; ++j)
2228 			{
2229 			  i5 = ii + uisec - 1;
2230 			  for (i = ii; i <= i5; i += 4)
2231 			    {
2232 			      f11 = c[i + j * c_dim1];
2233 			      f21 = c[i + 1 + j * c_dim1];
2234 			      f31 = c[i + 2 + j * c_dim1];
2235 			      f41 = c[i + 3 + j * c_dim1];
2236 			      i6 = ll + lsec - 1;
2237 			      for (l = ll; l <= i6; ++l)
2238 				{
2239 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2240 					  257] * b[l + j * b_dim1];
2241 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2242 					  257] * b[l + j * b_dim1];
2243 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2244 					  257] * b[l + j * b_dim1];
2245 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2246 					  257] * b[l + j * b_dim1];
2247 				}
2248 			      c[i + j * c_dim1] = f11;
2249 			      c[i + 1 + j * c_dim1] = f21;
2250 			      c[i + 2 + j * c_dim1] = f31;
2251 			      c[i + 3 + j * c_dim1] = f41;
2252 			    }
2253 			  i5 = ii + isec - 1;
2254 			  for (i = ii + uisec; i <= i5; ++i)
2255 			    {
2256 			      f11 = c[i + j * c_dim1];
2257 			      i6 = ll + lsec - 1;
2258 			      for (l = ll; l <= i6; ++l)
2259 				{
2260 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2261 					  257] * b[l + j * b_dim1];
2262 				}
2263 			      c[i + j * c_dim1] = f11;
2264 			    }
2265 			}
2266 		    }
2267 		}
2268 	    }
2269 	}
2270       free(t1);
2271       return;
2272     }
2273   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2274     {
2275       if (GFC_DESCRIPTOR_RANK (a) != 1)
2276 	{
2277 	  const GFC_COMPLEX_4 *restrict abase_x;
2278 	  const GFC_COMPLEX_4 *restrict bbase_y;
2279 	  GFC_COMPLEX_4 *restrict dest_y;
2280 	  GFC_COMPLEX_4 s;
2281 
2282 	  for (y = 0; y < ycount; y++)
2283 	    {
2284 	      bbase_y = &bbase[y*bystride];
2285 	      dest_y = &dest[y*rystride];
2286 	      for (x = 0; x < xcount; x++)
2287 		{
2288 		  abase_x = &abase[x*axstride];
2289 		  s = (GFC_COMPLEX_4) 0;
2290 		  for (n = 0; n < count; n++)
2291 		    s += abase_x[n] * bbase_y[n];
2292 		  dest_y[x] = s;
2293 		}
2294 	    }
2295 	}
2296       else
2297 	{
2298 	  const GFC_COMPLEX_4 *restrict bbase_y;
2299 	  GFC_COMPLEX_4 s;
2300 
2301 	  for (y = 0; y < ycount; y++)
2302 	    {
2303 	      bbase_y = &bbase[y*bystride];
2304 	      s = (GFC_COMPLEX_4) 0;
2305 	      for (n = 0; n < count; n++)
2306 		s += abase[n*axstride] * bbase_y[n];
2307 	      dest[y*rystride] = s;
2308 	    }
2309 	}
2310     }
2311   else if (axstride < aystride)
2312     {
2313       for (y = 0; y < ycount; y++)
2314 	for (x = 0; x < xcount; x++)
2315 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2316 
2317       for (y = 0; y < ycount; y++)
2318 	for (n = 0; n < count; n++)
2319 	  for (x = 0; x < xcount; x++)
2320 	    /* dest[x,y] += a[x,n] * b[n,y] */
2321 	    dest[x*rxstride + y*rystride] +=
2322 					abase[x*axstride + n*aystride] *
2323 					bbase[n*bxstride + y*bystride];
2324     }
2325   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2326     {
2327       const GFC_COMPLEX_4 *restrict bbase_y;
2328       GFC_COMPLEX_4 s;
2329 
2330       for (y = 0; y < ycount; y++)
2331 	{
2332 	  bbase_y = &bbase[y*bystride];
2333 	  s = (GFC_COMPLEX_4) 0;
2334 	  for (n = 0; n < count; n++)
2335 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2336 	  dest[y*rxstride] = s;
2337 	}
2338     }
2339   else
2340     {
2341       const GFC_COMPLEX_4 *restrict abase_x;
2342       const GFC_COMPLEX_4 *restrict bbase_y;
2343       GFC_COMPLEX_4 *restrict dest_y;
2344       GFC_COMPLEX_4 s;
2345 
2346       for (y = 0; y < ycount; y++)
2347 	{
2348 	  bbase_y = &bbase[y*bystride];
2349 	  dest_y = &dest[y*rystride];
2350 	  for (x = 0; x < xcount; x++)
2351 	    {
2352 	      abase_x = &abase[x*axstride];
2353 	      s = (GFC_COMPLEX_4) 0;
2354 	      for (n = 0; n < count; n++)
2355 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2356 	      dest_y[x*rxstride] = s;
2357 	    }
2358 	}
2359     }
2360 }
2361 #undef POW3
2362 #undef min
2363 #undef max
2364 
2365 
2366 /* Compiling main function, with selection code for the processor.  */
2367 
2368 /* Currently, this is i386 only.  Adjust for other architectures.  */
2369 
2370 #include <config/i386/cpuinfo.h>
matmul_c4(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2371 void matmul_c4 (gfc_array_c4 * const restrict retarray,
2372 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2373 	int blas_limit, blas_call gemm)
2374 {
2375   static void (*matmul_p) (gfc_array_c4 * const restrict retarray,
2376 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2377 	int blas_limit, blas_call gemm);
2378 
2379   void (*matmul_fn) (gfc_array_c4 * const restrict retarray,
2380 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2381 	int blas_limit, blas_call gemm);
2382 
2383   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2384   if (matmul_fn == NULL)
2385     {
2386       matmul_fn = matmul_c4_vanilla;
2387       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2388 	{
2389           /* Run down the available processors in order of preference.  */
2390 #ifdef HAVE_AVX512F
2391       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2392 	    {
2393 	      matmul_fn = matmul_c4_avx512f;
2394 	      goto store;
2395 	    }
2396 
2397 #endif  /* HAVE_AVX512F */
2398 
2399 #ifdef HAVE_AVX2
2400       	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2401 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2402 	    {
2403 	      matmul_fn = matmul_c4_avx2;
2404 	      goto store;
2405 	    }
2406 
2407 #endif
2408 
2409 #ifdef HAVE_AVX
2410       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2411  	    {
2412               matmul_fn = matmul_c4_avx;
2413 	      goto store;
2414 	    }
2415 #endif  /* HAVE_AVX */
2416         }
2417     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2418       {
2419 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2420         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2421 	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2422 	  {
2423             matmul_fn = matmul_c4_avx128_fma3;
2424 	    goto store;
2425 	  }
2426 #endif
2427 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2428         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2429 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2430 	  {
2431             matmul_fn = matmul_c4_avx128_fma4;
2432 	    goto store;
2433 	  }
2434 #endif
2435 
2436       }
2437    store:
2438       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2439    }
2440 
2441    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2442 }
2443 
2444 #else  /* Just the vanilla function.  */
2445 
2446 void
matmul_c4(gfc_array_c4 * const restrict retarray,gfc_array_c4 * const restrict a,gfc_array_c4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2447 matmul_c4 (gfc_array_c4 * const restrict retarray,
2448 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2449 	int blas_limit, blas_call gemm)
2450 {
2451   const GFC_COMPLEX_4 * restrict abase;
2452   const GFC_COMPLEX_4 * restrict bbase;
2453   GFC_COMPLEX_4 * restrict dest;
2454 
2455   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2456   index_type x, y, n, count, xcount, ycount;
2457 
2458   assert (GFC_DESCRIPTOR_RANK (a) == 2
2459           || GFC_DESCRIPTOR_RANK (b) == 2);
2460 
2461 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2462 
2463    Either A or B (but not both) can be rank 1:
2464 
2465    o One-dimensional argument A is implicitly treated as a row matrix
2466      dimensioned [1,count], so xcount=1.
2467 
2468    o One-dimensional argument B is implicitly treated as a column matrix
2469      dimensioned [count, 1], so ycount=1.
2470 */
2471 
2472   if (retarray->base_addr == NULL)
2473     {
2474       if (GFC_DESCRIPTOR_RANK (a) == 1)
2475         {
2476 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2477 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2478         }
2479       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2480         {
2481 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2482 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2483         }
2484       else
2485         {
2486 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2487 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2488 
2489           GFC_DIMENSION_SET(retarray->dim[1], 0,
2490 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2491 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2492         }
2493 
2494       retarray->base_addr
2495 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
2496       retarray->offset = 0;
2497     }
2498   else if (unlikely (compile_options.bounds_check))
2499     {
2500       index_type ret_extent, arg_extent;
2501 
2502       if (GFC_DESCRIPTOR_RANK (a) == 1)
2503 	{
2504 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2505 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2506 	  if (arg_extent != ret_extent)
2507 	    runtime_error ("Array bound mismatch for dimension 1 of "
2508 	    		   "array (%ld/%ld) ",
2509 			   (long int) ret_extent, (long int) arg_extent);
2510 	}
2511       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2512 	{
2513 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2514 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2515 	  if (arg_extent != ret_extent)
2516 	    runtime_error ("Array bound mismatch for dimension 1 of "
2517 	    		   "array (%ld/%ld) ",
2518 			   (long int) ret_extent, (long int) arg_extent);
2519 	}
2520       else
2521 	{
2522 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2523 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2524 	  if (arg_extent != ret_extent)
2525 	    runtime_error ("Array bound mismatch for dimension 1 of "
2526 	    		   "array (%ld/%ld) ",
2527 			   (long int) ret_extent, (long int) arg_extent);
2528 
2529 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2530 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2531 	  if (arg_extent != ret_extent)
2532 	    runtime_error ("Array bound mismatch for dimension 2 of "
2533 	    		   "array (%ld/%ld) ",
2534 			   (long int) ret_extent, (long int) arg_extent);
2535 	}
2536     }
2537 
2538 
2539   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2540     {
2541       /* One-dimensional result may be addressed in the code below
2542 	 either as a row or a column matrix. We want both cases to
2543 	 work. */
2544       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2545     }
2546   else
2547     {
2548       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2549       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2550     }
2551 
2552 
2553   if (GFC_DESCRIPTOR_RANK (a) == 1)
2554     {
2555       /* Treat it as a a row matrix A[1,count]. */
2556       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2557       aystride = 1;
2558 
2559       xcount = 1;
2560       count = GFC_DESCRIPTOR_EXTENT(a,0);
2561     }
2562   else
2563     {
2564       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2565       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2566 
2567       count = GFC_DESCRIPTOR_EXTENT(a,1);
2568       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2569     }
2570 
2571   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2572     {
2573       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2574 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2575 		       "in dimension 1: is %ld, should be %ld",
2576 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2577     }
2578 
2579   if (GFC_DESCRIPTOR_RANK (b) == 1)
2580     {
2581       /* Treat it as a column matrix B[count,1] */
2582       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2583 
2584       /* bystride should never be used for 1-dimensional b.
2585          The value is only used for calculation of the
2586          memory by the buffer.  */
2587       bystride = 256;
2588       ycount = 1;
2589     }
2590   else
2591     {
2592       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2593       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2594       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2595     }
2596 
2597   abase = a->base_addr;
2598   bbase = b->base_addr;
2599   dest = retarray->base_addr;
2600 
2601   /* Now that everything is set up, we perform the multiplication
2602      itself.  */
2603 
2604 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2605 #define min(a,b) ((a) <= (b) ? (a) : (b))
2606 #define max(a,b) ((a) >= (b) ? (a) : (b))
2607 
2608   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2609       && (bxstride == 1 || bystride == 1)
2610       && (((float) xcount) * ((float) ycount) * ((float) count)
2611           > POW3(blas_limit)))
2612     {
2613       const int m = xcount, n = ycount, k = count, ldc = rystride;
2614       const GFC_COMPLEX_4 one = 1, zero = 0;
2615       const int lda = (axstride == 1) ? aystride : axstride,
2616 		ldb = (bxstride == 1) ? bystride : bxstride;
2617 
2618       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2619 	{
2620 	  assert (gemm != NULL);
2621 	  const char *transa, *transb;
2622 	  if (try_blas & 2)
2623 	    transa = "C";
2624 	  else
2625 	    transa = axstride == 1 ? "N" : "T";
2626 
2627 	  if (try_blas & 4)
2628 	    transb = "C";
2629 	  else
2630 	    transb = bxstride == 1 ? "N" : "T";
2631 
2632 	  gemm (transa, transb , &m,
2633 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2634 		&ldc, 1, 1);
2635 	  return;
2636 	}
2637     }
2638 
2639   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2640     {
2641       /* This block of code implements a tuned matmul, derived from
2642          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2643 
2644                Bo Kagstrom and Per Ling
2645                Department of Computing Science
2646                Umea University
2647                S-901 87 Umea, Sweden
2648 
2649 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2650 
2651       const GFC_COMPLEX_4 *a, *b;
2652       GFC_COMPLEX_4 *c;
2653       const index_type m = xcount, n = ycount, k = count;
2654 
2655       /* System generated locals */
2656       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2657 		 i1, i2, i3, i4, i5, i6;
2658 
2659       /* Local variables */
2660       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
2661 		 f13, f14, f23, f24, f33, f34, f43, f44;
2662       index_type i, j, l, ii, jj, ll;
2663       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2664       GFC_COMPLEX_4 *t1;
2665 
2666       a = abase;
2667       b = bbase;
2668       c = retarray->base_addr;
2669 
2670       /* Parameter adjustments */
2671       c_dim1 = rystride;
2672       c_offset = 1 + c_dim1;
2673       c -= c_offset;
2674       a_dim1 = aystride;
2675       a_offset = 1 + a_dim1;
2676       a -= a_offset;
2677       b_dim1 = bystride;
2678       b_offset = 1 + b_dim1;
2679       b -= b_offset;
2680 
2681       /* Empty c first.  */
2682       for (j=1; j<=n; j++)
2683 	for (i=1; i<=m; i++)
2684 	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
2685 
2686       /* Early exit if possible */
2687       if (m == 0 || n == 0 || k == 0)
2688 	return;
2689 
2690       /* Adjust size of t1 to what is needed.  */
2691       index_type t1_dim, a_sz;
2692       if (aystride == 1)
2693         a_sz = rystride;
2694       else
2695         a_sz = a_dim1;
2696 
2697       t1_dim = a_sz * 256 + b_dim1;
2698       if (t1_dim > 65536)
2699 	t1_dim = 65536;
2700 
2701       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
2702 
2703       /* Start turning the crank. */
2704       i1 = n;
2705       for (jj = 1; jj <= i1; jj += 512)
2706 	{
2707 	  /* Computing MIN */
2708 	  i2 = 512;
2709 	  i3 = n - jj + 1;
2710 	  jsec = min(i2,i3);
2711 	  ujsec = jsec - jsec % 4;
2712 	  i2 = k;
2713 	  for (ll = 1; ll <= i2; ll += 256)
2714 	    {
2715 	      /* Computing MIN */
2716 	      i3 = 256;
2717 	      i4 = k - ll + 1;
2718 	      lsec = min(i3,i4);
2719 	      ulsec = lsec - lsec % 2;
2720 
2721 	      i3 = m;
2722 	      for (ii = 1; ii <= i3; ii += 256)
2723 		{
2724 		  /* Computing MIN */
2725 		  i4 = 256;
2726 		  i5 = m - ii + 1;
2727 		  isec = min(i4,i5);
2728 		  uisec = isec - isec % 2;
2729 		  i4 = ll + ulsec - 1;
2730 		  for (l = ll; l <= i4; l += 2)
2731 		    {
2732 		      i5 = ii + uisec - 1;
2733 		      for (i = ii; i <= i5; i += 2)
2734 			{
2735 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2736 					a[i + l * a_dim1];
2737 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2738 					a[i + (l + 1) * a_dim1];
2739 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2740 					a[i + 1 + l * a_dim1];
2741 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2742 					a[i + 1 + (l + 1) * a_dim1];
2743 			}
2744 		      if (uisec < isec)
2745 			{
2746 			  t1[l - ll + 1 + (isec << 8) - 257] =
2747 				    a[ii + isec - 1 + l * a_dim1];
2748 			  t1[l - ll + 2 + (isec << 8) - 257] =
2749 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2750 			}
2751 		    }
2752 		  if (ulsec < lsec)
2753 		    {
2754 		      i4 = ii + isec - 1;
2755 		      for (i = ii; i<= i4; ++i)
2756 			{
2757 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2758 				    a[i + (ll + lsec - 1) * a_dim1];
2759 			}
2760 		    }
2761 
2762 		  uisec = isec - isec % 4;
2763 		  i4 = jj + ujsec - 1;
2764 		  for (j = jj; j <= i4; j += 4)
2765 		    {
2766 		      i5 = ii + uisec - 1;
2767 		      for (i = ii; i <= i5; i += 4)
2768 			{
2769 			  f11 = c[i + j * c_dim1];
2770 			  f21 = c[i + 1 + j * c_dim1];
2771 			  f12 = c[i + (j + 1) * c_dim1];
2772 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2773 			  f13 = c[i + (j + 2) * c_dim1];
2774 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2775 			  f14 = c[i + (j + 3) * c_dim1];
2776 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2777 			  f31 = c[i + 2 + j * c_dim1];
2778 			  f41 = c[i + 3 + j * c_dim1];
2779 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2780 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2781 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2782 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2783 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2784 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2785 			  i6 = ll + lsec - 1;
2786 			  for (l = ll; l <= i6; ++l)
2787 			    {
2788 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2789 				      * b[l + j * b_dim1];
2790 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2791 				      * b[l + j * b_dim1];
2792 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2793 				      * b[l + (j + 1) * b_dim1];
2794 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2795 				      * b[l + (j + 1) * b_dim1];
2796 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2797 				      * b[l + (j + 2) * b_dim1];
2798 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2799 				      * b[l + (j + 2) * b_dim1];
2800 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2801 				      * b[l + (j + 3) * b_dim1];
2802 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2803 				      * b[l + (j + 3) * b_dim1];
2804 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2805 				      * b[l + j * b_dim1];
2806 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2807 				      * b[l + j * b_dim1];
2808 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2809 				      * b[l + (j + 1) * b_dim1];
2810 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2811 				      * b[l + (j + 1) * b_dim1];
2812 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2813 				      * b[l + (j + 2) * b_dim1];
2814 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2815 				      * b[l + (j + 2) * b_dim1];
2816 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2817 				      * b[l + (j + 3) * b_dim1];
2818 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2819 				      * b[l + (j + 3) * b_dim1];
2820 			    }
2821 			  c[i + j * c_dim1] = f11;
2822 			  c[i + 1 + j * c_dim1] = f21;
2823 			  c[i + (j + 1) * c_dim1] = f12;
2824 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2825 			  c[i + (j + 2) * c_dim1] = f13;
2826 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2827 			  c[i + (j + 3) * c_dim1] = f14;
2828 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2829 			  c[i + 2 + j * c_dim1] = f31;
2830 			  c[i + 3 + j * c_dim1] = f41;
2831 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2832 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2833 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2834 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2835 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2836 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2837 			}
2838 		      if (uisec < isec)
2839 			{
2840 			  i5 = ii + isec - 1;
2841 			  for (i = ii + uisec; i <= i5; ++i)
2842 			    {
2843 			      f11 = c[i + j * c_dim1];
2844 			      f12 = c[i + (j + 1) * c_dim1];
2845 			      f13 = c[i + (j + 2) * c_dim1];
2846 			      f14 = c[i + (j + 3) * c_dim1];
2847 			      i6 = ll + lsec - 1;
2848 			      for (l = ll; l <= i6; ++l)
2849 				{
2850 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2851 					  257] * b[l + j * b_dim1];
2852 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2853 					  257] * b[l + (j + 1) * b_dim1];
2854 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2855 					  257] * b[l + (j + 2) * b_dim1];
2856 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2857 					  257] * b[l + (j + 3) * b_dim1];
2858 				}
2859 			      c[i + j * c_dim1] = f11;
2860 			      c[i + (j + 1) * c_dim1] = f12;
2861 			      c[i + (j + 2) * c_dim1] = f13;
2862 			      c[i + (j + 3) * c_dim1] = f14;
2863 			    }
2864 			}
2865 		    }
2866 		  if (ujsec < jsec)
2867 		    {
2868 		      i4 = jj + jsec - 1;
2869 		      for (j = jj + ujsec; j <= i4; ++j)
2870 			{
2871 			  i5 = ii + uisec - 1;
2872 			  for (i = ii; i <= i5; i += 4)
2873 			    {
2874 			      f11 = c[i + j * c_dim1];
2875 			      f21 = c[i + 1 + j * c_dim1];
2876 			      f31 = c[i + 2 + j * c_dim1];
2877 			      f41 = c[i + 3 + j * c_dim1];
2878 			      i6 = ll + lsec - 1;
2879 			      for (l = ll; l <= i6; ++l)
2880 				{
2881 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2882 					  257] * b[l + j * b_dim1];
2883 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2884 					  257] * b[l + j * b_dim1];
2885 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2886 					  257] * b[l + j * b_dim1];
2887 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2888 					  257] * b[l + j * b_dim1];
2889 				}
2890 			      c[i + j * c_dim1] = f11;
2891 			      c[i + 1 + j * c_dim1] = f21;
2892 			      c[i + 2 + j * c_dim1] = f31;
2893 			      c[i + 3 + j * c_dim1] = f41;
2894 			    }
2895 			  i5 = ii + isec - 1;
2896 			  for (i = ii + uisec; i <= i5; ++i)
2897 			    {
2898 			      f11 = c[i + j * c_dim1];
2899 			      i6 = ll + lsec - 1;
2900 			      for (l = ll; l <= i6; ++l)
2901 				{
2902 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2903 					  257] * b[l + j * b_dim1];
2904 				}
2905 			      c[i + j * c_dim1] = f11;
2906 			    }
2907 			}
2908 		    }
2909 		}
2910 	    }
2911 	}
2912       free(t1);
2913       return;
2914     }
2915   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2916     {
2917       if (GFC_DESCRIPTOR_RANK (a) != 1)
2918 	{
2919 	  const GFC_COMPLEX_4 *restrict abase_x;
2920 	  const GFC_COMPLEX_4 *restrict bbase_y;
2921 	  GFC_COMPLEX_4 *restrict dest_y;
2922 	  GFC_COMPLEX_4 s;
2923 
2924 	  for (y = 0; y < ycount; y++)
2925 	    {
2926 	      bbase_y = &bbase[y*bystride];
2927 	      dest_y = &dest[y*rystride];
2928 	      for (x = 0; x < xcount; x++)
2929 		{
2930 		  abase_x = &abase[x*axstride];
2931 		  s = (GFC_COMPLEX_4) 0;
2932 		  for (n = 0; n < count; n++)
2933 		    s += abase_x[n] * bbase_y[n];
2934 		  dest_y[x] = s;
2935 		}
2936 	    }
2937 	}
2938       else
2939 	{
2940 	  const GFC_COMPLEX_4 *restrict bbase_y;
2941 	  GFC_COMPLEX_4 s;
2942 
2943 	  for (y = 0; y < ycount; y++)
2944 	    {
2945 	      bbase_y = &bbase[y*bystride];
2946 	      s = (GFC_COMPLEX_4) 0;
2947 	      for (n = 0; n < count; n++)
2948 		s += abase[n*axstride] * bbase_y[n];
2949 	      dest[y*rystride] = s;
2950 	    }
2951 	}
2952     }
2953   else if (axstride < aystride)
2954     {
2955       for (y = 0; y < ycount; y++)
2956 	for (x = 0; x < xcount; x++)
2957 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2958 
2959       for (y = 0; y < ycount; y++)
2960 	for (n = 0; n < count; n++)
2961 	  for (x = 0; x < xcount; x++)
2962 	    /* dest[x,y] += a[x,n] * b[n,y] */
2963 	    dest[x*rxstride + y*rystride] +=
2964 					abase[x*axstride + n*aystride] *
2965 					bbase[n*bxstride + y*bystride];
2966     }
2967   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2968     {
2969       const GFC_COMPLEX_4 *restrict bbase_y;
2970       GFC_COMPLEX_4 s;
2971 
2972       for (y = 0; y < ycount; y++)
2973 	{
2974 	  bbase_y = &bbase[y*bystride];
2975 	  s = (GFC_COMPLEX_4) 0;
2976 	  for (n = 0; n < count; n++)
2977 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2978 	  dest[y*rxstride] = s;
2979 	}
2980     }
2981   else
2982     {
2983       const GFC_COMPLEX_4 *restrict abase_x;
2984       const GFC_COMPLEX_4 *restrict bbase_y;
2985       GFC_COMPLEX_4 *restrict dest_y;
2986       GFC_COMPLEX_4 s;
2987 
2988       for (y = 0; y < ycount; y++)
2989 	{
2990 	  bbase_y = &bbase[y*bystride];
2991 	  dest_y = &dest[y*rystride];
2992 	  for (x = 0; x < xcount; x++)
2993 	    {
2994 	      abase_x = &abase[x*axstride];
2995 	      s = (GFC_COMPLEX_4) 0;
2996 	      for (n = 0; n < count; n++)
2997 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2998 	      dest_y[x*rxstride] = s;
2999 	    }
3000 	}
3001     }
3002 }
3003 #undef POW3
3004 #undef min
3005 #undef max
3006 
3007 #endif
3008 #endif
3009 
3010