1 /* Implementation of the MATMUL intrinsic
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3    Contributed by Paul Brook <paul@nowt.org>
4 
5 This file is part of the GNU Fortran runtime library (libgfortran).
6 
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11 
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20 
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24 <http://www.gnu.org/licenses/>.  */
25 
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29 
30 
31 #if defined (HAVE_GFC_REAL_4)
32 
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34    passed to us by the front-end, in which case we call it for large
35    matrices.  */
36 
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38                           const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
39                           const int *, const GFC_REAL_4 *, const int *,
40                           const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
41                           int, int);
42 
43 /* The order of loops is different in the case of plain matrix
44    multiplication C=MATMUL(A,B), and in the frequent special case where
45    the argument A is the temporary result of a TRANSPOSE intrinsic:
46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47    looking at their strides.
48 
49    The equivalent Fortran pseudo-code is:
50 
51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52    IF (.NOT.IS_TRANSPOSED(A)) THEN
53      C = 0
54      DO J=1,N
55        DO K=1,COUNT
56          DO I=1,M
57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58    ELSE
59      DO J=1,N
60        DO I=1,M
61          S = 0
62          DO K=1,COUNT
63            S = S+A(I,K)*B(K,J)
64          C(I,J) = S
65    ENDIF
66 */
67 
68 /* If try_blas is set to a nonzero value, then the matmul function will
69    see if there is a way to perform the matrix multiplication by a call
70    to the BLAS gemm function.  */
71 
72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
73 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
74 	int blas_limit, blas_call gemm);
75 export_proto(matmul_r4);
76 
77 /* Put exhaustive list of possible architectures here here, ORed together.  */
78 
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80 
81 #ifdef HAVE_AVX
82 static void
83 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
84 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
85 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
matmul_r4_avx(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
88 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
89 	int blas_limit, blas_call gemm)
90 {
91   const GFC_REAL_4 * restrict abase;
92   const GFC_REAL_4 * restrict bbase;
93   GFC_REAL_4 * restrict dest;
94 
95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96   index_type x, y, n, count, xcount, ycount;
97 
98   assert (GFC_DESCRIPTOR_RANK (a) == 2
99           || GFC_DESCRIPTOR_RANK (b) == 2);
100 
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102 
103    Either A or B (but not both) can be rank 1:
104 
105    o One-dimensional argument A is implicitly treated as a row matrix
106      dimensioned [1,count], so xcount=1.
107 
108    o One-dimensional argument B is implicitly treated as a column matrix
109      dimensioned [count, 1], so ycount=1.
110 */
111 
112   if (retarray->base_addr == NULL)
113     {
114       if (GFC_DESCRIPTOR_RANK (a) == 1)
115         {
116 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118         }
119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120         {
121 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123         }
124       else
125         {
126 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128 
129           GFC_DIMENSION_SET(retarray->dim[1], 0,
130 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132         }
133 
134       retarray->base_addr
135 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
136       retarray->offset = 0;
137     }
138   else if (unlikely (compile_options.bounds_check))
139     {
140       index_type ret_extent, arg_extent;
141 
142       if (GFC_DESCRIPTOR_RANK (a) == 1)
143 	{
144 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 	  if (arg_extent != ret_extent)
147 	    runtime_error ("Array bound mismatch for dimension 1 of "
148 	    		   "array (%ld/%ld) ",
149 			   (long int) ret_extent, (long int) arg_extent);
150 	}
151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 	{
153 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 	  if (arg_extent != ret_extent)
156 	    runtime_error ("Array bound mismatch for dimension 1 of "
157 	    		   "array (%ld/%ld) ",
158 			   (long int) ret_extent, (long int) arg_extent);
159 	}
160       else
161 	{
162 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 	  if (arg_extent != ret_extent)
165 	    runtime_error ("Array bound mismatch for dimension 1 of "
166 	    		   "array (%ld/%ld) ",
167 			   (long int) ret_extent, (long int) arg_extent);
168 
169 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171 	  if (arg_extent != ret_extent)
172 	    runtime_error ("Array bound mismatch for dimension 2 of "
173 	    		   "array (%ld/%ld) ",
174 			   (long int) ret_extent, (long int) arg_extent);
175 	}
176     }
177 
178 
179   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180     {
181       /* One-dimensional result may be addressed in the code below
182 	 either as a row or a column matrix. We want both cases to
183 	 work. */
184       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185     }
186   else
187     {
188       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190     }
191 
192 
193   if (GFC_DESCRIPTOR_RANK (a) == 1)
194     {
195       /* Treat it as a a row matrix A[1,count]. */
196       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197       aystride = 1;
198 
199       xcount = 1;
200       count = GFC_DESCRIPTOR_EXTENT(a,0);
201     }
202   else
203     {
204       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206 
207       count = GFC_DESCRIPTOR_EXTENT(a,1);
208       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209     }
210 
211   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212     {
213       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215 		       "in dimension 1: is %ld, should be %ld",
216 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217     }
218 
219   if (GFC_DESCRIPTOR_RANK (b) == 1)
220     {
221       /* Treat it as a column matrix B[count,1] */
222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223 
224       /* bystride should never be used for 1-dimensional b.
225          The value is only used for calculation of the
226          memory by the buffer.  */
227       bystride = 256;
228       ycount = 1;
229     }
230   else
231     {
232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235     }
236 
237   abase = a->base_addr;
238   bbase = b->base_addr;
239   dest = retarray->base_addr;
240 
241   /* Now that everything is set up, we perform the multiplication
242      itself.  */
243 
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247 
248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249       && (bxstride == 1 || bystride == 1)
250       && (((float) xcount) * ((float) ycount) * ((float) count)
251           > POW3(blas_limit)))
252     {
253       const int m = xcount, n = ycount, k = count, ldc = rystride;
254       const GFC_REAL_4 one = 1, zero = 0;
255       const int lda = (axstride == 1) ? aystride : axstride,
256 		ldb = (bxstride == 1) ? bystride : bxstride;
257 
258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 	{
260 	  assert (gemm != NULL);
261 	  const char *transa, *transb;
262 	  if (try_blas & 2)
263 	    transa = "C";
264 	  else
265 	    transa = axstride == 1 ? "N" : "T";
266 
267 	  if (try_blas & 4)
268 	    transb = "C";
269 	  else
270 	    transb = bxstride == 1 ? "N" : "T";
271 
272 	  gemm (transa, transb , &m,
273 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274 		&ldc, 1, 1);
275 	  return;
276 	}
277     }
278 
279   if (rxstride == 1 && axstride == 1 && bxstride == 1
280       && GFC_DESCRIPTOR_RANK (b) != 1)
281     {
282       /* This block of code implements a tuned matmul, derived from
283          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284 
285                Bo Kagstrom and Per Ling
286                Department of Computing Science
287                Umea University
288                S-901 87 Umea, Sweden
289 
290 	 from netlib.org, translated to C, and modified for matmul.m4.  */
291 
292       const GFC_REAL_4 *a, *b;
293       GFC_REAL_4 *c;
294       const index_type m = xcount, n = ycount, k = count;
295 
296       /* System generated locals */
297       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298 		 i1, i2, i3, i4, i5, i6;
299 
300       /* Local variables */
301       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
302 		 f13, f14, f23, f24, f33, f34, f43, f44;
303       index_type i, j, l, ii, jj, ll;
304       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305       GFC_REAL_4 *t1;
306 
307       a = abase;
308       b = bbase;
309       c = retarray->base_addr;
310 
311       /* Parameter adjustments */
312       c_dim1 = rystride;
313       c_offset = 1 + c_dim1;
314       c -= c_offset;
315       a_dim1 = aystride;
316       a_offset = 1 + a_dim1;
317       a -= a_offset;
318       b_dim1 = bystride;
319       b_offset = 1 + b_dim1;
320       b -= b_offset;
321 
322       /* Empty c first.  */
323       for (j=1; j<=n; j++)
324 	for (i=1; i<=m; i++)
325 	  c[i + j * c_dim1] = (GFC_REAL_4)0;
326 
327       /* Early exit if possible */
328       if (m == 0 || n == 0 || k == 0)
329 	return;
330 
331       /* Adjust size of t1 to what is needed.  */
332       index_type t1_dim, a_sz;
333       if (aystride == 1)
334         a_sz = rystride;
335       else
336         a_sz = a_dim1;
337 
338       t1_dim = a_sz * 256 + b_dim1;
339       if (t1_dim > 65536)
340 	t1_dim = 65536;
341 
342       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
343 
344       /* Start turning the crank. */
345       i1 = n;
346       for (jj = 1; jj <= i1; jj += 512)
347 	{
348 	  /* Computing MIN */
349 	  i2 = 512;
350 	  i3 = n - jj + 1;
351 	  jsec = min(i2,i3);
352 	  ujsec = jsec - jsec % 4;
353 	  i2 = k;
354 	  for (ll = 1; ll <= i2; ll += 256)
355 	    {
356 	      /* Computing MIN */
357 	      i3 = 256;
358 	      i4 = k - ll + 1;
359 	      lsec = min(i3,i4);
360 	      ulsec = lsec - lsec % 2;
361 
362 	      i3 = m;
363 	      for (ii = 1; ii <= i3; ii += 256)
364 		{
365 		  /* Computing MIN */
366 		  i4 = 256;
367 		  i5 = m - ii + 1;
368 		  isec = min(i4,i5);
369 		  uisec = isec - isec % 2;
370 		  i4 = ll + ulsec - 1;
371 		  for (l = ll; l <= i4; l += 2)
372 		    {
373 		      i5 = ii + uisec - 1;
374 		      for (i = ii; i <= i5; i += 2)
375 			{
376 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377 					a[i + l * a_dim1];
378 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379 					a[i + (l + 1) * a_dim1];
380 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381 					a[i + 1 + l * a_dim1];
382 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383 					a[i + 1 + (l + 1) * a_dim1];
384 			}
385 		      if (uisec < isec)
386 			{
387 			  t1[l - ll + 1 + (isec << 8) - 257] =
388 				    a[ii + isec - 1 + l * a_dim1];
389 			  t1[l - ll + 2 + (isec << 8) - 257] =
390 				    a[ii + isec - 1 + (l + 1) * a_dim1];
391 			}
392 		    }
393 		  if (ulsec < lsec)
394 		    {
395 		      i4 = ii + isec - 1;
396 		      for (i = ii; i<= i4; ++i)
397 			{
398 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399 				    a[i + (ll + lsec - 1) * a_dim1];
400 			}
401 		    }
402 
403 		  uisec = isec - isec % 4;
404 		  i4 = jj + ujsec - 1;
405 		  for (j = jj; j <= i4; j += 4)
406 		    {
407 		      i5 = ii + uisec - 1;
408 		      for (i = ii; i <= i5; i += 4)
409 			{
410 			  f11 = c[i + j * c_dim1];
411 			  f21 = c[i + 1 + j * c_dim1];
412 			  f12 = c[i + (j + 1) * c_dim1];
413 			  f22 = c[i + 1 + (j + 1) * c_dim1];
414 			  f13 = c[i + (j + 2) * c_dim1];
415 			  f23 = c[i + 1 + (j + 2) * c_dim1];
416 			  f14 = c[i + (j + 3) * c_dim1];
417 			  f24 = c[i + 1 + (j + 3) * c_dim1];
418 			  f31 = c[i + 2 + j * c_dim1];
419 			  f41 = c[i + 3 + j * c_dim1];
420 			  f32 = c[i + 2 + (j + 1) * c_dim1];
421 			  f42 = c[i + 3 + (j + 1) * c_dim1];
422 			  f33 = c[i + 2 + (j + 2) * c_dim1];
423 			  f43 = c[i + 3 + (j + 2) * c_dim1];
424 			  f34 = c[i + 2 + (j + 3) * c_dim1];
425 			  f44 = c[i + 3 + (j + 3) * c_dim1];
426 			  i6 = ll + lsec - 1;
427 			  for (l = ll; l <= i6; ++l)
428 			    {
429 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 				      * b[l + j * b_dim1];
431 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 				      * b[l + j * b_dim1];
433 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434 				      * b[l + (j + 1) * b_dim1];
435 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436 				      * b[l + (j + 1) * b_dim1];
437 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438 				      * b[l + (j + 2) * b_dim1];
439 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440 				      * b[l + (j + 2) * b_dim1];
441 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442 				      * b[l + (j + 3) * b_dim1];
443 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444 				      * b[l + (j + 3) * b_dim1];
445 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 				      * b[l + j * b_dim1];
447 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 				      * b[l + j * b_dim1];
449 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450 				      * b[l + (j + 1) * b_dim1];
451 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452 				      * b[l + (j + 1) * b_dim1];
453 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454 				      * b[l + (j + 2) * b_dim1];
455 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456 				      * b[l + (j + 2) * b_dim1];
457 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458 				      * b[l + (j + 3) * b_dim1];
459 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460 				      * b[l + (j + 3) * b_dim1];
461 			    }
462 			  c[i + j * c_dim1] = f11;
463 			  c[i + 1 + j * c_dim1] = f21;
464 			  c[i + (j + 1) * c_dim1] = f12;
465 			  c[i + 1 + (j + 1) * c_dim1] = f22;
466 			  c[i + (j + 2) * c_dim1] = f13;
467 			  c[i + 1 + (j + 2) * c_dim1] = f23;
468 			  c[i + (j + 3) * c_dim1] = f14;
469 			  c[i + 1 + (j + 3) * c_dim1] = f24;
470 			  c[i + 2 + j * c_dim1] = f31;
471 			  c[i + 3 + j * c_dim1] = f41;
472 			  c[i + 2 + (j + 1) * c_dim1] = f32;
473 			  c[i + 3 + (j + 1) * c_dim1] = f42;
474 			  c[i + 2 + (j + 2) * c_dim1] = f33;
475 			  c[i + 3 + (j + 2) * c_dim1] = f43;
476 			  c[i + 2 + (j + 3) * c_dim1] = f34;
477 			  c[i + 3 + (j + 3) * c_dim1] = f44;
478 			}
479 		      if (uisec < isec)
480 			{
481 			  i5 = ii + isec - 1;
482 			  for (i = ii + uisec; i <= i5; ++i)
483 			    {
484 			      f11 = c[i + j * c_dim1];
485 			      f12 = c[i + (j + 1) * c_dim1];
486 			      f13 = c[i + (j + 2) * c_dim1];
487 			      f14 = c[i + (j + 3) * c_dim1];
488 			      i6 = ll + lsec - 1;
489 			      for (l = ll; l <= i6; ++l)
490 				{
491 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492 					  257] * b[l + j * b_dim1];
493 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494 					  257] * b[l + (j + 1) * b_dim1];
495 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496 					  257] * b[l + (j + 2) * b_dim1];
497 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 					  257] * b[l + (j + 3) * b_dim1];
499 				}
500 			      c[i + j * c_dim1] = f11;
501 			      c[i + (j + 1) * c_dim1] = f12;
502 			      c[i + (j + 2) * c_dim1] = f13;
503 			      c[i + (j + 3) * c_dim1] = f14;
504 			    }
505 			}
506 		    }
507 		  if (ujsec < jsec)
508 		    {
509 		      i4 = jj + jsec - 1;
510 		      for (j = jj + ujsec; j <= i4; ++j)
511 			{
512 			  i5 = ii + uisec - 1;
513 			  for (i = ii; i <= i5; i += 4)
514 			    {
515 			      f11 = c[i + j * c_dim1];
516 			      f21 = c[i + 1 + j * c_dim1];
517 			      f31 = c[i + 2 + j * c_dim1];
518 			      f41 = c[i + 3 + j * c_dim1];
519 			      i6 = ll + lsec - 1;
520 			      for (l = ll; l <= i6; ++l)
521 				{
522 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523 					  257] * b[l + j * b_dim1];
524 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525 					  257] * b[l + j * b_dim1];
526 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527 					  257] * b[l + j * b_dim1];
528 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529 					  257] * b[l + j * b_dim1];
530 				}
531 			      c[i + j * c_dim1] = f11;
532 			      c[i + 1 + j * c_dim1] = f21;
533 			      c[i + 2 + j * c_dim1] = f31;
534 			      c[i + 3 + j * c_dim1] = f41;
535 			    }
536 			  i5 = ii + isec - 1;
537 			  for (i = ii + uisec; i <= i5; ++i)
538 			    {
539 			      f11 = c[i + j * c_dim1];
540 			      i6 = ll + lsec - 1;
541 			      for (l = ll; l <= i6; ++l)
542 				{
543 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544 					  257] * b[l + j * b_dim1];
545 				}
546 			      c[i + j * c_dim1] = f11;
547 			    }
548 			}
549 		    }
550 		}
551 	    }
552 	}
553       free(t1);
554       return;
555     }
556   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557     {
558       if (GFC_DESCRIPTOR_RANK (a) != 1)
559 	{
560 	  const GFC_REAL_4 *restrict abase_x;
561 	  const GFC_REAL_4 *restrict bbase_y;
562 	  GFC_REAL_4 *restrict dest_y;
563 	  GFC_REAL_4 s;
564 
565 	  for (y = 0; y < ycount; y++)
566 	    {
567 	      bbase_y = &bbase[y*bystride];
568 	      dest_y = &dest[y*rystride];
569 	      for (x = 0; x < xcount; x++)
570 		{
571 		  abase_x = &abase[x*axstride];
572 		  s = (GFC_REAL_4) 0;
573 		  for (n = 0; n < count; n++)
574 		    s += abase_x[n] * bbase_y[n];
575 		  dest_y[x] = s;
576 		}
577 	    }
578 	}
579       else
580 	{
581 	  const GFC_REAL_4 *restrict bbase_y;
582 	  GFC_REAL_4 s;
583 
584 	  for (y = 0; y < ycount; y++)
585 	    {
586 	      bbase_y = &bbase[y*bystride];
587 	      s = (GFC_REAL_4) 0;
588 	      for (n = 0; n < count; n++)
589 		s += abase[n*axstride] * bbase_y[n];
590 	      dest[y*rystride] = s;
591 	    }
592 	}
593     }
594   else if (GFC_DESCRIPTOR_RANK (a) == 1)
595     {
596       const GFC_REAL_4 *restrict bbase_y;
597       GFC_REAL_4 s;
598 
599       for (y = 0; y < ycount; y++)
600 	{
601 	  bbase_y = &bbase[y*bystride];
602 	  s = (GFC_REAL_4) 0;
603 	  for (n = 0; n < count; n++)
604 	    s += abase[n*axstride] * bbase_y[n*bxstride];
605 	  dest[y*rxstride] = s;
606 	}
607     }
608   else if (axstride < aystride)
609     {
610       for (y = 0; y < ycount; y++)
611 	for (x = 0; x < xcount; x++)
612 	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
613 
614       for (y = 0; y < ycount; y++)
615 	for (n = 0; n < count; n++)
616 	  for (x = 0; x < xcount; x++)
617 	    /* dest[x,y] += a[x,n] * b[n,y] */
618 	    dest[x*rxstride + y*rystride] +=
619 					abase[x*axstride + n*aystride] *
620 					bbase[n*bxstride + y*bystride];
621     }
622   else
623     {
624       const GFC_REAL_4 *restrict abase_x;
625       const GFC_REAL_4 *restrict bbase_y;
626       GFC_REAL_4 *restrict dest_y;
627       GFC_REAL_4 s;
628 
629       for (y = 0; y < ycount; y++)
630 	{
631 	  bbase_y = &bbase[y*bystride];
632 	  dest_y = &dest[y*rystride];
633 	  for (x = 0; x < xcount; x++)
634 	    {
635 	      abase_x = &abase[x*axstride];
636 	      s = (GFC_REAL_4) 0;
637 	      for (n = 0; n < count; n++)
638 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639 	      dest_y[x*rxstride] = s;
640 	    }
641 	}
642     }
643 }
644 #undef POW3
645 #undef min
646 #undef max
647 
648 #endif /* HAVE_AVX */
649 
650 #ifdef HAVE_AVX2
651 static void
652 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
653 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
654 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655 static void
matmul_r4_avx2(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
657 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
658 	int blas_limit, blas_call gemm)
659 {
660   const GFC_REAL_4 * restrict abase;
661   const GFC_REAL_4 * restrict bbase;
662   GFC_REAL_4 * restrict dest;
663 
664   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665   index_type x, y, n, count, xcount, ycount;
666 
667   assert (GFC_DESCRIPTOR_RANK (a) == 2
668           || GFC_DESCRIPTOR_RANK (b) == 2);
669 
670 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671 
672    Either A or B (but not both) can be rank 1:
673 
674    o One-dimensional argument A is implicitly treated as a row matrix
675      dimensioned [1,count], so xcount=1.
676 
677    o One-dimensional argument B is implicitly treated as a column matrix
678      dimensioned [count, 1], so ycount=1.
679 */
680 
681   if (retarray->base_addr == NULL)
682     {
683       if (GFC_DESCRIPTOR_RANK (a) == 1)
684         {
685 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687         }
688       else if (GFC_DESCRIPTOR_RANK (b) == 1)
689         {
690 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692         }
693       else
694         {
695 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697 
698           GFC_DIMENSION_SET(retarray->dim[1], 0,
699 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701         }
702 
703       retarray->base_addr
704 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
705       retarray->offset = 0;
706     }
707   else if (unlikely (compile_options.bounds_check))
708     {
709       index_type ret_extent, arg_extent;
710 
711       if (GFC_DESCRIPTOR_RANK (a) == 1)
712 	{
713 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715 	  if (arg_extent != ret_extent)
716 	    runtime_error ("Array bound mismatch for dimension 1 of "
717 	    		   "array (%ld/%ld) ",
718 			   (long int) ret_extent, (long int) arg_extent);
719 	}
720       else if (GFC_DESCRIPTOR_RANK (b) == 1)
721 	{
722 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724 	  if (arg_extent != ret_extent)
725 	    runtime_error ("Array bound mismatch for dimension 1 of "
726 	    		   "array (%ld/%ld) ",
727 			   (long int) ret_extent, (long int) arg_extent);
728 	}
729       else
730 	{
731 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733 	  if (arg_extent != ret_extent)
734 	    runtime_error ("Array bound mismatch for dimension 1 of "
735 	    		   "array (%ld/%ld) ",
736 			   (long int) ret_extent, (long int) arg_extent);
737 
738 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740 	  if (arg_extent != ret_extent)
741 	    runtime_error ("Array bound mismatch for dimension 2 of "
742 	    		   "array (%ld/%ld) ",
743 			   (long int) ret_extent, (long int) arg_extent);
744 	}
745     }
746 
747 
748   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749     {
750       /* One-dimensional result may be addressed in the code below
751 	 either as a row or a column matrix. We want both cases to
752 	 work. */
753       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754     }
755   else
756     {
757       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759     }
760 
761 
762   if (GFC_DESCRIPTOR_RANK (a) == 1)
763     {
764       /* Treat it as a a row matrix A[1,count]. */
765       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766       aystride = 1;
767 
768       xcount = 1;
769       count = GFC_DESCRIPTOR_EXTENT(a,0);
770     }
771   else
772     {
773       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775 
776       count = GFC_DESCRIPTOR_EXTENT(a,1);
777       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778     }
779 
780   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781     {
782       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784 		       "in dimension 1: is %ld, should be %ld",
785 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786     }
787 
788   if (GFC_DESCRIPTOR_RANK (b) == 1)
789     {
790       /* Treat it as a column matrix B[count,1] */
791       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792 
793       /* bystride should never be used for 1-dimensional b.
794          The value is only used for calculation of the
795          memory by the buffer.  */
796       bystride = 256;
797       ycount = 1;
798     }
799   else
800     {
801       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804     }
805 
806   abase = a->base_addr;
807   bbase = b->base_addr;
808   dest = retarray->base_addr;
809 
810   /* Now that everything is set up, we perform the multiplication
811      itself.  */
812 
813 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814 #define min(a,b) ((a) <= (b) ? (a) : (b))
815 #define max(a,b) ((a) >= (b) ? (a) : (b))
816 
817   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818       && (bxstride == 1 || bystride == 1)
819       && (((float) xcount) * ((float) ycount) * ((float) count)
820           > POW3(blas_limit)))
821     {
822       const int m = xcount, n = ycount, k = count, ldc = rystride;
823       const GFC_REAL_4 one = 1, zero = 0;
824       const int lda = (axstride == 1) ? aystride : axstride,
825 		ldb = (bxstride == 1) ? bystride : bxstride;
826 
827       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828 	{
829 	  assert (gemm != NULL);
830 	  const char *transa, *transb;
831 	  if (try_blas & 2)
832 	    transa = "C";
833 	  else
834 	    transa = axstride == 1 ? "N" : "T";
835 
836 	  if (try_blas & 4)
837 	    transb = "C";
838 	  else
839 	    transb = bxstride == 1 ? "N" : "T";
840 
841 	  gemm (transa, transb , &m,
842 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843 		&ldc, 1, 1);
844 	  return;
845 	}
846     }
847 
848   if (rxstride == 1 && axstride == 1 && bxstride == 1
849       && GFC_DESCRIPTOR_RANK (b) != 1)
850     {
851       /* This block of code implements a tuned matmul, derived from
852          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853 
854                Bo Kagstrom and Per Ling
855                Department of Computing Science
856                Umea University
857                S-901 87 Umea, Sweden
858 
859 	 from netlib.org, translated to C, and modified for matmul.m4.  */
860 
861       const GFC_REAL_4 *a, *b;
862       GFC_REAL_4 *c;
863       const index_type m = xcount, n = ycount, k = count;
864 
865       /* System generated locals */
866       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867 		 i1, i2, i3, i4, i5, i6;
868 
869       /* Local variables */
870       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
871 		 f13, f14, f23, f24, f33, f34, f43, f44;
872       index_type i, j, l, ii, jj, ll;
873       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874       GFC_REAL_4 *t1;
875 
876       a = abase;
877       b = bbase;
878       c = retarray->base_addr;
879 
880       /* Parameter adjustments */
881       c_dim1 = rystride;
882       c_offset = 1 + c_dim1;
883       c -= c_offset;
884       a_dim1 = aystride;
885       a_offset = 1 + a_dim1;
886       a -= a_offset;
887       b_dim1 = bystride;
888       b_offset = 1 + b_dim1;
889       b -= b_offset;
890 
891       /* Empty c first.  */
892       for (j=1; j<=n; j++)
893 	for (i=1; i<=m; i++)
894 	  c[i + j * c_dim1] = (GFC_REAL_4)0;
895 
896       /* Early exit if possible */
897       if (m == 0 || n == 0 || k == 0)
898 	return;
899 
900       /* Adjust size of t1 to what is needed.  */
901       index_type t1_dim, a_sz;
902       if (aystride == 1)
903         a_sz = rystride;
904       else
905         a_sz = a_dim1;
906 
907       t1_dim = a_sz * 256 + b_dim1;
908       if (t1_dim > 65536)
909 	t1_dim = 65536;
910 
911       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
912 
913       /* Start turning the crank. */
914       i1 = n;
915       for (jj = 1; jj <= i1; jj += 512)
916 	{
917 	  /* Computing MIN */
918 	  i2 = 512;
919 	  i3 = n - jj + 1;
920 	  jsec = min(i2,i3);
921 	  ujsec = jsec - jsec % 4;
922 	  i2 = k;
923 	  for (ll = 1; ll <= i2; ll += 256)
924 	    {
925 	      /* Computing MIN */
926 	      i3 = 256;
927 	      i4 = k - ll + 1;
928 	      lsec = min(i3,i4);
929 	      ulsec = lsec - lsec % 2;
930 
931 	      i3 = m;
932 	      for (ii = 1; ii <= i3; ii += 256)
933 		{
934 		  /* Computing MIN */
935 		  i4 = 256;
936 		  i5 = m - ii + 1;
937 		  isec = min(i4,i5);
938 		  uisec = isec - isec % 2;
939 		  i4 = ll + ulsec - 1;
940 		  for (l = ll; l <= i4; l += 2)
941 		    {
942 		      i5 = ii + uisec - 1;
943 		      for (i = ii; i <= i5; i += 2)
944 			{
945 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946 					a[i + l * a_dim1];
947 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948 					a[i + (l + 1) * a_dim1];
949 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950 					a[i + 1 + l * a_dim1];
951 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952 					a[i + 1 + (l + 1) * a_dim1];
953 			}
954 		      if (uisec < isec)
955 			{
956 			  t1[l - ll + 1 + (isec << 8) - 257] =
957 				    a[ii + isec - 1 + l * a_dim1];
958 			  t1[l - ll + 2 + (isec << 8) - 257] =
959 				    a[ii + isec - 1 + (l + 1) * a_dim1];
960 			}
961 		    }
962 		  if (ulsec < lsec)
963 		    {
964 		      i4 = ii + isec - 1;
965 		      for (i = ii; i<= i4; ++i)
966 			{
967 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968 				    a[i + (ll + lsec - 1) * a_dim1];
969 			}
970 		    }
971 
972 		  uisec = isec - isec % 4;
973 		  i4 = jj + ujsec - 1;
974 		  for (j = jj; j <= i4; j += 4)
975 		    {
976 		      i5 = ii + uisec - 1;
977 		      for (i = ii; i <= i5; i += 4)
978 			{
979 			  f11 = c[i + j * c_dim1];
980 			  f21 = c[i + 1 + j * c_dim1];
981 			  f12 = c[i + (j + 1) * c_dim1];
982 			  f22 = c[i + 1 + (j + 1) * c_dim1];
983 			  f13 = c[i + (j + 2) * c_dim1];
984 			  f23 = c[i + 1 + (j + 2) * c_dim1];
985 			  f14 = c[i + (j + 3) * c_dim1];
986 			  f24 = c[i + 1 + (j + 3) * c_dim1];
987 			  f31 = c[i + 2 + j * c_dim1];
988 			  f41 = c[i + 3 + j * c_dim1];
989 			  f32 = c[i + 2 + (j + 1) * c_dim1];
990 			  f42 = c[i + 3 + (j + 1) * c_dim1];
991 			  f33 = c[i + 2 + (j + 2) * c_dim1];
992 			  f43 = c[i + 3 + (j + 2) * c_dim1];
993 			  f34 = c[i + 2 + (j + 3) * c_dim1];
994 			  f44 = c[i + 3 + (j + 3) * c_dim1];
995 			  i6 = ll + lsec - 1;
996 			  for (l = ll; l <= i6; ++l)
997 			    {
998 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999 				      * b[l + j * b_dim1];
1000 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001 				      * b[l + j * b_dim1];
1002 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003 				      * b[l + (j + 1) * b_dim1];
1004 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005 				      * b[l + (j + 1) * b_dim1];
1006 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007 				      * b[l + (j + 2) * b_dim1];
1008 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009 				      * b[l + (j + 2) * b_dim1];
1010 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011 				      * b[l + (j + 3) * b_dim1];
1012 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013 				      * b[l + (j + 3) * b_dim1];
1014 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015 				      * b[l + j * b_dim1];
1016 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017 				      * b[l + j * b_dim1];
1018 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019 				      * b[l + (j + 1) * b_dim1];
1020 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021 				      * b[l + (j + 1) * b_dim1];
1022 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023 				      * b[l + (j + 2) * b_dim1];
1024 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025 				      * b[l + (j + 2) * b_dim1];
1026 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027 				      * b[l + (j + 3) * b_dim1];
1028 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029 				      * b[l + (j + 3) * b_dim1];
1030 			    }
1031 			  c[i + j * c_dim1] = f11;
1032 			  c[i + 1 + j * c_dim1] = f21;
1033 			  c[i + (j + 1) * c_dim1] = f12;
1034 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035 			  c[i + (j + 2) * c_dim1] = f13;
1036 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037 			  c[i + (j + 3) * c_dim1] = f14;
1038 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039 			  c[i + 2 + j * c_dim1] = f31;
1040 			  c[i + 3 + j * c_dim1] = f41;
1041 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047 			}
1048 		      if (uisec < isec)
1049 			{
1050 			  i5 = ii + isec - 1;
1051 			  for (i = ii + uisec; i <= i5; ++i)
1052 			    {
1053 			      f11 = c[i + j * c_dim1];
1054 			      f12 = c[i + (j + 1) * c_dim1];
1055 			      f13 = c[i + (j + 2) * c_dim1];
1056 			      f14 = c[i + (j + 3) * c_dim1];
1057 			      i6 = ll + lsec - 1;
1058 			      for (l = ll; l <= i6; ++l)
1059 				{
1060 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061 					  257] * b[l + j * b_dim1];
1062 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063 					  257] * b[l + (j + 1) * b_dim1];
1064 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 					  257] * b[l + (j + 2) * b_dim1];
1066 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067 					  257] * b[l + (j + 3) * b_dim1];
1068 				}
1069 			      c[i + j * c_dim1] = f11;
1070 			      c[i + (j + 1) * c_dim1] = f12;
1071 			      c[i + (j + 2) * c_dim1] = f13;
1072 			      c[i + (j + 3) * c_dim1] = f14;
1073 			    }
1074 			}
1075 		    }
1076 		  if (ujsec < jsec)
1077 		    {
1078 		      i4 = jj + jsec - 1;
1079 		      for (j = jj + ujsec; j <= i4; ++j)
1080 			{
1081 			  i5 = ii + uisec - 1;
1082 			  for (i = ii; i <= i5; i += 4)
1083 			    {
1084 			      f11 = c[i + j * c_dim1];
1085 			      f21 = c[i + 1 + j * c_dim1];
1086 			      f31 = c[i + 2 + j * c_dim1];
1087 			      f41 = c[i + 3 + j * c_dim1];
1088 			      i6 = ll + lsec - 1;
1089 			      for (l = ll; l <= i6; ++l)
1090 				{
1091 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092 					  257] * b[l + j * b_dim1];
1093 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094 					  257] * b[l + j * b_dim1];
1095 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096 					  257] * b[l + j * b_dim1];
1097 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098 					  257] * b[l + j * b_dim1];
1099 				}
1100 			      c[i + j * c_dim1] = f11;
1101 			      c[i + 1 + j * c_dim1] = f21;
1102 			      c[i + 2 + j * c_dim1] = f31;
1103 			      c[i + 3 + j * c_dim1] = f41;
1104 			    }
1105 			  i5 = ii + isec - 1;
1106 			  for (i = ii + uisec; i <= i5; ++i)
1107 			    {
1108 			      f11 = c[i + j * c_dim1];
1109 			      i6 = ll + lsec - 1;
1110 			      for (l = ll; l <= i6; ++l)
1111 				{
1112 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113 					  257] * b[l + j * b_dim1];
1114 				}
1115 			      c[i + j * c_dim1] = f11;
1116 			    }
1117 			}
1118 		    }
1119 		}
1120 	    }
1121 	}
1122       free(t1);
1123       return;
1124     }
1125   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126     {
1127       if (GFC_DESCRIPTOR_RANK (a) != 1)
1128 	{
1129 	  const GFC_REAL_4 *restrict abase_x;
1130 	  const GFC_REAL_4 *restrict bbase_y;
1131 	  GFC_REAL_4 *restrict dest_y;
1132 	  GFC_REAL_4 s;
1133 
1134 	  for (y = 0; y < ycount; y++)
1135 	    {
1136 	      bbase_y = &bbase[y*bystride];
1137 	      dest_y = &dest[y*rystride];
1138 	      for (x = 0; x < xcount; x++)
1139 		{
1140 		  abase_x = &abase[x*axstride];
1141 		  s = (GFC_REAL_4) 0;
1142 		  for (n = 0; n < count; n++)
1143 		    s += abase_x[n] * bbase_y[n];
1144 		  dest_y[x] = s;
1145 		}
1146 	    }
1147 	}
1148       else
1149 	{
1150 	  const GFC_REAL_4 *restrict bbase_y;
1151 	  GFC_REAL_4 s;
1152 
1153 	  for (y = 0; y < ycount; y++)
1154 	    {
1155 	      bbase_y = &bbase[y*bystride];
1156 	      s = (GFC_REAL_4) 0;
1157 	      for (n = 0; n < count; n++)
1158 		s += abase[n*axstride] * bbase_y[n];
1159 	      dest[y*rystride] = s;
1160 	    }
1161 	}
1162     }
1163   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164     {
1165       const GFC_REAL_4 *restrict bbase_y;
1166       GFC_REAL_4 s;
1167 
1168       for (y = 0; y < ycount; y++)
1169 	{
1170 	  bbase_y = &bbase[y*bystride];
1171 	  s = (GFC_REAL_4) 0;
1172 	  for (n = 0; n < count; n++)
1173 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174 	  dest[y*rxstride] = s;
1175 	}
1176     }
1177   else if (axstride < aystride)
1178     {
1179       for (y = 0; y < ycount; y++)
1180 	for (x = 0; x < xcount; x++)
1181 	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1182 
1183       for (y = 0; y < ycount; y++)
1184 	for (n = 0; n < count; n++)
1185 	  for (x = 0; x < xcount; x++)
1186 	    /* dest[x,y] += a[x,n] * b[n,y] */
1187 	    dest[x*rxstride + y*rystride] +=
1188 					abase[x*axstride + n*aystride] *
1189 					bbase[n*bxstride + y*bystride];
1190     }
1191   else
1192     {
1193       const GFC_REAL_4 *restrict abase_x;
1194       const GFC_REAL_4 *restrict bbase_y;
1195       GFC_REAL_4 *restrict dest_y;
1196       GFC_REAL_4 s;
1197 
1198       for (y = 0; y < ycount; y++)
1199 	{
1200 	  bbase_y = &bbase[y*bystride];
1201 	  dest_y = &dest[y*rystride];
1202 	  for (x = 0; x < xcount; x++)
1203 	    {
1204 	      abase_x = &abase[x*axstride];
1205 	      s = (GFC_REAL_4) 0;
1206 	      for (n = 0; n < count; n++)
1207 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208 	      dest_y[x*rxstride] = s;
1209 	    }
1210 	}
1211     }
1212 }
1213 #undef POW3
1214 #undef min
1215 #undef max
1216 
1217 #endif /* HAVE_AVX2 */
1218 
1219 #ifdef HAVE_AVX512F
1220 static void
1221 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1222 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1223 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224 static void
matmul_r4_avx512f(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1226 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1227 	int blas_limit, blas_call gemm)
1228 {
1229   const GFC_REAL_4 * restrict abase;
1230   const GFC_REAL_4 * restrict bbase;
1231   GFC_REAL_4 * restrict dest;
1232 
1233   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234   index_type x, y, n, count, xcount, ycount;
1235 
1236   assert (GFC_DESCRIPTOR_RANK (a) == 2
1237           || GFC_DESCRIPTOR_RANK (b) == 2);
1238 
1239 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240 
1241    Either A or B (but not both) can be rank 1:
1242 
1243    o One-dimensional argument A is implicitly treated as a row matrix
1244      dimensioned [1,count], so xcount=1.
1245 
1246    o One-dimensional argument B is implicitly treated as a column matrix
1247      dimensioned [count, 1], so ycount=1.
1248 */
1249 
1250   if (retarray->base_addr == NULL)
1251     {
1252       if (GFC_DESCRIPTOR_RANK (a) == 1)
1253         {
1254 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256         }
1257       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258         {
1259 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261         }
1262       else
1263         {
1264 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266 
1267           GFC_DIMENSION_SET(retarray->dim[1], 0,
1268 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270         }
1271 
1272       retarray->base_addr
1273 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1274       retarray->offset = 0;
1275     }
1276   else if (unlikely (compile_options.bounds_check))
1277     {
1278       index_type ret_extent, arg_extent;
1279 
1280       if (GFC_DESCRIPTOR_RANK (a) == 1)
1281 	{
1282 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284 	  if (arg_extent != ret_extent)
1285 	    runtime_error ("Array bound mismatch for dimension 1 of "
1286 	    		   "array (%ld/%ld) ",
1287 			   (long int) ret_extent, (long int) arg_extent);
1288 	}
1289       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290 	{
1291 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293 	  if (arg_extent != ret_extent)
1294 	    runtime_error ("Array bound mismatch for dimension 1 of "
1295 	    		   "array (%ld/%ld) ",
1296 			   (long int) ret_extent, (long int) arg_extent);
1297 	}
1298       else
1299 	{
1300 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302 	  if (arg_extent != ret_extent)
1303 	    runtime_error ("Array bound mismatch for dimension 1 of "
1304 	    		   "array (%ld/%ld) ",
1305 			   (long int) ret_extent, (long int) arg_extent);
1306 
1307 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309 	  if (arg_extent != ret_extent)
1310 	    runtime_error ("Array bound mismatch for dimension 2 of "
1311 	    		   "array (%ld/%ld) ",
1312 			   (long int) ret_extent, (long int) arg_extent);
1313 	}
1314     }
1315 
1316 
1317   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318     {
1319       /* One-dimensional result may be addressed in the code below
1320 	 either as a row or a column matrix. We want both cases to
1321 	 work. */
1322       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323     }
1324   else
1325     {
1326       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328     }
1329 
1330 
1331   if (GFC_DESCRIPTOR_RANK (a) == 1)
1332     {
1333       /* Treat it as a a row matrix A[1,count]. */
1334       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335       aystride = 1;
1336 
1337       xcount = 1;
1338       count = GFC_DESCRIPTOR_EXTENT(a,0);
1339     }
1340   else
1341     {
1342       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344 
1345       count = GFC_DESCRIPTOR_EXTENT(a,1);
1346       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347     }
1348 
1349   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350     {
1351       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353 		       "in dimension 1: is %ld, should be %ld",
1354 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355     }
1356 
1357   if (GFC_DESCRIPTOR_RANK (b) == 1)
1358     {
1359       /* Treat it as a column matrix B[count,1] */
1360       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361 
1362       /* bystride should never be used for 1-dimensional b.
1363          The value is only used for calculation of the
1364          memory by the buffer.  */
1365       bystride = 256;
1366       ycount = 1;
1367     }
1368   else
1369     {
1370       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373     }
1374 
1375   abase = a->base_addr;
1376   bbase = b->base_addr;
1377   dest = retarray->base_addr;
1378 
1379   /* Now that everything is set up, we perform the multiplication
1380      itself.  */
1381 
1382 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383 #define min(a,b) ((a) <= (b) ? (a) : (b))
1384 #define max(a,b) ((a) >= (b) ? (a) : (b))
1385 
1386   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387       && (bxstride == 1 || bystride == 1)
1388       && (((float) xcount) * ((float) ycount) * ((float) count)
1389           > POW3(blas_limit)))
1390     {
1391       const int m = xcount, n = ycount, k = count, ldc = rystride;
1392       const GFC_REAL_4 one = 1, zero = 0;
1393       const int lda = (axstride == 1) ? aystride : axstride,
1394 		ldb = (bxstride == 1) ? bystride : bxstride;
1395 
1396       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397 	{
1398 	  assert (gemm != NULL);
1399 	  const char *transa, *transb;
1400 	  if (try_blas & 2)
1401 	    transa = "C";
1402 	  else
1403 	    transa = axstride == 1 ? "N" : "T";
1404 
1405 	  if (try_blas & 4)
1406 	    transb = "C";
1407 	  else
1408 	    transb = bxstride == 1 ? "N" : "T";
1409 
1410 	  gemm (transa, transb , &m,
1411 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412 		&ldc, 1, 1);
1413 	  return;
1414 	}
1415     }
1416 
1417   if (rxstride == 1 && axstride == 1 && bxstride == 1
1418       && GFC_DESCRIPTOR_RANK (b) != 1)
1419     {
1420       /* This block of code implements a tuned matmul, derived from
1421          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422 
1423                Bo Kagstrom and Per Ling
1424                Department of Computing Science
1425                Umea University
1426                S-901 87 Umea, Sweden
1427 
1428 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429 
1430       const GFC_REAL_4 *a, *b;
1431       GFC_REAL_4 *c;
1432       const index_type m = xcount, n = ycount, k = count;
1433 
1434       /* System generated locals */
1435       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436 		 i1, i2, i3, i4, i5, i6;
1437 
1438       /* Local variables */
1439       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1440 		 f13, f14, f23, f24, f33, f34, f43, f44;
1441       index_type i, j, l, ii, jj, ll;
1442       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443       GFC_REAL_4 *t1;
1444 
1445       a = abase;
1446       b = bbase;
1447       c = retarray->base_addr;
1448 
1449       /* Parameter adjustments */
1450       c_dim1 = rystride;
1451       c_offset = 1 + c_dim1;
1452       c -= c_offset;
1453       a_dim1 = aystride;
1454       a_offset = 1 + a_dim1;
1455       a -= a_offset;
1456       b_dim1 = bystride;
1457       b_offset = 1 + b_dim1;
1458       b -= b_offset;
1459 
1460       /* Empty c first.  */
1461       for (j=1; j<=n; j++)
1462 	for (i=1; i<=m; i++)
1463 	  c[i + j * c_dim1] = (GFC_REAL_4)0;
1464 
1465       /* Early exit if possible */
1466       if (m == 0 || n == 0 || k == 0)
1467 	return;
1468 
1469       /* Adjust size of t1 to what is needed.  */
1470       index_type t1_dim, a_sz;
1471       if (aystride == 1)
1472         a_sz = rystride;
1473       else
1474         a_sz = a_dim1;
1475 
1476       t1_dim = a_sz * 256 + b_dim1;
1477       if (t1_dim > 65536)
1478 	t1_dim = 65536;
1479 
1480       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
1481 
1482       /* Start turning the crank. */
1483       i1 = n;
1484       for (jj = 1; jj <= i1; jj += 512)
1485 	{
1486 	  /* Computing MIN */
1487 	  i2 = 512;
1488 	  i3 = n - jj + 1;
1489 	  jsec = min(i2,i3);
1490 	  ujsec = jsec - jsec % 4;
1491 	  i2 = k;
1492 	  for (ll = 1; ll <= i2; ll += 256)
1493 	    {
1494 	      /* Computing MIN */
1495 	      i3 = 256;
1496 	      i4 = k - ll + 1;
1497 	      lsec = min(i3,i4);
1498 	      ulsec = lsec - lsec % 2;
1499 
1500 	      i3 = m;
1501 	      for (ii = 1; ii <= i3; ii += 256)
1502 		{
1503 		  /* Computing MIN */
1504 		  i4 = 256;
1505 		  i5 = m - ii + 1;
1506 		  isec = min(i4,i5);
1507 		  uisec = isec - isec % 2;
1508 		  i4 = ll + ulsec - 1;
1509 		  for (l = ll; l <= i4; l += 2)
1510 		    {
1511 		      i5 = ii + uisec - 1;
1512 		      for (i = ii; i <= i5; i += 2)
1513 			{
1514 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515 					a[i + l * a_dim1];
1516 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517 					a[i + (l + 1) * a_dim1];
1518 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519 					a[i + 1 + l * a_dim1];
1520 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521 					a[i + 1 + (l + 1) * a_dim1];
1522 			}
1523 		      if (uisec < isec)
1524 			{
1525 			  t1[l - ll + 1 + (isec << 8) - 257] =
1526 				    a[ii + isec - 1 + l * a_dim1];
1527 			  t1[l - ll + 2 + (isec << 8) - 257] =
1528 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529 			}
1530 		    }
1531 		  if (ulsec < lsec)
1532 		    {
1533 		      i4 = ii + isec - 1;
1534 		      for (i = ii; i<= i4; ++i)
1535 			{
1536 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537 				    a[i + (ll + lsec - 1) * a_dim1];
1538 			}
1539 		    }
1540 
1541 		  uisec = isec - isec % 4;
1542 		  i4 = jj + ujsec - 1;
1543 		  for (j = jj; j <= i4; j += 4)
1544 		    {
1545 		      i5 = ii + uisec - 1;
1546 		      for (i = ii; i <= i5; i += 4)
1547 			{
1548 			  f11 = c[i + j * c_dim1];
1549 			  f21 = c[i + 1 + j * c_dim1];
1550 			  f12 = c[i + (j + 1) * c_dim1];
1551 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552 			  f13 = c[i + (j + 2) * c_dim1];
1553 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554 			  f14 = c[i + (j + 3) * c_dim1];
1555 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556 			  f31 = c[i + 2 + j * c_dim1];
1557 			  f41 = c[i + 3 + j * c_dim1];
1558 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564 			  i6 = ll + lsec - 1;
1565 			  for (l = ll; l <= i6; ++l)
1566 			    {
1567 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568 				      * b[l + j * b_dim1];
1569 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570 				      * b[l + j * b_dim1];
1571 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572 				      * b[l + (j + 1) * b_dim1];
1573 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574 				      * b[l + (j + 1) * b_dim1];
1575 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576 				      * b[l + (j + 2) * b_dim1];
1577 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578 				      * b[l + (j + 2) * b_dim1];
1579 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580 				      * b[l + (j + 3) * b_dim1];
1581 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582 				      * b[l + (j + 3) * b_dim1];
1583 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584 				      * b[l + j * b_dim1];
1585 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586 				      * b[l + j * b_dim1];
1587 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588 				      * b[l + (j + 1) * b_dim1];
1589 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590 				      * b[l + (j + 1) * b_dim1];
1591 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592 				      * b[l + (j + 2) * b_dim1];
1593 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594 				      * b[l + (j + 2) * b_dim1];
1595 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596 				      * b[l + (j + 3) * b_dim1];
1597 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598 				      * b[l + (j + 3) * b_dim1];
1599 			    }
1600 			  c[i + j * c_dim1] = f11;
1601 			  c[i + 1 + j * c_dim1] = f21;
1602 			  c[i + (j + 1) * c_dim1] = f12;
1603 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604 			  c[i + (j + 2) * c_dim1] = f13;
1605 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606 			  c[i + (j + 3) * c_dim1] = f14;
1607 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608 			  c[i + 2 + j * c_dim1] = f31;
1609 			  c[i + 3 + j * c_dim1] = f41;
1610 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616 			}
1617 		      if (uisec < isec)
1618 			{
1619 			  i5 = ii + isec - 1;
1620 			  for (i = ii + uisec; i <= i5; ++i)
1621 			    {
1622 			      f11 = c[i + j * c_dim1];
1623 			      f12 = c[i + (j + 1) * c_dim1];
1624 			      f13 = c[i + (j + 2) * c_dim1];
1625 			      f14 = c[i + (j + 3) * c_dim1];
1626 			      i6 = ll + lsec - 1;
1627 			      for (l = ll; l <= i6; ++l)
1628 				{
1629 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630 					  257] * b[l + j * b_dim1];
1631 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632 					  257] * b[l + (j + 1) * b_dim1];
1633 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634 					  257] * b[l + (j + 2) * b_dim1];
1635 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636 					  257] * b[l + (j + 3) * b_dim1];
1637 				}
1638 			      c[i + j * c_dim1] = f11;
1639 			      c[i + (j + 1) * c_dim1] = f12;
1640 			      c[i + (j + 2) * c_dim1] = f13;
1641 			      c[i + (j + 3) * c_dim1] = f14;
1642 			    }
1643 			}
1644 		    }
1645 		  if (ujsec < jsec)
1646 		    {
1647 		      i4 = jj + jsec - 1;
1648 		      for (j = jj + ujsec; j <= i4; ++j)
1649 			{
1650 			  i5 = ii + uisec - 1;
1651 			  for (i = ii; i <= i5; i += 4)
1652 			    {
1653 			      f11 = c[i + j * c_dim1];
1654 			      f21 = c[i + 1 + j * c_dim1];
1655 			      f31 = c[i + 2 + j * c_dim1];
1656 			      f41 = c[i + 3 + j * c_dim1];
1657 			      i6 = ll + lsec - 1;
1658 			      for (l = ll; l <= i6; ++l)
1659 				{
1660 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661 					  257] * b[l + j * b_dim1];
1662 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663 					  257] * b[l + j * b_dim1];
1664 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665 					  257] * b[l + j * b_dim1];
1666 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667 					  257] * b[l + j * b_dim1];
1668 				}
1669 			      c[i + j * c_dim1] = f11;
1670 			      c[i + 1 + j * c_dim1] = f21;
1671 			      c[i + 2 + j * c_dim1] = f31;
1672 			      c[i + 3 + j * c_dim1] = f41;
1673 			    }
1674 			  i5 = ii + isec - 1;
1675 			  for (i = ii + uisec; i <= i5; ++i)
1676 			    {
1677 			      f11 = c[i + j * c_dim1];
1678 			      i6 = ll + lsec - 1;
1679 			      for (l = ll; l <= i6; ++l)
1680 				{
1681 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682 					  257] * b[l + j * b_dim1];
1683 				}
1684 			      c[i + j * c_dim1] = f11;
1685 			    }
1686 			}
1687 		    }
1688 		}
1689 	    }
1690 	}
1691       free(t1);
1692       return;
1693     }
1694   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695     {
1696       if (GFC_DESCRIPTOR_RANK (a) != 1)
1697 	{
1698 	  const GFC_REAL_4 *restrict abase_x;
1699 	  const GFC_REAL_4 *restrict bbase_y;
1700 	  GFC_REAL_4 *restrict dest_y;
1701 	  GFC_REAL_4 s;
1702 
1703 	  for (y = 0; y < ycount; y++)
1704 	    {
1705 	      bbase_y = &bbase[y*bystride];
1706 	      dest_y = &dest[y*rystride];
1707 	      for (x = 0; x < xcount; x++)
1708 		{
1709 		  abase_x = &abase[x*axstride];
1710 		  s = (GFC_REAL_4) 0;
1711 		  for (n = 0; n < count; n++)
1712 		    s += abase_x[n] * bbase_y[n];
1713 		  dest_y[x] = s;
1714 		}
1715 	    }
1716 	}
1717       else
1718 	{
1719 	  const GFC_REAL_4 *restrict bbase_y;
1720 	  GFC_REAL_4 s;
1721 
1722 	  for (y = 0; y < ycount; y++)
1723 	    {
1724 	      bbase_y = &bbase[y*bystride];
1725 	      s = (GFC_REAL_4) 0;
1726 	      for (n = 0; n < count; n++)
1727 		s += abase[n*axstride] * bbase_y[n];
1728 	      dest[y*rystride] = s;
1729 	    }
1730 	}
1731     }
1732   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733     {
1734       const GFC_REAL_4 *restrict bbase_y;
1735       GFC_REAL_4 s;
1736 
1737       for (y = 0; y < ycount; y++)
1738 	{
1739 	  bbase_y = &bbase[y*bystride];
1740 	  s = (GFC_REAL_4) 0;
1741 	  for (n = 0; n < count; n++)
1742 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743 	  dest[y*rxstride] = s;
1744 	}
1745     }
1746   else if (axstride < aystride)
1747     {
1748       for (y = 0; y < ycount; y++)
1749 	for (x = 0; x < xcount; x++)
1750 	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1751 
1752       for (y = 0; y < ycount; y++)
1753 	for (n = 0; n < count; n++)
1754 	  for (x = 0; x < xcount; x++)
1755 	    /* dest[x,y] += a[x,n] * b[n,y] */
1756 	    dest[x*rxstride + y*rystride] +=
1757 					abase[x*axstride + n*aystride] *
1758 					bbase[n*bxstride + y*bystride];
1759     }
1760   else
1761     {
1762       const GFC_REAL_4 *restrict abase_x;
1763       const GFC_REAL_4 *restrict bbase_y;
1764       GFC_REAL_4 *restrict dest_y;
1765       GFC_REAL_4 s;
1766 
1767       for (y = 0; y < ycount; y++)
1768 	{
1769 	  bbase_y = &bbase[y*bystride];
1770 	  dest_y = &dest[y*rystride];
1771 	  for (x = 0; x < xcount; x++)
1772 	    {
1773 	      abase_x = &abase[x*axstride];
1774 	      s = (GFC_REAL_4) 0;
1775 	      for (n = 0; n < count; n++)
1776 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777 	      dest_y[x*rxstride] = s;
1778 	    }
1779 	}
1780     }
1781 }
1782 #undef POW3
1783 #undef min
1784 #undef max
1785 
1786 #endif  /* HAVE_AVX512F */
1787 
1788 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789 
1790 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791 void
1792 matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
1793 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1794 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795 internal_proto(matmul_r4_avx128_fma3);
1796 #endif
1797 
1798 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799 void
1800 matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
1801 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1802 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803 internal_proto(matmul_r4_avx128_fma4);
1804 #endif
1805 
1806 /* Function to fall back to if there is no special processor-specific version.  */
1807 static void
matmul_r4_vanilla(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1809 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1810 	int blas_limit, blas_call gemm)
1811 {
1812   const GFC_REAL_4 * restrict abase;
1813   const GFC_REAL_4 * restrict bbase;
1814   GFC_REAL_4 * restrict dest;
1815 
1816   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817   index_type x, y, n, count, xcount, ycount;
1818 
1819   assert (GFC_DESCRIPTOR_RANK (a) == 2
1820           || GFC_DESCRIPTOR_RANK (b) == 2);
1821 
1822 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823 
1824    Either A or B (but not both) can be rank 1:
1825 
1826    o One-dimensional argument A is implicitly treated as a row matrix
1827      dimensioned [1,count], so xcount=1.
1828 
1829    o One-dimensional argument B is implicitly treated as a column matrix
1830      dimensioned [count, 1], so ycount=1.
1831 */
1832 
1833   if (retarray->base_addr == NULL)
1834     {
1835       if (GFC_DESCRIPTOR_RANK (a) == 1)
1836         {
1837 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839         }
1840       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841         {
1842 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844         }
1845       else
1846         {
1847 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849 
1850           GFC_DIMENSION_SET(retarray->dim[1], 0,
1851 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853         }
1854 
1855       retarray->base_addr
1856 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1857       retarray->offset = 0;
1858     }
1859   else if (unlikely (compile_options.bounds_check))
1860     {
1861       index_type ret_extent, arg_extent;
1862 
1863       if (GFC_DESCRIPTOR_RANK (a) == 1)
1864 	{
1865 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867 	  if (arg_extent != ret_extent)
1868 	    runtime_error ("Array bound mismatch for dimension 1 of "
1869 	    		   "array (%ld/%ld) ",
1870 			   (long int) ret_extent, (long int) arg_extent);
1871 	}
1872       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873 	{
1874 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876 	  if (arg_extent != ret_extent)
1877 	    runtime_error ("Array bound mismatch for dimension 1 of "
1878 	    		   "array (%ld/%ld) ",
1879 			   (long int) ret_extent, (long int) arg_extent);
1880 	}
1881       else
1882 	{
1883 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885 	  if (arg_extent != ret_extent)
1886 	    runtime_error ("Array bound mismatch for dimension 1 of "
1887 	    		   "array (%ld/%ld) ",
1888 			   (long int) ret_extent, (long int) arg_extent);
1889 
1890 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892 	  if (arg_extent != ret_extent)
1893 	    runtime_error ("Array bound mismatch for dimension 2 of "
1894 	    		   "array (%ld/%ld) ",
1895 			   (long int) ret_extent, (long int) arg_extent);
1896 	}
1897     }
1898 
1899 
1900   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901     {
1902       /* One-dimensional result may be addressed in the code below
1903 	 either as a row or a column matrix. We want both cases to
1904 	 work. */
1905       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906     }
1907   else
1908     {
1909       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911     }
1912 
1913 
1914   if (GFC_DESCRIPTOR_RANK (a) == 1)
1915     {
1916       /* Treat it as a a row matrix A[1,count]. */
1917       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918       aystride = 1;
1919 
1920       xcount = 1;
1921       count = GFC_DESCRIPTOR_EXTENT(a,0);
1922     }
1923   else
1924     {
1925       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927 
1928       count = GFC_DESCRIPTOR_EXTENT(a,1);
1929       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930     }
1931 
1932   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933     {
1934       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936 		       "in dimension 1: is %ld, should be %ld",
1937 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938     }
1939 
1940   if (GFC_DESCRIPTOR_RANK (b) == 1)
1941     {
1942       /* Treat it as a column matrix B[count,1] */
1943       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944 
1945       /* bystride should never be used for 1-dimensional b.
1946          The value is only used for calculation of the
1947          memory by the buffer.  */
1948       bystride = 256;
1949       ycount = 1;
1950     }
1951   else
1952     {
1953       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956     }
1957 
1958   abase = a->base_addr;
1959   bbase = b->base_addr;
1960   dest = retarray->base_addr;
1961 
1962   /* Now that everything is set up, we perform the multiplication
1963      itself.  */
1964 
1965 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966 #define min(a,b) ((a) <= (b) ? (a) : (b))
1967 #define max(a,b) ((a) >= (b) ? (a) : (b))
1968 
1969   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970       && (bxstride == 1 || bystride == 1)
1971       && (((float) xcount) * ((float) ycount) * ((float) count)
1972           > POW3(blas_limit)))
1973     {
1974       const int m = xcount, n = ycount, k = count, ldc = rystride;
1975       const GFC_REAL_4 one = 1, zero = 0;
1976       const int lda = (axstride == 1) ? aystride : axstride,
1977 		ldb = (bxstride == 1) ? bystride : bxstride;
1978 
1979       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980 	{
1981 	  assert (gemm != NULL);
1982 	  const char *transa, *transb;
1983 	  if (try_blas & 2)
1984 	    transa = "C";
1985 	  else
1986 	    transa = axstride == 1 ? "N" : "T";
1987 
1988 	  if (try_blas & 4)
1989 	    transb = "C";
1990 	  else
1991 	    transb = bxstride == 1 ? "N" : "T";
1992 
1993 	  gemm (transa, transb , &m,
1994 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995 		&ldc, 1, 1);
1996 	  return;
1997 	}
1998     }
1999 
2000   if (rxstride == 1 && axstride == 1 && bxstride == 1
2001       && GFC_DESCRIPTOR_RANK (b) != 1)
2002     {
2003       /* This block of code implements a tuned matmul, derived from
2004          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005 
2006                Bo Kagstrom and Per Ling
2007                Department of Computing Science
2008                Umea University
2009                S-901 87 Umea, Sweden
2010 
2011 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012 
2013       const GFC_REAL_4 *a, *b;
2014       GFC_REAL_4 *c;
2015       const index_type m = xcount, n = ycount, k = count;
2016 
2017       /* System generated locals */
2018       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019 		 i1, i2, i3, i4, i5, i6;
2020 
2021       /* Local variables */
2022       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2023 		 f13, f14, f23, f24, f33, f34, f43, f44;
2024       index_type i, j, l, ii, jj, ll;
2025       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026       GFC_REAL_4 *t1;
2027 
2028       a = abase;
2029       b = bbase;
2030       c = retarray->base_addr;
2031 
2032       /* Parameter adjustments */
2033       c_dim1 = rystride;
2034       c_offset = 1 + c_dim1;
2035       c -= c_offset;
2036       a_dim1 = aystride;
2037       a_offset = 1 + a_dim1;
2038       a -= a_offset;
2039       b_dim1 = bystride;
2040       b_offset = 1 + b_dim1;
2041       b -= b_offset;
2042 
2043       /* Empty c first.  */
2044       for (j=1; j<=n; j++)
2045 	for (i=1; i<=m; i++)
2046 	  c[i + j * c_dim1] = (GFC_REAL_4)0;
2047 
2048       /* Early exit if possible */
2049       if (m == 0 || n == 0 || k == 0)
2050 	return;
2051 
2052       /* Adjust size of t1 to what is needed.  */
2053       index_type t1_dim, a_sz;
2054       if (aystride == 1)
2055         a_sz = rystride;
2056       else
2057         a_sz = a_dim1;
2058 
2059       t1_dim = a_sz * 256 + b_dim1;
2060       if (t1_dim > 65536)
2061 	t1_dim = 65536;
2062 
2063       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
2064 
2065       /* Start turning the crank. */
2066       i1 = n;
2067       for (jj = 1; jj <= i1; jj += 512)
2068 	{
2069 	  /* Computing MIN */
2070 	  i2 = 512;
2071 	  i3 = n - jj + 1;
2072 	  jsec = min(i2,i3);
2073 	  ujsec = jsec - jsec % 4;
2074 	  i2 = k;
2075 	  for (ll = 1; ll <= i2; ll += 256)
2076 	    {
2077 	      /* Computing MIN */
2078 	      i3 = 256;
2079 	      i4 = k - ll + 1;
2080 	      lsec = min(i3,i4);
2081 	      ulsec = lsec - lsec % 2;
2082 
2083 	      i3 = m;
2084 	      for (ii = 1; ii <= i3; ii += 256)
2085 		{
2086 		  /* Computing MIN */
2087 		  i4 = 256;
2088 		  i5 = m - ii + 1;
2089 		  isec = min(i4,i5);
2090 		  uisec = isec - isec % 2;
2091 		  i4 = ll + ulsec - 1;
2092 		  for (l = ll; l <= i4; l += 2)
2093 		    {
2094 		      i5 = ii + uisec - 1;
2095 		      for (i = ii; i <= i5; i += 2)
2096 			{
2097 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098 					a[i + l * a_dim1];
2099 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100 					a[i + (l + 1) * a_dim1];
2101 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102 					a[i + 1 + l * a_dim1];
2103 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104 					a[i + 1 + (l + 1) * a_dim1];
2105 			}
2106 		      if (uisec < isec)
2107 			{
2108 			  t1[l - ll + 1 + (isec << 8) - 257] =
2109 				    a[ii + isec - 1 + l * a_dim1];
2110 			  t1[l - ll + 2 + (isec << 8) - 257] =
2111 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112 			}
2113 		    }
2114 		  if (ulsec < lsec)
2115 		    {
2116 		      i4 = ii + isec - 1;
2117 		      for (i = ii; i<= i4; ++i)
2118 			{
2119 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120 				    a[i + (ll + lsec - 1) * a_dim1];
2121 			}
2122 		    }
2123 
2124 		  uisec = isec - isec % 4;
2125 		  i4 = jj + ujsec - 1;
2126 		  for (j = jj; j <= i4; j += 4)
2127 		    {
2128 		      i5 = ii + uisec - 1;
2129 		      for (i = ii; i <= i5; i += 4)
2130 			{
2131 			  f11 = c[i + j * c_dim1];
2132 			  f21 = c[i + 1 + j * c_dim1];
2133 			  f12 = c[i + (j + 1) * c_dim1];
2134 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135 			  f13 = c[i + (j + 2) * c_dim1];
2136 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137 			  f14 = c[i + (j + 3) * c_dim1];
2138 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139 			  f31 = c[i + 2 + j * c_dim1];
2140 			  f41 = c[i + 3 + j * c_dim1];
2141 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147 			  i6 = ll + lsec - 1;
2148 			  for (l = ll; l <= i6; ++l)
2149 			    {
2150 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151 				      * b[l + j * b_dim1];
2152 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153 				      * b[l + j * b_dim1];
2154 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155 				      * b[l + (j + 1) * b_dim1];
2156 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157 				      * b[l + (j + 1) * b_dim1];
2158 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159 				      * b[l + (j + 2) * b_dim1];
2160 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161 				      * b[l + (j + 2) * b_dim1];
2162 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163 				      * b[l + (j + 3) * b_dim1];
2164 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165 				      * b[l + (j + 3) * b_dim1];
2166 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167 				      * b[l + j * b_dim1];
2168 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169 				      * b[l + j * b_dim1];
2170 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171 				      * b[l + (j + 1) * b_dim1];
2172 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173 				      * b[l + (j + 1) * b_dim1];
2174 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175 				      * b[l + (j + 2) * b_dim1];
2176 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177 				      * b[l + (j + 2) * b_dim1];
2178 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179 				      * b[l + (j + 3) * b_dim1];
2180 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181 				      * b[l + (j + 3) * b_dim1];
2182 			    }
2183 			  c[i + j * c_dim1] = f11;
2184 			  c[i + 1 + j * c_dim1] = f21;
2185 			  c[i + (j + 1) * c_dim1] = f12;
2186 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187 			  c[i + (j + 2) * c_dim1] = f13;
2188 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189 			  c[i + (j + 3) * c_dim1] = f14;
2190 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191 			  c[i + 2 + j * c_dim1] = f31;
2192 			  c[i + 3 + j * c_dim1] = f41;
2193 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199 			}
2200 		      if (uisec < isec)
2201 			{
2202 			  i5 = ii + isec - 1;
2203 			  for (i = ii + uisec; i <= i5; ++i)
2204 			    {
2205 			      f11 = c[i + j * c_dim1];
2206 			      f12 = c[i + (j + 1) * c_dim1];
2207 			      f13 = c[i + (j + 2) * c_dim1];
2208 			      f14 = c[i + (j + 3) * c_dim1];
2209 			      i6 = ll + lsec - 1;
2210 			      for (l = ll; l <= i6; ++l)
2211 				{
2212 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213 					  257] * b[l + j * b_dim1];
2214 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215 					  257] * b[l + (j + 1) * b_dim1];
2216 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 					  257] * b[l + (j + 2) * b_dim1];
2218 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219 					  257] * b[l + (j + 3) * b_dim1];
2220 				}
2221 			      c[i + j * c_dim1] = f11;
2222 			      c[i + (j + 1) * c_dim1] = f12;
2223 			      c[i + (j + 2) * c_dim1] = f13;
2224 			      c[i + (j + 3) * c_dim1] = f14;
2225 			    }
2226 			}
2227 		    }
2228 		  if (ujsec < jsec)
2229 		    {
2230 		      i4 = jj + jsec - 1;
2231 		      for (j = jj + ujsec; j <= i4; ++j)
2232 			{
2233 			  i5 = ii + uisec - 1;
2234 			  for (i = ii; i <= i5; i += 4)
2235 			    {
2236 			      f11 = c[i + j * c_dim1];
2237 			      f21 = c[i + 1 + j * c_dim1];
2238 			      f31 = c[i + 2 + j * c_dim1];
2239 			      f41 = c[i + 3 + j * c_dim1];
2240 			      i6 = ll + lsec - 1;
2241 			      for (l = ll; l <= i6; ++l)
2242 				{
2243 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244 					  257] * b[l + j * b_dim1];
2245 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246 					  257] * b[l + j * b_dim1];
2247 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248 					  257] * b[l + j * b_dim1];
2249 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250 					  257] * b[l + j * b_dim1];
2251 				}
2252 			      c[i + j * c_dim1] = f11;
2253 			      c[i + 1 + j * c_dim1] = f21;
2254 			      c[i + 2 + j * c_dim1] = f31;
2255 			      c[i + 3 + j * c_dim1] = f41;
2256 			    }
2257 			  i5 = ii + isec - 1;
2258 			  for (i = ii + uisec; i <= i5; ++i)
2259 			    {
2260 			      f11 = c[i + j * c_dim1];
2261 			      i6 = ll + lsec - 1;
2262 			      for (l = ll; l <= i6; ++l)
2263 				{
2264 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265 					  257] * b[l + j * b_dim1];
2266 				}
2267 			      c[i + j * c_dim1] = f11;
2268 			    }
2269 			}
2270 		    }
2271 		}
2272 	    }
2273 	}
2274       free(t1);
2275       return;
2276     }
2277   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278     {
2279       if (GFC_DESCRIPTOR_RANK (a) != 1)
2280 	{
2281 	  const GFC_REAL_4 *restrict abase_x;
2282 	  const GFC_REAL_4 *restrict bbase_y;
2283 	  GFC_REAL_4 *restrict dest_y;
2284 	  GFC_REAL_4 s;
2285 
2286 	  for (y = 0; y < ycount; y++)
2287 	    {
2288 	      bbase_y = &bbase[y*bystride];
2289 	      dest_y = &dest[y*rystride];
2290 	      for (x = 0; x < xcount; x++)
2291 		{
2292 		  abase_x = &abase[x*axstride];
2293 		  s = (GFC_REAL_4) 0;
2294 		  for (n = 0; n < count; n++)
2295 		    s += abase_x[n] * bbase_y[n];
2296 		  dest_y[x] = s;
2297 		}
2298 	    }
2299 	}
2300       else
2301 	{
2302 	  const GFC_REAL_4 *restrict bbase_y;
2303 	  GFC_REAL_4 s;
2304 
2305 	  for (y = 0; y < ycount; y++)
2306 	    {
2307 	      bbase_y = &bbase[y*bystride];
2308 	      s = (GFC_REAL_4) 0;
2309 	      for (n = 0; n < count; n++)
2310 		s += abase[n*axstride] * bbase_y[n];
2311 	      dest[y*rystride] = s;
2312 	    }
2313 	}
2314     }
2315   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316     {
2317       const GFC_REAL_4 *restrict bbase_y;
2318       GFC_REAL_4 s;
2319 
2320       for (y = 0; y < ycount; y++)
2321 	{
2322 	  bbase_y = &bbase[y*bystride];
2323 	  s = (GFC_REAL_4) 0;
2324 	  for (n = 0; n < count; n++)
2325 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326 	  dest[y*rxstride] = s;
2327 	}
2328     }
2329   else if (axstride < aystride)
2330     {
2331       for (y = 0; y < ycount; y++)
2332 	for (x = 0; x < xcount; x++)
2333 	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2334 
2335       for (y = 0; y < ycount; y++)
2336 	for (n = 0; n < count; n++)
2337 	  for (x = 0; x < xcount; x++)
2338 	    /* dest[x,y] += a[x,n] * b[n,y] */
2339 	    dest[x*rxstride + y*rystride] +=
2340 					abase[x*axstride + n*aystride] *
2341 					bbase[n*bxstride + y*bystride];
2342     }
2343   else
2344     {
2345       const GFC_REAL_4 *restrict abase_x;
2346       const GFC_REAL_4 *restrict bbase_y;
2347       GFC_REAL_4 *restrict dest_y;
2348       GFC_REAL_4 s;
2349 
2350       for (y = 0; y < ycount; y++)
2351 	{
2352 	  bbase_y = &bbase[y*bystride];
2353 	  dest_y = &dest[y*rystride];
2354 	  for (x = 0; x < xcount; x++)
2355 	    {
2356 	      abase_x = &abase[x*axstride];
2357 	      s = (GFC_REAL_4) 0;
2358 	      for (n = 0; n < count; n++)
2359 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360 	      dest_y[x*rxstride] = s;
2361 	    }
2362 	}
2363     }
2364 }
2365 #undef POW3
2366 #undef min
2367 #undef max
2368 
2369 
2370 /* Compiling main function, with selection code for the processor.  */
2371 
2372 /* Currently, this is i386 only.  Adjust for other architectures.  */
2373 
matmul_r4(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2374 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2375 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2376 	int blas_limit, blas_call gemm)
2377 {
2378   static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2379 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2380 	int blas_limit, blas_call gemm);
2381 
2382   void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2383 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2384 	int blas_limit, blas_call gemm);
2385 
2386   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2387   if (matmul_fn == NULL)
2388     {
2389       matmul_fn = matmul_r4_vanilla;
2390       if (__builtin_cpu_is ("intel"))
2391 	{
2392           /* Run down the available processors in order of preference.  */
2393 #ifdef HAVE_AVX512F
2394 	  if (__builtin_cpu_supports ("avx512f"))
2395 	    {
2396 	      matmul_fn = matmul_r4_avx512f;
2397 	      goto store;
2398 	    }
2399 
2400 #endif  /* HAVE_AVX512F */
2401 
2402 #ifdef HAVE_AVX2
2403 	  if (__builtin_cpu_supports ("avx2")
2404 	      && __builtin_cpu_supports ("fma"))
2405 	    {
2406 	      matmul_fn = matmul_r4_avx2;
2407 	      goto store;
2408 	    }
2409 
2410 #endif
2411 
2412 #ifdef HAVE_AVX
2413 	  if (__builtin_cpu_supports ("avx"))
2414  	    {
2415               matmul_fn = matmul_r4_avx;
2416 	      goto store;
2417 	    }
2418 #endif  /* HAVE_AVX */
2419         }
2420     else if (__builtin_cpu_is ("amd"))
2421       {
2422 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2423 	if (__builtin_cpu_supports ("avx")
2424 	    && __builtin_cpu_supports ("fma"))
2425 	  {
2426             matmul_fn = matmul_r4_avx128_fma3;
2427 	    goto store;
2428 	  }
2429 #endif
2430 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2431 	if (__builtin_cpu_supports ("avx")
2432 	    && __builtin_cpu_supports ("fma4"))
2433 	  {
2434             matmul_fn = matmul_r4_avx128_fma4;
2435 	    goto store;
2436 	  }
2437 #endif
2438 
2439       }
2440    store:
2441       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2442    }
2443 
2444    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2445 }
2446 
2447 #else  /* Just the vanilla function.  */
2448 
2449 void
matmul_r4(gfc_array_r4 * const restrict retarray,gfc_array_r4 * const restrict a,gfc_array_r4 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2450 matmul_r4 (gfc_array_r4 * const restrict retarray,
2451 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2452 	int blas_limit, blas_call gemm)
2453 {
2454   const GFC_REAL_4 * restrict abase;
2455   const GFC_REAL_4 * restrict bbase;
2456   GFC_REAL_4 * restrict dest;
2457 
2458   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2459   index_type x, y, n, count, xcount, ycount;
2460 
2461   assert (GFC_DESCRIPTOR_RANK (a) == 2
2462           || GFC_DESCRIPTOR_RANK (b) == 2);
2463 
2464 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2465 
2466    Either A or B (but not both) can be rank 1:
2467 
2468    o One-dimensional argument A is implicitly treated as a row matrix
2469      dimensioned [1,count], so xcount=1.
2470 
2471    o One-dimensional argument B is implicitly treated as a column matrix
2472      dimensioned [count, 1], so ycount=1.
2473 */
2474 
2475   if (retarray->base_addr == NULL)
2476     {
2477       if (GFC_DESCRIPTOR_RANK (a) == 1)
2478         {
2479 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2480 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2481         }
2482       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2483         {
2484 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2485 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2486         }
2487       else
2488         {
2489 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2490 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2491 
2492           GFC_DIMENSION_SET(retarray->dim[1], 0,
2493 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2494 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2495         }
2496 
2497       retarray->base_addr
2498 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2499       retarray->offset = 0;
2500     }
2501   else if (unlikely (compile_options.bounds_check))
2502     {
2503       index_type ret_extent, arg_extent;
2504 
2505       if (GFC_DESCRIPTOR_RANK (a) == 1)
2506 	{
2507 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2508 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2509 	  if (arg_extent != ret_extent)
2510 	    runtime_error ("Array bound mismatch for dimension 1 of "
2511 	    		   "array (%ld/%ld) ",
2512 			   (long int) ret_extent, (long int) arg_extent);
2513 	}
2514       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2515 	{
2516 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2517 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2518 	  if (arg_extent != ret_extent)
2519 	    runtime_error ("Array bound mismatch for dimension 1 of "
2520 	    		   "array (%ld/%ld) ",
2521 			   (long int) ret_extent, (long int) arg_extent);
2522 	}
2523       else
2524 	{
2525 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2526 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2527 	  if (arg_extent != ret_extent)
2528 	    runtime_error ("Array bound mismatch for dimension 1 of "
2529 	    		   "array (%ld/%ld) ",
2530 			   (long int) ret_extent, (long int) arg_extent);
2531 
2532 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2533 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2534 	  if (arg_extent != ret_extent)
2535 	    runtime_error ("Array bound mismatch for dimension 2 of "
2536 	    		   "array (%ld/%ld) ",
2537 			   (long int) ret_extent, (long int) arg_extent);
2538 	}
2539     }
2540 
2541 
2542   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2543     {
2544       /* One-dimensional result may be addressed in the code below
2545 	 either as a row or a column matrix. We want both cases to
2546 	 work. */
2547       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548     }
2549   else
2550     {
2551       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2552       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2553     }
2554 
2555 
2556   if (GFC_DESCRIPTOR_RANK (a) == 1)
2557     {
2558       /* Treat it as a a row matrix A[1,count]. */
2559       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2560       aystride = 1;
2561 
2562       xcount = 1;
2563       count = GFC_DESCRIPTOR_EXTENT(a,0);
2564     }
2565   else
2566     {
2567       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2568       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2569 
2570       count = GFC_DESCRIPTOR_EXTENT(a,1);
2571       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2572     }
2573 
2574   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2575     {
2576       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2577 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2578 		       "in dimension 1: is %ld, should be %ld",
2579 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2580     }
2581 
2582   if (GFC_DESCRIPTOR_RANK (b) == 1)
2583     {
2584       /* Treat it as a column matrix B[count,1] */
2585       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2586 
2587       /* bystride should never be used for 1-dimensional b.
2588          The value is only used for calculation of the
2589          memory by the buffer.  */
2590       bystride = 256;
2591       ycount = 1;
2592     }
2593   else
2594     {
2595       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2596       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2597       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2598     }
2599 
2600   abase = a->base_addr;
2601   bbase = b->base_addr;
2602   dest = retarray->base_addr;
2603 
2604   /* Now that everything is set up, we perform the multiplication
2605      itself.  */
2606 
2607 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2608 #define min(a,b) ((a) <= (b) ? (a) : (b))
2609 #define max(a,b) ((a) >= (b) ? (a) : (b))
2610 
2611   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2612       && (bxstride == 1 || bystride == 1)
2613       && (((float) xcount) * ((float) ycount) * ((float) count)
2614           > POW3(blas_limit)))
2615     {
2616       const int m = xcount, n = ycount, k = count, ldc = rystride;
2617       const GFC_REAL_4 one = 1, zero = 0;
2618       const int lda = (axstride == 1) ? aystride : axstride,
2619 		ldb = (bxstride == 1) ? bystride : bxstride;
2620 
2621       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2622 	{
2623 	  assert (gemm != NULL);
2624 	  const char *transa, *transb;
2625 	  if (try_blas & 2)
2626 	    transa = "C";
2627 	  else
2628 	    transa = axstride == 1 ? "N" : "T";
2629 
2630 	  if (try_blas & 4)
2631 	    transb = "C";
2632 	  else
2633 	    transb = bxstride == 1 ? "N" : "T";
2634 
2635 	  gemm (transa, transb , &m,
2636 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2637 		&ldc, 1, 1);
2638 	  return;
2639 	}
2640     }
2641 
2642   if (rxstride == 1 && axstride == 1 && bxstride == 1
2643       && GFC_DESCRIPTOR_RANK (b) != 1)
2644     {
2645       /* This block of code implements a tuned matmul, derived from
2646          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2647 
2648                Bo Kagstrom and Per Ling
2649                Department of Computing Science
2650                Umea University
2651                S-901 87 Umea, Sweden
2652 
2653 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2654 
2655       const GFC_REAL_4 *a, *b;
2656       GFC_REAL_4 *c;
2657       const index_type m = xcount, n = ycount, k = count;
2658 
2659       /* System generated locals */
2660       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2661 		 i1, i2, i3, i4, i5, i6;
2662 
2663       /* Local variables */
2664       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2665 		 f13, f14, f23, f24, f33, f34, f43, f44;
2666       index_type i, j, l, ii, jj, ll;
2667       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2668       GFC_REAL_4 *t1;
2669 
2670       a = abase;
2671       b = bbase;
2672       c = retarray->base_addr;
2673 
2674       /* Parameter adjustments */
2675       c_dim1 = rystride;
2676       c_offset = 1 + c_dim1;
2677       c -= c_offset;
2678       a_dim1 = aystride;
2679       a_offset = 1 + a_dim1;
2680       a -= a_offset;
2681       b_dim1 = bystride;
2682       b_offset = 1 + b_dim1;
2683       b -= b_offset;
2684 
2685       /* Empty c first.  */
2686       for (j=1; j<=n; j++)
2687 	for (i=1; i<=m; i++)
2688 	  c[i + j * c_dim1] = (GFC_REAL_4)0;
2689 
2690       /* Early exit if possible */
2691       if (m == 0 || n == 0 || k == 0)
2692 	return;
2693 
2694       /* Adjust size of t1 to what is needed.  */
2695       index_type t1_dim, a_sz;
2696       if (aystride == 1)
2697         a_sz = rystride;
2698       else
2699         a_sz = a_dim1;
2700 
2701       t1_dim = a_sz * 256 + b_dim1;
2702       if (t1_dim > 65536)
2703 	t1_dim = 65536;
2704 
2705       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
2706 
2707       /* Start turning the crank. */
2708       i1 = n;
2709       for (jj = 1; jj <= i1; jj += 512)
2710 	{
2711 	  /* Computing MIN */
2712 	  i2 = 512;
2713 	  i3 = n - jj + 1;
2714 	  jsec = min(i2,i3);
2715 	  ujsec = jsec - jsec % 4;
2716 	  i2 = k;
2717 	  for (ll = 1; ll <= i2; ll += 256)
2718 	    {
2719 	      /* Computing MIN */
2720 	      i3 = 256;
2721 	      i4 = k - ll + 1;
2722 	      lsec = min(i3,i4);
2723 	      ulsec = lsec - lsec % 2;
2724 
2725 	      i3 = m;
2726 	      for (ii = 1; ii <= i3; ii += 256)
2727 		{
2728 		  /* Computing MIN */
2729 		  i4 = 256;
2730 		  i5 = m - ii + 1;
2731 		  isec = min(i4,i5);
2732 		  uisec = isec - isec % 2;
2733 		  i4 = ll + ulsec - 1;
2734 		  for (l = ll; l <= i4; l += 2)
2735 		    {
2736 		      i5 = ii + uisec - 1;
2737 		      for (i = ii; i <= i5; i += 2)
2738 			{
2739 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2740 					a[i + l * a_dim1];
2741 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2742 					a[i + (l + 1) * a_dim1];
2743 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2744 					a[i + 1 + l * a_dim1];
2745 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2746 					a[i + 1 + (l + 1) * a_dim1];
2747 			}
2748 		      if (uisec < isec)
2749 			{
2750 			  t1[l - ll + 1 + (isec << 8) - 257] =
2751 				    a[ii + isec - 1 + l * a_dim1];
2752 			  t1[l - ll + 2 + (isec << 8) - 257] =
2753 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2754 			}
2755 		    }
2756 		  if (ulsec < lsec)
2757 		    {
2758 		      i4 = ii + isec - 1;
2759 		      for (i = ii; i<= i4; ++i)
2760 			{
2761 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2762 				    a[i + (ll + lsec - 1) * a_dim1];
2763 			}
2764 		    }
2765 
2766 		  uisec = isec - isec % 4;
2767 		  i4 = jj + ujsec - 1;
2768 		  for (j = jj; j <= i4; j += 4)
2769 		    {
2770 		      i5 = ii + uisec - 1;
2771 		      for (i = ii; i <= i5; i += 4)
2772 			{
2773 			  f11 = c[i + j * c_dim1];
2774 			  f21 = c[i + 1 + j * c_dim1];
2775 			  f12 = c[i + (j + 1) * c_dim1];
2776 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2777 			  f13 = c[i + (j + 2) * c_dim1];
2778 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2779 			  f14 = c[i + (j + 3) * c_dim1];
2780 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2781 			  f31 = c[i + 2 + j * c_dim1];
2782 			  f41 = c[i + 3 + j * c_dim1];
2783 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2784 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2785 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2786 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2787 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2788 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2789 			  i6 = ll + lsec - 1;
2790 			  for (l = ll; l <= i6; ++l)
2791 			    {
2792 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2793 				      * b[l + j * b_dim1];
2794 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2795 				      * b[l + j * b_dim1];
2796 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2797 				      * b[l + (j + 1) * b_dim1];
2798 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2799 				      * b[l + (j + 1) * b_dim1];
2800 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2801 				      * b[l + (j + 2) * b_dim1];
2802 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2803 				      * b[l + (j + 2) * b_dim1];
2804 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2805 				      * b[l + (j + 3) * b_dim1];
2806 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2807 				      * b[l + (j + 3) * b_dim1];
2808 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2809 				      * b[l + j * b_dim1];
2810 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2811 				      * b[l + j * b_dim1];
2812 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2813 				      * b[l + (j + 1) * b_dim1];
2814 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2815 				      * b[l + (j + 1) * b_dim1];
2816 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2817 				      * b[l + (j + 2) * b_dim1];
2818 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2819 				      * b[l + (j + 2) * b_dim1];
2820 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2821 				      * b[l + (j + 3) * b_dim1];
2822 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2823 				      * b[l + (j + 3) * b_dim1];
2824 			    }
2825 			  c[i + j * c_dim1] = f11;
2826 			  c[i + 1 + j * c_dim1] = f21;
2827 			  c[i + (j + 1) * c_dim1] = f12;
2828 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2829 			  c[i + (j + 2) * c_dim1] = f13;
2830 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2831 			  c[i + (j + 3) * c_dim1] = f14;
2832 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2833 			  c[i + 2 + j * c_dim1] = f31;
2834 			  c[i + 3 + j * c_dim1] = f41;
2835 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2836 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2837 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2838 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2839 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2840 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2841 			}
2842 		      if (uisec < isec)
2843 			{
2844 			  i5 = ii + isec - 1;
2845 			  for (i = ii + uisec; i <= i5; ++i)
2846 			    {
2847 			      f11 = c[i + j * c_dim1];
2848 			      f12 = c[i + (j + 1) * c_dim1];
2849 			      f13 = c[i + (j + 2) * c_dim1];
2850 			      f14 = c[i + (j + 3) * c_dim1];
2851 			      i6 = ll + lsec - 1;
2852 			      for (l = ll; l <= i6; ++l)
2853 				{
2854 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2855 					  257] * b[l + j * b_dim1];
2856 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2857 					  257] * b[l + (j + 1) * b_dim1];
2858 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2859 					  257] * b[l + (j + 2) * b_dim1];
2860 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2861 					  257] * b[l + (j + 3) * b_dim1];
2862 				}
2863 			      c[i + j * c_dim1] = f11;
2864 			      c[i + (j + 1) * c_dim1] = f12;
2865 			      c[i + (j + 2) * c_dim1] = f13;
2866 			      c[i + (j + 3) * c_dim1] = f14;
2867 			    }
2868 			}
2869 		    }
2870 		  if (ujsec < jsec)
2871 		    {
2872 		      i4 = jj + jsec - 1;
2873 		      for (j = jj + ujsec; j <= i4; ++j)
2874 			{
2875 			  i5 = ii + uisec - 1;
2876 			  for (i = ii; i <= i5; i += 4)
2877 			    {
2878 			      f11 = c[i + j * c_dim1];
2879 			      f21 = c[i + 1 + j * c_dim1];
2880 			      f31 = c[i + 2 + j * c_dim1];
2881 			      f41 = c[i + 3 + j * c_dim1];
2882 			      i6 = ll + lsec - 1;
2883 			      for (l = ll; l <= i6; ++l)
2884 				{
2885 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2886 					  257] * b[l + j * b_dim1];
2887 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2888 					  257] * b[l + j * b_dim1];
2889 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2890 					  257] * b[l + j * b_dim1];
2891 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2892 					  257] * b[l + j * b_dim1];
2893 				}
2894 			      c[i + j * c_dim1] = f11;
2895 			      c[i + 1 + j * c_dim1] = f21;
2896 			      c[i + 2 + j * c_dim1] = f31;
2897 			      c[i + 3 + j * c_dim1] = f41;
2898 			    }
2899 			  i5 = ii + isec - 1;
2900 			  for (i = ii + uisec; i <= i5; ++i)
2901 			    {
2902 			      f11 = c[i + j * c_dim1];
2903 			      i6 = ll + lsec - 1;
2904 			      for (l = ll; l <= i6; ++l)
2905 				{
2906 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2907 					  257] * b[l + j * b_dim1];
2908 				}
2909 			      c[i + j * c_dim1] = f11;
2910 			    }
2911 			}
2912 		    }
2913 		}
2914 	    }
2915 	}
2916       free(t1);
2917       return;
2918     }
2919   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2920     {
2921       if (GFC_DESCRIPTOR_RANK (a) != 1)
2922 	{
2923 	  const GFC_REAL_4 *restrict abase_x;
2924 	  const GFC_REAL_4 *restrict bbase_y;
2925 	  GFC_REAL_4 *restrict dest_y;
2926 	  GFC_REAL_4 s;
2927 
2928 	  for (y = 0; y < ycount; y++)
2929 	    {
2930 	      bbase_y = &bbase[y*bystride];
2931 	      dest_y = &dest[y*rystride];
2932 	      for (x = 0; x < xcount; x++)
2933 		{
2934 		  abase_x = &abase[x*axstride];
2935 		  s = (GFC_REAL_4) 0;
2936 		  for (n = 0; n < count; n++)
2937 		    s += abase_x[n] * bbase_y[n];
2938 		  dest_y[x] = s;
2939 		}
2940 	    }
2941 	}
2942       else
2943 	{
2944 	  const GFC_REAL_4 *restrict bbase_y;
2945 	  GFC_REAL_4 s;
2946 
2947 	  for (y = 0; y < ycount; y++)
2948 	    {
2949 	      bbase_y = &bbase[y*bystride];
2950 	      s = (GFC_REAL_4) 0;
2951 	      for (n = 0; n < count; n++)
2952 		s += abase[n*axstride] * bbase_y[n];
2953 	      dest[y*rystride] = s;
2954 	    }
2955 	}
2956     }
2957   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2958     {
2959       const GFC_REAL_4 *restrict bbase_y;
2960       GFC_REAL_4 s;
2961 
2962       for (y = 0; y < ycount; y++)
2963 	{
2964 	  bbase_y = &bbase[y*bystride];
2965 	  s = (GFC_REAL_4) 0;
2966 	  for (n = 0; n < count; n++)
2967 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2968 	  dest[y*rxstride] = s;
2969 	}
2970     }
2971   else if (axstride < aystride)
2972     {
2973       for (y = 0; y < ycount; y++)
2974 	for (x = 0; x < xcount; x++)
2975 	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2976 
2977       for (y = 0; y < ycount; y++)
2978 	for (n = 0; n < count; n++)
2979 	  for (x = 0; x < xcount; x++)
2980 	    /* dest[x,y] += a[x,n] * b[n,y] */
2981 	    dest[x*rxstride + y*rystride] +=
2982 					abase[x*axstride + n*aystride] *
2983 					bbase[n*bxstride + y*bystride];
2984     }
2985   else
2986     {
2987       const GFC_REAL_4 *restrict abase_x;
2988       const GFC_REAL_4 *restrict bbase_y;
2989       GFC_REAL_4 *restrict dest_y;
2990       GFC_REAL_4 s;
2991 
2992       for (y = 0; y < ycount; y++)
2993 	{
2994 	  bbase_y = &bbase[y*bystride];
2995 	  dest_y = &dest[y*rystride];
2996 	  for (x = 0; x < xcount; x++)
2997 	    {
2998 	      abase_x = &abase[x*axstride];
2999 	      s = (GFC_REAL_4) 0;
3000 	      for (n = 0; n < count; n++)
3001 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3002 	      dest_y[x*rxstride] = s;
3003 	    }
3004 	}
3005     }
3006 }
3007 #undef POW3
3008 #undef min
3009 #undef max
3010 
3011 #endif
3012 #endif
3013 
3014