1 /* Implementation of the MATMUL intrinsic
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    Contributed by Paul Brook <paul@nowt.org>
4 
5 This file is part of the GNU Fortran runtime library (libgfortran).
6 
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11 
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20 
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24 <http://www.gnu.org/licenses/>.  */
25 
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29 
30 
31 #if defined (HAVE_GFC_REAL_16)
32 
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34    passed to us by the front-end, in which case we call it for large
35    matrices.  */
36 
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38                           const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
39                           const int *, const GFC_REAL_16 *, const int *,
40                           const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
41                           int, int);
42 
43 /* The order of loops is different in the case of plain matrix
44    multiplication C=MATMUL(A,B), and in the frequent special case where
45    the argument A is the temporary result of a TRANSPOSE intrinsic:
46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47    looking at their strides.
48 
49    The equivalent Fortran pseudo-code is:
50 
51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52    IF (.NOT.IS_TRANSPOSED(A)) THEN
53      C = 0
54      DO J=1,N
55        DO K=1,COUNT
56          DO I=1,M
57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
58    ELSE
59      DO J=1,N
60        DO I=1,M
61          S = 0
62          DO K=1,COUNT
63            S = S+A(I,K)*B(K,J)
64          C(I,J) = S
65    ENDIF
66 */
67 
68 /* If try_blas is set to a nonzero value, then the matmul function will
69    see if there is a way to perform the matrix multiplication by a call
70    to the BLAS gemm function.  */
71 
72 extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
73 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
74 	int blas_limit, blas_call gemm);
75 export_proto(matmul_r16);
76 
77 /* Put exhaustive list of possible architectures here here, ORed together.  */
78 
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80 
81 #ifdef HAVE_AVX
82 static void
83 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
84 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
85 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
matmul_r16_avx(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
88 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
89 	int blas_limit, blas_call gemm)
90 {
91   const GFC_REAL_16 * restrict abase;
92   const GFC_REAL_16 * restrict bbase;
93   GFC_REAL_16 * restrict dest;
94 
95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96   index_type x, y, n, count, xcount, ycount;
97 
98   assert (GFC_DESCRIPTOR_RANK (a) == 2
99           || GFC_DESCRIPTOR_RANK (b) == 2);
100 
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102 
103    Either A or B (but not both) can be rank 1:
104 
105    o One-dimensional argument A is implicitly treated as a row matrix
106      dimensioned [1,count], so xcount=1.
107 
108    o One-dimensional argument B is implicitly treated as a column matrix
109      dimensioned [count, 1], so ycount=1.
110 */
111 
112   if (retarray->base_addr == NULL)
113     {
114       if (GFC_DESCRIPTOR_RANK (a) == 1)
115         {
116 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118         }
119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
120         {
121 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123         }
124       else
125         {
126 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128 
129           GFC_DIMENSION_SET(retarray->dim[1], 0,
130 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132         }
133 
134       retarray->base_addr
135 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
136       retarray->offset = 0;
137     }
138   else if (unlikely (compile_options.bounds_check))
139     {
140       index_type ret_extent, arg_extent;
141 
142       if (GFC_DESCRIPTOR_RANK (a) == 1)
143 	{
144 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 	  if (arg_extent != ret_extent)
147 	    runtime_error ("Incorrect extent in return array in"
148 			   " MATMUL intrinsic: is %ld, should be %ld",
149 			   (long int) ret_extent, (long int) arg_extent);
150 	}
151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 	{
153 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 	  if (arg_extent != ret_extent)
156 	    runtime_error ("Incorrect extent in return array in"
157 			   " MATMUL intrinsic: is %ld, should be %ld",
158 			   (long int) ret_extent, (long int) arg_extent);
159 	}
160       else
161 	{
162 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 	  if (arg_extent != ret_extent)
165 	    runtime_error ("Incorrect extent in return array in"
166 			   " MATMUL intrinsic for dimension 1:"
167 			   " is %ld, should be %ld",
168 			   (long int) ret_extent, (long int) arg_extent);
169 
170 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 	  if (arg_extent != ret_extent)
173 	    runtime_error ("Incorrect extent in return array in"
174 			   " MATMUL intrinsic for dimension 2:"
175 			   " is %ld, should be %ld",
176 			   (long int) ret_extent, (long int) arg_extent);
177 	}
178     }
179 
180 
181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182     {
183       /* One-dimensional result may be addressed in the code below
184 	 either as a row or a column matrix. We want both cases to
185 	 work. */
186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187     }
188   else
189     {
190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192     }
193 
194 
195   if (GFC_DESCRIPTOR_RANK (a) == 1)
196     {
197       /* Treat it as a a row matrix A[1,count]. */
198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199       aystride = 1;
200 
201       xcount = 1;
202       count = GFC_DESCRIPTOR_EXTENT(a,0);
203     }
204   else
205     {
206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208 
209       count = GFC_DESCRIPTOR_EXTENT(a,1);
210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211     }
212 
213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214     {
215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217     }
218 
219   if (GFC_DESCRIPTOR_RANK (b) == 1)
220     {
221       /* Treat it as a column matrix B[count,1] */
222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223 
224       /* bystride should never be used for 1-dimensional b.
225          The value is only used for calculation of the
226          memory by the buffer.  */
227       bystride = 256;
228       ycount = 1;
229     }
230   else
231     {
232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235     }
236 
237   abase = a->base_addr;
238   bbase = b->base_addr;
239   dest = retarray->base_addr;
240 
241   /* Now that everything is set up, we perform the multiplication
242      itself.  */
243 
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247 
248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249       && (bxstride == 1 || bystride == 1)
250       && (((float) xcount) * ((float) ycount) * ((float) count)
251           > POW3(blas_limit)))
252     {
253       const int m = xcount, n = ycount, k = count, ldc = rystride;
254       const GFC_REAL_16 one = 1, zero = 0;
255       const int lda = (axstride == 1) ? aystride : axstride,
256 		ldb = (bxstride == 1) ? bystride : bxstride;
257 
258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 	{
260 	  assert (gemm != NULL);
261 	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
263 		&ldc, 1, 1);
264 	  return;
265 	}
266     }
267 
268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
269     {
270       /* This block of code implements a tuned matmul, derived from
271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
272 
273                Bo Kagstrom and Per Ling
274                Department of Computing Science
275                Umea University
276                S-901 87 Umea, Sweden
277 
278 	 from netlib.org, translated to C, and modified for matmul.m4.  */
279 
280       const GFC_REAL_16 *a, *b;
281       GFC_REAL_16 *c;
282       const index_type m = xcount, n = ycount, k = count;
283 
284       /* System generated locals */
285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 		 i1, i2, i3, i4, i5, i6;
287 
288       /* Local variables */
289       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
290 		 f13, f14, f23, f24, f33, f34, f43, f44;
291       index_type i, j, l, ii, jj, ll;
292       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293       GFC_REAL_16 *t1;
294 
295       a = abase;
296       b = bbase;
297       c = retarray->base_addr;
298 
299       /* Parameter adjustments */
300       c_dim1 = rystride;
301       c_offset = 1 + c_dim1;
302       c -= c_offset;
303       a_dim1 = aystride;
304       a_offset = 1 + a_dim1;
305       a -= a_offset;
306       b_dim1 = bystride;
307       b_offset = 1 + b_dim1;
308       b -= b_offset;
309 
310       /* Empty c first.  */
311       for (j=1; j<=n; j++)
312 	for (i=1; i<=m; i++)
313 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
314 
315       /* Early exit if possible */
316       if (m == 0 || n == 0 || k == 0)
317 	return;
318 
319       /* Adjust size of t1 to what is needed.  */
320       index_type t1_dim, a_sz;
321       if (aystride == 1)
322         a_sz = rystride;
323       else
324         a_sz = a_dim1;
325 
326       t1_dim = a_sz * 256 + b_dim1;
327       if (t1_dim > 65536)
328 	t1_dim = 65536;
329 
330       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
331 
332       /* Start turning the crank. */
333       i1 = n;
334       for (jj = 1; jj <= i1; jj += 512)
335 	{
336 	  /* Computing MIN */
337 	  i2 = 512;
338 	  i3 = n - jj + 1;
339 	  jsec = min(i2,i3);
340 	  ujsec = jsec - jsec % 4;
341 	  i2 = k;
342 	  for (ll = 1; ll <= i2; ll += 256)
343 	    {
344 	      /* Computing MIN */
345 	      i3 = 256;
346 	      i4 = k - ll + 1;
347 	      lsec = min(i3,i4);
348 	      ulsec = lsec - lsec % 2;
349 
350 	      i3 = m;
351 	      for (ii = 1; ii <= i3; ii += 256)
352 		{
353 		  /* Computing MIN */
354 		  i4 = 256;
355 		  i5 = m - ii + 1;
356 		  isec = min(i4,i5);
357 		  uisec = isec - isec % 2;
358 		  i4 = ll + ulsec - 1;
359 		  for (l = ll; l <= i4; l += 2)
360 		    {
361 		      i5 = ii + uisec - 1;
362 		      for (i = ii; i <= i5; i += 2)
363 			{
364 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
365 					a[i + l * a_dim1];
366 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
367 					a[i + (l + 1) * a_dim1];
368 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
369 					a[i + 1 + l * a_dim1];
370 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
371 					a[i + 1 + (l + 1) * a_dim1];
372 			}
373 		      if (uisec < isec)
374 			{
375 			  t1[l - ll + 1 + (isec << 8) - 257] =
376 				    a[ii + isec - 1 + l * a_dim1];
377 			  t1[l - ll + 2 + (isec << 8) - 257] =
378 				    a[ii + isec - 1 + (l + 1) * a_dim1];
379 			}
380 		    }
381 		  if (ulsec < lsec)
382 		    {
383 		      i4 = ii + isec - 1;
384 		      for (i = ii; i<= i4; ++i)
385 			{
386 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
387 				    a[i + (ll + lsec - 1) * a_dim1];
388 			}
389 		    }
390 
391 		  uisec = isec - isec % 4;
392 		  i4 = jj + ujsec - 1;
393 		  for (j = jj; j <= i4; j += 4)
394 		    {
395 		      i5 = ii + uisec - 1;
396 		      for (i = ii; i <= i5; i += 4)
397 			{
398 			  f11 = c[i + j * c_dim1];
399 			  f21 = c[i + 1 + j * c_dim1];
400 			  f12 = c[i + (j + 1) * c_dim1];
401 			  f22 = c[i + 1 + (j + 1) * c_dim1];
402 			  f13 = c[i + (j + 2) * c_dim1];
403 			  f23 = c[i + 1 + (j + 2) * c_dim1];
404 			  f14 = c[i + (j + 3) * c_dim1];
405 			  f24 = c[i + 1 + (j + 3) * c_dim1];
406 			  f31 = c[i + 2 + j * c_dim1];
407 			  f41 = c[i + 3 + j * c_dim1];
408 			  f32 = c[i + 2 + (j + 1) * c_dim1];
409 			  f42 = c[i + 3 + (j + 1) * c_dim1];
410 			  f33 = c[i + 2 + (j + 2) * c_dim1];
411 			  f43 = c[i + 3 + (j + 2) * c_dim1];
412 			  f34 = c[i + 2 + (j + 3) * c_dim1];
413 			  f44 = c[i + 3 + (j + 3) * c_dim1];
414 			  i6 = ll + lsec - 1;
415 			  for (l = ll; l <= i6; ++l)
416 			    {
417 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
418 				      * b[l + j * b_dim1];
419 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
420 				      * b[l + j * b_dim1];
421 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
422 				      * b[l + (j + 1) * b_dim1];
423 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
424 				      * b[l + (j + 1) * b_dim1];
425 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
426 				      * b[l + (j + 2) * b_dim1];
427 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
428 				      * b[l + (j + 2) * b_dim1];
429 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 				      * b[l + (j + 3) * b_dim1];
431 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 				      * b[l + (j + 3) * b_dim1];
433 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
434 				      * b[l + j * b_dim1];
435 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
436 				      * b[l + j * b_dim1];
437 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
438 				      * b[l + (j + 1) * b_dim1];
439 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
440 				      * b[l + (j + 1) * b_dim1];
441 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
442 				      * b[l + (j + 2) * b_dim1];
443 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
444 				      * b[l + (j + 2) * b_dim1];
445 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 				      * b[l + (j + 3) * b_dim1];
447 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 				      * b[l + (j + 3) * b_dim1];
449 			    }
450 			  c[i + j * c_dim1] = f11;
451 			  c[i + 1 + j * c_dim1] = f21;
452 			  c[i + (j + 1) * c_dim1] = f12;
453 			  c[i + 1 + (j + 1) * c_dim1] = f22;
454 			  c[i + (j + 2) * c_dim1] = f13;
455 			  c[i + 1 + (j + 2) * c_dim1] = f23;
456 			  c[i + (j + 3) * c_dim1] = f14;
457 			  c[i + 1 + (j + 3) * c_dim1] = f24;
458 			  c[i + 2 + j * c_dim1] = f31;
459 			  c[i + 3 + j * c_dim1] = f41;
460 			  c[i + 2 + (j + 1) * c_dim1] = f32;
461 			  c[i + 3 + (j + 1) * c_dim1] = f42;
462 			  c[i + 2 + (j + 2) * c_dim1] = f33;
463 			  c[i + 3 + (j + 2) * c_dim1] = f43;
464 			  c[i + 2 + (j + 3) * c_dim1] = f34;
465 			  c[i + 3 + (j + 3) * c_dim1] = f44;
466 			}
467 		      if (uisec < isec)
468 			{
469 			  i5 = ii + isec - 1;
470 			  for (i = ii + uisec; i <= i5; ++i)
471 			    {
472 			      f11 = c[i + j * c_dim1];
473 			      f12 = c[i + (j + 1) * c_dim1];
474 			      f13 = c[i + (j + 2) * c_dim1];
475 			      f14 = c[i + (j + 3) * c_dim1];
476 			      i6 = ll + lsec - 1;
477 			      for (l = ll; l <= i6; ++l)
478 				{
479 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 					  257] * b[l + j * b_dim1];
481 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
482 					  257] * b[l + (j + 1) * b_dim1];
483 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
484 					  257] * b[l + (j + 2) * b_dim1];
485 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
486 					  257] * b[l + (j + 3) * b_dim1];
487 				}
488 			      c[i + j * c_dim1] = f11;
489 			      c[i + (j + 1) * c_dim1] = f12;
490 			      c[i + (j + 2) * c_dim1] = f13;
491 			      c[i + (j + 3) * c_dim1] = f14;
492 			    }
493 			}
494 		    }
495 		  if (ujsec < jsec)
496 		    {
497 		      i4 = jj + jsec - 1;
498 		      for (j = jj + ujsec; j <= i4; ++j)
499 			{
500 			  i5 = ii + uisec - 1;
501 			  for (i = ii; i <= i5; i += 4)
502 			    {
503 			      f11 = c[i + j * c_dim1];
504 			      f21 = c[i + 1 + j * c_dim1];
505 			      f31 = c[i + 2 + j * c_dim1];
506 			      f41 = c[i + 3 + j * c_dim1];
507 			      i6 = ll + lsec - 1;
508 			      for (l = ll; l <= i6; ++l)
509 				{
510 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511 					  257] * b[l + j * b_dim1];
512 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
513 					  257] * b[l + j * b_dim1];
514 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
515 					  257] * b[l + j * b_dim1];
516 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
517 					  257] * b[l + j * b_dim1];
518 				}
519 			      c[i + j * c_dim1] = f11;
520 			      c[i + 1 + j * c_dim1] = f21;
521 			      c[i + 2 + j * c_dim1] = f31;
522 			      c[i + 3 + j * c_dim1] = f41;
523 			    }
524 			  i5 = ii + isec - 1;
525 			  for (i = ii + uisec; i <= i5; ++i)
526 			    {
527 			      f11 = c[i + j * c_dim1];
528 			      i6 = ll + lsec - 1;
529 			      for (l = ll; l <= i6; ++l)
530 				{
531 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
532 					  257] * b[l + j * b_dim1];
533 				}
534 			      c[i + j * c_dim1] = f11;
535 			    }
536 			}
537 		    }
538 		}
539 	    }
540 	}
541       free(t1);
542       return;
543     }
544   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
545     {
546       if (GFC_DESCRIPTOR_RANK (a) != 1)
547 	{
548 	  const GFC_REAL_16 *restrict abase_x;
549 	  const GFC_REAL_16 *restrict bbase_y;
550 	  GFC_REAL_16 *restrict dest_y;
551 	  GFC_REAL_16 s;
552 
553 	  for (y = 0; y < ycount; y++)
554 	    {
555 	      bbase_y = &bbase[y*bystride];
556 	      dest_y = &dest[y*rystride];
557 	      for (x = 0; x < xcount; x++)
558 		{
559 		  abase_x = &abase[x*axstride];
560 		  s = (GFC_REAL_16) 0;
561 		  for (n = 0; n < count; n++)
562 		    s += abase_x[n] * bbase_y[n];
563 		  dest_y[x] = s;
564 		}
565 	    }
566 	}
567       else
568 	{
569 	  const GFC_REAL_16 *restrict bbase_y;
570 	  GFC_REAL_16 s;
571 
572 	  for (y = 0; y < ycount; y++)
573 	    {
574 	      bbase_y = &bbase[y*bystride];
575 	      s = (GFC_REAL_16) 0;
576 	      for (n = 0; n < count; n++)
577 		s += abase[n*axstride] * bbase_y[n];
578 	      dest[y*rystride] = s;
579 	    }
580 	}
581     }
582   else if (axstride < aystride)
583     {
584       for (y = 0; y < ycount; y++)
585 	for (x = 0; x < xcount; x++)
586 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
587 
588       for (y = 0; y < ycount; y++)
589 	for (n = 0; n < count; n++)
590 	  for (x = 0; x < xcount; x++)
591 	    /* dest[x,y] += a[x,n] * b[n,y] */
592 	    dest[x*rxstride + y*rystride] +=
593 					abase[x*axstride + n*aystride] *
594 					bbase[n*bxstride + y*bystride];
595     }
596   else if (GFC_DESCRIPTOR_RANK (a) == 1)
597     {
598       const GFC_REAL_16 *restrict bbase_y;
599       GFC_REAL_16 s;
600 
601       for (y = 0; y < ycount; y++)
602 	{
603 	  bbase_y = &bbase[y*bystride];
604 	  s = (GFC_REAL_16) 0;
605 	  for (n = 0; n < count; n++)
606 	    s += abase[n*axstride] * bbase_y[n*bxstride];
607 	  dest[y*rxstride] = s;
608 	}
609     }
610   else
611     {
612       const GFC_REAL_16 *restrict abase_x;
613       const GFC_REAL_16 *restrict bbase_y;
614       GFC_REAL_16 *restrict dest_y;
615       GFC_REAL_16 s;
616 
617       for (y = 0; y < ycount; y++)
618 	{
619 	  bbase_y = &bbase[y*bystride];
620 	  dest_y = &dest[y*rystride];
621 	  for (x = 0; x < xcount; x++)
622 	    {
623 	      abase_x = &abase[x*axstride];
624 	      s = (GFC_REAL_16) 0;
625 	      for (n = 0; n < count; n++)
626 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
627 	      dest_y[x*rxstride] = s;
628 	    }
629 	}
630     }
631 }
632 #undef POW3
633 #undef min
634 #undef max
635 
636 #endif /* HAVE_AVX */
637 
638 #ifdef HAVE_AVX2
639 static void
640 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
641 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
642 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
643 static void
matmul_r16_avx2(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)644 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
645 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
646 	int blas_limit, blas_call gemm)
647 {
648   const GFC_REAL_16 * restrict abase;
649   const GFC_REAL_16 * restrict bbase;
650   GFC_REAL_16 * restrict dest;
651 
652   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
653   index_type x, y, n, count, xcount, ycount;
654 
655   assert (GFC_DESCRIPTOR_RANK (a) == 2
656           || GFC_DESCRIPTOR_RANK (b) == 2);
657 
658 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
659 
660    Either A or B (but not both) can be rank 1:
661 
662    o One-dimensional argument A is implicitly treated as a row matrix
663      dimensioned [1,count], so xcount=1.
664 
665    o One-dimensional argument B is implicitly treated as a column matrix
666      dimensioned [count, 1], so ycount=1.
667 */
668 
669   if (retarray->base_addr == NULL)
670     {
671       if (GFC_DESCRIPTOR_RANK (a) == 1)
672         {
673 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
674 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
675         }
676       else if (GFC_DESCRIPTOR_RANK (b) == 1)
677         {
678 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
679 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
680         }
681       else
682         {
683 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
684 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
685 
686           GFC_DIMENSION_SET(retarray->dim[1], 0,
687 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
688 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
689         }
690 
691       retarray->base_addr
692 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
693       retarray->offset = 0;
694     }
695   else if (unlikely (compile_options.bounds_check))
696     {
697       index_type ret_extent, arg_extent;
698 
699       if (GFC_DESCRIPTOR_RANK (a) == 1)
700 	{
701 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
702 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
703 	  if (arg_extent != ret_extent)
704 	    runtime_error ("Incorrect extent in return array in"
705 			   " MATMUL intrinsic: is %ld, should be %ld",
706 			   (long int) ret_extent, (long int) arg_extent);
707 	}
708       else if (GFC_DESCRIPTOR_RANK (b) == 1)
709 	{
710 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
711 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
712 	  if (arg_extent != ret_extent)
713 	    runtime_error ("Incorrect extent in return array in"
714 			   " MATMUL intrinsic: is %ld, should be %ld",
715 			   (long int) ret_extent, (long int) arg_extent);
716 	}
717       else
718 	{
719 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
720 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
721 	  if (arg_extent != ret_extent)
722 	    runtime_error ("Incorrect extent in return array in"
723 			   " MATMUL intrinsic for dimension 1:"
724 			   " is %ld, should be %ld",
725 			   (long int) ret_extent, (long int) arg_extent);
726 
727 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
728 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
729 	  if (arg_extent != ret_extent)
730 	    runtime_error ("Incorrect extent in return array in"
731 			   " MATMUL intrinsic for dimension 2:"
732 			   " is %ld, should be %ld",
733 			   (long int) ret_extent, (long int) arg_extent);
734 	}
735     }
736 
737 
738   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
739     {
740       /* One-dimensional result may be addressed in the code below
741 	 either as a row or a column matrix. We want both cases to
742 	 work. */
743       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
744     }
745   else
746     {
747       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
748       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
749     }
750 
751 
752   if (GFC_DESCRIPTOR_RANK (a) == 1)
753     {
754       /* Treat it as a a row matrix A[1,count]. */
755       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
756       aystride = 1;
757 
758       xcount = 1;
759       count = GFC_DESCRIPTOR_EXTENT(a,0);
760     }
761   else
762     {
763       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
764       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
765 
766       count = GFC_DESCRIPTOR_EXTENT(a,1);
767       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
768     }
769 
770   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
771     {
772       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
773 	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
774     }
775 
776   if (GFC_DESCRIPTOR_RANK (b) == 1)
777     {
778       /* Treat it as a column matrix B[count,1] */
779       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
780 
781       /* bystride should never be used for 1-dimensional b.
782          The value is only used for calculation of the
783          memory by the buffer.  */
784       bystride = 256;
785       ycount = 1;
786     }
787   else
788     {
789       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
790       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
791       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
792     }
793 
794   abase = a->base_addr;
795   bbase = b->base_addr;
796   dest = retarray->base_addr;
797 
798   /* Now that everything is set up, we perform the multiplication
799      itself.  */
800 
801 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
802 #define min(a,b) ((a) <= (b) ? (a) : (b))
803 #define max(a,b) ((a) >= (b) ? (a) : (b))
804 
805   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
806       && (bxstride == 1 || bystride == 1)
807       && (((float) xcount) * ((float) ycount) * ((float) count)
808           > POW3(blas_limit)))
809     {
810       const int m = xcount, n = ycount, k = count, ldc = rystride;
811       const GFC_REAL_16 one = 1, zero = 0;
812       const int lda = (axstride == 1) ? aystride : axstride,
813 		ldb = (bxstride == 1) ? bystride : bxstride;
814 
815       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
816 	{
817 	  assert (gemm != NULL);
818 	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
819 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
820 		&ldc, 1, 1);
821 	  return;
822 	}
823     }
824 
825   if (rxstride == 1 && axstride == 1 && bxstride == 1)
826     {
827       /* This block of code implements a tuned matmul, derived from
828          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
829 
830                Bo Kagstrom and Per Ling
831                Department of Computing Science
832                Umea University
833                S-901 87 Umea, Sweden
834 
835 	 from netlib.org, translated to C, and modified for matmul.m4.  */
836 
837       const GFC_REAL_16 *a, *b;
838       GFC_REAL_16 *c;
839       const index_type m = xcount, n = ycount, k = count;
840 
841       /* System generated locals */
842       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
843 		 i1, i2, i3, i4, i5, i6;
844 
845       /* Local variables */
846       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
847 		 f13, f14, f23, f24, f33, f34, f43, f44;
848       index_type i, j, l, ii, jj, ll;
849       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
850       GFC_REAL_16 *t1;
851 
852       a = abase;
853       b = bbase;
854       c = retarray->base_addr;
855 
856       /* Parameter adjustments */
857       c_dim1 = rystride;
858       c_offset = 1 + c_dim1;
859       c -= c_offset;
860       a_dim1 = aystride;
861       a_offset = 1 + a_dim1;
862       a -= a_offset;
863       b_dim1 = bystride;
864       b_offset = 1 + b_dim1;
865       b -= b_offset;
866 
867       /* Empty c first.  */
868       for (j=1; j<=n; j++)
869 	for (i=1; i<=m; i++)
870 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
871 
872       /* Early exit if possible */
873       if (m == 0 || n == 0 || k == 0)
874 	return;
875 
876       /* Adjust size of t1 to what is needed.  */
877       index_type t1_dim, a_sz;
878       if (aystride == 1)
879         a_sz = rystride;
880       else
881         a_sz = a_dim1;
882 
883       t1_dim = a_sz * 256 + b_dim1;
884       if (t1_dim > 65536)
885 	t1_dim = 65536;
886 
887       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
888 
889       /* Start turning the crank. */
890       i1 = n;
891       for (jj = 1; jj <= i1; jj += 512)
892 	{
893 	  /* Computing MIN */
894 	  i2 = 512;
895 	  i3 = n - jj + 1;
896 	  jsec = min(i2,i3);
897 	  ujsec = jsec - jsec % 4;
898 	  i2 = k;
899 	  for (ll = 1; ll <= i2; ll += 256)
900 	    {
901 	      /* Computing MIN */
902 	      i3 = 256;
903 	      i4 = k - ll + 1;
904 	      lsec = min(i3,i4);
905 	      ulsec = lsec - lsec % 2;
906 
907 	      i3 = m;
908 	      for (ii = 1; ii <= i3; ii += 256)
909 		{
910 		  /* Computing MIN */
911 		  i4 = 256;
912 		  i5 = m - ii + 1;
913 		  isec = min(i4,i5);
914 		  uisec = isec - isec % 2;
915 		  i4 = ll + ulsec - 1;
916 		  for (l = ll; l <= i4; l += 2)
917 		    {
918 		      i5 = ii + uisec - 1;
919 		      for (i = ii; i <= i5; i += 2)
920 			{
921 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
922 					a[i + l * a_dim1];
923 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
924 					a[i + (l + 1) * a_dim1];
925 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
926 					a[i + 1 + l * a_dim1];
927 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
928 					a[i + 1 + (l + 1) * a_dim1];
929 			}
930 		      if (uisec < isec)
931 			{
932 			  t1[l - ll + 1 + (isec << 8) - 257] =
933 				    a[ii + isec - 1 + l * a_dim1];
934 			  t1[l - ll + 2 + (isec << 8) - 257] =
935 				    a[ii + isec - 1 + (l + 1) * a_dim1];
936 			}
937 		    }
938 		  if (ulsec < lsec)
939 		    {
940 		      i4 = ii + isec - 1;
941 		      for (i = ii; i<= i4; ++i)
942 			{
943 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
944 				    a[i + (ll + lsec - 1) * a_dim1];
945 			}
946 		    }
947 
948 		  uisec = isec - isec % 4;
949 		  i4 = jj + ujsec - 1;
950 		  for (j = jj; j <= i4; j += 4)
951 		    {
952 		      i5 = ii + uisec - 1;
953 		      for (i = ii; i <= i5; i += 4)
954 			{
955 			  f11 = c[i + j * c_dim1];
956 			  f21 = c[i + 1 + j * c_dim1];
957 			  f12 = c[i + (j + 1) * c_dim1];
958 			  f22 = c[i + 1 + (j + 1) * c_dim1];
959 			  f13 = c[i + (j + 2) * c_dim1];
960 			  f23 = c[i + 1 + (j + 2) * c_dim1];
961 			  f14 = c[i + (j + 3) * c_dim1];
962 			  f24 = c[i + 1 + (j + 3) * c_dim1];
963 			  f31 = c[i + 2 + j * c_dim1];
964 			  f41 = c[i + 3 + j * c_dim1];
965 			  f32 = c[i + 2 + (j + 1) * c_dim1];
966 			  f42 = c[i + 3 + (j + 1) * c_dim1];
967 			  f33 = c[i + 2 + (j + 2) * c_dim1];
968 			  f43 = c[i + 3 + (j + 2) * c_dim1];
969 			  f34 = c[i + 2 + (j + 3) * c_dim1];
970 			  f44 = c[i + 3 + (j + 3) * c_dim1];
971 			  i6 = ll + lsec - 1;
972 			  for (l = ll; l <= i6; ++l)
973 			    {
974 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 				      * b[l + j * b_dim1];
976 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 				      * b[l + j * b_dim1];
978 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
979 				      * b[l + (j + 1) * b_dim1];
980 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
981 				      * b[l + (j + 1) * b_dim1];
982 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
983 				      * b[l + (j + 2) * b_dim1];
984 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
985 				      * b[l + (j + 2) * b_dim1];
986 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
987 				      * b[l + (j + 3) * b_dim1];
988 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
989 				      * b[l + (j + 3) * b_dim1];
990 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 				      * b[l + j * b_dim1];
992 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 				      * b[l + j * b_dim1];
994 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
995 				      * b[l + (j + 1) * b_dim1];
996 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
997 				      * b[l + (j + 1) * b_dim1];
998 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
999 				      * b[l + (j + 2) * b_dim1];
1000 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1001 				      * b[l + (j + 2) * b_dim1];
1002 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1003 				      * b[l + (j + 3) * b_dim1];
1004 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1005 				      * b[l + (j + 3) * b_dim1];
1006 			    }
1007 			  c[i + j * c_dim1] = f11;
1008 			  c[i + 1 + j * c_dim1] = f21;
1009 			  c[i + (j + 1) * c_dim1] = f12;
1010 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1011 			  c[i + (j + 2) * c_dim1] = f13;
1012 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1013 			  c[i + (j + 3) * c_dim1] = f14;
1014 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1015 			  c[i + 2 + j * c_dim1] = f31;
1016 			  c[i + 3 + j * c_dim1] = f41;
1017 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1018 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1019 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1020 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1021 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1022 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1023 			}
1024 		      if (uisec < isec)
1025 			{
1026 			  i5 = ii + isec - 1;
1027 			  for (i = ii + uisec; i <= i5; ++i)
1028 			    {
1029 			      f11 = c[i + j * c_dim1];
1030 			      f12 = c[i + (j + 1) * c_dim1];
1031 			      f13 = c[i + (j + 2) * c_dim1];
1032 			      f14 = c[i + (j + 3) * c_dim1];
1033 			      i6 = ll + lsec - 1;
1034 			      for (l = ll; l <= i6; ++l)
1035 				{
1036 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1037 					  257] * b[l + j * b_dim1];
1038 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1039 					  257] * b[l + (j + 1) * b_dim1];
1040 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 					  257] * b[l + (j + 2) * b_dim1];
1042 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1043 					  257] * b[l + (j + 3) * b_dim1];
1044 				}
1045 			      c[i + j * c_dim1] = f11;
1046 			      c[i + (j + 1) * c_dim1] = f12;
1047 			      c[i + (j + 2) * c_dim1] = f13;
1048 			      c[i + (j + 3) * c_dim1] = f14;
1049 			    }
1050 			}
1051 		    }
1052 		  if (ujsec < jsec)
1053 		    {
1054 		      i4 = jj + jsec - 1;
1055 		      for (j = jj + ujsec; j <= i4; ++j)
1056 			{
1057 			  i5 = ii + uisec - 1;
1058 			  for (i = ii; i <= i5; i += 4)
1059 			    {
1060 			      f11 = c[i + j * c_dim1];
1061 			      f21 = c[i + 1 + j * c_dim1];
1062 			      f31 = c[i + 2 + j * c_dim1];
1063 			      f41 = c[i + 3 + j * c_dim1];
1064 			      i6 = ll + lsec - 1;
1065 			      for (l = ll; l <= i6; ++l)
1066 				{
1067 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1068 					  257] * b[l + j * b_dim1];
1069 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1070 					  257] * b[l + j * b_dim1];
1071 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1072 					  257] * b[l + j * b_dim1];
1073 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1074 					  257] * b[l + j * b_dim1];
1075 				}
1076 			      c[i + j * c_dim1] = f11;
1077 			      c[i + 1 + j * c_dim1] = f21;
1078 			      c[i + 2 + j * c_dim1] = f31;
1079 			      c[i + 3 + j * c_dim1] = f41;
1080 			    }
1081 			  i5 = ii + isec - 1;
1082 			  for (i = ii + uisec; i <= i5; ++i)
1083 			    {
1084 			      f11 = c[i + j * c_dim1];
1085 			      i6 = ll + lsec - 1;
1086 			      for (l = ll; l <= i6; ++l)
1087 				{
1088 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1089 					  257] * b[l + j * b_dim1];
1090 				}
1091 			      c[i + j * c_dim1] = f11;
1092 			    }
1093 			}
1094 		    }
1095 		}
1096 	    }
1097 	}
1098       free(t1);
1099       return;
1100     }
1101   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1102     {
1103       if (GFC_DESCRIPTOR_RANK (a) != 1)
1104 	{
1105 	  const GFC_REAL_16 *restrict abase_x;
1106 	  const GFC_REAL_16 *restrict bbase_y;
1107 	  GFC_REAL_16 *restrict dest_y;
1108 	  GFC_REAL_16 s;
1109 
1110 	  for (y = 0; y < ycount; y++)
1111 	    {
1112 	      bbase_y = &bbase[y*bystride];
1113 	      dest_y = &dest[y*rystride];
1114 	      for (x = 0; x < xcount; x++)
1115 		{
1116 		  abase_x = &abase[x*axstride];
1117 		  s = (GFC_REAL_16) 0;
1118 		  for (n = 0; n < count; n++)
1119 		    s += abase_x[n] * bbase_y[n];
1120 		  dest_y[x] = s;
1121 		}
1122 	    }
1123 	}
1124       else
1125 	{
1126 	  const GFC_REAL_16 *restrict bbase_y;
1127 	  GFC_REAL_16 s;
1128 
1129 	  for (y = 0; y < ycount; y++)
1130 	    {
1131 	      bbase_y = &bbase[y*bystride];
1132 	      s = (GFC_REAL_16) 0;
1133 	      for (n = 0; n < count; n++)
1134 		s += abase[n*axstride] * bbase_y[n];
1135 	      dest[y*rystride] = s;
1136 	    }
1137 	}
1138     }
1139   else if (axstride < aystride)
1140     {
1141       for (y = 0; y < ycount; y++)
1142 	for (x = 0; x < xcount; x++)
1143 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1144 
1145       for (y = 0; y < ycount; y++)
1146 	for (n = 0; n < count; n++)
1147 	  for (x = 0; x < xcount; x++)
1148 	    /* dest[x,y] += a[x,n] * b[n,y] */
1149 	    dest[x*rxstride + y*rystride] +=
1150 					abase[x*axstride + n*aystride] *
1151 					bbase[n*bxstride + y*bystride];
1152     }
1153   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1154     {
1155       const GFC_REAL_16 *restrict bbase_y;
1156       GFC_REAL_16 s;
1157 
1158       for (y = 0; y < ycount; y++)
1159 	{
1160 	  bbase_y = &bbase[y*bystride];
1161 	  s = (GFC_REAL_16) 0;
1162 	  for (n = 0; n < count; n++)
1163 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1164 	  dest[y*rxstride] = s;
1165 	}
1166     }
1167   else
1168     {
1169       const GFC_REAL_16 *restrict abase_x;
1170       const GFC_REAL_16 *restrict bbase_y;
1171       GFC_REAL_16 *restrict dest_y;
1172       GFC_REAL_16 s;
1173 
1174       for (y = 0; y < ycount; y++)
1175 	{
1176 	  bbase_y = &bbase[y*bystride];
1177 	  dest_y = &dest[y*rystride];
1178 	  for (x = 0; x < xcount; x++)
1179 	    {
1180 	      abase_x = &abase[x*axstride];
1181 	      s = (GFC_REAL_16) 0;
1182 	      for (n = 0; n < count; n++)
1183 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1184 	      dest_y[x*rxstride] = s;
1185 	    }
1186 	}
1187     }
1188 }
1189 #undef POW3
1190 #undef min
1191 #undef max
1192 
1193 #endif /* HAVE_AVX2 */
1194 
1195 #ifdef HAVE_AVX512F
1196 static void
1197 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1198 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1199 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1200 static void
matmul_r16_avx512f(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1201 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1202 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1203 	int blas_limit, blas_call gemm)
1204 {
1205   const GFC_REAL_16 * restrict abase;
1206   const GFC_REAL_16 * restrict bbase;
1207   GFC_REAL_16 * restrict dest;
1208 
1209   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1210   index_type x, y, n, count, xcount, ycount;
1211 
1212   assert (GFC_DESCRIPTOR_RANK (a) == 2
1213           || GFC_DESCRIPTOR_RANK (b) == 2);
1214 
1215 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1216 
1217    Either A or B (but not both) can be rank 1:
1218 
1219    o One-dimensional argument A is implicitly treated as a row matrix
1220      dimensioned [1,count], so xcount=1.
1221 
1222    o One-dimensional argument B is implicitly treated as a column matrix
1223      dimensioned [count, 1], so ycount=1.
1224 */
1225 
1226   if (retarray->base_addr == NULL)
1227     {
1228       if (GFC_DESCRIPTOR_RANK (a) == 1)
1229         {
1230 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1232         }
1233       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1234         {
1235 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1236 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1237         }
1238       else
1239         {
1240 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1241 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1242 
1243           GFC_DIMENSION_SET(retarray->dim[1], 0,
1244 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1245 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1246         }
1247 
1248       retarray->base_addr
1249 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1250       retarray->offset = 0;
1251     }
1252   else if (unlikely (compile_options.bounds_check))
1253     {
1254       index_type ret_extent, arg_extent;
1255 
1256       if (GFC_DESCRIPTOR_RANK (a) == 1)
1257 	{
1258 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1259 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1260 	  if (arg_extent != ret_extent)
1261 	    runtime_error ("Incorrect extent in return array in"
1262 			   " MATMUL intrinsic: is %ld, should be %ld",
1263 			   (long int) ret_extent, (long int) arg_extent);
1264 	}
1265       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1266 	{
1267 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1268 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1269 	  if (arg_extent != ret_extent)
1270 	    runtime_error ("Incorrect extent in return array in"
1271 			   " MATMUL intrinsic: is %ld, should be %ld",
1272 			   (long int) ret_extent, (long int) arg_extent);
1273 	}
1274       else
1275 	{
1276 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1277 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1278 	  if (arg_extent != ret_extent)
1279 	    runtime_error ("Incorrect extent in return array in"
1280 			   " MATMUL intrinsic for dimension 1:"
1281 			   " is %ld, should be %ld",
1282 			   (long int) ret_extent, (long int) arg_extent);
1283 
1284 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1285 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1286 	  if (arg_extent != ret_extent)
1287 	    runtime_error ("Incorrect extent in return array in"
1288 			   " MATMUL intrinsic for dimension 2:"
1289 			   " is %ld, should be %ld",
1290 			   (long int) ret_extent, (long int) arg_extent);
1291 	}
1292     }
1293 
1294 
1295   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1296     {
1297       /* One-dimensional result may be addressed in the code below
1298 	 either as a row or a column matrix. We want both cases to
1299 	 work. */
1300       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1301     }
1302   else
1303     {
1304       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1305       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1306     }
1307 
1308 
1309   if (GFC_DESCRIPTOR_RANK (a) == 1)
1310     {
1311       /* Treat it as a a row matrix A[1,count]. */
1312       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313       aystride = 1;
1314 
1315       xcount = 1;
1316       count = GFC_DESCRIPTOR_EXTENT(a,0);
1317     }
1318   else
1319     {
1320       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1321       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1322 
1323       count = GFC_DESCRIPTOR_EXTENT(a,1);
1324       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1325     }
1326 
1327   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1328     {
1329       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1330 	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1331     }
1332 
1333   if (GFC_DESCRIPTOR_RANK (b) == 1)
1334     {
1335       /* Treat it as a column matrix B[count,1] */
1336       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337 
1338       /* bystride should never be used for 1-dimensional b.
1339          The value is only used for calculation of the
1340          memory by the buffer.  */
1341       bystride = 256;
1342       ycount = 1;
1343     }
1344   else
1345     {
1346       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1347       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1348       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1349     }
1350 
1351   abase = a->base_addr;
1352   bbase = b->base_addr;
1353   dest = retarray->base_addr;
1354 
1355   /* Now that everything is set up, we perform the multiplication
1356      itself.  */
1357 
1358 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359 #define min(a,b) ((a) <= (b) ? (a) : (b))
1360 #define max(a,b) ((a) >= (b) ? (a) : (b))
1361 
1362   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1363       && (bxstride == 1 || bystride == 1)
1364       && (((float) xcount) * ((float) ycount) * ((float) count)
1365           > POW3(blas_limit)))
1366     {
1367       const int m = xcount, n = ycount, k = count, ldc = rystride;
1368       const GFC_REAL_16 one = 1, zero = 0;
1369       const int lda = (axstride == 1) ? aystride : axstride,
1370 		ldb = (bxstride == 1) ? bystride : bxstride;
1371 
1372       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1373 	{
1374 	  assert (gemm != NULL);
1375 	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1376 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1377 		&ldc, 1, 1);
1378 	  return;
1379 	}
1380     }
1381 
1382   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1383     {
1384       /* This block of code implements a tuned matmul, derived from
1385          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1386 
1387                Bo Kagstrom and Per Ling
1388                Department of Computing Science
1389                Umea University
1390                S-901 87 Umea, Sweden
1391 
1392 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1393 
1394       const GFC_REAL_16 *a, *b;
1395       GFC_REAL_16 *c;
1396       const index_type m = xcount, n = ycount, k = count;
1397 
1398       /* System generated locals */
1399       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1400 		 i1, i2, i3, i4, i5, i6;
1401 
1402       /* Local variables */
1403       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
1404 		 f13, f14, f23, f24, f33, f34, f43, f44;
1405       index_type i, j, l, ii, jj, ll;
1406       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1407       GFC_REAL_16 *t1;
1408 
1409       a = abase;
1410       b = bbase;
1411       c = retarray->base_addr;
1412 
1413       /* Parameter adjustments */
1414       c_dim1 = rystride;
1415       c_offset = 1 + c_dim1;
1416       c -= c_offset;
1417       a_dim1 = aystride;
1418       a_offset = 1 + a_dim1;
1419       a -= a_offset;
1420       b_dim1 = bystride;
1421       b_offset = 1 + b_dim1;
1422       b -= b_offset;
1423 
1424       /* Empty c first.  */
1425       for (j=1; j<=n; j++)
1426 	for (i=1; i<=m; i++)
1427 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
1428 
1429       /* Early exit if possible */
1430       if (m == 0 || n == 0 || k == 0)
1431 	return;
1432 
1433       /* Adjust size of t1 to what is needed.  */
1434       index_type t1_dim, a_sz;
1435       if (aystride == 1)
1436         a_sz = rystride;
1437       else
1438         a_sz = a_dim1;
1439 
1440       t1_dim = a_sz * 256 + b_dim1;
1441       if (t1_dim > 65536)
1442 	t1_dim = 65536;
1443 
1444       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
1445 
1446       /* Start turning the crank. */
1447       i1 = n;
1448       for (jj = 1; jj <= i1; jj += 512)
1449 	{
1450 	  /* Computing MIN */
1451 	  i2 = 512;
1452 	  i3 = n - jj + 1;
1453 	  jsec = min(i2,i3);
1454 	  ujsec = jsec - jsec % 4;
1455 	  i2 = k;
1456 	  for (ll = 1; ll <= i2; ll += 256)
1457 	    {
1458 	      /* Computing MIN */
1459 	      i3 = 256;
1460 	      i4 = k - ll + 1;
1461 	      lsec = min(i3,i4);
1462 	      ulsec = lsec - lsec % 2;
1463 
1464 	      i3 = m;
1465 	      for (ii = 1; ii <= i3; ii += 256)
1466 		{
1467 		  /* Computing MIN */
1468 		  i4 = 256;
1469 		  i5 = m - ii + 1;
1470 		  isec = min(i4,i5);
1471 		  uisec = isec - isec % 2;
1472 		  i4 = ll + ulsec - 1;
1473 		  for (l = ll; l <= i4; l += 2)
1474 		    {
1475 		      i5 = ii + uisec - 1;
1476 		      for (i = ii; i <= i5; i += 2)
1477 			{
1478 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1479 					a[i + l * a_dim1];
1480 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1481 					a[i + (l + 1) * a_dim1];
1482 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1483 					a[i + 1 + l * a_dim1];
1484 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1485 					a[i + 1 + (l + 1) * a_dim1];
1486 			}
1487 		      if (uisec < isec)
1488 			{
1489 			  t1[l - ll + 1 + (isec << 8) - 257] =
1490 				    a[ii + isec - 1 + l * a_dim1];
1491 			  t1[l - ll + 2 + (isec << 8) - 257] =
1492 				    a[ii + isec - 1 + (l + 1) * a_dim1];
1493 			}
1494 		    }
1495 		  if (ulsec < lsec)
1496 		    {
1497 		      i4 = ii + isec - 1;
1498 		      for (i = ii; i<= i4; ++i)
1499 			{
1500 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1501 				    a[i + (ll + lsec - 1) * a_dim1];
1502 			}
1503 		    }
1504 
1505 		  uisec = isec - isec % 4;
1506 		  i4 = jj + ujsec - 1;
1507 		  for (j = jj; j <= i4; j += 4)
1508 		    {
1509 		      i5 = ii + uisec - 1;
1510 		      for (i = ii; i <= i5; i += 4)
1511 			{
1512 			  f11 = c[i + j * c_dim1];
1513 			  f21 = c[i + 1 + j * c_dim1];
1514 			  f12 = c[i + (j + 1) * c_dim1];
1515 			  f22 = c[i + 1 + (j + 1) * c_dim1];
1516 			  f13 = c[i + (j + 2) * c_dim1];
1517 			  f23 = c[i + 1 + (j + 2) * c_dim1];
1518 			  f14 = c[i + (j + 3) * c_dim1];
1519 			  f24 = c[i + 1 + (j + 3) * c_dim1];
1520 			  f31 = c[i + 2 + j * c_dim1];
1521 			  f41 = c[i + 3 + j * c_dim1];
1522 			  f32 = c[i + 2 + (j + 1) * c_dim1];
1523 			  f42 = c[i + 3 + (j + 1) * c_dim1];
1524 			  f33 = c[i + 2 + (j + 2) * c_dim1];
1525 			  f43 = c[i + 3 + (j + 2) * c_dim1];
1526 			  f34 = c[i + 2 + (j + 3) * c_dim1];
1527 			  f44 = c[i + 3 + (j + 3) * c_dim1];
1528 			  i6 = ll + lsec - 1;
1529 			  for (l = ll; l <= i6; ++l)
1530 			    {
1531 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1532 				      * b[l + j * b_dim1];
1533 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1534 				      * b[l + j * b_dim1];
1535 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1536 				      * b[l + (j + 1) * b_dim1];
1537 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1538 				      * b[l + (j + 1) * b_dim1];
1539 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1540 				      * b[l + (j + 2) * b_dim1];
1541 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1542 				      * b[l + (j + 2) * b_dim1];
1543 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1544 				      * b[l + (j + 3) * b_dim1];
1545 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1546 				      * b[l + (j + 3) * b_dim1];
1547 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1548 				      * b[l + j * b_dim1];
1549 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1550 				      * b[l + j * b_dim1];
1551 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1552 				      * b[l + (j + 1) * b_dim1];
1553 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1554 				      * b[l + (j + 1) * b_dim1];
1555 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1556 				      * b[l + (j + 2) * b_dim1];
1557 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1558 				      * b[l + (j + 2) * b_dim1];
1559 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1560 				      * b[l + (j + 3) * b_dim1];
1561 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1562 				      * b[l + (j + 3) * b_dim1];
1563 			    }
1564 			  c[i + j * c_dim1] = f11;
1565 			  c[i + 1 + j * c_dim1] = f21;
1566 			  c[i + (j + 1) * c_dim1] = f12;
1567 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1568 			  c[i + (j + 2) * c_dim1] = f13;
1569 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1570 			  c[i + (j + 3) * c_dim1] = f14;
1571 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1572 			  c[i + 2 + j * c_dim1] = f31;
1573 			  c[i + 3 + j * c_dim1] = f41;
1574 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1575 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1576 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1577 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1578 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1579 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1580 			}
1581 		      if (uisec < isec)
1582 			{
1583 			  i5 = ii + isec - 1;
1584 			  for (i = ii + uisec; i <= i5; ++i)
1585 			    {
1586 			      f11 = c[i + j * c_dim1];
1587 			      f12 = c[i + (j + 1) * c_dim1];
1588 			      f13 = c[i + (j + 2) * c_dim1];
1589 			      f14 = c[i + (j + 3) * c_dim1];
1590 			      i6 = ll + lsec - 1;
1591 			      for (l = ll; l <= i6; ++l)
1592 				{
1593 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1594 					  257] * b[l + j * b_dim1];
1595 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1596 					  257] * b[l + (j + 1) * b_dim1];
1597 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1598 					  257] * b[l + (j + 2) * b_dim1];
1599 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1600 					  257] * b[l + (j + 3) * b_dim1];
1601 				}
1602 			      c[i + j * c_dim1] = f11;
1603 			      c[i + (j + 1) * c_dim1] = f12;
1604 			      c[i + (j + 2) * c_dim1] = f13;
1605 			      c[i + (j + 3) * c_dim1] = f14;
1606 			    }
1607 			}
1608 		    }
1609 		  if (ujsec < jsec)
1610 		    {
1611 		      i4 = jj + jsec - 1;
1612 		      for (j = jj + ujsec; j <= i4; ++j)
1613 			{
1614 			  i5 = ii + uisec - 1;
1615 			  for (i = ii; i <= i5; i += 4)
1616 			    {
1617 			      f11 = c[i + j * c_dim1];
1618 			      f21 = c[i + 1 + j * c_dim1];
1619 			      f31 = c[i + 2 + j * c_dim1];
1620 			      f41 = c[i + 3 + j * c_dim1];
1621 			      i6 = ll + lsec - 1;
1622 			      for (l = ll; l <= i6; ++l)
1623 				{
1624 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1625 					  257] * b[l + j * b_dim1];
1626 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1627 					  257] * b[l + j * b_dim1];
1628 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1629 					  257] * b[l + j * b_dim1];
1630 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1631 					  257] * b[l + j * b_dim1];
1632 				}
1633 			      c[i + j * c_dim1] = f11;
1634 			      c[i + 1 + j * c_dim1] = f21;
1635 			      c[i + 2 + j * c_dim1] = f31;
1636 			      c[i + 3 + j * c_dim1] = f41;
1637 			    }
1638 			  i5 = ii + isec - 1;
1639 			  for (i = ii + uisec; i <= i5; ++i)
1640 			    {
1641 			      f11 = c[i + j * c_dim1];
1642 			      i6 = ll + lsec - 1;
1643 			      for (l = ll; l <= i6; ++l)
1644 				{
1645 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1646 					  257] * b[l + j * b_dim1];
1647 				}
1648 			      c[i + j * c_dim1] = f11;
1649 			    }
1650 			}
1651 		    }
1652 		}
1653 	    }
1654 	}
1655       free(t1);
1656       return;
1657     }
1658   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1659     {
1660       if (GFC_DESCRIPTOR_RANK (a) != 1)
1661 	{
1662 	  const GFC_REAL_16 *restrict abase_x;
1663 	  const GFC_REAL_16 *restrict bbase_y;
1664 	  GFC_REAL_16 *restrict dest_y;
1665 	  GFC_REAL_16 s;
1666 
1667 	  for (y = 0; y < ycount; y++)
1668 	    {
1669 	      bbase_y = &bbase[y*bystride];
1670 	      dest_y = &dest[y*rystride];
1671 	      for (x = 0; x < xcount; x++)
1672 		{
1673 		  abase_x = &abase[x*axstride];
1674 		  s = (GFC_REAL_16) 0;
1675 		  for (n = 0; n < count; n++)
1676 		    s += abase_x[n] * bbase_y[n];
1677 		  dest_y[x] = s;
1678 		}
1679 	    }
1680 	}
1681       else
1682 	{
1683 	  const GFC_REAL_16 *restrict bbase_y;
1684 	  GFC_REAL_16 s;
1685 
1686 	  for (y = 0; y < ycount; y++)
1687 	    {
1688 	      bbase_y = &bbase[y*bystride];
1689 	      s = (GFC_REAL_16) 0;
1690 	      for (n = 0; n < count; n++)
1691 		s += abase[n*axstride] * bbase_y[n];
1692 	      dest[y*rystride] = s;
1693 	    }
1694 	}
1695     }
1696   else if (axstride < aystride)
1697     {
1698       for (y = 0; y < ycount; y++)
1699 	for (x = 0; x < xcount; x++)
1700 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1701 
1702       for (y = 0; y < ycount; y++)
1703 	for (n = 0; n < count; n++)
1704 	  for (x = 0; x < xcount; x++)
1705 	    /* dest[x,y] += a[x,n] * b[n,y] */
1706 	    dest[x*rxstride + y*rystride] +=
1707 					abase[x*axstride + n*aystride] *
1708 					bbase[n*bxstride + y*bystride];
1709     }
1710   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1711     {
1712       const GFC_REAL_16 *restrict bbase_y;
1713       GFC_REAL_16 s;
1714 
1715       for (y = 0; y < ycount; y++)
1716 	{
1717 	  bbase_y = &bbase[y*bystride];
1718 	  s = (GFC_REAL_16) 0;
1719 	  for (n = 0; n < count; n++)
1720 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1721 	  dest[y*rxstride] = s;
1722 	}
1723     }
1724   else
1725     {
1726       const GFC_REAL_16 *restrict abase_x;
1727       const GFC_REAL_16 *restrict bbase_y;
1728       GFC_REAL_16 *restrict dest_y;
1729       GFC_REAL_16 s;
1730 
1731       for (y = 0; y < ycount; y++)
1732 	{
1733 	  bbase_y = &bbase[y*bystride];
1734 	  dest_y = &dest[y*rystride];
1735 	  for (x = 0; x < xcount; x++)
1736 	    {
1737 	      abase_x = &abase[x*axstride];
1738 	      s = (GFC_REAL_16) 0;
1739 	      for (n = 0; n < count; n++)
1740 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1741 	      dest_y[x*rxstride] = s;
1742 	    }
1743 	}
1744     }
1745 }
1746 #undef POW3
1747 #undef min
1748 #undef max
1749 
1750 #endif  /* HAVE_AVX512F */
1751 
1752 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1753 
1754 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1755 void
1756 matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
1757 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1758 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1759 internal_proto(matmul_r16_avx128_fma3);
1760 #endif
1761 
1762 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1763 void
1764 matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
1765 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1766 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1767 internal_proto(matmul_r16_avx128_fma4);
1768 #endif
1769 
1770 /* Function to fall back to if there is no special processor-specific version.  */
1771 static void
matmul_r16_vanilla(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1772 matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
1773 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1774 	int blas_limit, blas_call gemm)
1775 {
1776   const GFC_REAL_16 * restrict abase;
1777   const GFC_REAL_16 * restrict bbase;
1778   GFC_REAL_16 * restrict dest;
1779 
1780   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1781   index_type x, y, n, count, xcount, ycount;
1782 
1783   assert (GFC_DESCRIPTOR_RANK (a) == 2
1784           || GFC_DESCRIPTOR_RANK (b) == 2);
1785 
1786 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1787 
1788    Either A or B (but not both) can be rank 1:
1789 
1790    o One-dimensional argument A is implicitly treated as a row matrix
1791      dimensioned [1,count], so xcount=1.
1792 
1793    o One-dimensional argument B is implicitly treated as a column matrix
1794      dimensioned [count, 1], so ycount=1.
1795 */
1796 
1797   if (retarray->base_addr == NULL)
1798     {
1799       if (GFC_DESCRIPTOR_RANK (a) == 1)
1800         {
1801 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1802 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1803         }
1804       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1805         {
1806 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1807 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1808         }
1809       else
1810         {
1811 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1812 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1813 
1814           GFC_DIMENSION_SET(retarray->dim[1], 0,
1815 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1816 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1817         }
1818 
1819       retarray->base_addr
1820 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1821       retarray->offset = 0;
1822     }
1823   else if (unlikely (compile_options.bounds_check))
1824     {
1825       index_type ret_extent, arg_extent;
1826 
1827       if (GFC_DESCRIPTOR_RANK (a) == 1)
1828 	{
1829 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1830 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1831 	  if (arg_extent != ret_extent)
1832 	    runtime_error ("Incorrect extent in return array in"
1833 			   " MATMUL intrinsic: is %ld, should be %ld",
1834 			   (long int) ret_extent, (long int) arg_extent);
1835 	}
1836       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1837 	{
1838 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1839 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1840 	  if (arg_extent != ret_extent)
1841 	    runtime_error ("Incorrect extent in return array in"
1842 			   " MATMUL intrinsic: is %ld, should be %ld",
1843 			   (long int) ret_extent, (long int) arg_extent);
1844 	}
1845       else
1846 	{
1847 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1848 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1849 	  if (arg_extent != ret_extent)
1850 	    runtime_error ("Incorrect extent in return array in"
1851 			   " MATMUL intrinsic for dimension 1:"
1852 			   " is %ld, should be %ld",
1853 			   (long int) ret_extent, (long int) arg_extent);
1854 
1855 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1856 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1857 	  if (arg_extent != ret_extent)
1858 	    runtime_error ("Incorrect extent in return array in"
1859 			   " MATMUL intrinsic for dimension 2:"
1860 			   " is %ld, should be %ld",
1861 			   (long int) ret_extent, (long int) arg_extent);
1862 	}
1863     }
1864 
1865 
1866   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1867     {
1868       /* One-dimensional result may be addressed in the code below
1869 	 either as a row or a column matrix. We want both cases to
1870 	 work. */
1871       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1872     }
1873   else
1874     {
1875       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1876       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1877     }
1878 
1879 
1880   if (GFC_DESCRIPTOR_RANK (a) == 1)
1881     {
1882       /* Treat it as a a row matrix A[1,count]. */
1883       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1884       aystride = 1;
1885 
1886       xcount = 1;
1887       count = GFC_DESCRIPTOR_EXTENT(a,0);
1888     }
1889   else
1890     {
1891       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1892       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1893 
1894       count = GFC_DESCRIPTOR_EXTENT(a,1);
1895       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1896     }
1897 
1898   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1899     {
1900       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1901 	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1902     }
1903 
1904   if (GFC_DESCRIPTOR_RANK (b) == 1)
1905     {
1906       /* Treat it as a column matrix B[count,1] */
1907       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1908 
1909       /* bystride should never be used for 1-dimensional b.
1910          The value is only used for calculation of the
1911          memory by the buffer.  */
1912       bystride = 256;
1913       ycount = 1;
1914     }
1915   else
1916     {
1917       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1918       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1919       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1920     }
1921 
1922   abase = a->base_addr;
1923   bbase = b->base_addr;
1924   dest = retarray->base_addr;
1925 
1926   /* Now that everything is set up, we perform the multiplication
1927      itself.  */
1928 
1929 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930 #define min(a,b) ((a) <= (b) ? (a) : (b))
1931 #define max(a,b) ((a) >= (b) ? (a) : (b))
1932 
1933   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1934       && (bxstride == 1 || bystride == 1)
1935       && (((float) xcount) * ((float) ycount) * ((float) count)
1936           > POW3(blas_limit)))
1937     {
1938       const int m = xcount, n = ycount, k = count, ldc = rystride;
1939       const GFC_REAL_16 one = 1, zero = 0;
1940       const int lda = (axstride == 1) ? aystride : axstride,
1941 		ldb = (bxstride == 1) ? bystride : bxstride;
1942 
1943       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1944 	{
1945 	  assert (gemm != NULL);
1946 	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1947 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1948 		&ldc, 1, 1);
1949 	  return;
1950 	}
1951     }
1952 
1953   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1954     {
1955       /* This block of code implements a tuned matmul, derived from
1956          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1957 
1958                Bo Kagstrom and Per Ling
1959                Department of Computing Science
1960                Umea University
1961                S-901 87 Umea, Sweden
1962 
1963 	 from netlib.org, translated to C, and modified for matmul.m4.  */
1964 
1965       const GFC_REAL_16 *a, *b;
1966       GFC_REAL_16 *c;
1967       const index_type m = xcount, n = ycount, k = count;
1968 
1969       /* System generated locals */
1970       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1971 		 i1, i2, i3, i4, i5, i6;
1972 
1973       /* Local variables */
1974       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
1975 		 f13, f14, f23, f24, f33, f34, f43, f44;
1976       index_type i, j, l, ii, jj, ll;
1977       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1978       GFC_REAL_16 *t1;
1979 
1980       a = abase;
1981       b = bbase;
1982       c = retarray->base_addr;
1983 
1984       /* Parameter adjustments */
1985       c_dim1 = rystride;
1986       c_offset = 1 + c_dim1;
1987       c -= c_offset;
1988       a_dim1 = aystride;
1989       a_offset = 1 + a_dim1;
1990       a -= a_offset;
1991       b_dim1 = bystride;
1992       b_offset = 1 + b_dim1;
1993       b -= b_offset;
1994 
1995       /* Empty c first.  */
1996       for (j=1; j<=n; j++)
1997 	for (i=1; i<=m; i++)
1998 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
1999 
2000       /* Early exit if possible */
2001       if (m == 0 || n == 0 || k == 0)
2002 	return;
2003 
2004       /* Adjust size of t1 to what is needed.  */
2005       index_type t1_dim, a_sz;
2006       if (aystride == 1)
2007         a_sz = rystride;
2008       else
2009         a_sz = a_dim1;
2010 
2011       t1_dim = a_sz * 256 + b_dim1;
2012       if (t1_dim > 65536)
2013 	t1_dim = 65536;
2014 
2015       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
2016 
2017       /* Start turning the crank. */
2018       i1 = n;
2019       for (jj = 1; jj <= i1; jj += 512)
2020 	{
2021 	  /* Computing MIN */
2022 	  i2 = 512;
2023 	  i3 = n - jj + 1;
2024 	  jsec = min(i2,i3);
2025 	  ujsec = jsec - jsec % 4;
2026 	  i2 = k;
2027 	  for (ll = 1; ll <= i2; ll += 256)
2028 	    {
2029 	      /* Computing MIN */
2030 	      i3 = 256;
2031 	      i4 = k - ll + 1;
2032 	      lsec = min(i3,i4);
2033 	      ulsec = lsec - lsec % 2;
2034 
2035 	      i3 = m;
2036 	      for (ii = 1; ii <= i3; ii += 256)
2037 		{
2038 		  /* Computing MIN */
2039 		  i4 = 256;
2040 		  i5 = m - ii + 1;
2041 		  isec = min(i4,i5);
2042 		  uisec = isec - isec % 2;
2043 		  i4 = ll + ulsec - 1;
2044 		  for (l = ll; l <= i4; l += 2)
2045 		    {
2046 		      i5 = ii + uisec - 1;
2047 		      for (i = ii; i <= i5; i += 2)
2048 			{
2049 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2050 					a[i + l * a_dim1];
2051 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2052 					a[i + (l + 1) * a_dim1];
2053 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2054 					a[i + 1 + l * a_dim1];
2055 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2056 					a[i + 1 + (l + 1) * a_dim1];
2057 			}
2058 		      if (uisec < isec)
2059 			{
2060 			  t1[l - ll + 1 + (isec << 8) - 257] =
2061 				    a[ii + isec - 1 + l * a_dim1];
2062 			  t1[l - ll + 2 + (isec << 8) - 257] =
2063 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2064 			}
2065 		    }
2066 		  if (ulsec < lsec)
2067 		    {
2068 		      i4 = ii + isec - 1;
2069 		      for (i = ii; i<= i4; ++i)
2070 			{
2071 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2072 				    a[i + (ll + lsec - 1) * a_dim1];
2073 			}
2074 		    }
2075 
2076 		  uisec = isec - isec % 4;
2077 		  i4 = jj + ujsec - 1;
2078 		  for (j = jj; j <= i4; j += 4)
2079 		    {
2080 		      i5 = ii + uisec - 1;
2081 		      for (i = ii; i <= i5; i += 4)
2082 			{
2083 			  f11 = c[i + j * c_dim1];
2084 			  f21 = c[i + 1 + j * c_dim1];
2085 			  f12 = c[i + (j + 1) * c_dim1];
2086 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2087 			  f13 = c[i + (j + 2) * c_dim1];
2088 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2089 			  f14 = c[i + (j + 3) * c_dim1];
2090 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2091 			  f31 = c[i + 2 + j * c_dim1];
2092 			  f41 = c[i + 3 + j * c_dim1];
2093 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2094 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2095 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2096 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2097 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2098 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2099 			  i6 = ll + lsec - 1;
2100 			  for (l = ll; l <= i6; ++l)
2101 			    {
2102 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2103 				      * b[l + j * b_dim1];
2104 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2105 				      * b[l + j * b_dim1];
2106 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2107 				      * b[l + (j + 1) * b_dim1];
2108 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2109 				      * b[l + (j + 1) * b_dim1];
2110 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2111 				      * b[l + (j + 2) * b_dim1];
2112 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2113 				      * b[l + (j + 2) * b_dim1];
2114 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2115 				      * b[l + (j + 3) * b_dim1];
2116 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2117 				      * b[l + (j + 3) * b_dim1];
2118 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2119 				      * b[l + j * b_dim1];
2120 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2121 				      * b[l + j * b_dim1];
2122 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2123 				      * b[l + (j + 1) * b_dim1];
2124 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2125 				      * b[l + (j + 1) * b_dim1];
2126 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2127 				      * b[l + (j + 2) * b_dim1];
2128 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2129 				      * b[l + (j + 2) * b_dim1];
2130 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2131 				      * b[l + (j + 3) * b_dim1];
2132 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2133 				      * b[l + (j + 3) * b_dim1];
2134 			    }
2135 			  c[i + j * c_dim1] = f11;
2136 			  c[i + 1 + j * c_dim1] = f21;
2137 			  c[i + (j + 1) * c_dim1] = f12;
2138 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2139 			  c[i + (j + 2) * c_dim1] = f13;
2140 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2141 			  c[i + (j + 3) * c_dim1] = f14;
2142 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2143 			  c[i + 2 + j * c_dim1] = f31;
2144 			  c[i + 3 + j * c_dim1] = f41;
2145 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2146 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2147 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2148 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2149 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2150 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2151 			}
2152 		      if (uisec < isec)
2153 			{
2154 			  i5 = ii + isec - 1;
2155 			  for (i = ii + uisec; i <= i5; ++i)
2156 			    {
2157 			      f11 = c[i + j * c_dim1];
2158 			      f12 = c[i + (j + 1) * c_dim1];
2159 			      f13 = c[i + (j + 2) * c_dim1];
2160 			      f14 = c[i + (j + 3) * c_dim1];
2161 			      i6 = ll + lsec - 1;
2162 			      for (l = ll; l <= i6; ++l)
2163 				{
2164 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2165 					  257] * b[l + j * b_dim1];
2166 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2167 					  257] * b[l + (j + 1) * b_dim1];
2168 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2169 					  257] * b[l + (j + 2) * b_dim1];
2170 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2171 					  257] * b[l + (j + 3) * b_dim1];
2172 				}
2173 			      c[i + j * c_dim1] = f11;
2174 			      c[i + (j + 1) * c_dim1] = f12;
2175 			      c[i + (j + 2) * c_dim1] = f13;
2176 			      c[i + (j + 3) * c_dim1] = f14;
2177 			    }
2178 			}
2179 		    }
2180 		  if (ujsec < jsec)
2181 		    {
2182 		      i4 = jj + jsec - 1;
2183 		      for (j = jj + ujsec; j <= i4; ++j)
2184 			{
2185 			  i5 = ii + uisec - 1;
2186 			  for (i = ii; i <= i5; i += 4)
2187 			    {
2188 			      f11 = c[i + j * c_dim1];
2189 			      f21 = c[i + 1 + j * c_dim1];
2190 			      f31 = c[i + 2 + j * c_dim1];
2191 			      f41 = c[i + 3 + j * c_dim1];
2192 			      i6 = ll + lsec - 1;
2193 			      for (l = ll; l <= i6; ++l)
2194 				{
2195 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2196 					  257] * b[l + j * b_dim1];
2197 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2198 					  257] * b[l + j * b_dim1];
2199 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2200 					  257] * b[l + j * b_dim1];
2201 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2202 					  257] * b[l + j * b_dim1];
2203 				}
2204 			      c[i + j * c_dim1] = f11;
2205 			      c[i + 1 + j * c_dim1] = f21;
2206 			      c[i + 2 + j * c_dim1] = f31;
2207 			      c[i + 3 + j * c_dim1] = f41;
2208 			    }
2209 			  i5 = ii + isec - 1;
2210 			  for (i = ii + uisec; i <= i5; ++i)
2211 			    {
2212 			      f11 = c[i + j * c_dim1];
2213 			      i6 = ll + lsec - 1;
2214 			      for (l = ll; l <= i6; ++l)
2215 				{
2216 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 					  257] * b[l + j * b_dim1];
2218 				}
2219 			      c[i + j * c_dim1] = f11;
2220 			    }
2221 			}
2222 		    }
2223 		}
2224 	    }
2225 	}
2226       free(t1);
2227       return;
2228     }
2229   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2230     {
2231       if (GFC_DESCRIPTOR_RANK (a) != 1)
2232 	{
2233 	  const GFC_REAL_16 *restrict abase_x;
2234 	  const GFC_REAL_16 *restrict bbase_y;
2235 	  GFC_REAL_16 *restrict dest_y;
2236 	  GFC_REAL_16 s;
2237 
2238 	  for (y = 0; y < ycount; y++)
2239 	    {
2240 	      bbase_y = &bbase[y*bystride];
2241 	      dest_y = &dest[y*rystride];
2242 	      for (x = 0; x < xcount; x++)
2243 		{
2244 		  abase_x = &abase[x*axstride];
2245 		  s = (GFC_REAL_16) 0;
2246 		  for (n = 0; n < count; n++)
2247 		    s += abase_x[n] * bbase_y[n];
2248 		  dest_y[x] = s;
2249 		}
2250 	    }
2251 	}
2252       else
2253 	{
2254 	  const GFC_REAL_16 *restrict bbase_y;
2255 	  GFC_REAL_16 s;
2256 
2257 	  for (y = 0; y < ycount; y++)
2258 	    {
2259 	      bbase_y = &bbase[y*bystride];
2260 	      s = (GFC_REAL_16) 0;
2261 	      for (n = 0; n < count; n++)
2262 		s += abase[n*axstride] * bbase_y[n];
2263 	      dest[y*rystride] = s;
2264 	    }
2265 	}
2266     }
2267   else if (axstride < aystride)
2268     {
2269       for (y = 0; y < ycount; y++)
2270 	for (x = 0; x < xcount; x++)
2271 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2272 
2273       for (y = 0; y < ycount; y++)
2274 	for (n = 0; n < count; n++)
2275 	  for (x = 0; x < xcount; x++)
2276 	    /* dest[x,y] += a[x,n] * b[n,y] */
2277 	    dest[x*rxstride + y*rystride] +=
2278 					abase[x*axstride + n*aystride] *
2279 					bbase[n*bxstride + y*bystride];
2280     }
2281   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2282     {
2283       const GFC_REAL_16 *restrict bbase_y;
2284       GFC_REAL_16 s;
2285 
2286       for (y = 0; y < ycount; y++)
2287 	{
2288 	  bbase_y = &bbase[y*bystride];
2289 	  s = (GFC_REAL_16) 0;
2290 	  for (n = 0; n < count; n++)
2291 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2292 	  dest[y*rxstride] = s;
2293 	}
2294     }
2295   else
2296     {
2297       const GFC_REAL_16 *restrict abase_x;
2298       const GFC_REAL_16 *restrict bbase_y;
2299       GFC_REAL_16 *restrict dest_y;
2300       GFC_REAL_16 s;
2301 
2302       for (y = 0; y < ycount; y++)
2303 	{
2304 	  bbase_y = &bbase[y*bystride];
2305 	  dest_y = &dest[y*rystride];
2306 	  for (x = 0; x < xcount; x++)
2307 	    {
2308 	      abase_x = &abase[x*axstride];
2309 	      s = (GFC_REAL_16) 0;
2310 	      for (n = 0; n < count; n++)
2311 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2312 	      dest_y[x*rxstride] = s;
2313 	    }
2314 	}
2315     }
2316 }
2317 #undef POW3
2318 #undef min
2319 #undef max
2320 
2321 
2322 /* Compiling main function, with selection code for the processor.  */
2323 
2324 /* Currently, this is i386 only.  Adjust for other architectures.  */
2325 
2326 #include <config/i386/cpuinfo.h>
matmul_r16(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2327 void matmul_r16 (gfc_array_r16 * const restrict retarray,
2328 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2329 	int blas_limit, blas_call gemm)
2330 {
2331   static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
2332 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2333 	int blas_limit, blas_call gemm);
2334 
2335   void (*matmul_fn) (gfc_array_r16 * const restrict retarray,
2336 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2337 	int blas_limit, blas_call gemm);
2338 
2339   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2340   if (matmul_fn == NULL)
2341     {
2342       matmul_fn = matmul_r16_vanilla;
2343       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2344 	{
2345           /* Run down the available processors in order of preference.  */
2346 #ifdef HAVE_AVX512F
2347       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2348 	    {
2349 	      matmul_fn = matmul_r16_avx512f;
2350 	      goto store;
2351 	    }
2352 
2353 #endif  /* HAVE_AVX512F */
2354 
2355 #ifdef HAVE_AVX2
2356       	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2357 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2358 	    {
2359 	      matmul_fn = matmul_r16_avx2;
2360 	      goto store;
2361 	    }
2362 
2363 #endif
2364 
2365 #ifdef HAVE_AVX
2366       	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2367  	    {
2368               matmul_fn = matmul_r16_avx;
2369 	      goto store;
2370 	    }
2371 #endif  /* HAVE_AVX */
2372         }
2373     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2374       {
2375 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2377 	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2378 	  {
2379             matmul_fn = matmul_r16_avx128_fma3;
2380 	    goto store;
2381 	  }
2382 #endif
2383 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2385 	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2386 	  {
2387             matmul_fn = matmul_r16_avx128_fma4;
2388 	    goto store;
2389 	  }
2390 #endif
2391 
2392       }
2393    store:
2394       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2395    }
2396 
2397    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2398 }
2399 
2400 #else  /* Just the vanilla function.  */
2401 
2402 void
matmul_r16(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2403 matmul_r16 (gfc_array_r16 * const restrict retarray,
2404 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2405 	int blas_limit, blas_call gemm)
2406 {
2407   const GFC_REAL_16 * restrict abase;
2408   const GFC_REAL_16 * restrict bbase;
2409   GFC_REAL_16 * restrict dest;
2410 
2411   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2412   index_type x, y, n, count, xcount, ycount;
2413 
2414   assert (GFC_DESCRIPTOR_RANK (a) == 2
2415           || GFC_DESCRIPTOR_RANK (b) == 2);
2416 
2417 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2418 
2419    Either A or B (but not both) can be rank 1:
2420 
2421    o One-dimensional argument A is implicitly treated as a row matrix
2422      dimensioned [1,count], so xcount=1.
2423 
2424    o One-dimensional argument B is implicitly treated as a column matrix
2425      dimensioned [count, 1], so ycount=1.
2426 */
2427 
2428   if (retarray->base_addr == NULL)
2429     {
2430       if (GFC_DESCRIPTOR_RANK (a) == 1)
2431         {
2432 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2433 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2434         }
2435       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2436         {
2437 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2438 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2439         }
2440       else
2441         {
2442 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2443 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2444 
2445           GFC_DIMENSION_SET(retarray->dim[1], 0,
2446 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2447 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2448         }
2449 
2450       retarray->base_addr
2451 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
2452       retarray->offset = 0;
2453     }
2454   else if (unlikely (compile_options.bounds_check))
2455     {
2456       index_type ret_extent, arg_extent;
2457 
2458       if (GFC_DESCRIPTOR_RANK (a) == 1)
2459 	{
2460 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2461 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2462 	  if (arg_extent != ret_extent)
2463 	    runtime_error ("Incorrect extent in return array in"
2464 			   " MATMUL intrinsic: is %ld, should be %ld",
2465 			   (long int) ret_extent, (long int) arg_extent);
2466 	}
2467       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2468 	{
2469 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2470 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2471 	  if (arg_extent != ret_extent)
2472 	    runtime_error ("Incorrect extent in return array in"
2473 			   " MATMUL intrinsic: is %ld, should be %ld",
2474 			   (long int) ret_extent, (long int) arg_extent);
2475 	}
2476       else
2477 	{
2478 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2479 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2480 	  if (arg_extent != ret_extent)
2481 	    runtime_error ("Incorrect extent in return array in"
2482 			   " MATMUL intrinsic for dimension 1:"
2483 			   " is %ld, should be %ld",
2484 			   (long int) ret_extent, (long int) arg_extent);
2485 
2486 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2487 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2488 	  if (arg_extent != ret_extent)
2489 	    runtime_error ("Incorrect extent in return array in"
2490 			   " MATMUL intrinsic for dimension 2:"
2491 			   " is %ld, should be %ld",
2492 			   (long int) ret_extent, (long int) arg_extent);
2493 	}
2494     }
2495 
2496 
2497   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2498     {
2499       /* One-dimensional result may be addressed in the code below
2500 	 either as a row or a column matrix. We want both cases to
2501 	 work. */
2502       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2503     }
2504   else
2505     {
2506       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2507       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2508     }
2509 
2510 
2511   if (GFC_DESCRIPTOR_RANK (a) == 1)
2512     {
2513       /* Treat it as a a row matrix A[1,count]. */
2514       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2515       aystride = 1;
2516 
2517       xcount = 1;
2518       count = GFC_DESCRIPTOR_EXTENT(a,0);
2519     }
2520   else
2521     {
2522       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2523       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2524 
2525       count = GFC_DESCRIPTOR_EXTENT(a,1);
2526       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2527     }
2528 
2529   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2530     {
2531       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2532 	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2533     }
2534 
2535   if (GFC_DESCRIPTOR_RANK (b) == 1)
2536     {
2537       /* Treat it as a column matrix B[count,1] */
2538       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2539 
2540       /* bystride should never be used for 1-dimensional b.
2541          The value is only used for calculation of the
2542          memory by the buffer.  */
2543       bystride = 256;
2544       ycount = 1;
2545     }
2546   else
2547     {
2548       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2549       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2550       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2551     }
2552 
2553   abase = a->base_addr;
2554   bbase = b->base_addr;
2555   dest = retarray->base_addr;
2556 
2557   /* Now that everything is set up, we perform the multiplication
2558      itself.  */
2559 
2560 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2561 #define min(a,b) ((a) <= (b) ? (a) : (b))
2562 #define max(a,b) ((a) >= (b) ? (a) : (b))
2563 
2564   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2565       && (bxstride == 1 || bystride == 1)
2566       && (((float) xcount) * ((float) ycount) * ((float) count)
2567           > POW3(blas_limit)))
2568     {
2569       const int m = xcount, n = ycount, k = count, ldc = rystride;
2570       const GFC_REAL_16 one = 1, zero = 0;
2571       const int lda = (axstride == 1) ? aystride : axstride,
2572 		ldb = (bxstride == 1) ? bystride : bxstride;
2573 
2574       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2575 	{
2576 	  assert (gemm != NULL);
2577 	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2578 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2579 		&ldc, 1, 1);
2580 	  return;
2581 	}
2582     }
2583 
2584   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2585     {
2586       /* This block of code implements a tuned matmul, derived from
2587          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2588 
2589                Bo Kagstrom and Per Ling
2590                Department of Computing Science
2591                Umea University
2592                S-901 87 Umea, Sweden
2593 
2594 	 from netlib.org, translated to C, and modified for matmul.m4.  */
2595 
2596       const GFC_REAL_16 *a, *b;
2597       GFC_REAL_16 *c;
2598       const index_type m = xcount, n = ycount, k = count;
2599 
2600       /* System generated locals */
2601       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2602 		 i1, i2, i3, i4, i5, i6;
2603 
2604       /* Local variables */
2605       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
2606 		 f13, f14, f23, f24, f33, f34, f43, f44;
2607       index_type i, j, l, ii, jj, ll;
2608       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2609       GFC_REAL_16 *t1;
2610 
2611       a = abase;
2612       b = bbase;
2613       c = retarray->base_addr;
2614 
2615       /* Parameter adjustments */
2616       c_dim1 = rystride;
2617       c_offset = 1 + c_dim1;
2618       c -= c_offset;
2619       a_dim1 = aystride;
2620       a_offset = 1 + a_dim1;
2621       a -= a_offset;
2622       b_dim1 = bystride;
2623       b_offset = 1 + b_dim1;
2624       b -= b_offset;
2625 
2626       /* Empty c first.  */
2627       for (j=1; j<=n; j++)
2628 	for (i=1; i<=m; i++)
2629 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
2630 
2631       /* Early exit if possible */
2632       if (m == 0 || n == 0 || k == 0)
2633 	return;
2634 
2635       /* Adjust size of t1 to what is needed.  */
2636       index_type t1_dim, a_sz;
2637       if (aystride == 1)
2638         a_sz = rystride;
2639       else
2640         a_sz = a_dim1;
2641 
2642       t1_dim = a_sz * 256 + b_dim1;
2643       if (t1_dim > 65536)
2644 	t1_dim = 65536;
2645 
2646       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
2647 
2648       /* Start turning the crank. */
2649       i1 = n;
2650       for (jj = 1; jj <= i1; jj += 512)
2651 	{
2652 	  /* Computing MIN */
2653 	  i2 = 512;
2654 	  i3 = n - jj + 1;
2655 	  jsec = min(i2,i3);
2656 	  ujsec = jsec - jsec % 4;
2657 	  i2 = k;
2658 	  for (ll = 1; ll <= i2; ll += 256)
2659 	    {
2660 	      /* Computing MIN */
2661 	      i3 = 256;
2662 	      i4 = k - ll + 1;
2663 	      lsec = min(i3,i4);
2664 	      ulsec = lsec - lsec % 2;
2665 
2666 	      i3 = m;
2667 	      for (ii = 1; ii <= i3; ii += 256)
2668 		{
2669 		  /* Computing MIN */
2670 		  i4 = 256;
2671 		  i5 = m - ii + 1;
2672 		  isec = min(i4,i5);
2673 		  uisec = isec - isec % 2;
2674 		  i4 = ll + ulsec - 1;
2675 		  for (l = ll; l <= i4; l += 2)
2676 		    {
2677 		      i5 = ii + uisec - 1;
2678 		      for (i = ii; i <= i5; i += 2)
2679 			{
2680 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2681 					a[i + l * a_dim1];
2682 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2683 					a[i + (l + 1) * a_dim1];
2684 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2685 					a[i + 1 + l * a_dim1];
2686 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2687 					a[i + 1 + (l + 1) * a_dim1];
2688 			}
2689 		      if (uisec < isec)
2690 			{
2691 			  t1[l - ll + 1 + (isec << 8) - 257] =
2692 				    a[ii + isec - 1 + l * a_dim1];
2693 			  t1[l - ll + 2 + (isec << 8) - 257] =
2694 				    a[ii + isec - 1 + (l + 1) * a_dim1];
2695 			}
2696 		    }
2697 		  if (ulsec < lsec)
2698 		    {
2699 		      i4 = ii + isec - 1;
2700 		      for (i = ii; i<= i4; ++i)
2701 			{
2702 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2703 				    a[i + (ll + lsec - 1) * a_dim1];
2704 			}
2705 		    }
2706 
2707 		  uisec = isec - isec % 4;
2708 		  i4 = jj + ujsec - 1;
2709 		  for (j = jj; j <= i4; j += 4)
2710 		    {
2711 		      i5 = ii + uisec - 1;
2712 		      for (i = ii; i <= i5; i += 4)
2713 			{
2714 			  f11 = c[i + j * c_dim1];
2715 			  f21 = c[i + 1 + j * c_dim1];
2716 			  f12 = c[i + (j + 1) * c_dim1];
2717 			  f22 = c[i + 1 + (j + 1) * c_dim1];
2718 			  f13 = c[i + (j + 2) * c_dim1];
2719 			  f23 = c[i + 1 + (j + 2) * c_dim1];
2720 			  f14 = c[i + (j + 3) * c_dim1];
2721 			  f24 = c[i + 1 + (j + 3) * c_dim1];
2722 			  f31 = c[i + 2 + j * c_dim1];
2723 			  f41 = c[i + 3 + j * c_dim1];
2724 			  f32 = c[i + 2 + (j + 1) * c_dim1];
2725 			  f42 = c[i + 3 + (j + 1) * c_dim1];
2726 			  f33 = c[i + 2 + (j + 2) * c_dim1];
2727 			  f43 = c[i + 3 + (j + 2) * c_dim1];
2728 			  f34 = c[i + 2 + (j + 3) * c_dim1];
2729 			  f44 = c[i + 3 + (j + 3) * c_dim1];
2730 			  i6 = ll + lsec - 1;
2731 			  for (l = ll; l <= i6; ++l)
2732 			    {
2733 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2734 				      * b[l + j * b_dim1];
2735 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2736 				      * b[l + j * b_dim1];
2737 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2738 				      * b[l + (j + 1) * b_dim1];
2739 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2740 				      * b[l + (j + 1) * b_dim1];
2741 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2742 				      * b[l + (j + 2) * b_dim1];
2743 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2744 				      * b[l + (j + 2) * b_dim1];
2745 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2746 				      * b[l + (j + 3) * b_dim1];
2747 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2748 				      * b[l + (j + 3) * b_dim1];
2749 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2750 				      * b[l + j * b_dim1];
2751 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2752 				      * b[l + j * b_dim1];
2753 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2754 				      * b[l + (j + 1) * b_dim1];
2755 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2756 				      * b[l + (j + 1) * b_dim1];
2757 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2758 				      * b[l + (j + 2) * b_dim1];
2759 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2760 				      * b[l + (j + 2) * b_dim1];
2761 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2762 				      * b[l + (j + 3) * b_dim1];
2763 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2764 				      * b[l + (j + 3) * b_dim1];
2765 			    }
2766 			  c[i + j * c_dim1] = f11;
2767 			  c[i + 1 + j * c_dim1] = f21;
2768 			  c[i + (j + 1) * c_dim1] = f12;
2769 			  c[i + 1 + (j + 1) * c_dim1] = f22;
2770 			  c[i + (j + 2) * c_dim1] = f13;
2771 			  c[i + 1 + (j + 2) * c_dim1] = f23;
2772 			  c[i + (j + 3) * c_dim1] = f14;
2773 			  c[i + 1 + (j + 3) * c_dim1] = f24;
2774 			  c[i + 2 + j * c_dim1] = f31;
2775 			  c[i + 3 + j * c_dim1] = f41;
2776 			  c[i + 2 + (j + 1) * c_dim1] = f32;
2777 			  c[i + 3 + (j + 1) * c_dim1] = f42;
2778 			  c[i + 2 + (j + 2) * c_dim1] = f33;
2779 			  c[i + 3 + (j + 2) * c_dim1] = f43;
2780 			  c[i + 2 + (j + 3) * c_dim1] = f34;
2781 			  c[i + 3 + (j + 3) * c_dim1] = f44;
2782 			}
2783 		      if (uisec < isec)
2784 			{
2785 			  i5 = ii + isec - 1;
2786 			  for (i = ii + uisec; i <= i5; ++i)
2787 			    {
2788 			      f11 = c[i + j * c_dim1];
2789 			      f12 = c[i + (j + 1) * c_dim1];
2790 			      f13 = c[i + (j + 2) * c_dim1];
2791 			      f14 = c[i + (j + 3) * c_dim1];
2792 			      i6 = ll + lsec - 1;
2793 			      for (l = ll; l <= i6; ++l)
2794 				{
2795 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2796 					  257] * b[l + j * b_dim1];
2797 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2798 					  257] * b[l + (j + 1) * b_dim1];
2799 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2800 					  257] * b[l + (j + 2) * b_dim1];
2801 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802 					  257] * b[l + (j + 3) * b_dim1];
2803 				}
2804 			      c[i + j * c_dim1] = f11;
2805 			      c[i + (j + 1) * c_dim1] = f12;
2806 			      c[i + (j + 2) * c_dim1] = f13;
2807 			      c[i + (j + 3) * c_dim1] = f14;
2808 			    }
2809 			}
2810 		    }
2811 		  if (ujsec < jsec)
2812 		    {
2813 		      i4 = jj + jsec - 1;
2814 		      for (j = jj + ujsec; j <= i4; ++j)
2815 			{
2816 			  i5 = ii + uisec - 1;
2817 			  for (i = ii; i <= i5; i += 4)
2818 			    {
2819 			      f11 = c[i + j * c_dim1];
2820 			      f21 = c[i + 1 + j * c_dim1];
2821 			      f31 = c[i + 2 + j * c_dim1];
2822 			      f41 = c[i + 3 + j * c_dim1];
2823 			      i6 = ll + lsec - 1;
2824 			      for (l = ll; l <= i6; ++l)
2825 				{
2826 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2827 					  257] * b[l + j * b_dim1];
2828 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2829 					  257] * b[l + j * b_dim1];
2830 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2831 					  257] * b[l + j * b_dim1];
2832 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2833 					  257] * b[l + j * b_dim1];
2834 				}
2835 			      c[i + j * c_dim1] = f11;
2836 			      c[i + 1 + j * c_dim1] = f21;
2837 			      c[i + 2 + j * c_dim1] = f31;
2838 			      c[i + 3 + j * c_dim1] = f41;
2839 			    }
2840 			  i5 = ii + isec - 1;
2841 			  for (i = ii + uisec; i <= i5; ++i)
2842 			    {
2843 			      f11 = c[i + j * c_dim1];
2844 			      i6 = ll + lsec - 1;
2845 			      for (l = ll; l <= i6; ++l)
2846 				{
2847 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2848 					  257] * b[l + j * b_dim1];
2849 				}
2850 			      c[i + j * c_dim1] = f11;
2851 			    }
2852 			}
2853 		    }
2854 		}
2855 	    }
2856 	}
2857       free(t1);
2858       return;
2859     }
2860   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2861     {
2862       if (GFC_DESCRIPTOR_RANK (a) != 1)
2863 	{
2864 	  const GFC_REAL_16 *restrict abase_x;
2865 	  const GFC_REAL_16 *restrict bbase_y;
2866 	  GFC_REAL_16 *restrict dest_y;
2867 	  GFC_REAL_16 s;
2868 
2869 	  for (y = 0; y < ycount; y++)
2870 	    {
2871 	      bbase_y = &bbase[y*bystride];
2872 	      dest_y = &dest[y*rystride];
2873 	      for (x = 0; x < xcount; x++)
2874 		{
2875 		  abase_x = &abase[x*axstride];
2876 		  s = (GFC_REAL_16) 0;
2877 		  for (n = 0; n < count; n++)
2878 		    s += abase_x[n] * bbase_y[n];
2879 		  dest_y[x] = s;
2880 		}
2881 	    }
2882 	}
2883       else
2884 	{
2885 	  const GFC_REAL_16 *restrict bbase_y;
2886 	  GFC_REAL_16 s;
2887 
2888 	  for (y = 0; y < ycount; y++)
2889 	    {
2890 	      bbase_y = &bbase[y*bystride];
2891 	      s = (GFC_REAL_16) 0;
2892 	      for (n = 0; n < count; n++)
2893 		s += abase[n*axstride] * bbase_y[n];
2894 	      dest[y*rystride] = s;
2895 	    }
2896 	}
2897     }
2898   else if (axstride < aystride)
2899     {
2900       for (y = 0; y < ycount; y++)
2901 	for (x = 0; x < xcount; x++)
2902 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2903 
2904       for (y = 0; y < ycount; y++)
2905 	for (n = 0; n < count; n++)
2906 	  for (x = 0; x < xcount; x++)
2907 	    /* dest[x,y] += a[x,n] * b[n,y] */
2908 	    dest[x*rxstride + y*rystride] +=
2909 					abase[x*axstride + n*aystride] *
2910 					bbase[n*bxstride + y*bystride];
2911     }
2912   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2913     {
2914       const GFC_REAL_16 *restrict bbase_y;
2915       GFC_REAL_16 s;
2916 
2917       for (y = 0; y < ycount; y++)
2918 	{
2919 	  bbase_y = &bbase[y*bystride];
2920 	  s = (GFC_REAL_16) 0;
2921 	  for (n = 0; n < count; n++)
2922 	    s += abase[n*axstride] * bbase_y[n*bxstride];
2923 	  dest[y*rxstride] = s;
2924 	}
2925     }
2926   else
2927     {
2928       const GFC_REAL_16 *restrict abase_x;
2929       const GFC_REAL_16 *restrict bbase_y;
2930       GFC_REAL_16 *restrict dest_y;
2931       GFC_REAL_16 s;
2932 
2933       for (y = 0; y < ycount; y++)
2934 	{
2935 	  bbase_y = &bbase[y*bystride];
2936 	  dest_y = &dest[y*rystride];
2937 	  for (x = 0; x < xcount; x++)
2938 	    {
2939 	      abase_x = &abase[x*axstride];
2940 	      s = (GFC_REAL_16) 0;
2941 	      for (n = 0; n < count; n++)
2942 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2943 	      dest_y[x*rxstride] = s;
2944 	    }
2945 	}
2946     }
2947 }
2948 #undef POW3
2949 #undef min
2950 #undef max
2951 
2952 #endif
2953 #endif
2954 
2955