1 /* Implementation of the MATMUL intrinsic
2    Copyright (C) 2002-2019 Free Software Foundation, Inc.
3    Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4 
5 This file is part of the GNU Fortran runtime library (libgfortran).
6 
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11 
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20 
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24 <http://www.gnu.org/licenses/>.  */
25 
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29 
30 
31 /* These are the specific versions of matmul with -mprefer-avx128.  */
32 
33 #if defined (HAVE_GFC_INTEGER_1)
34 
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36    passed to us by the front-end, in which case we call it for large
37    matrices.  */
38 
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40                           const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
41                           const int *, const GFC_INTEGER_1 *, const int *,
42                           const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
43                           int, int);
44 
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46 void
47 matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
48 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
49 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50 internal_proto(matmul_i1_avx128_fma3);
51 void
matmul_i1_avx128_fma3(gfc_array_i1 * const restrict retarray,gfc_array_i1 * const restrict a,gfc_array_i1 * const restrict b,int try_blas,int blas_limit,blas_call gemm)52 matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
53 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
54 	int blas_limit, blas_call gemm)
55 {
56   const GFC_INTEGER_1 * restrict abase;
57   const GFC_INTEGER_1 * restrict bbase;
58   GFC_INTEGER_1 * restrict dest;
59 
60   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61   index_type x, y, n, count, xcount, ycount;
62 
63   assert (GFC_DESCRIPTOR_RANK (a) == 2
64           || GFC_DESCRIPTOR_RANK (b) == 2);
65 
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67 
68    Either A or B (but not both) can be rank 1:
69 
70    o One-dimensional argument A is implicitly treated as a row matrix
71      dimensioned [1,count], so xcount=1.
72 
73    o One-dimensional argument B is implicitly treated as a column matrix
74      dimensioned [count, 1], so ycount=1.
75 */
76 
77   if (retarray->base_addr == NULL)
78     {
79       if (GFC_DESCRIPTOR_RANK (a) == 1)
80         {
81 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
82 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83         }
84       else if (GFC_DESCRIPTOR_RANK (b) == 1)
85         {
86 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
87 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88         }
89       else
90         {
91 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
92 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93 
94           GFC_DIMENSION_SET(retarray->dim[1], 0,
95 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
97         }
98 
99       retarray->base_addr
100 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
101       retarray->offset = 0;
102     }
103   else if (unlikely (compile_options.bounds_check))
104     {
105       index_type ret_extent, arg_extent;
106 
107       if (GFC_DESCRIPTOR_RANK (a) == 1)
108 	{
109 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111 	  if (arg_extent != ret_extent)
112 	    runtime_error ("Array bound mismatch for dimension 1 of "
113 	    		   "array (%ld/%ld) ",
114 			   (long int) ret_extent, (long int) arg_extent);
115 	}
116       else if (GFC_DESCRIPTOR_RANK (b) == 1)
117 	{
118 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120 	  if (arg_extent != ret_extent)
121 	    runtime_error ("Array bound mismatch for dimension 1 of "
122 	    		   "array (%ld/%ld) ",
123 			   (long int) ret_extent, (long int) arg_extent);
124 	}
125       else
126 	{
127 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129 	  if (arg_extent != ret_extent)
130 	    runtime_error ("Array bound mismatch for dimension 1 of "
131 	    		   "array (%ld/%ld) ",
132 			   (long int) ret_extent, (long int) arg_extent);
133 
134 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136 	  if (arg_extent != ret_extent)
137 	    runtime_error ("Array bound mismatch for dimension 2 of "
138 	    		   "array (%ld/%ld) ",
139 			   (long int) ret_extent, (long int) arg_extent);
140 	}
141     }
142 
143 
144   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145     {
146       /* One-dimensional result may be addressed in the code below
147 	 either as a row or a column matrix. We want both cases to
148 	 work. */
149       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150     }
151   else
152     {
153       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155     }
156 
157 
158   if (GFC_DESCRIPTOR_RANK (a) == 1)
159     {
160       /* Treat it as a a row matrix A[1,count]. */
161       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162       aystride = 1;
163 
164       xcount = 1;
165       count = GFC_DESCRIPTOR_EXTENT(a,0);
166     }
167   else
168     {
169       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171 
172       count = GFC_DESCRIPTOR_EXTENT(a,1);
173       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174     }
175 
176   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177     {
178       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180 		       "in dimension 1: is %ld, should be %ld",
181 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182     }
183 
184   if (GFC_DESCRIPTOR_RANK (b) == 1)
185     {
186       /* Treat it as a column matrix B[count,1] */
187       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188 
189       /* bystride should never be used for 1-dimensional b.
190          The value is only used for calculation of the
191          memory by the buffer.  */
192       bystride = 256;
193       ycount = 1;
194     }
195   else
196     {
197       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200     }
201 
202   abase = a->base_addr;
203   bbase = b->base_addr;
204   dest = retarray->base_addr;
205 
206   /* Now that everything is set up, we perform the multiplication
207      itself.  */
208 
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
212 
213   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214       && (bxstride == 1 || bystride == 1)
215       && (((float) xcount) * ((float) ycount) * ((float) count)
216           > POW3(blas_limit)))
217     {
218       const int m = xcount, n = ycount, k = count, ldc = rystride;
219       const GFC_INTEGER_1 one = 1, zero = 0;
220       const int lda = (axstride == 1) ? aystride : axstride,
221 		ldb = (bxstride == 1) ? bystride : bxstride;
222 
223       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224 	{
225 	  assert (gemm != NULL);
226 	  const char *transa, *transb;
227 	  if (try_blas & 2)
228 	    transa = "C";
229 	  else
230 	    transa = axstride == 1 ? "N" : "T";
231 
232 	  if (try_blas & 4)
233 	    transb = "C";
234 	  else
235 	    transb = bxstride == 1 ? "N" : "T";
236 
237 	  gemm (transa, transb , &m,
238 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
239 		&ldc, 1, 1);
240 	  return;
241 	}
242     }
243 
244   if (rxstride == 1 && axstride == 1 && bxstride == 1
245       && GFC_DESCRIPTOR_RANK (b) != 1)
246     {
247       /* This block of code implements a tuned matmul, derived from
248          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
249 
250                Bo Kagstrom and Per Ling
251                Department of Computing Science
252                Umea University
253                S-901 87 Umea, Sweden
254 
255 	 from netlib.org, translated to C, and modified for matmul.m4.  */
256 
257       const GFC_INTEGER_1 *a, *b;
258       GFC_INTEGER_1 *c;
259       const index_type m = xcount, n = ycount, k = count;
260 
261       /* System generated locals */
262       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263 		 i1, i2, i3, i4, i5, i6;
264 
265       /* Local variables */
266       GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
267 		 f13, f14, f23, f24, f33, f34, f43, f44;
268       index_type i, j, l, ii, jj, ll;
269       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270       GFC_INTEGER_1 *t1;
271 
272       a = abase;
273       b = bbase;
274       c = retarray->base_addr;
275 
276       /* Parameter adjustments */
277       c_dim1 = rystride;
278       c_offset = 1 + c_dim1;
279       c -= c_offset;
280       a_dim1 = aystride;
281       a_offset = 1 + a_dim1;
282       a -= a_offset;
283       b_dim1 = bystride;
284       b_offset = 1 + b_dim1;
285       b -= b_offset;
286 
287       /* Empty c first.  */
288       for (j=1; j<=n; j++)
289 	for (i=1; i<=m; i++)
290 	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
291 
292       /* Early exit if possible */
293       if (m == 0 || n == 0 || k == 0)
294 	return;
295 
296       /* Adjust size of t1 to what is needed.  */
297       index_type t1_dim, a_sz;
298       if (aystride == 1)
299         a_sz = rystride;
300       else
301         a_sz = a_dim1;
302 
303       t1_dim = a_sz * 256 + b_dim1;
304       if (t1_dim > 65536)
305 	t1_dim = 65536;
306 
307       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
308 
309       /* Start turning the crank. */
310       i1 = n;
311       for (jj = 1; jj <= i1; jj += 512)
312 	{
313 	  /* Computing MIN */
314 	  i2 = 512;
315 	  i3 = n - jj + 1;
316 	  jsec = min(i2,i3);
317 	  ujsec = jsec - jsec % 4;
318 	  i2 = k;
319 	  for (ll = 1; ll <= i2; ll += 256)
320 	    {
321 	      /* Computing MIN */
322 	      i3 = 256;
323 	      i4 = k - ll + 1;
324 	      lsec = min(i3,i4);
325 	      ulsec = lsec - lsec % 2;
326 
327 	      i3 = m;
328 	      for (ii = 1; ii <= i3; ii += 256)
329 		{
330 		  /* Computing MIN */
331 		  i4 = 256;
332 		  i5 = m - ii + 1;
333 		  isec = min(i4,i5);
334 		  uisec = isec - isec % 2;
335 		  i4 = ll + ulsec - 1;
336 		  for (l = ll; l <= i4; l += 2)
337 		    {
338 		      i5 = ii + uisec - 1;
339 		      for (i = ii; i <= i5; i += 2)
340 			{
341 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342 					a[i + l * a_dim1];
343 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344 					a[i + (l + 1) * a_dim1];
345 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346 					a[i + 1 + l * a_dim1];
347 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348 					a[i + 1 + (l + 1) * a_dim1];
349 			}
350 		      if (uisec < isec)
351 			{
352 			  t1[l - ll + 1 + (isec << 8) - 257] =
353 				    a[ii + isec - 1 + l * a_dim1];
354 			  t1[l - ll + 2 + (isec << 8) - 257] =
355 				    a[ii + isec - 1 + (l + 1) * a_dim1];
356 			}
357 		    }
358 		  if (ulsec < lsec)
359 		    {
360 		      i4 = ii + isec - 1;
361 		      for (i = ii; i<= i4; ++i)
362 			{
363 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
364 				    a[i + (ll + lsec - 1) * a_dim1];
365 			}
366 		    }
367 
368 		  uisec = isec - isec % 4;
369 		  i4 = jj + ujsec - 1;
370 		  for (j = jj; j <= i4; j += 4)
371 		    {
372 		      i5 = ii + uisec - 1;
373 		      for (i = ii; i <= i5; i += 4)
374 			{
375 			  f11 = c[i + j * c_dim1];
376 			  f21 = c[i + 1 + j * c_dim1];
377 			  f12 = c[i + (j + 1) * c_dim1];
378 			  f22 = c[i + 1 + (j + 1) * c_dim1];
379 			  f13 = c[i + (j + 2) * c_dim1];
380 			  f23 = c[i + 1 + (j + 2) * c_dim1];
381 			  f14 = c[i + (j + 3) * c_dim1];
382 			  f24 = c[i + 1 + (j + 3) * c_dim1];
383 			  f31 = c[i + 2 + j * c_dim1];
384 			  f41 = c[i + 3 + j * c_dim1];
385 			  f32 = c[i + 2 + (j + 1) * c_dim1];
386 			  f42 = c[i + 3 + (j + 1) * c_dim1];
387 			  f33 = c[i + 2 + (j + 2) * c_dim1];
388 			  f43 = c[i + 3 + (j + 2) * c_dim1];
389 			  f34 = c[i + 2 + (j + 3) * c_dim1];
390 			  f44 = c[i + 3 + (j + 3) * c_dim1];
391 			  i6 = ll + lsec - 1;
392 			  for (l = ll; l <= i6; ++l)
393 			    {
394 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395 				      * b[l + j * b_dim1];
396 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397 				      * b[l + j * b_dim1];
398 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399 				      * b[l + (j + 1) * b_dim1];
400 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401 				      * b[l + (j + 1) * b_dim1];
402 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403 				      * b[l + (j + 2) * b_dim1];
404 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405 				      * b[l + (j + 2) * b_dim1];
406 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407 				      * b[l + (j + 3) * b_dim1];
408 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409 				      * b[l + (j + 3) * b_dim1];
410 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411 				      * b[l + j * b_dim1];
412 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413 				      * b[l + j * b_dim1];
414 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415 				      * b[l + (j + 1) * b_dim1];
416 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417 				      * b[l + (j + 1) * b_dim1];
418 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419 				      * b[l + (j + 2) * b_dim1];
420 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421 				      * b[l + (j + 2) * b_dim1];
422 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423 				      * b[l + (j + 3) * b_dim1];
424 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425 				      * b[l + (j + 3) * b_dim1];
426 			    }
427 			  c[i + j * c_dim1] = f11;
428 			  c[i + 1 + j * c_dim1] = f21;
429 			  c[i + (j + 1) * c_dim1] = f12;
430 			  c[i + 1 + (j + 1) * c_dim1] = f22;
431 			  c[i + (j + 2) * c_dim1] = f13;
432 			  c[i + 1 + (j + 2) * c_dim1] = f23;
433 			  c[i + (j + 3) * c_dim1] = f14;
434 			  c[i + 1 + (j + 3) * c_dim1] = f24;
435 			  c[i + 2 + j * c_dim1] = f31;
436 			  c[i + 3 + j * c_dim1] = f41;
437 			  c[i + 2 + (j + 1) * c_dim1] = f32;
438 			  c[i + 3 + (j + 1) * c_dim1] = f42;
439 			  c[i + 2 + (j + 2) * c_dim1] = f33;
440 			  c[i + 3 + (j + 2) * c_dim1] = f43;
441 			  c[i + 2 + (j + 3) * c_dim1] = f34;
442 			  c[i + 3 + (j + 3) * c_dim1] = f44;
443 			}
444 		      if (uisec < isec)
445 			{
446 			  i5 = ii + isec - 1;
447 			  for (i = ii + uisec; i <= i5; ++i)
448 			    {
449 			      f11 = c[i + j * c_dim1];
450 			      f12 = c[i + (j + 1) * c_dim1];
451 			      f13 = c[i + (j + 2) * c_dim1];
452 			      f14 = c[i + (j + 3) * c_dim1];
453 			      i6 = ll + lsec - 1;
454 			      for (l = ll; l <= i6; ++l)
455 				{
456 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457 					  257] * b[l + j * b_dim1];
458 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459 					  257] * b[l + (j + 1) * b_dim1];
460 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461 					  257] * b[l + (j + 2) * b_dim1];
462 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463 					  257] * b[l + (j + 3) * b_dim1];
464 				}
465 			      c[i + j * c_dim1] = f11;
466 			      c[i + (j + 1) * c_dim1] = f12;
467 			      c[i + (j + 2) * c_dim1] = f13;
468 			      c[i + (j + 3) * c_dim1] = f14;
469 			    }
470 			}
471 		    }
472 		  if (ujsec < jsec)
473 		    {
474 		      i4 = jj + jsec - 1;
475 		      for (j = jj + ujsec; j <= i4; ++j)
476 			{
477 			  i5 = ii + uisec - 1;
478 			  for (i = ii; i <= i5; i += 4)
479 			    {
480 			      f11 = c[i + j * c_dim1];
481 			      f21 = c[i + 1 + j * c_dim1];
482 			      f31 = c[i + 2 + j * c_dim1];
483 			      f41 = c[i + 3 + j * c_dim1];
484 			      i6 = ll + lsec - 1;
485 			      for (l = ll; l <= i6; ++l)
486 				{
487 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488 					  257] * b[l + j * b_dim1];
489 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490 					  257] * b[l + j * b_dim1];
491 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492 					  257] * b[l + j * b_dim1];
493 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494 					  257] * b[l + j * b_dim1];
495 				}
496 			      c[i + j * c_dim1] = f11;
497 			      c[i + 1 + j * c_dim1] = f21;
498 			      c[i + 2 + j * c_dim1] = f31;
499 			      c[i + 3 + j * c_dim1] = f41;
500 			    }
501 			  i5 = ii + isec - 1;
502 			  for (i = ii + uisec; i <= i5; ++i)
503 			    {
504 			      f11 = c[i + j * c_dim1];
505 			      i6 = ll + lsec - 1;
506 			      for (l = ll; l <= i6; ++l)
507 				{
508 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509 					  257] * b[l + j * b_dim1];
510 				}
511 			      c[i + j * c_dim1] = f11;
512 			    }
513 			}
514 		    }
515 		}
516 	    }
517 	}
518       free(t1);
519       return;
520     }
521   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522     {
523       if (GFC_DESCRIPTOR_RANK (a) != 1)
524 	{
525 	  const GFC_INTEGER_1 *restrict abase_x;
526 	  const GFC_INTEGER_1 *restrict bbase_y;
527 	  GFC_INTEGER_1 *restrict dest_y;
528 	  GFC_INTEGER_1 s;
529 
530 	  for (y = 0; y < ycount; y++)
531 	    {
532 	      bbase_y = &bbase[y*bystride];
533 	      dest_y = &dest[y*rystride];
534 	      for (x = 0; x < xcount; x++)
535 		{
536 		  abase_x = &abase[x*axstride];
537 		  s = (GFC_INTEGER_1) 0;
538 		  for (n = 0; n < count; n++)
539 		    s += abase_x[n] * bbase_y[n];
540 		  dest_y[x] = s;
541 		}
542 	    }
543 	}
544       else
545 	{
546 	  const GFC_INTEGER_1 *restrict bbase_y;
547 	  GFC_INTEGER_1 s;
548 
549 	  for (y = 0; y < ycount; y++)
550 	    {
551 	      bbase_y = &bbase[y*bystride];
552 	      s = (GFC_INTEGER_1) 0;
553 	      for (n = 0; n < count; n++)
554 		s += abase[n*axstride] * bbase_y[n];
555 	      dest[y*rystride] = s;
556 	    }
557 	}
558     }
559   else if (GFC_DESCRIPTOR_RANK (a) == 1)
560     {
561       const GFC_INTEGER_1 *restrict bbase_y;
562       GFC_INTEGER_1 s;
563 
564       for (y = 0; y < ycount; y++)
565 	{
566 	  bbase_y = &bbase[y*bystride];
567 	  s = (GFC_INTEGER_1) 0;
568 	  for (n = 0; n < count; n++)
569 	    s += abase[n*axstride] * bbase_y[n*bxstride];
570 	  dest[y*rxstride] = s;
571 	}
572     }
573   else if (axstride < aystride)
574     {
575       for (y = 0; y < ycount; y++)
576 	for (x = 0; x < xcount; x++)
577 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
578 
579       for (y = 0; y < ycount; y++)
580 	for (n = 0; n < count; n++)
581 	  for (x = 0; x < xcount; x++)
582 	    /* dest[x,y] += a[x,n] * b[n,y] */
583 	    dest[x*rxstride + y*rystride] +=
584 					abase[x*axstride + n*aystride] *
585 					bbase[n*bxstride + y*bystride];
586     }
587   else
588     {
589       const GFC_INTEGER_1 *restrict abase_x;
590       const GFC_INTEGER_1 *restrict bbase_y;
591       GFC_INTEGER_1 *restrict dest_y;
592       GFC_INTEGER_1 s;
593 
594       for (y = 0; y < ycount; y++)
595 	{
596 	  bbase_y = &bbase[y*bystride];
597 	  dest_y = &dest[y*rystride];
598 	  for (x = 0; x < xcount; x++)
599 	    {
600 	      abase_x = &abase[x*axstride];
601 	      s = (GFC_INTEGER_1) 0;
602 	      for (n = 0; n < count; n++)
603 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
604 	      dest_y[x*rxstride] = s;
605 	    }
606 	}
607     }
608 }
609 #undef POW3
610 #undef min
611 #undef max
612 
613 #endif
614 
615 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616 void
617 matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
618 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
619 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620 internal_proto(matmul_i1_avx128_fma4);
621 void
matmul_i1_avx128_fma4(gfc_array_i1 * const restrict retarray,gfc_array_i1 * const restrict a,gfc_array_i1 * const restrict b,int try_blas,int blas_limit,blas_call gemm)622 matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
623 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
624 	int blas_limit, blas_call gemm)
625 {
626   const GFC_INTEGER_1 * restrict abase;
627   const GFC_INTEGER_1 * restrict bbase;
628   GFC_INTEGER_1 * restrict dest;
629 
630   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631   index_type x, y, n, count, xcount, ycount;
632 
633   assert (GFC_DESCRIPTOR_RANK (a) == 2
634           || GFC_DESCRIPTOR_RANK (b) == 2);
635 
636 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637 
638    Either A or B (but not both) can be rank 1:
639 
640    o One-dimensional argument A is implicitly treated as a row matrix
641      dimensioned [1,count], so xcount=1.
642 
643    o One-dimensional argument B is implicitly treated as a column matrix
644      dimensioned [count, 1], so ycount=1.
645 */
646 
647   if (retarray->base_addr == NULL)
648     {
649       if (GFC_DESCRIPTOR_RANK (a) == 1)
650         {
651 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
652 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653         }
654       else if (GFC_DESCRIPTOR_RANK (b) == 1)
655         {
656 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
657 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658         }
659       else
660         {
661 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
662 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663 
664           GFC_DIMENSION_SET(retarray->dim[1], 0,
665 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
667         }
668 
669       retarray->base_addr
670 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
671       retarray->offset = 0;
672     }
673   else if (unlikely (compile_options.bounds_check))
674     {
675       index_type ret_extent, arg_extent;
676 
677       if (GFC_DESCRIPTOR_RANK (a) == 1)
678 	{
679 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681 	  if (arg_extent != ret_extent)
682 	    runtime_error ("Array bound mismatch for dimension 1 of "
683 	    		   "array (%ld/%ld) ",
684 			   (long int) ret_extent, (long int) arg_extent);
685 	}
686       else if (GFC_DESCRIPTOR_RANK (b) == 1)
687 	{
688 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690 	  if (arg_extent != ret_extent)
691 	    runtime_error ("Array bound mismatch for dimension 1 of "
692 	    		   "array (%ld/%ld) ",
693 			   (long int) ret_extent, (long int) arg_extent);
694 	}
695       else
696 	{
697 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699 	  if (arg_extent != ret_extent)
700 	    runtime_error ("Array bound mismatch for dimension 1 of "
701 	    		   "array (%ld/%ld) ",
702 			   (long int) ret_extent, (long int) arg_extent);
703 
704 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706 	  if (arg_extent != ret_extent)
707 	    runtime_error ("Array bound mismatch for dimension 2 of "
708 	    		   "array (%ld/%ld) ",
709 			   (long int) ret_extent, (long int) arg_extent);
710 	}
711     }
712 
713 
714   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715     {
716       /* One-dimensional result may be addressed in the code below
717 	 either as a row or a column matrix. We want both cases to
718 	 work. */
719       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720     }
721   else
722     {
723       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725     }
726 
727 
728   if (GFC_DESCRIPTOR_RANK (a) == 1)
729     {
730       /* Treat it as a a row matrix A[1,count]. */
731       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732       aystride = 1;
733 
734       xcount = 1;
735       count = GFC_DESCRIPTOR_EXTENT(a,0);
736     }
737   else
738     {
739       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741 
742       count = GFC_DESCRIPTOR_EXTENT(a,1);
743       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744     }
745 
746   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747     {
748       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750 		       "in dimension 1: is %ld, should be %ld",
751 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752     }
753 
754   if (GFC_DESCRIPTOR_RANK (b) == 1)
755     {
756       /* Treat it as a column matrix B[count,1] */
757       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758 
759       /* bystride should never be used for 1-dimensional b.
760          The value is only used for calculation of the
761          memory by the buffer.  */
762       bystride = 256;
763       ycount = 1;
764     }
765   else
766     {
767       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770     }
771 
772   abase = a->base_addr;
773   bbase = b->base_addr;
774   dest = retarray->base_addr;
775 
776   /* Now that everything is set up, we perform the multiplication
777      itself.  */
778 
779 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780 #define min(a,b) ((a) <= (b) ? (a) : (b))
781 #define max(a,b) ((a) >= (b) ? (a) : (b))
782 
783   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784       && (bxstride == 1 || bystride == 1)
785       && (((float) xcount) * ((float) ycount) * ((float) count)
786           > POW3(blas_limit)))
787     {
788       const int m = xcount, n = ycount, k = count, ldc = rystride;
789       const GFC_INTEGER_1 one = 1, zero = 0;
790       const int lda = (axstride == 1) ? aystride : axstride,
791 		ldb = (bxstride == 1) ? bystride : bxstride;
792 
793       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794 	{
795 	  assert (gemm != NULL);
796 	  const char *transa, *transb;
797 	  if (try_blas & 2)
798 	    transa = "C";
799 	  else
800 	    transa = axstride == 1 ? "N" : "T";
801 
802 	  if (try_blas & 4)
803 	    transb = "C";
804 	  else
805 	    transb = bxstride == 1 ? "N" : "T";
806 
807 	  gemm (transa, transb , &m,
808 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
809 		&ldc, 1, 1);
810 	  return;
811 	}
812     }
813 
814   if (rxstride == 1 && axstride == 1 && bxstride == 1
815       && GFC_DESCRIPTOR_RANK (b) != 1)
816     {
817       /* This block of code implements a tuned matmul, derived from
818          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
819 
820                Bo Kagstrom and Per Ling
821                Department of Computing Science
822                Umea University
823                S-901 87 Umea, Sweden
824 
825 	 from netlib.org, translated to C, and modified for matmul.m4.  */
826 
827       const GFC_INTEGER_1 *a, *b;
828       GFC_INTEGER_1 *c;
829       const index_type m = xcount, n = ycount, k = count;
830 
831       /* System generated locals */
832       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833 		 i1, i2, i3, i4, i5, i6;
834 
835       /* Local variables */
836       GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
837 		 f13, f14, f23, f24, f33, f34, f43, f44;
838       index_type i, j, l, ii, jj, ll;
839       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840       GFC_INTEGER_1 *t1;
841 
842       a = abase;
843       b = bbase;
844       c = retarray->base_addr;
845 
846       /* Parameter adjustments */
847       c_dim1 = rystride;
848       c_offset = 1 + c_dim1;
849       c -= c_offset;
850       a_dim1 = aystride;
851       a_offset = 1 + a_dim1;
852       a -= a_offset;
853       b_dim1 = bystride;
854       b_offset = 1 + b_dim1;
855       b -= b_offset;
856 
857       /* Empty c first.  */
858       for (j=1; j<=n; j++)
859 	for (i=1; i<=m; i++)
860 	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
861 
862       /* Early exit if possible */
863       if (m == 0 || n == 0 || k == 0)
864 	return;
865 
866       /* Adjust size of t1 to what is needed.  */
867       index_type t1_dim, a_sz;
868       if (aystride == 1)
869         a_sz = rystride;
870       else
871         a_sz = a_dim1;
872 
873       t1_dim = a_sz * 256 + b_dim1;
874       if (t1_dim > 65536)
875 	t1_dim = 65536;
876 
877       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
878 
879       /* Start turning the crank. */
880       i1 = n;
881       for (jj = 1; jj <= i1; jj += 512)
882 	{
883 	  /* Computing MIN */
884 	  i2 = 512;
885 	  i3 = n - jj + 1;
886 	  jsec = min(i2,i3);
887 	  ujsec = jsec - jsec % 4;
888 	  i2 = k;
889 	  for (ll = 1; ll <= i2; ll += 256)
890 	    {
891 	      /* Computing MIN */
892 	      i3 = 256;
893 	      i4 = k - ll + 1;
894 	      lsec = min(i3,i4);
895 	      ulsec = lsec - lsec % 2;
896 
897 	      i3 = m;
898 	      for (ii = 1; ii <= i3; ii += 256)
899 		{
900 		  /* Computing MIN */
901 		  i4 = 256;
902 		  i5 = m - ii + 1;
903 		  isec = min(i4,i5);
904 		  uisec = isec - isec % 2;
905 		  i4 = ll + ulsec - 1;
906 		  for (l = ll; l <= i4; l += 2)
907 		    {
908 		      i5 = ii + uisec - 1;
909 		      for (i = ii; i <= i5; i += 2)
910 			{
911 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912 					a[i + l * a_dim1];
913 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914 					a[i + (l + 1) * a_dim1];
915 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916 					a[i + 1 + l * a_dim1];
917 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918 					a[i + 1 + (l + 1) * a_dim1];
919 			}
920 		      if (uisec < isec)
921 			{
922 			  t1[l - ll + 1 + (isec << 8) - 257] =
923 				    a[ii + isec - 1 + l * a_dim1];
924 			  t1[l - ll + 2 + (isec << 8) - 257] =
925 				    a[ii + isec - 1 + (l + 1) * a_dim1];
926 			}
927 		    }
928 		  if (ulsec < lsec)
929 		    {
930 		      i4 = ii + isec - 1;
931 		      for (i = ii; i<= i4; ++i)
932 			{
933 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
934 				    a[i + (ll + lsec - 1) * a_dim1];
935 			}
936 		    }
937 
938 		  uisec = isec - isec % 4;
939 		  i4 = jj + ujsec - 1;
940 		  for (j = jj; j <= i4; j += 4)
941 		    {
942 		      i5 = ii + uisec - 1;
943 		      for (i = ii; i <= i5; i += 4)
944 			{
945 			  f11 = c[i + j * c_dim1];
946 			  f21 = c[i + 1 + j * c_dim1];
947 			  f12 = c[i + (j + 1) * c_dim1];
948 			  f22 = c[i + 1 + (j + 1) * c_dim1];
949 			  f13 = c[i + (j + 2) * c_dim1];
950 			  f23 = c[i + 1 + (j + 2) * c_dim1];
951 			  f14 = c[i + (j + 3) * c_dim1];
952 			  f24 = c[i + 1 + (j + 3) * c_dim1];
953 			  f31 = c[i + 2 + j * c_dim1];
954 			  f41 = c[i + 3 + j * c_dim1];
955 			  f32 = c[i + 2 + (j + 1) * c_dim1];
956 			  f42 = c[i + 3 + (j + 1) * c_dim1];
957 			  f33 = c[i + 2 + (j + 2) * c_dim1];
958 			  f43 = c[i + 3 + (j + 2) * c_dim1];
959 			  f34 = c[i + 2 + (j + 3) * c_dim1];
960 			  f44 = c[i + 3 + (j + 3) * c_dim1];
961 			  i6 = ll + lsec - 1;
962 			  for (l = ll; l <= i6; ++l)
963 			    {
964 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965 				      * b[l + j * b_dim1];
966 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967 				      * b[l + j * b_dim1];
968 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969 				      * b[l + (j + 1) * b_dim1];
970 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971 				      * b[l + (j + 1) * b_dim1];
972 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973 				      * b[l + (j + 2) * b_dim1];
974 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975 				      * b[l + (j + 2) * b_dim1];
976 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977 				      * b[l + (j + 3) * b_dim1];
978 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979 				      * b[l + (j + 3) * b_dim1];
980 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981 				      * b[l + j * b_dim1];
982 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983 				      * b[l + j * b_dim1];
984 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985 				      * b[l + (j + 1) * b_dim1];
986 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987 				      * b[l + (j + 1) * b_dim1];
988 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989 				      * b[l + (j + 2) * b_dim1];
990 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991 				      * b[l + (j + 2) * b_dim1];
992 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993 				      * b[l + (j + 3) * b_dim1];
994 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995 				      * b[l + (j + 3) * b_dim1];
996 			    }
997 			  c[i + j * c_dim1] = f11;
998 			  c[i + 1 + j * c_dim1] = f21;
999 			  c[i + (j + 1) * c_dim1] = f12;
1000 			  c[i + 1 + (j + 1) * c_dim1] = f22;
1001 			  c[i + (j + 2) * c_dim1] = f13;
1002 			  c[i + 1 + (j + 2) * c_dim1] = f23;
1003 			  c[i + (j + 3) * c_dim1] = f14;
1004 			  c[i + 1 + (j + 3) * c_dim1] = f24;
1005 			  c[i + 2 + j * c_dim1] = f31;
1006 			  c[i + 3 + j * c_dim1] = f41;
1007 			  c[i + 2 + (j + 1) * c_dim1] = f32;
1008 			  c[i + 3 + (j + 1) * c_dim1] = f42;
1009 			  c[i + 2 + (j + 2) * c_dim1] = f33;
1010 			  c[i + 3 + (j + 2) * c_dim1] = f43;
1011 			  c[i + 2 + (j + 3) * c_dim1] = f34;
1012 			  c[i + 3 + (j + 3) * c_dim1] = f44;
1013 			}
1014 		      if (uisec < isec)
1015 			{
1016 			  i5 = ii + isec - 1;
1017 			  for (i = ii + uisec; i <= i5; ++i)
1018 			    {
1019 			      f11 = c[i + j * c_dim1];
1020 			      f12 = c[i + (j + 1) * c_dim1];
1021 			      f13 = c[i + (j + 2) * c_dim1];
1022 			      f14 = c[i + (j + 3) * c_dim1];
1023 			      i6 = ll + lsec - 1;
1024 			      for (l = ll; l <= i6; ++l)
1025 				{
1026 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027 					  257] * b[l + j * b_dim1];
1028 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029 					  257] * b[l + (j + 1) * b_dim1];
1030 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031 					  257] * b[l + (j + 2) * b_dim1];
1032 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033 					  257] * b[l + (j + 3) * b_dim1];
1034 				}
1035 			      c[i + j * c_dim1] = f11;
1036 			      c[i + (j + 1) * c_dim1] = f12;
1037 			      c[i + (j + 2) * c_dim1] = f13;
1038 			      c[i + (j + 3) * c_dim1] = f14;
1039 			    }
1040 			}
1041 		    }
1042 		  if (ujsec < jsec)
1043 		    {
1044 		      i4 = jj + jsec - 1;
1045 		      for (j = jj + ujsec; j <= i4; ++j)
1046 			{
1047 			  i5 = ii + uisec - 1;
1048 			  for (i = ii; i <= i5; i += 4)
1049 			    {
1050 			      f11 = c[i + j * c_dim1];
1051 			      f21 = c[i + 1 + j * c_dim1];
1052 			      f31 = c[i + 2 + j * c_dim1];
1053 			      f41 = c[i + 3 + j * c_dim1];
1054 			      i6 = ll + lsec - 1;
1055 			      for (l = ll; l <= i6; ++l)
1056 				{
1057 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058 					  257] * b[l + j * b_dim1];
1059 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060 					  257] * b[l + j * b_dim1];
1061 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062 					  257] * b[l + j * b_dim1];
1063 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064 					  257] * b[l + j * b_dim1];
1065 				}
1066 			      c[i + j * c_dim1] = f11;
1067 			      c[i + 1 + j * c_dim1] = f21;
1068 			      c[i + 2 + j * c_dim1] = f31;
1069 			      c[i + 3 + j * c_dim1] = f41;
1070 			    }
1071 			  i5 = ii + isec - 1;
1072 			  for (i = ii + uisec; i <= i5; ++i)
1073 			    {
1074 			      f11 = c[i + j * c_dim1];
1075 			      i6 = ll + lsec - 1;
1076 			      for (l = ll; l <= i6; ++l)
1077 				{
1078 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079 					  257] * b[l + j * b_dim1];
1080 				}
1081 			      c[i + j * c_dim1] = f11;
1082 			    }
1083 			}
1084 		    }
1085 		}
1086 	    }
1087 	}
1088       free(t1);
1089       return;
1090     }
1091   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092     {
1093       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094 	{
1095 	  const GFC_INTEGER_1 *restrict abase_x;
1096 	  const GFC_INTEGER_1 *restrict bbase_y;
1097 	  GFC_INTEGER_1 *restrict dest_y;
1098 	  GFC_INTEGER_1 s;
1099 
1100 	  for (y = 0; y < ycount; y++)
1101 	    {
1102 	      bbase_y = &bbase[y*bystride];
1103 	      dest_y = &dest[y*rystride];
1104 	      for (x = 0; x < xcount; x++)
1105 		{
1106 		  abase_x = &abase[x*axstride];
1107 		  s = (GFC_INTEGER_1) 0;
1108 		  for (n = 0; n < count; n++)
1109 		    s += abase_x[n] * bbase_y[n];
1110 		  dest_y[x] = s;
1111 		}
1112 	    }
1113 	}
1114       else
1115 	{
1116 	  const GFC_INTEGER_1 *restrict bbase_y;
1117 	  GFC_INTEGER_1 s;
1118 
1119 	  for (y = 0; y < ycount; y++)
1120 	    {
1121 	      bbase_y = &bbase[y*bystride];
1122 	      s = (GFC_INTEGER_1) 0;
1123 	      for (n = 0; n < count; n++)
1124 		s += abase[n*axstride] * bbase_y[n];
1125 	      dest[y*rystride] = s;
1126 	    }
1127 	}
1128     }
1129   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130     {
1131       const GFC_INTEGER_1 *restrict bbase_y;
1132       GFC_INTEGER_1 s;
1133 
1134       for (y = 0; y < ycount; y++)
1135 	{
1136 	  bbase_y = &bbase[y*bystride];
1137 	  s = (GFC_INTEGER_1) 0;
1138 	  for (n = 0; n < count; n++)
1139 	    s += abase[n*axstride] * bbase_y[n*bxstride];
1140 	  dest[y*rxstride] = s;
1141 	}
1142     }
1143   else if (axstride < aystride)
1144     {
1145       for (y = 0; y < ycount; y++)
1146 	for (x = 0; x < xcount; x++)
1147 	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
1148 
1149       for (y = 0; y < ycount; y++)
1150 	for (n = 0; n < count; n++)
1151 	  for (x = 0; x < xcount; x++)
1152 	    /* dest[x,y] += a[x,n] * b[n,y] */
1153 	    dest[x*rxstride + y*rystride] +=
1154 					abase[x*axstride + n*aystride] *
1155 					bbase[n*bxstride + y*bystride];
1156     }
1157   else
1158     {
1159       const GFC_INTEGER_1 *restrict abase_x;
1160       const GFC_INTEGER_1 *restrict bbase_y;
1161       GFC_INTEGER_1 *restrict dest_y;
1162       GFC_INTEGER_1 s;
1163 
1164       for (y = 0; y < ycount; y++)
1165 	{
1166 	  bbase_y = &bbase[y*bystride];
1167 	  dest_y = &dest[y*rystride];
1168 	  for (x = 0; x < xcount; x++)
1169 	    {
1170 	      abase_x = &abase[x*axstride];
1171 	      s = (GFC_INTEGER_1) 0;
1172 	      for (n = 0; n < count; n++)
1173 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174 	      dest_y[x*rxstride] = s;
1175 	    }
1176 	}
1177     }
1178 }
1179 #undef POW3
1180 #undef min
1181 #undef max
1182 
1183 #endif
1184 
1185 #endif
1186 
1187