1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_REAL_16)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
39 const int *, const GFC_REAL_16 *, const int *,
40 const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
73 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r16);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
84 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
matmul_r16_avx(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)87 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
88 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_REAL_16 * restrict abase;
92 const GFC_REAL_16 * restrict bbase;
93 GFC_REAL_16 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Array bound mismatch for dimension 1 of "
148 "array (%ld/%ld) ",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Array bound mismatch for dimension 1 of "
157 "array (%ld/%ld) ",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Array bound mismatch for dimension 1 of "
166 "array (%ld/%ld) ",
167 (long int) ret_extent, (long int) arg_extent);
168
169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171 if (arg_extent != ret_extent)
172 runtime_error ("Array bound mismatch for dimension 2 of "
173 "array (%ld/%ld) ",
174 (long int) ret_extent, (long int) arg_extent);
175 }
176 }
177
178
179 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180 {
181 /* One-dimensional result may be addressed in the code below
182 either as a row or a column matrix. We want both cases to
183 work. */
184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185 }
186 else
187 {
188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190 }
191
192
193 if (GFC_DESCRIPTOR_RANK (a) == 1)
194 {
195 /* Treat it as a a row matrix A[1,count]. */
196 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197 aystride = 1;
198
199 xcount = 1;
200 count = GFC_DESCRIPTOR_EXTENT(a,0);
201 }
202 else
203 {
204 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206
207 count = GFC_DESCRIPTOR_EXTENT(a,1);
208 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209 }
210
211 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212 {
213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215 "in dimension 1: is %ld, should be %ld",
216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_16 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 const char *transa, *transb;
262 if (try_blas & 2)
263 transa = "C";
264 else
265 transa = axstride == 1 ? "N" : "T";
266
267 if (try_blas & 4)
268 transb = "C";
269 else
270 transb = bxstride == 1 ? "N" : "T";
271
272 gemm (transa, transb , &m,
273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
274 &ldc, 1, 1);
275 return;
276 }
277 }
278
279 if (rxstride == 1 && axstride == 1 && bxstride == 1
280 && GFC_DESCRIPTOR_RANK (b) != 1)
281 {
282 /* This block of code implements a tuned matmul, derived from
283 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
284
285 Bo Kagstrom and Per Ling
286 Department of Computing Science
287 Umea University
288 S-901 87 Umea, Sweden
289
290 from netlib.org, translated to C, and modified for matmul.m4. */
291
292 const GFC_REAL_16 *a, *b;
293 GFC_REAL_16 *c;
294 const index_type m = xcount, n = ycount, k = count;
295
296 /* System generated locals */
297 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298 i1, i2, i3, i4, i5, i6;
299
300 /* Local variables */
301 GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
302 f13, f14, f23, f24, f33, f34, f43, f44;
303 index_type i, j, l, ii, jj, ll;
304 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305 GFC_REAL_16 *t1;
306
307 a = abase;
308 b = bbase;
309 c = retarray->base_addr;
310
311 /* Parameter adjustments */
312 c_dim1 = rystride;
313 c_offset = 1 + c_dim1;
314 c -= c_offset;
315 a_dim1 = aystride;
316 a_offset = 1 + a_dim1;
317 a -= a_offset;
318 b_dim1 = bystride;
319 b_offset = 1 + b_dim1;
320 b -= b_offset;
321
322 /* Empty c first. */
323 for (j=1; j<=n; j++)
324 for (i=1; i<=m; i++)
325 c[i + j * c_dim1] = (GFC_REAL_16)0;
326
327 /* Early exit if possible */
328 if (m == 0 || n == 0 || k == 0)
329 return;
330
331 /* Adjust size of t1 to what is needed. */
332 index_type t1_dim, a_sz;
333 if (aystride == 1)
334 a_sz = rystride;
335 else
336 a_sz = a_dim1;
337
338 t1_dim = a_sz * 256 + b_dim1;
339 if (t1_dim > 65536)
340 t1_dim = 65536;
341
342 t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
343
344 /* Start turning the crank. */
345 i1 = n;
346 for (jj = 1; jj <= i1; jj += 512)
347 {
348 /* Computing MIN */
349 i2 = 512;
350 i3 = n - jj + 1;
351 jsec = min(i2,i3);
352 ujsec = jsec - jsec % 4;
353 i2 = k;
354 for (ll = 1; ll <= i2; ll += 256)
355 {
356 /* Computing MIN */
357 i3 = 256;
358 i4 = k - ll + 1;
359 lsec = min(i3,i4);
360 ulsec = lsec - lsec % 2;
361
362 i3 = m;
363 for (ii = 1; ii <= i3; ii += 256)
364 {
365 /* Computing MIN */
366 i4 = 256;
367 i5 = m - ii + 1;
368 isec = min(i4,i5);
369 uisec = isec - isec % 2;
370 i4 = ll + ulsec - 1;
371 for (l = ll; l <= i4; l += 2)
372 {
373 i5 = ii + uisec - 1;
374 for (i = ii; i <= i5; i += 2)
375 {
376 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377 a[i + l * a_dim1];
378 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379 a[i + (l + 1) * a_dim1];
380 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381 a[i + 1 + l * a_dim1];
382 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383 a[i + 1 + (l + 1) * a_dim1];
384 }
385 if (uisec < isec)
386 {
387 t1[l - ll + 1 + (isec << 8) - 257] =
388 a[ii + isec - 1 + l * a_dim1];
389 t1[l - ll + 2 + (isec << 8) - 257] =
390 a[ii + isec - 1 + (l + 1) * a_dim1];
391 }
392 }
393 if (ulsec < lsec)
394 {
395 i4 = ii + isec - 1;
396 for (i = ii; i<= i4; ++i)
397 {
398 t1[lsec + ((i - ii + 1) << 8) - 257] =
399 a[i + (ll + lsec - 1) * a_dim1];
400 }
401 }
402
403 uisec = isec - isec % 4;
404 i4 = jj + ujsec - 1;
405 for (j = jj; j <= i4; j += 4)
406 {
407 i5 = ii + uisec - 1;
408 for (i = ii; i <= i5; i += 4)
409 {
410 f11 = c[i + j * c_dim1];
411 f21 = c[i + 1 + j * c_dim1];
412 f12 = c[i + (j + 1) * c_dim1];
413 f22 = c[i + 1 + (j + 1) * c_dim1];
414 f13 = c[i + (j + 2) * c_dim1];
415 f23 = c[i + 1 + (j + 2) * c_dim1];
416 f14 = c[i + (j + 3) * c_dim1];
417 f24 = c[i + 1 + (j + 3) * c_dim1];
418 f31 = c[i + 2 + j * c_dim1];
419 f41 = c[i + 3 + j * c_dim1];
420 f32 = c[i + 2 + (j + 1) * c_dim1];
421 f42 = c[i + 3 + (j + 1) * c_dim1];
422 f33 = c[i + 2 + (j + 2) * c_dim1];
423 f43 = c[i + 3 + (j + 2) * c_dim1];
424 f34 = c[i + 2 + (j + 3) * c_dim1];
425 f44 = c[i + 3 + (j + 3) * c_dim1];
426 i6 = ll + lsec - 1;
427 for (l = ll; l <= i6; ++l)
428 {
429 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 * b[l + j * b_dim1];
431 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 * b[l + j * b_dim1];
433 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434 * b[l + (j + 1) * b_dim1];
435 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436 * b[l + (j + 1) * b_dim1];
437 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438 * b[l + (j + 2) * b_dim1];
439 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440 * b[l + (j + 2) * b_dim1];
441 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442 * b[l + (j + 3) * b_dim1];
443 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444 * b[l + (j + 3) * b_dim1];
445 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 * b[l + j * b_dim1];
447 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 * b[l + j * b_dim1];
449 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450 * b[l + (j + 1) * b_dim1];
451 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452 * b[l + (j + 1) * b_dim1];
453 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454 * b[l + (j + 2) * b_dim1];
455 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456 * b[l + (j + 2) * b_dim1];
457 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458 * b[l + (j + 3) * b_dim1];
459 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460 * b[l + (j + 3) * b_dim1];
461 }
462 c[i + j * c_dim1] = f11;
463 c[i + 1 + j * c_dim1] = f21;
464 c[i + (j + 1) * c_dim1] = f12;
465 c[i + 1 + (j + 1) * c_dim1] = f22;
466 c[i + (j + 2) * c_dim1] = f13;
467 c[i + 1 + (j + 2) * c_dim1] = f23;
468 c[i + (j + 3) * c_dim1] = f14;
469 c[i + 1 + (j + 3) * c_dim1] = f24;
470 c[i + 2 + j * c_dim1] = f31;
471 c[i + 3 + j * c_dim1] = f41;
472 c[i + 2 + (j + 1) * c_dim1] = f32;
473 c[i + 3 + (j + 1) * c_dim1] = f42;
474 c[i + 2 + (j + 2) * c_dim1] = f33;
475 c[i + 3 + (j + 2) * c_dim1] = f43;
476 c[i + 2 + (j + 3) * c_dim1] = f34;
477 c[i + 3 + (j + 3) * c_dim1] = f44;
478 }
479 if (uisec < isec)
480 {
481 i5 = ii + isec - 1;
482 for (i = ii + uisec; i <= i5; ++i)
483 {
484 f11 = c[i + j * c_dim1];
485 f12 = c[i + (j + 1) * c_dim1];
486 f13 = c[i + (j + 2) * c_dim1];
487 f14 = c[i + (j + 3) * c_dim1];
488 i6 = ll + lsec - 1;
489 for (l = ll; l <= i6; ++l)
490 {
491 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492 257] * b[l + j * b_dim1];
493 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494 257] * b[l + (j + 1) * b_dim1];
495 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496 257] * b[l + (j + 2) * b_dim1];
497 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 257] * b[l + (j + 3) * b_dim1];
499 }
500 c[i + j * c_dim1] = f11;
501 c[i + (j + 1) * c_dim1] = f12;
502 c[i + (j + 2) * c_dim1] = f13;
503 c[i + (j + 3) * c_dim1] = f14;
504 }
505 }
506 }
507 if (ujsec < jsec)
508 {
509 i4 = jj + jsec - 1;
510 for (j = jj + ujsec; j <= i4; ++j)
511 {
512 i5 = ii + uisec - 1;
513 for (i = ii; i <= i5; i += 4)
514 {
515 f11 = c[i + j * c_dim1];
516 f21 = c[i + 1 + j * c_dim1];
517 f31 = c[i + 2 + j * c_dim1];
518 f41 = c[i + 3 + j * c_dim1];
519 i6 = ll + lsec - 1;
520 for (l = ll; l <= i6; ++l)
521 {
522 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523 257] * b[l + j * b_dim1];
524 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525 257] * b[l + j * b_dim1];
526 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527 257] * b[l + j * b_dim1];
528 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529 257] * b[l + j * b_dim1];
530 }
531 c[i + j * c_dim1] = f11;
532 c[i + 1 + j * c_dim1] = f21;
533 c[i + 2 + j * c_dim1] = f31;
534 c[i + 3 + j * c_dim1] = f41;
535 }
536 i5 = ii + isec - 1;
537 for (i = ii + uisec; i <= i5; ++i)
538 {
539 f11 = c[i + j * c_dim1];
540 i6 = ll + lsec - 1;
541 for (l = ll; l <= i6; ++l)
542 {
543 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544 257] * b[l + j * b_dim1];
545 }
546 c[i + j * c_dim1] = f11;
547 }
548 }
549 }
550 }
551 }
552 }
553 free(t1);
554 return;
555 }
556 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557 {
558 if (GFC_DESCRIPTOR_RANK (a) != 1)
559 {
560 const GFC_REAL_16 *restrict abase_x;
561 const GFC_REAL_16 *restrict bbase_y;
562 GFC_REAL_16 *restrict dest_y;
563 GFC_REAL_16 s;
564
565 for (y = 0; y < ycount; y++)
566 {
567 bbase_y = &bbase[y*bystride];
568 dest_y = &dest[y*rystride];
569 for (x = 0; x < xcount; x++)
570 {
571 abase_x = &abase[x*axstride];
572 s = (GFC_REAL_16) 0;
573 for (n = 0; n < count; n++)
574 s += abase_x[n] * bbase_y[n];
575 dest_y[x] = s;
576 }
577 }
578 }
579 else
580 {
581 const GFC_REAL_16 *restrict bbase_y;
582 GFC_REAL_16 s;
583
584 for (y = 0; y < ycount; y++)
585 {
586 bbase_y = &bbase[y*bystride];
587 s = (GFC_REAL_16) 0;
588 for (n = 0; n < count; n++)
589 s += abase[n*axstride] * bbase_y[n];
590 dest[y*rystride] = s;
591 }
592 }
593 }
594 else if (GFC_DESCRIPTOR_RANK (a) == 1)
595 {
596 const GFC_REAL_16 *restrict bbase_y;
597 GFC_REAL_16 s;
598
599 for (y = 0; y < ycount; y++)
600 {
601 bbase_y = &bbase[y*bystride];
602 s = (GFC_REAL_16) 0;
603 for (n = 0; n < count; n++)
604 s += abase[n*axstride] * bbase_y[n*bxstride];
605 dest[y*rxstride] = s;
606 }
607 }
608 else if (axstride < aystride)
609 {
610 for (y = 0; y < ycount; y++)
611 for (x = 0; x < xcount; x++)
612 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
613
614 for (y = 0; y < ycount; y++)
615 for (n = 0; n < count; n++)
616 for (x = 0; x < xcount; x++)
617 /* dest[x,y] += a[x,n] * b[n,y] */
618 dest[x*rxstride + y*rystride] +=
619 abase[x*axstride + n*aystride] *
620 bbase[n*bxstride + y*bystride];
621 }
622 else
623 {
624 const GFC_REAL_16 *restrict abase_x;
625 const GFC_REAL_16 *restrict bbase_y;
626 GFC_REAL_16 *restrict dest_y;
627 GFC_REAL_16 s;
628
629 for (y = 0; y < ycount; y++)
630 {
631 bbase_y = &bbase[y*bystride];
632 dest_y = &dest[y*rystride];
633 for (x = 0; x < xcount; x++)
634 {
635 abase_x = &abase[x*axstride];
636 s = (GFC_REAL_16) 0;
637 for (n = 0; n < count; n++)
638 s += abase_x[n*aystride] * bbase_y[n*bxstride];
639 dest_y[x*rxstride] = s;
640 }
641 }
642 }
643 }
644 #undef POW3
645 #undef min
646 #undef max
647
648 #endif /* HAVE_AVX */
649
650 #ifdef HAVE_AVX2
651 static void
652 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
653 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
654 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655 static void
matmul_r16_avx2(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)656 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
657 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
658 int blas_limit, blas_call gemm)
659 {
660 const GFC_REAL_16 * restrict abase;
661 const GFC_REAL_16 * restrict bbase;
662 GFC_REAL_16 * restrict dest;
663
664 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665 index_type x, y, n, count, xcount, ycount;
666
667 assert (GFC_DESCRIPTOR_RANK (a) == 2
668 || GFC_DESCRIPTOR_RANK (b) == 2);
669
670 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671
672 Either A or B (but not both) can be rank 1:
673
674 o One-dimensional argument A is implicitly treated as a row matrix
675 dimensioned [1,count], so xcount=1.
676
677 o One-dimensional argument B is implicitly treated as a column matrix
678 dimensioned [count, 1], so ycount=1.
679 */
680
681 if (retarray->base_addr == NULL)
682 {
683 if (GFC_DESCRIPTOR_RANK (a) == 1)
684 {
685 GFC_DIMENSION_SET(retarray->dim[0], 0,
686 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687 }
688 else if (GFC_DESCRIPTOR_RANK (b) == 1)
689 {
690 GFC_DIMENSION_SET(retarray->dim[0], 0,
691 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692 }
693 else
694 {
695 GFC_DIMENSION_SET(retarray->dim[0], 0,
696 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697
698 GFC_DIMENSION_SET(retarray->dim[1], 0,
699 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700 GFC_DESCRIPTOR_EXTENT(retarray,0));
701 }
702
703 retarray->base_addr
704 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
705 retarray->offset = 0;
706 }
707 else if (unlikely (compile_options.bounds_check))
708 {
709 index_type ret_extent, arg_extent;
710
711 if (GFC_DESCRIPTOR_RANK (a) == 1)
712 {
713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715 if (arg_extent != ret_extent)
716 runtime_error ("Array bound mismatch for dimension 1 of "
717 "array (%ld/%ld) ",
718 (long int) ret_extent, (long int) arg_extent);
719 }
720 else if (GFC_DESCRIPTOR_RANK (b) == 1)
721 {
722 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724 if (arg_extent != ret_extent)
725 runtime_error ("Array bound mismatch for dimension 1 of "
726 "array (%ld/%ld) ",
727 (long int) ret_extent, (long int) arg_extent);
728 }
729 else
730 {
731 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733 if (arg_extent != ret_extent)
734 runtime_error ("Array bound mismatch for dimension 1 of "
735 "array (%ld/%ld) ",
736 (long int) ret_extent, (long int) arg_extent);
737
738 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740 if (arg_extent != ret_extent)
741 runtime_error ("Array bound mismatch for dimension 2 of "
742 "array (%ld/%ld) ",
743 (long int) ret_extent, (long int) arg_extent);
744 }
745 }
746
747
748 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749 {
750 /* One-dimensional result may be addressed in the code below
751 either as a row or a column matrix. We want both cases to
752 work. */
753 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754 }
755 else
756 {
757 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759 }
760
761
762 if (GFC_DESCRIPTOR_RANK (a) == 1)
763 {
764 /* Treat it as a a row matrix A[1,count]. */
765 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766 aystride = 1;
767
768 xcount = 1;
769 count = GFC_DESCRIPTOR_EXTENT(a,0);
770 }
771 else
772 {
773 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775
776 count = GFC_DESCRIPTOR_EXTENT(a,1);
777 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778 }
779
780 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781 {
782 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784 "in dimension 1: is %ld, should be %ld",
785 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786 }
787
788 if (GFC_DESCRIPTOR_RANK (b) == 1)
789 {
790 /* Treat it as a column matrix B[count,1] */
791 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792
793 /* bystride should never be used for 1-dimensional b.
794 The value is only used for calculation of the
795 memory by the buffer. */
796 bystride = 256;
797 ycount = 1;
798 }
799 else
800 {
801 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804 }
805
806 abase = a->base_addr;
807 bbase = b->base_addr;
808 dest = retarray->base_addr;
809
810 /* Now that everything is set up, we perform the multiplication
811 itself. */
812
813 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814 #define min(a,b) ((a) <= (b) ? (a) : (b))
815 #define max(a,b) ((a) >= (b) ? (a) : (b))
816
817 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818 && (bxstride == 1 || bystride == 1)
819 && (((float) xcount) * ((float) ycount) * ((float) count)
820 > POW3(blas_limit)))
821 {
822 const int m = xcount, n = ycount, k = count, ldc = rystride;
823 const GFC_REAL_16 one = 1, zero = 0;
824 const int lda = (axstride == 1) ? aystride : axstride,
825 ldb = (bxstride == 1) ? bystride : bxstride;
826
827 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828 {
829 assert (gemm != NULL);
830 const char *transa, *transb;
831 if (try_blas & 2)
832 transa = "C";
833 else
834 transa = axstride == 1 ? "N" : "T";
835
836 if (try_blas & 4)
837 transb = "C";
838 else
839 transb = bxstride == 1 ? "N" : "T";
840
841 gemm (transa, transb , &m,
842 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
843 &ldc, 1, 1);
844 return;
845 }
846 }
847
848 if (rxstride == 1 && axstride == 1 && bxstride == 1
849 && GFC_DESCRIPTOR_RANK (b) != 1)
850 {
851 /* This block of code implements a tuned matmul, derived from
852 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
853
854 Bo Kagstrom and Per Ling
855 Department of Computing Science
856 Umea University
857 S-901 87 Umea, Sweden
858
859 from netlib.org, translated to C, and modified for matmul.m4. */
860
861 const GFC_REAL_16 *a, *b;
862 GFC_REAL_16 *c;
863 const index_type m = xcount, n = ycount, k = count;
864
865 /* System generated locals */
866 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867 i1, i2, i3, i4, i5, i6;
868
869 /* Local variables */
870 GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
871 f13, f14, f23, f24, f33, f34, f43, f44;
872 index_type i, j, l, ii, jj, ll;
873 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874 GFC_REAL_16 *t1;
875
876 a = abase;
877 b = bbase;
878 c = retarray->base_addr;
879
880 /* Parameter adjustments */
881 c_dim1 = rystride;
882 c_offset = 1 + c_dim1;
883 c -= c_offset;
884 a_dim1 = aystride;
885 a_offset = 1 + a_dim1;
886 a -= a_offset;
887 b_dim1 = bystride;
888 b_offset = 1 + b_dim1;
889 b -= b_offset;
890
891 /* Empty c first. */
892 for (j=1; j<=n; j++)
893 for (i=1; i<=m; i++)
894 c[i + j * c_dim1] = (GFC_REAL_16)0;
895
896 /* Early exit if possible */
897 if (m == 0 || n == 0 || k == 0)
898 return;
899
900 /* Adjust size of t1 to what is needed. */
901 index_type t1_dim, a_sz;
902 if (aystride == 1)
903 a_sz = rystride;
904 else
905 a_sz = a_dim1;
906
907 t1_dim = a_sz * 256 + b_dim1;
908 if (t1_dim > 65536)
909 t1_dim = 65536;
910
911 t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
912
913 /* Start turning the crank. */
914 i1 = n;
915 for (jj = 1; jj <= i1; jj += 512)
916 {
917 /* Computing MIN */
918 i2 = 512;
919 i3 = n - jj + 1;
920 jsec = min(i2,i3);
921 ujsec = jsec - jsec % 4;
922 i2 = k;
923 for (ll = 1; ll <= i2; ll += 256)
924 {
925 /* Computing MIN */
926 i3 = 256;
927 i4 = k - ll + 1;
928 lsec = min(i3,i4);
929 ulsec = lsec - lsec % 2;
930
931 i3 = m;
932 for (ii = 1; ii <= i3; ii += 256)
933 {
934 /* Computing MIN */
935 i4 = 256;
936 i5 = m - ii + 1;
937 isec = min(i4,i5);
938 uisec = isec - isec % 2;
939 i4 = ll + ulsec - 1;
940 for (l = ll; l <= i4; l += 2)
941 {
942 i5 = ii + uisec - 1;
943 for (i = ii; i <= i5; i += 2)
944 {
945 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946 a[i + l * a_dim1];
947 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948 a[i + (l + 1) * a_dim1];
949 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950 a[i + 1 + l * a_dim1];
951 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952 a[i + 1 + (l + 1) * a_dim1];
953 }
954 if (uisec < isec)
955 {
956 t1[l - ll + 1 + (isec << 8) - 257] =
957 a[ii + isec - 1 + l * a_dim1];
958 t1[l - ll + 2 + (isec << 8) - 257] =
959 a[ii + isec - 1 + (l + 1) * a_dim1];
960 }
961 }
962 if (ulsec < lsec)
963 {
964 i4 = ii + isec - 1;
965 for (i = ii; i<= i4; ++i)
966 {
967 t1[lsec + ((i - ii + 1) << 8) - 257] =
968 a[i + (ll + lsec - 1) * a_dim1];
969 }
970 }
971
972 uisec = isec - isec % 4;
973 i4 = jj + ujsec - 1;
974 for (j = jj; j <= i4; j += 4)
975 {
976 i5 = ii + uisec - 1;
977 for (i = ii; i <= i5; i += 4)
978 {
979 f11 = c[i + j * c_dim1];
980 f21 = c[i + 1 + j * c_dim1];
981 f12 = c[i + (j + 1) * c_dim1];
982 f22 = c[i + 1 + (j + 1) * c_dim1];
983 f13 = c[i + (j + 2) * c_dim1];
984 f23 = c[i + 1 + (j + 2) * c_dim1];
985 f14 = c[i + (j + 3) * c_dim1];
986 f24 = c[i + 1 + (j + 3) * c_dim1];
987 f31 = c[i + 2 + j * c_dim1];
988 f41 = c[i + 3 + j * c_dim1];
989 f32 = c[i + 2 + (j + 1) * c_dim1];
990 f42 = c[i + 3 + (j + 1) * c_dim1];
991 f33 = c[i + 2 + (j + 2) * c_dim1];
992 f43 = c[i + 3 + (j + 2) * c_dim1];
993 f34 = c[i + 2 + (j + 3) * c_dim1];
994 f44 = c[i + 3 + (j + 3) * c_dim1];
995 i6 = ll + lsec - 1;
996 for (l = ll; l <= i6; ++l)
997 {
998 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999 * b[l + j * b_dim1];
1000 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001 * b[l + j * b_dim1];
1002 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003 * b[l + (j + 1) * b_dim1];
1004 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005 * b[l + (j + 1) * b_dim1];
1006 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007 * b[l + (j + 2) * b_dim1];
1008 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009 * b[l + (j + 2) * b_dim1];
1010 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011 * b[l + (j + 3) * b_dim1];
1012 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013 * b[l + (j + 3) * b_dim1];
1014 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015 * b[l + j * b_dim1];
1016 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017 * b[l + j * b_dim1];
1018 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019 * b[l + (j + 1) * b_dim1];
1020 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021 * b[l + (j + 1) * b_dim1];
1022 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023 * b[l + (j + 2) * b_dim1];
1024 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025 * b[l + (j + 2) * b_dim1];
1026 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027 * b[l + (j + 3) * b_dim1];
1028 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029 * b[l + (j + 3) * b_dim1];
1030 }
1031 c[i + j * c_dim1] = f11;
1032 c[i + 1 + j * c_dim1] = f21;
1033 c[i + (j + 1) * c_dim1] = f12;
1034 c[i + 1 + (j + 1) * c_dim1] = f22;
1035 c[i + (j + 2) * c_dim1] = f13;
1036 c[i + 1 + (j + 2) * c_dim1] = f23;
1037 c[i + (j + 3) * c_dim1] = f14;
1038 c[i + 1 + (j + 3) * c_dim1] = f24;
1039 c[i + 2 + j * c_dim1] = f31;
1040 c[i + 3 + j * c_dim1] = f41;
1041 c[i + 2 + (j + 1) * c_dim1] = f32;
1042 c[i + 3 + (j + 1) * c_dim1] = f42;
1043 c[i + 2 + (j + 2) * c_dim1] = f33;
1044 c[i + 3 + (j + 2) * c_dim1] = f43;
1045 c[i + 2 + (j + 3) * c_dim1] = f34;
1046 c[i + 3 + (j + 3) * c_dim1] = f44;
1047 }
1048 if (uisec < isec)
1049 {
1050 i5 = ii + isec - 1;
1051 for (i = ii + uisec; i <= i5; ++i)
1052 {
1053 f11 = c[i + j * c_dim1];
1054 f12 = c[i + (j + 1) * c_dim1];
1055 f13 = c[i + (j + 2) * c_dim1];
1056 f14 = c[i + (j + 3) * c_dim1];
1057 i6 = ll + lsec - 1;
1058 for (l = ll; l <= i6; ++l)
1059 {
1060 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061 257] * b[l + j * b_dim1];
1062 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063 257] * b[l + (j + 1) * b_dim1];
1064 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 257] * b[l + (j + 2) * b_dim1];
1066 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067 257] * b[l + (j + 3) * b_dim1];
1068 }
1069 c[i + j * c_dim1] = f11;
1070 c[i + (j + 1) * c_dim1] = f12;
1071 c[i + (j + 2) * c_dim1] = f13;
1072 c[i + (j + 3) * c_dim1] = f14;
1073 }
1074 }
1075 }
1076 if (ujsec < jsec)
1077 {
1078 i4 = jj + jsec - 1;
1079 for (j = jj + ujsec; j <= i4; ++j)
1080 {
1081 i5 = ii + uisec - 1;
1082 for (i = ii; i <= i5; i += 4)
1083 {
1084 f11 = c[i + j * c_dim1];
1085 f21 = c[i + 1 + j * c_dim1];
1086 f31 = c[i + 2 + j * c_dim1];
1087 f41 = c[i + 3 + j * c_dim1];
1088 i6 = ll + lsec - 1;
1089 for (l = ll; l <= i6; ++l)
1090 {
1091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092 257] * b[l + j * b_dim1];
1093 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094 257] * b[l + j * b_dim1];
1095 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096 257] * b[l + j * b_dim1];
1097 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098 257] * b[l + j * b_dim1];
1099 }
1100 c[i + j * c_dim1] = f11;
1101 c[i + 1 + j * c_dim1] = f21;
1102 c[i + 2 + j * c_dim1] = f31;
1103 c[i + 3 + j * c_dim1] = f41;
1104 }
1105 i5 = ii + isec - 1;
1106 for (i = ii + uisec; i <= i5; ++i)
1107 {
1108 f11 = c[i + j * c_dim1];
1109 i6 = ll + lsec - 1;
1110 for (l = ll; l <= i6; ++l)
1111 {
1112 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113 257] * b[l + j * b_dim1];
1114 }
1115 c[i + j * c_dim1] = f11;
1116 }
1117 }
1118 }
1119 }
1120 }
1121 }
1122 free(t1);
1123 return;
1124 }
1125 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126 {
1127 if (GFC_DESCRIPTOR_RANK (a) != 1)
1128 {
1129 const GFC_REAL_16 *restrict abase_x;
1130 const GFC_REAL_16 *restrict bbase_y;
1131 GFC_REAL_16 *restrict dest_y;
1132 GFC_REAL_16 s;
1133
1134 for (y = 0; y < ycount; y++)
1135 {
1136 bbase_y = &bbase[y*bystride];
1137 dest_y = &dest[y*rystride];
1138 for (x = 0; x < xcount; x++)
1139 {
1140 abase_x = &abase[x*axstride];
1141 s = (GFC_REAL_16) 0;
1142 for (n = 0; n < count; n++)
1143 s += abase_x[n] * bbase_y[n];
1144 dest_y[x] = s;
1145 }
1146 }
1147 }
1148 else
1149 {
1150 const GFC_REAL_16 *restrict bbase_y;
1151 GFC_REAL_16 s;
1152
1153 for (y = 0; y < ycount; y++)
1154 {
1155 bbase_y = &bbase[y*bystride];
1156 s = (GFC_REAL_16) 0;
1157 for (n = 0; n < count; n++)
1158 s += abase[n*axstride] * bbase_y[n];
1159 dest[y*rystride] = s;
1160 }
1161 }
1162 }
1163 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164 {
1165 const GFC_REAL_16 *restrict bbase_y;
1166 GFC_REAL_16 s;
1167
1168 for (y = 0; y < ycount; y++)
1169 {
1170 bbase_y = &bbase[y*bystride];
1171 s = (GFC_REAL_16) 0;
1172 for (n = 0; n < count; n++)
1173 s += abase[n*axstride] * bbase_y[n*bxstride];
1174 dest[y*rxstride] = s;
1175 }
1176 }
1177 else if (axstride < aystride)
1178 {
1179 for (y = 0; y < ycount; y++)
1180 for (x = 0; x < xcount; x++)
1181 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1182
1183 for (y = 0; y < ycount; y++)
1184 for (n = 0; n < count; n++)
1185 for (x = 0; x < xcount; x++)
1186 /* dest[x,y] += a[x,n] * b[n,y] */
1187 dest[x*rxstride + y*rystride] +=
1188 abase[x*axstride + n*aystride] *
1189 bbase[n*bxstride + y*bystride];
1190 }
1191 else
1192 {
1193 const GFC_REAL_16 *restrict abase_x;
1194 const GFC_REAL_16 *restrict bbase_y;
1195 GFC_REAL_16 *restrict dest_y;
1196 GFC_REAL_16 s;
1197
1198 for (y = 0; y < ycount; y++)
1199 {
1200 bbase_y = &bbase[y*bystride];
1201 dest_y = &dest[y*rystride];
1202 for (x = 0; x < xcount; x++)
1203 {
1204 abase_x = &abase[x*axstride];
1205 s = (GFC_REAL_16) 0;
1206 for (n = 0; n < count; n++)
1207 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208 dest_y[x*rxstride] = s;
1209 }
1210 }
1211 }
1212 }
1213 #undef POW3
1214 #undef min
1215 #undef max
1216
1217 #endif /* HAVE_AVX2 */
1218
1219 #ifdef HAVE_AVX512F
1220 static void
1221 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1222 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1223 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224 static void
matmul_r16_avx512f(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1225 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1226 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1227 int blas_limit, blas_call gemm)
1228 {
1229 const GFC_REAL_16 * restrict abase;
1230 const GFC_REAL_16 * restrict bbase;
1231 GFC_REAL_16 * restrict dest;
1232
1233 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234 index_type x, y, n, count, xcount, ycount;
1235
1236 assert (GFC_DESCRIPTOR_RANK (a) == 2
1237 || GFC_DESCRIPTOR_RANK (b) == 2);
1238
1239 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240
1241 Either A or B (but not both) can be rank 1:
1242
1243 o One-dimensional argument A is implicitly treated as a row matrix
1244 dimensioned [1,count], so xcount=1.
1245
1246 o One-dimensional argument B is implicitly treated as a column matrix
1247 dimensioned [count, 1], so ycount=1.
1248 */
1249
1250 if (retarray->base_addr == NULL)
1251 {
1252 if (GFC_DESCRIPTOR_RANK (a) == 1)
1253 {
1254 GFC_DIMENSION_SET(retarray->dim[0], 0,
1255 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256 }
1257 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258 {
1259 GFC_DIMENSION_SET(retarray->dim[0], 0,
1260 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261 }
1262 else
1263 {
1264 GFC_DIMENSION_SET(retarray->dim[0], 0,
1265 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266
1267 GFC_DIMENSION_SET(retarray->dim[1], 0,
1268 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269 GFC_DESCRIPTOR_EXTENT(retarray,0));
1270 }
1271
1272 retarray->base_addr
1273 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1274 retarray->offset = 0;
1275 }
1276 else if (unlikely (compile_options.bounds_check))
1277 {
1278 index_type ret_extent, arg_extent;
1279
1280 if (GFC_DESCRIPTOR_RANK (a) == 1)
1281 {
1282 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284 if (arg_extent != ret_extent)
1285 runtime_error ("Array bound mismatch for dimension 1 of "
1286 "array (%ld/%ld) ",
1287 (long int) ret_extent, (long int) arg_extent);
1288 }
1289 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290 {
1291 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293 if (arg_extent != ret_extent)
1294 runtime_error ("Array bound mismatch for dimension 1 of "
1295 "array (%ld/%ld) ",
1296 (long int) ret_extent, (long int) arg_extent);
1297 }
1298 else
1299 {
1300 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302 if (arg_extent != ret_extent)
1303 runtime_error ("Array bound mismatch for dimension 1 of "
1304 "array (%ld/%ld) ",
1305 (long int) ret_extent, (long int) arg_extent);
1306
1307 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309 if (arg_extent != ret_extent)
1310 runtime_error ("Array bound mismatch for dimension 2 of "
1311 "array (%ld/%ld) ",
1312 (long int) ret_extent, (long int) arg_extent);
1313 }
1314 }
1315
1316
1317 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318 {
1319 /* One-dimensional result may be addressed in the code below
1320 either as a row or a column matrix. We want both cases to
1321 work. */
1322 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323 }
1324 else
1325 {
1326 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328 }
1329
1330
1331 if (GFC_DESCRIPTOR_RANK (a) == 1)
1332 {
1333 /* Treat it as a a row matrix A[1,count]. */
1334 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335 aystride = 1;
1336
1337 xcount = 1;
1338 count = GFC_DESCRIPTOR_EXTENT(a,0);
1339 }
1340 else
1341 {
1342 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344
1345 count = GFC_DESCRIPTOR_EXTENT(a,1);
1346 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347 }
1348
1349 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350 {
1351 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353 "in dimension 1: is %ld, should be %ld",
1354 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355 }
1356
1357 if (GFC_DESCRIPTOR_RANK (b) == 1)
1358 {
1359 /* Treat it as a column matrix B[count,1] */
1360 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361
1362 /* bystride should never be used for 1-dimensional b.
1363 The value is only used for calculation of the
1364 memory by the buffer. */
1365 bystride = 256;
1366 ycount = 1;
1367 }
1368 else
1369 {
1370 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373 }
1374
1375 abase = a->base_addr;
1376 bbase = b->base_addr;
1377 dest = retarray->base_addr;
1378
1379 /* Now that everything is set up, we perform the multiplication
1380 itself. */
1381
1382 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383 #define min(a,b) ((a) <= (b) ? (a) : (b))
1384 #define max(a,b) ((a) >= (b) ? (a) : (b))
1385
1386 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387 && (bxstride == 1 || bystride == 1)
1388 && (((float) xcount) * ((float) ycount) * ((float) count)
1389 > POW3(blas_limit)))
1390 {
1391 const int m = xcount, n = ycount, k = count, ldc = rystride;
1392 const GFC_REAL_16 one = 1, zero = 0;
1393 const int lda = (axstride == 1) ? aystride : axstride,
1394 ldb = (bxstride == 1) ? bystride : bxstride;
1395
1396 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397 {
1398 assert (gemm != NULL);
1399 const char *transa, *transb;
1400 if (try_blas & 2)
1401 transa = "C";
1402 else
1403 transa = axstride == 1 ? "N" : "T";
1404
1405 if (try_blas & 4)
1406 transb = "C";
1407 else
1408 transb = bxstride == 1 ? "N" : "T";
1409
1410 gemm (transa, transb , &m,
1411 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1412 &ldc, 1, 1);
1413 return;
1414 }
1415 }
1416
1417 if (rxstride == 1 && axstride == 1 && bxstride == 1
1418 && GFC_DESCRIPTOR_RANK (b) != 1)
1419 {
1420 /* This block of code implements a tuned matmul, derived from
1421 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1422
1423 Bo Kagstrom and Per Ling
1424 Department of Computing Science
1425 Umea University
1426 S-901 87 Umea, Sweden
1427
1428 from netlib.org, translated to C, and modified for matmul.m4. */
1429
1430 const GFC_REAL_16 *a, *b;
1431 GFC_REAL_16 *c;
1432 const index_type m = xcount, n = ycount, k = count;
1433
1434 /* System generated locals */
1435 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436 i1, i2, i3, i4, i5, i6;
1437
1438 /* Local variables */
1439 GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
1440 f13, f14, f23, f24, f33, f34, f43, f44;
1441 index_type i, j, l, ii, jj, ll;
1442 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443 GFC_REAL_16 *t1;
1444
1445 a = abase;
1446 b = bbase;
1447 c = retarray->base_addr;
1448
1449 /* Parameter adjustments */
1450 c_dim1 = rystride;
1451 c_offset = 1 + c_dim1;
1452 c -= c_offset;
1453 a_dim1 = aystride;
1454 a_offset = 1 + a_dim1;
1455 a -= a_offset;
1456 b_dim1 = bystride;
1457 b_offset = 1 + b_dim1;
1458 b -= b_offset;
1459
1460 /* Empty c first. */
1461 for (j=1; j<=n; j++)
1462 for (i=1; i<=m; i++)
1463 c[i + j * c_dim1] = (GFC_REAL_16)0;
1464
1465 /* Early exit if possible */
1466 if (m == 0 || n == 0 || k == 0)
1467 return;
1468
1469 /* Adjust size of t1 to what is needed. */
1470 index_type t1_dim, a_sz;
1471 if (aystride == 1)
1472 a_sz = rystride;
1473 else
1474 a_sz = a_dim1;
1475
1476 t1_dim = a_sz * 256 + b_dim1;
1477 if (t1_dim > 65536)
1478 t1_dim = 65536;
1479
1480 t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
1481
1482 /* Start turning the crank. */
1483 i1 = n;
1484 for (jj = 1; jj <= i1; jj += 512)
1485 {
1486 /* Computing MIN */
1487 i2 = 512;
1488 i3 = n - jj + 1;
1489 jsec = min(i2,i3);
1490 ujsec = jsec - jsec % 4;
1491 i2 = k;
1492 for (ll = 1; ll <= i2; ll += 256)
1493 {
1494 /* Computing MIN */
1495 i3 = 256;
1496 i4 = k - ll + 1;
1497 lsec = min(i3,i4);
1498 ulsec = lsec - lsec % 2;
1499
1500 i3 = m;
1501 for (ii = 1; ii <= i3; ii += 256)
1502 {
1503 /* Computing MIN */
1504 i4 = 256;
1505 i5 = m - ii + 1;
1506 isec = min(i4,i5);
1507 uisec = isec - isec % 2;
1508 i4 = ll + ulsec - 1;
1509 for (l = ll; l <= i4; l += 2)
1510 {
1511 i5 = ii + uisec - 1;
1512 for (i = ii; i <= i5; i += 2)
1513 {
1514 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515 a[i + l * a_dim1];
1516 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517 a[i + (l + 1) * a_dim1];
1518 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519 a[i + 1 + l * a_dim1];
1520 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521 a[i + 1 + (l + 1) * a_dim1];
1522 }
1523 if (uisec < isec)
1524 {
1525 t1[l - ll + 1 + (isec << 8) - 257] =
1526 a[ii + isec - 1 + l * a_dim1];
1527 t1[l - ll + 2 + (isec << 8) - 257] =
1528 a[ii + isec - 1 + (l + 1) * a_dim1];
1529 }
1530 }
1531 if (ulsec < lsec)
1532 {
1533 i4 = ii + isec - 1;
1534 for (i = ii; i<= i4; ++i)
1535 {
1536 t1[lsec + ((i - ii + 1) << 8) - 257] =
1537 a[i + (ll + lsec - 1) * a_dim1];
1538 }
1539 }
1540
1541 uisec = isec - isec % 4;
1542 i4 = jj + ujsec - 1;
1543 for (j = jj; j <= i4; j += 4)
1544 {
1545 i5 = ii + uisec - 1;
1546 for (i = ii; i <= i5; i += 4)
1547 {
1548 f11 = c[i + j * c_dim1];
1549 f21 = c[i + 1 + j * c_dim1];
1550 f12 = c[i + (j + 1) * c_dim1];
1551 f22 = c[i + 1 + (j + 1) * c_dim1];
1552 f13 = c[i + (j + 2) * c_dim1];
1553 f23 = c[i + 1 + (j + 2) * c_dim1];
1554 f14 = c[i + (j + 3) * c_dim1];
1555 f24 = c[i + 1 + (j + 3) * c_dim1];
1556 f31 = c[i + 2 + j * c_dim1];
1557 f41 = c[i + 3 + j * c_dim1];
1558 f32 = c[i + 2 + (j + 1) * c_dim1];
1559 f42 = c[i + 3 + (j + 1) * c_dim1];
1560 f33 = c[i + 2 + (j + 2) * c_dim1];
1561 f43 = c[i + 3 + (j + 2) * c_dim1];
1562 f34 = c[i + 2 + (j + 3) * c_dim1];
1563 f44 = c[i + 3 + (j + 3) * c_dim1];
1564 i6 = ll + lsec - 1;
1565 for (l = ll; l <= i6; ++l)
1566 {
1567 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568 * b[l + j * b_dim1];
1569 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570 * b[l + j * b_dim1];
1571 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572 * b[l + (j + 1) * b_dim1];
1573 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574 * b[l + (j + 1) * b_dim1];
1575 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576 * b[l + (j + 2) * b_dim1];
1577 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578 * b[l + (j + 2) * b_dim1];
1579 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580 * b[l + (j + 3) * b_dim1];
1581 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582 * b[l + (j + 3) * b_dim1];
1583 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584 * b[l + j * b_dim1];
1585 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586 * b[l + j * b_dim1];
1587 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588 * b[l + (j + 1) * b_dim1];
1589 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590 * b[l + (j + 1) * b_dim1];
1591 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592 * b[l + (j + 2) * b_dim1];
1593 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594 * b[l + (j + 2) * b_dim1];
1595 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596 * b[l + (j + 3) * b_dim1];
1597 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598 * b[l + (j + 3) * b_dim1];
1599 }
1600 c[i + j * c_dim1] = f11;
1601 c[i + 1 + j * c_dim1] = f21;
1602 c[i + (j + 1) * c_dim1] = f12;
1603 c[i + 1 + (j + 1) * c_dim1] = f22;
1604 c[i + (j + 2) * c_dim1] = f13;
1605 c[i + 1 + (j + 2) * c_dim1] = f23;
1606 c[i + (j + 3) * c_dim1] = f14;
1607 c[i + 1 + (j + 3) * c_dim1] = f24;
1608 c[i + 2 + j * c_dim1] = f31;
1609 c[i + 3 + j * c_dim1] = f41;
1610 c[i + 2 + (j + 1) * c_dim1] = f32;
1611 c[i + 3 + (j + 1) * c_dim1] = f42;
1612 c[i + 2 + (j + 2) * c_dim1] = f33;
1613 c[i + 3 + (j + 2) * c_dim1] = f43;
1614 c[i + 2 + (j + 3) * c_dim1] = f34;
1615 c[i + 3 + (j + 3) * c_dim1] = f44;
1616 }
1617 if (uisec < isec)
1618 {
1619 i5 = ii + isec - 1;
1620 for (i = ii + uisec; i <= i5; ++i)
1621 {
1622 f11 = c[i + j * c_dim1];
1623 f12 = c[i + (j + 1) * c_dim1];
1624 f13 = c[i + (j + 2) * c_dim1];
1625 f14 = c[i + (j + 3) * c_dim1];
1626 i6 = ll + lsec - 1;
1627 for (l = ll; l <= i6; ++l)
1628 {
1629 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630 257] * b[l + j * b_dim1];
1631 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632 257] * b[l + (j + 1) * b_dim1];
1633 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634 257] * b[l + (j + 2) * b_dim1];
1635 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636 257] * b[l + (j + 3) * b_dim1];
1637 }
1638 c[i + j * c_dim1] = f11;
1639 c[i + (j + 1) * c_dim1] = f12;
1640 c[i + (j + 2) * c_dim1] = f13;
1641 c[i + (j + 3) * c_dim1] = f14;
1642 }
1643 }
1644 }
1645 if (ujsec < jsec)
1646 {
1647 i4 = jj + jsec - 1;
1648 for (j = jj + ujsec; j <= i4; ++j)
1649 {
1650 i5 = ii + uisec - 1;
1651 for (i = ii; i <= i5; i += 4)
1652 {
1653 f11 = c[i + j * c_dim1];
1654 f21 = c[i + 1 + j * c_dim1];
1655 f31 = c[i + 2 + j * c_dim1];
1656 f41 = c[i + 3 + j * c_dim1];
1657 i6 = ll + lsec - 1;
1658 for (l = ll; l <= i6; ++l)
1659 {
1660 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661 257] * b[l + j * b_dim1];
1662 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663 257] * b[l + j * b_dim1];
1664 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665 257] * b[l + j * b_dim1];
1666 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667 257] * b[l + j * b_dim1];
1668 }
1669 c[i + j * c_dim1] = f11;
1670 c[i + 1 + j * c_dim1] = f21;
1671 c[i + 2 + j * c_dim1] = f31;
1672 c[i + 3 + j * c_dim1] = f41;
1673 }
1674 i5 = ii + isec - 1;
1675 for (i = ii + uisec; i <= i5; ++i)
1676 {
1677 f11 = c[i + j * c_dim1];
1678 i6 = ll + lsec - 1;
1679 for (l = ll; l <= i6; ++l)
1680 {
1681 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682 257] * b[l + j * b_dim1];
1683 }
1684 c[i + j * c_dim1] = f11;
1685 }
1686 }
1687 }
1688 }
1689 }
1690 }
1691 free(t1);
1692 return;
1693 }
1694 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695 {
1696 if (GFC_DESCRIPTOR_RANK (a) != 1)
1697 {
1698 const GFC_REAL_16 *restrict abase_x;
1699 const GFC_REAL_16 *restrict bbase_y;
1700 GFC_REAL_16 *restrict dest_y;
1701 GFC_REAL_16 s;
1702
1703 for (y = 0; y < ycount; y++)
1704 {
1705 bbase_y = &bbase[y*bystride];
1706 dest_y = &dest[y*rystride];
1707 for (x = 0; x < xcount; x++)
1708 {
1709 abase_x = &abase[x*axstride];
1710 s = (GFC_REAL_16) 0;
1711 for (n = 0; n < count; n++)
1712 s += abase_x[n] * bbase_y[n];
1713 dest_y[x] = s;
1714 }
1715 }
1716 }
1717 else
1718 {
1719 const GFC_REAL_16 *restrict bbase_y;
1720 GFC_REAL_16 s;
1721
1722 for (y = 0; y < ycount; y++)
1723 {
1724 bbase_y = &bbase[y*bystride];
1725 s = (GFC_REAL_16) 0;
1726 for (n = 0; n < count; n++)
1727 s += abase[n*axstride] * bbase_y[n];
1728 dest[y*rystride] = s;
1729 }
1730 }
1731 }
1732 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733 {
1734 const GFC_REAL_16 *restrict bbase_y;
1735 GFC_REAL_16 s;
1736
1737 for (y = 0; y < ycount; y++)
1738 {
1739 bbase_y = &bbase[y*bystride];
1740 s = (GFC_REAL_16) 0;
1741 for (n = 0; n < count; n++)
1742 s += abase[n*axstride] * bbase_y[n*bxstride];
1743 dest[y*rxstride] = s;
1744 }
1745 }
1746 else if (axstride < aystride)
1747 {
1748 for (y = 0; y < ycount; y++)
1749 for (x = 0; x < xcount; x++)
1750 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1751
1752 for (y = 0; y < ycount; y++)
1753 for (n = 0; n < count; n++)
1754 for (x = 0; x < xcount; x++)
1755 /* dest[x,y] += a[x,n] * b[n,y] */
1756 dest[x*rxstride + y*rystride] +=
1757 abase[x*axstride + n*aystride] *
1758 bbase[n*bxstride + y*bystride];
1759 }
1760 else
1761 {
1762 const GFC_REAL_16 *restrict abase_x;
1763 const GFC_REAL_16 *restrict bbase_y;
1764 GFC_REAL_16 *restrict dest_y;
1765 GFC_REAL_16 s;
1766
1767 for (y = 0; y < ycount; y++)
1768 {
1769 bbase_y = &bbase[y*bystride];
1770 dest_y = &dest[y*rystride];
1771 for (x = 0; x < xcount; x++)
1772 {
1773 abase_x = &abase[x*axstride];
1774 s = (GFC_REAL_16) 0;
1775 for (n = 0; n < count; n++)
1776 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777 dest_y[x*rxstride] = s;
1778 }
1779 }
1780 }
1781 }
1782 #undef POW3
1783 #undef min
1784 #undef max
1785
1786 #endif /* HAVE_AVX512F */
1787
1788 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1789
1790 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791 void
1792 matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
1793 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1794 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795 internal_proto(matmul_r16_avx128_fma3);
1796 #endif
1797
1798 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799 void
1800 matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
1801 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1802 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803 internal_proto(matmul_r16_avx128_fma4);
1804 #endif
1805
1806 /* Function to fall back to if there is no special processor-specific version. */
1807 static void
matmul_r16_vanilla(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)1808 matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
1809 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1810 int blas_limit, blas_call gemm)
1811 {
1812 const GFC_REAL_16 * restrict abase;
1813 const GFC_REAL_16 * restrict bbase;
1814 GFC_REAL_16 * restrict dest;
1815
1816 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817 index_type x, y, n, count, xcount, ycount;
1818
1819 assert (GFC_DESCRIPTOR_RANK (a) == 2
1820 || GFC_DESCRIPTOR_RANK (b) == 2);
1821
1822 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823
1824 Either A or B (but not both) can be rank 1:
1825
1826 o One-dimensional argument A is implicitly treated as a row matrix
1827 dimensioned [1,count], so xcount=1.
1828
1829 o One-dimensional argument B is implicitly treated as a column matrix
1830 dimensioned [count, 1], so ycount=1.
1831 */
1832
1833 if (retarray->base_addr == NULL)
1834 {
1835 if (GFC_DESCRIPTOR_RANK (a) == 1)
1836 {
1837 GFC_DIMENSION_SET(retarray->dim[0], 0,
1838 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839 }
1840 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841 {
1842 GFC_DIMENSION_SET(retarray->dim[0], 0,
1843 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844 }
1845 else
1846 {
1847 GFC_DIMENSION_SET(retarray->dim[0], 0,
1848 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849
1850 GFC_DIMENSION_SET(retarray->dim[1], 0,
1851 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852 GFC_DESCRIPTOR_EXTENT(retarray,0));
1853 }
1854
1855 retarray->base_addr
1856 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1857 retarray->offset = 0;
1858 }
1859 else if (unlikely (compile_options.bounds_check))
1860 {
1861 index_type ret_extent, arg_extent;
1862
1863 if (GFC_DESCRIPTOR_RANK (a) == 1)
1864 {
1865 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867 if (arg_extent != ret_extent)
1868 runtime_error ("Array bound mismatch for dimension 1 of "
1869 "array (%ld/%ld) ",
1870 (long int) ret_extent, (long int) arg_extent);
1871 }
1872 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873 {
1874 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876 if (arg_extent != ret_extent)
1877 runtime_error ("Array bound mismatch for dimension 1 of "
1878 "array (%ld/%ld) ",
1879 (long int) ret_extent, (long int) arg_extent);
1880 }
1881 else
1882 {
1883 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885 if (arg_extent != ret_extent)
1886 runtime_error ("Array bound mismatch for dimension 1 of "
1887 "array (%ld/%ld) ",
1888 (long int) ret_extent, (long int) arg_extent);
1889
1890 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892 if (arg_extent != ret_extent)
1893 runtime_error ("Array bound mismatch for dimension 2 of "
1894 "array (%ld/%ld) ",
1895 (long int) ret_extent, (long int) arg_extent);
1896 }
1897 }
1898
1899
1900 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901 {
1902 /* One-dimensional result may be addressed in the code below
1903 either as a row or a column matrix. We want both cases to
1904 work. */
1905 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906 }
1907 else
1908 {
1909 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911 }
1912
1913
1914 if (GFC_DESCRIPTOR_RANK (a) == 1)
1915 {
1916 /* Treat it as a a row matrix A[1,count]. */
1917 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918 aystride = 1;
1919
1920 xcount = 1;
1921 count = GFC_DESCRIPTOR_EXTENT(a,0);
1922 }
1923 else
1924 {
1925 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927
1928 count = GFC_DESCRIPTOR_EXTENT(a,1);
1929 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930 }
1931
1932 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933 {
1934 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936 "in dimension 1: is %ld, should be %ld",
1937 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938 }
1939
1940 if (GFC_DESCRIPTOR_RANK (b) == 1)
1941 {
1942 /* Treat it as a column matrix B[count,1] */
1943 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944
1945 /* bystride should never be used for 1-dimensional b.
1946 The value is only used for calculation of the
1947 memory by the buffer. */
1948 bystride = 256;
1949 ycount = 1;
1950 }
1951 else
1952 {
1953 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956 }
1957
1958 abase = a->base_addr;
1959 bbase = b->base_addr;
1960 dest = retarray->base_addr;
1961
1962 /* Now that everything is set up, we perform the multiplication
1963 itself. */
1964
1965 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966 #define min(a,b) ((a) <= (b) ? (a) : (b))
1967 #define max(a,b) ((a) >= (b) ? (a) : (b))
1968
1969 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970 && (bxstride == 1 || bystride == 1)
1971 && (((float) xcount) * ((float) ycount) * ((float) count)
1972 > POW3(blas_limit)))
1973 {
1974 const int m = xcount, n = ycount, k = count, ldc = rystride;
1975 const GFC_REAL_16 one = 1, zero = 0;
1976 const int lda = (axstride == 1) ? aystride : axstride,
1977 ldb = (bxstride == 1) ? bystride : bxstride;
1978
1979 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980 {
1981 assert (gemm != NULL);
1982 const char *transa, *transb;
1983 if (try_blas & 2)
1984 transa = "C";
1985 else
1986 transa = axstride == 1 ? "N" : "T";
1987
1988 if (try_blas & 4)
1989 transb = "C";
1990 else
1991 transb = bxstride == 1 ? "N" : "T";
1992
1993 gemm (transa, transb , &m,
1994 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1995 &ldc, 1, 1);
1996 return;
1997 }
1998 }
1999
2000 if (rxstride == 1 && axstride == 1 && bxstride == 1
2001 && GFC_DESCRIPTOR_RANK (b) != 1)
2002 {
2003 /* This block of code implements a tuned matmul, derived from
2004 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2005
2006 Bo Kagstrom and Per Ling
2007 Department of Computing Science
2008 Umea University
2009 S-901 87 Umea, Sweden
2010
2011 from netlib.org, translated to C, and modified for matmul.m4. */
2012
2013 const GFC_REAL_16 *a, *b;
2014 GFC_REAL_16 *c;
2015 const index_type m = xcount, n = ycount, k = count;
2016
2017 /* System generated locals */
2018 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019 i1, i2, i3, i4, i5, i6;
2020
2021 /* Local variables */
2022 GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
2023 f13, f14, f23, f24, f33, f34, f43, f44;
2024 index_type i, j, l, ii, jj, ll;
2025 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026 GFC_REAL_16 *t1;
2027
2028 a = abase;
2029 b = bbase;
2030 c = retarray->base_addr;
2031
2032 /* Parameter adjustments */
2033 c_dim1 = rystride;
2034 c_offset = 1 + c_dim1;
2035 c -= c_offset;
2036 a_dim1 = aystride;
2037 a_offset = 1 + a_dim1;
2038 a -= a_offset;
2039 b_dim1 = bystride;
2040 b_offset = 1 + b_dim1;
2041 b -= b_offset;
2042
2043 /* Empty c first. */
2044 for (j=1; j<=n; j++)
2045 for (i=1; i<=m; i++)
2046 c[i + j * c_dim1] = (GFC_REAL_16)0;
2047
2048 /* Early exit if possible */
2049 if (m == 0 || n == 0 || k == 0)
2050 return;
2051
2052 /* Adjust size of t1 to what is needed. */
2053 index_type t1_dim, a_sz;
2054 if (aystride == 1)
2055 a_sz = rystride;
2056 else
2057 a_sz = a_dim1;
2058
2059 t1_dim = a_sz * 256 + b_dim1;
2060 if (t1_dim > 65536)
2061 t1_dim = 65536;
2062
2063 t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
2064
2065 /* Start turning the crank. */
2066 i1 = n;
2067 for (jj = 1; jj <= i1; jj += 512)
2068 {
2069 /* Computing MIN */
2070 i2 = 512;
2071 i3 = n - jj + 1;
2072 jsec = min(i2,i3);
2073 ujsec = jsec - jsec % 4;
2074 i2 = k;
2075 for (ll = 1; ll <= i2; ll += 256)
2076 {
2077 /* Computing MIN */
2078 i3 = 256;
2079 i4 = k - ll + 1;
2080 lsec = min(i3,i4);
2081 ulsec = lsec - lsec % 2;
2082
2083 i3 = m;
2084 for (ii = 1; ii <= i3; ii += 256)
2085 {
2086 /* Computing MIN */
2087 i4 = 256;
2088 i5 = m - ii + 1;
2089 isec = min(i4,i5);
2090 uisec = isec - isec % 2;
2091 i4 = ll + ulsec - 1;
2092 for (l = ll; l <= i4; l += 2)
2093 {
2094 i5 = ii + uisec - 1;
2095 for (i = ii; i <= i5; i += 2)
2096 {
2097 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098 a[i + l * a_dim1];
2099 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100 a[i + (l + 1) * a_dim1];
2101 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102 a[i + 1 + l * a_dim1];
2103 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104 a[i + 1 + (l + 1) * a_dim1];
2105 }
2106 if (uisec < isec)
2107 {
2108 t1[l - ll + 1 + (isec << 8) - 257] =
2109 a[ii + isec - 1 + l * a_dim1];
2110 t1[l - ll + 2 + (isec << 8) - 257] =
2111 a[ii + isec - 1 + (l + 1) * a_dim1];
2112 }
2113 }
2114 if (ulsec < lsec)
2115 {
2116 i4 = ii + isec - 1;
2117 for (i = ii; i<= i4; ++i)
2118 {
2119 t1[lsec + ((i - ii + 1) << 8) - 257] =
2120 a[i + (ll + lsec - 1) * a_dim1];
2121 }
2122 }
2123
2124 uisec = isec - isec % 4;
2125 i4 = jj + ujsec - 1;
2126 for (j = jj; j <= i4; j += 4)
2127 {
2128 i5 = ii + uisec - 1;
2129 for (i = ii; i <= i5; i += 4)
2130 {
2131 f11 = c[i + j * c_dim1];
2132 f21 = c[i + 1 + j * c_dim1];
2133 f12 = c[i + (j + 1) * c_dim1];
2134 f22 = c[i + 1 + (j + 1) * c_dim1];
2135 f13 = c[i + (j + 2) * c_dim1];
2136 f23 = c[i + 1 + (j + 2) * c_dim1];
2137 f14 = c[i + (j + 3) * c_dim1];
2138 f24 = c[i + 1 + (j + 3) * c_dim1];
2139 f31 = c[i + 2 + j * c_dim1];
2140 f41 = c[i + 3 + j * c_dim1];
2141 f32 = c[i + 2 + (j + 1) * c_dim1];
2142 f42 = c[i + 3 + (j + 1) * c_dim1];
2143 f33 = c[i + 2 + (j + 2) * c_dim1];
2144 f43 = c[i + 3 + (j + 2) * c_dim1];
2145 f34 = c[i + 2 + (j + 3) * c_dim1];
2146 f44 = c[i + 3 + (j + 3) * c_dim1];
2147 i6 = ll + lsec - 1;
2148 for (l = ll; l <= i6; ++l)
2149 {
2150 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151 * b[l + j * b_dim1];
2152 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153 * b[l + j * b_dim1];
2154 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155 * b[l + (j + 1) * b_dim1];
2156 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157 * b[l + (j + 1) * b_dim1];
2158 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159 * b[l + (j + 2) * b_dim1];
2160 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161 * b[l + (j + 2) * b_dim1];
2162 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163 * b[l + (j + 3) * b_dim1];
2164 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165 * b[l + (j + 3) * b_dim1];
2166 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167 * b[l + j * b_dim1];
2168 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169 * b[l + j * b_dim1];
2170 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171 * b[l + (j + 1) * b_dim1];
2172 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173 * b[l + (j + 1) * b_dim1];
2174 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175 * b[l + (j + 2) * b_dim1];
2176 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177 * b[l + (j + 2) * b_dim1];
2178 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179 * b[l + (j + 3) * b_dim1];
2180 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181 * b[l + (j + 3) * b_dim1];
2182 }
2183 c[i + j * c_dim1] = f11;
2184 c[i + 1 + j * c_dim1] = f21;
2185 c[i + (j + 1) * c_dim1] = f12;
2186 c[i + 1 + (j + 1) * c_dim1] = f22;
2187 c[i + (j + 2) * c_dim1] = f13;
2188 c[i + 1 + (j + 2) * c_dim1] = f23;
2189 c[i + (j + 3) * c_dim1] = f14;
2190 c[i + 1 + (j + 3) * c_dim1] = f24;
2191 c[i + 2 + j * c_dim1] = f31;
2192 c[i + 3 + j * c_dim1] = f41;
2193 c[i + 2 + (j + 1) * c_dim1] = f32;
2194 c[i + 3 + (j + 1) * c_dim1] = f42;
2195 c[i + 2 + (j + 2) * c_dim1] = f33;
2196 c[i + 3 + (j + 2) * c_dim1] = f43;
2197 c[i + 2 + (j + 3) * c_dim1] = f34;
2198 c[i + 3 + (j + 3) * c_dim1] = f44;
2199 }
2200 if (uisec < isec)
2201 {
2202 i5 = ii + isec - 1;
2203 for (i = ii + uisec; i <= i5; ++i)
2204 {
2205 f11 = c[i + j * c_dim1];
2206 f12 = c[i + (j + 1) * c_dim1];
2207 f13 = c[i + (j + 2) * c_dim1];
2208 f14 = c[i + (j + 3) * c_dim1];
2209 i6 = ll + lsec - 1;
2210 for (l = ll; l <= i6; ++l)
2211 {
2212 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213 257] * b[l + j * b_dim1];
2214 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215 257] * b[l + (j + 1) * b_dim1];
2216 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 257] * b[l + (j + 2) * b_dim1];
2218 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219 257] * b[l + (j + 3) * b_dim1];
2220 }
2221 c[i + j * c_dim1] = f11;
2222 c[i + (j + 1) * c_dim1] = f12;
2223 c[i + (j + 2) * c_dim1] = f13;
2224 c[i + (j + 3) * c_dim1] = f14;
2225 }
2226 }
2227 }
2228 if (ujsec < jsec)
2229 {
2230 i4 = jj + jsec - 1;
2231 for (j = jj + ujsec; j <= i4; ++j)
2232 {
2233 i5 = ii + uisec - 1;
2234 for (i = ii; i <= i5; i += 4)
2235 {
2236 f11 = c[i + j * c_dim1];
2237 f21 = c[i + 1 + j * c_dim1];
2238 f31 = c[i + 2 + j * c_dim1];
2239 f41 = c[i + 3 + j * c_dim1];
2240 i6 = ll + lsec - 1;
2241 for (l = ll; l <= i6; ++l)
2242 {
2243 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244 257] * b[l + j * b_dim1];
2245 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246 257] * b[l + j * b_dim1];
2247 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248 257] * b[l + j * b_dim1];
2249 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250 257] * b[l + j * b_dim1];
2251 }
2252 c[i + j * c_dim1] = f11;
2253 c[i + 1 + j * c_dim1] = f21;
2254 c[i + 2 + j * c_dim1] = f31;
2255 c[i + 3 + j * c_dim1] = f41;
2256 }
2257 i5 = ii + isec - 1;
2258 for (i = ii + uisec; i <= i5; ++i)
2259 {
2260 f11 = c[i + j * c_dim1];
2261 i6 = ll + lsec - 1;
2262 for (l = ll; l <= i6; ++l)
2263 {
2264 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265 257] * b[l + j * b_dim1];
2266 }
2267 c[i + j * c_dim1] = f11;
2268 }
2269 }
2270 }
2271 }
2272 }
2273 }
2274 free(t1);
2275 return;
2276 }
2277 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278 {
2279 if (GFC_DESCRIPTOR_RANK (a) != 1)
2280 {
2281 const GFC_REAL_16 *restrict abase_x;
2282 const GFC_REAL_16 *restrict bbase_y;
2283 GFC_REAL_16 *restrict dest_y;
2284 GFC_REAL_16 s;
2285
2286 for (y = 0; y < ycount; y++)
2287 {
2288 bbase_y = &bbase[y*bystride];
2289 dest_y = &dest[y*rystride];
2290 for (x = 0; x < xcount; x++)
2291 {
2292 abase_x = &abase[x*axstride];
2293 s = (GFC_REAL_16) 0;
2294 for (n = 0; n < count; n++)
2295 s += abase_x[n] * bbase_y[n];
2296 dest_y[x] = s;
2297 }
2298 }
2299 }
2300 else
2301 {
2302 const GFC_REAL_16 *restrict bbase_y;
2303 GFC_REAL_16 s;
2304
2305 for (y = 0; y < ycount; y++)
2306 {
2307 bbase_y = &bbase[y*bystride];
2308 s = (GFC_REAL_16) 0;
2309 for (n = 0; n < count; n++)
2310 s += abase[n*axstride] * bbase_y[n];
2311 dest[y*rystride] = s;
2312 }
2313 }
2314 }
2315 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316 {
2317 const GFC_REAL_16 *restrict bbase_y;
2318 GFC_REAL_16 s;
2319
2320 for (y = 0; y < ycount; y++)
2321 {
2322 bbase_y = &bbase[y*bystride];
2323 s = (GFC_REAL_16) 0;
2324 for (n = 0; n < count; n++)
2325 s += abase[n*axstride] * bbase_y[n*bxstride];
2326 dest[y*rxstride] = s;
2327 }
2328 }
2329 else if (axstride < aystride)
2330 {
2331 for (y = 0; y < ycount; y++)
2332 for (x = 0; x < xcount; x++)
2333 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2334
2335 for (y = 0; y < ycount; y++)
2336 for (n = 0; n < count; n++)
2337 for (x = 0; x < xcount; x++)
2338 /* dest[x,y] += a[x,n] * b[n,y] */
2339 dest[x*rxstride + y*rystride] +=
2340 abase[x*axstride + n*aystride] *
2341 bbase[n*bxstride + y*bystride];
2342 }
2343 else
2344 {
2345 const GFC_REAL_16 *restrict abase_x;
2346 const GFC_REAL_16 *restrict bbase_y;
2347 GFC_REAL_16 *restrict dest_y;
2348 GFC_REAL_16 s;
2349
2350 for (y = 0; y < ycount; y++)
2351 {
2352 bbase_y = &bbase[y*bystride];
2353 dest_y = &dest[y*rystride];
2354 for (x = 0; x < xcount; x++)
2355 {
2356 abase_x = &abase[x*axstride];
2357 s = (GFC_REAL_16) 0;
2358 for (n = 0; n < count; n++)
2359 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360 dest_y[x*rxstride] = s;
2361 }
2362 }
2363 }
2364 }
2365 #undef POW3
2366 #undef min
2367 #undef max
2368
2369
2370 /* Compiling main function, with selection code for the processor. */
2371
2372 /* Currently, this is i386 only. Adjust for other architectures. */
2373
2374 #include <config/i386/cpuinfo.h>
matmul_r16(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2375 void matmul_r16 (gfc_array_r16 * const restrict retarray,
2376 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2377 int blas_limit, blas_call gemm)
2378 {
2379 static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
2380 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2381 int blas_limit, blas_call gemm);
2382
2383 void (*matmul_fn) (gfc_array_r16 * const restrict retarray,
2384 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2385 int blas_limit, blas_call gemm);
2386
2387 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2388 if (matmul_fn == NULL)
2389 {
2390 matmul_fn = matmul_r16_vanilla;
2391 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2392 {
2393 /* Run down the available processors in order of preference. */
2394 #ifdef HAVE_AVX512F
2395 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2396 {
2397 matmul_fn = matmul_r16_avx512f;
2398 goto store;
2399 }
2400
2401 #endif /* HAVE_AVX512F */
2402
2403 #ifdef HAVE_AVX2
2404 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2405 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2406 {
2407 matmul_fn = matmul_r16_avx2;
2408 goto store;
2409 }
2410
2411 #endif
2412
2413 #ifdef HAVE_AVX
2414 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2415 {
2416 matmul_fn = matmul_r16_avx;
2417 goto store;
2418 }
2419 #endif /* HAVE_AVX */
2420 }
2421 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2422 {
2423 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2424 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2425 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2426 {
2427 matmul_fn = matmul_r16_avx128_fma3;
2428 goto store;
2429 }
2430 #endif
2431 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2432 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2433 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2434 {
2435 matmul_fn = matmul_r16_avx128_fma4;
2436 goto store;
2437 }
2438 #endif
2439
2440 }
2441 store:
2442 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2443 }
2444
2445 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2446 }
2447
2448 #else /* Just the vanilla function. */
2449
2450 void
matmul_r16(gfc_array_r16 * const restrict retarray,gfc_array_r16 * const restrict a,gfc_array_r16 * const restrict b,int try_blas,int blas_limit,blas_call gemm)2451 matmul_r16 (gfc_array_r16 * const restrict retarray,
2452 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2453 int blas_limit, blas_call gemm)
2454 {
2455 const GFC_REAL_16 * restrict abase;
2456 const GFC_REAL_16 * restrict bbase;
2457 GFC_REAL_16 * restrict dest;
2458
2459 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2460 index_type x, y, n, count, xcount, ycount;
2461
2462 assert (GFC_DESCRIPTOR_RANK (a) == 2
2463 || GFC_DESCRIPTOR_RANK (b) == 2);
2464
2465 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2466
2467 Either A or B (but not both) can be rank 1:
2468
2469 o One-dimensional argument A is implicitly treated as a row matrix
2470 dimensioned [1,count], so xcount=1.
2471
2472 o One-dimensional argument B is implicitly treated as a column matrix
2473 dimensioned [count, 1], so ycount=1.
2474 */
2475
2476 if (retarray->base_addr == NULL)
2477 {
2478 if (GFC_DESCRIPTOR_RANK (a) == 1)
2479 {
2480 GFC_DIMENSION_SET(retarray->dim[0], 0,
2481 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2482 }
2483 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2484 {
2485 GFC_DIMENSION_SET(retarray->dim[0], 0,
2486 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487 }
2488 else
2489 {
2490 GFC_DIMENSION_SET(retarray->dim[0], 0,
2491 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2492
2493 GFC_DIMENSION_SET(retarray->dim[1], 0,
2494 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2495 GFC_DESCRIPTOR_EXTENT(retarray,0));
2496 }
2497
2498 retarray->base_addr
2499 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
2500 retarray->offset = 0;
2501 }
2502 else if (unlikely (compile_options.bounds_check))
2503 {
2504 index_type ret_extent, arg_extent;
2505
2506 if (GFC_DESCRIPTOR_RANK (a) == 1)
2507 {
2508 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2509 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2510 if (arg_extent != ret_extent)
2511 runtime_error ("Array bound mismatch for dimension 1 of "
2512 "array (%ld/%ld) ",
2513 (long int) ret_extent, (long int) arg_extent);
2514 }
2515 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2516 {
2517 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2518 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2519 if (arg_extent != ret_extent)
2520 runtime_error ("Array bound mismatch for dimension 1 of "
2521 "array (%ld/%ld) ",
2522 (long int) ret_extent, (long int) arg_extent);
2523 }
2524 else
2525 {
2526 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2527 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2528 if (arg_extent != ret_extent)
2529 runtime_error ("Array bound mismatch for dimension 1 of "
2530 "array (%ld/%ld) ",
2531 (long int) ret_extent, (long int) arg_extent);
2532
2533 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2534 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2535 if (arg_extent != ret_extent)
2536 runtime_error ("Array bound mismatch for dimension 2 of "
2537 "array (%ld/%ld) ",
2538 (long int) ret_extent, (long int) arg_extent);
2539 }
2540 }
2541
2542
2543 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2544 {
2545 /* One-dimensional result may be addressed in the code below
2546 either as a row or a column matrix. We want both cases to
2547 work. */
2548 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2549 }
2550 else
2551 {
2552 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2553 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2554 }
2555
2556
2557 if (GFC_DESCRIPTOR_RANK (a) == 1)
2558 {
2559 /* Treat it as a a row matrix A[1,count]. */
2560 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2561 aystride = 1;
2562
2563 xcount = 1;
2564 count = GFC_DESCRIPTOR_EXTENT(a,0);
2565 }
2566 else
2567 {
2568 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2569 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2570
2571 count = GFC_DESCRIPTOR_EXTENT(a,1);
2572 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2573 }
2574
2575 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2576 {
2577 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2578 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2579 "in dimension 1: is %ld, should be %ld",
2580 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2581 }
2582
2583 if (GFC_DESCRIPTOR_RANK (b) == 1)
2584 {
2585 /* Treat it as a column matrix B[count,1] */
2586 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2587
2588 /* bystride should never be used for 1-dimensional b.
2589 The value is only used for calculation of the
2590 memory by the buffer. */
2591 bystride = 256;
2592 ycount = 1;
2593 }
2594 else
2595 {
2596 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2597 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2598 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2599 }
2600
2601 abase = a->base_addr;
2602 bbase = b->base_addr;
2603 dest = retarray->base_addr;
2604
2605 /* Now that everything is set up, we perform the multiplication
2606 itself. */
2607
2608 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2609 #define min(a,b) ((a) <= (b) ? (a) : (b))
2610 #define max(a,b) ((a) >= (b) ? (a) : (b))
2611
2612 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2613 && (bxstride == 1 || bystride == 1)
2614 && (((float) xcount) * ((float) ycount) * ((float) count)
2615 > POW3(blas_limit)))
2616 {
2617 const int m = xcount, n = ycount, k = count, ldc = rystride;
2618 const GFC_REAL_16 one = 1, zero = 0;
2619 const int lda = (axstride == 1) ? aystride : axstride,
2620 ldb = (bxstride == 1) ? bystride : bxstride;
2621
2622 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2623 {
2624 assert (gemm != NULL);
2625 const char *transa, *transb;
2626 if (try_blas & 2)
2627 transa = "C";
2628 else
2629 transa = axstride == 1 ? "N" : "T";
2630
2631 if (try_blas & 4)
2632 transb = "C";
2633 else
2634 transb = bxstride == 1 ? "N" : "T";
2635
2636 gemm (transa, transb , &m,
2637 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2638 &ldc, 1, 1);
2639 return;
2640 }
2641 }
2642
2643 if (rxstride == 1 && axstride == 1 && bxstride == 1
2644 && GFC_DESCRIPTOR_RANK (b) != 1)
2645 {
2646 /* This block of code implements a tuned matmul, derived from
2647 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2648
2649 Bo Kagstrom and Per Ling
2650 Department of Computing Science
2651 Umea University
2652 S-901 87 Umea, Sweden
2653
2654 from netlib.org, translated to C, and modified for matmul.m4. */
2655
2656 const GFC_REAL_16 *a, *b;
2657 GFC_REAL_16 *c;
2658 const index_type m = xcount, n = ycount, k = count;
2659
2660 /* System generated locals */
2661 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2662 i1, i2, i3, i4, i5, i6;
2663
2664 /* Local variables */
2665 GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
2666 f13, f14, f23, f24, f33, f34, f43, f44;
2667 index_type i, j, l, ii, jj, ll;
2668 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2669 GFC_REAL_16 *t1;
2670
2671 a = abase;
2672 b = bbase;
2673 c = retarray->base_addr;
2674
2675 /* Parameter adjustments */
2676 c_dim1 = rystride;
2677 c_offset = 1 + c_dim1;
2678 c -= c_offset;
2679 a_dim1 = aystride;
2680 a_offset = 1 + a_dim1;
2681 a -= a_offset;
2682 b_dim1 = bystride;
2683 b_offset = 1 + b_dim1;
2684 b -= b_offset;
2685
2686 /* Empty c first. */
2687 for (j=1; j<=n; j++)
2688 for (i=1; i<=m; i++)
2689 c[i + j * c_dim1] = (GFC_REAL_16)0;
2690
2691 /* Early exit if possible */
2692 if (m == 0 || n == 0 || k == 0)
2693 return;
2694
2695 /* Adjust size of t1 to what is needed. */
2696 index_type t1_dim, a_sz;
2697 if (aystride == 1)
2698 a_sz = rystride;
2699 else
2700 a_sz = a_dim1;
2701
2702 t1_dim = a_sz * 256 + b_dim1;
2703 if (t1_dim > 65536)
2704 t1_dim = 65536;
2705
2706 t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
2707
2708 /* Start turning the crank. */
2709 i1 = n;
2710 for (jj = 1; jj <= i1; jj += 512)
2711 {
2712 /* Computing MIN */
2713 i2 = 512;
2714 i3 = n - jj + 1;
2715 jsec = min(i2,i3);
2716 ujsec = jsec - jsec % 4;
2717 i2 = k;
2718 for (ll = 1; ll <= i2; ll += 256)
2719 {
2720 /* Computing MIN */
2721 i3 = 256;
2722 i4 = k - ll + 1;
2723 lsec = min(i3,i4);
2724 ulsec = lsec - lsec % 2;
2725
2726 i3 = m;
2727 for (ii = 1; ii <= i3; ii += 256)
2728 {
2729 /* Computing MIN */
2730 i4 = 256;
2731 i5 = m - ii + 1;
2732 isec = min(i4,i5);
2733 uisec = isec - isec % 2;
2734 i4 = ll + ulsec - 1;
2735 for (l = ll; l <= i4; l += 2)
2736 {
2737 i5 = ii + uisec - 1;
2738 for (i = ii; i <= i5; i += 2)
2739 {
2740 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2741 a[i + l * a_dim1];
2742 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2743 a[i + (l + 1) * a_dim1];
2744 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2745 a[i + 1 + l * a_dim1];
2746 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2747 a[i + 1 + (l + 1) * a_dim1];
2748 }
2749 if (uisec < isec)
2750 {
2751 t1[l - ll + 1 + (isec << 8) - 257] =
2752 a[ii + isec - 1 + l * a_dim1];
2753 t1[l - ll + 2 + (isec << 8) - 257] =
2754 a[ii + isec - 1 + (l + 1) * a_dim1];
2755 }
2756 }
2757 if (ulsec < lsec)
2758 {
2759 i4 = ii + isec - 1;
2760 for (i = ii; i<= i4; ++i)
2761 {
2762 t1[lsec + ((i - ii + 1) << 8) - 257] =
2763 a[i + (ll + lsec - 1) * a_dim1];
2764 }
2765 }
2766
2767 uisec = isec - isec % 4;
2768 i4 = jj + ujsec - 1;
2769 for (j = jj; j <= i4; j += 4)
2770 {
2771 i5 = ii + uisec - 1;
2772 for (i = ii; i <= i5; i += 4)
2773 {
2774 f11 = c[i + j * c_dim1];
2775 f21 = c[i + 1 + j * c_dim1];
2776 f12 = c[i + (j + 1) * c_dim1];
2777 f22 = c[i + 1 + (j + 1) * c_dim1];
2778 f13 = c[i + (j + 2) * c_dim1];
2779 f23 = c[i + 1 + (j + 2) * c_dim1];
2780 f14 = c[i + (j + 3) * c_dim1];
2781 f24 = c[i + 1 + (j + 3) * c_dim1];
2782 f31 = c[i + 2 + j * c_dim1];
2783 f41 = c[i + 3 + j * c_dim1];
2784 f32 = c[i + 2 + (j + 1) * c_dim1];
2785 f42 = c[i + 3 + (j + 1) * c_dim1];
2786 f33 = c[i + 2 + (j + 2) * c_dim1];
2787 f43 = c[i + 3 + (j + 2) * c_dim1];
2788 f34 = c[i + 2 + (j + 3) * c_dim1];
2789 f44 = c[i + 3 + (j + 3) * c_dim1];
2790 i6 = ll + lsec - 1;
2791 for (l = ll; l <= i6; ++l)
2792 {
2793 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2794 * b[l + j * b_dim1];
2795 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2796 * b[l + j * b_dim1];
2797 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2798 * b[l + (j + 1) * b_dim1];
2799 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2800 * b[l + (j + 1) * b_dim1];
2801 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2802 * b[l + (j + 2) * b_dim1];
2803 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2804 * b[l + (j + 2) * b_dim1];
2805 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2806 * b[l + (j + 3) * b_dim1];
2807 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2808 * b[l + (j + 3) * b_dim1];
2809 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2810 * b[l + j * b_dim1];
2811 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2812 * b[l + j * b_dim1];
2813 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2814 * b[l + (j + 1) * b_dim1];
2815 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2816 * b[l + (j + 1) * b_dim1];
2817 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2818 * b[l + (j + 2) * b_dim1];
2819 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2820 * b[l + (j + 2) * b_dim1];
2821 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2822 * b[l + (j + 3) * b_dim1];
2823 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2824 * b[l + (j + 3) * b_dim1];
2825 }
2826 c[i + j * c_dim1] = f11;
2827 c[i + 1 + j * c_dim1] = f21;
2828 c[i + (j + 1) * c_dim1] = f12;
2829 c[i + 1 + (j + 1) * c_dim1] = f22;
2830 c[i + (j + 2) * c_dim1] = f13;
2831 c[i + 1 + (j + 2) * c_dim1] = f23;
2832 c[i + (j + 3) * c_dim1] = f14;
2833 c[i + 1 + (j + 3) * c_dim1] = f24;
2834 c[i + 2 + j * c_dim1] = f31;
2835 c[i + 3 + j * c_dim1] = f41;
2836 c[i + 2 + (j + 1) * c_dim1] = f32;
2837 c[i + 3 + (j + 1) * c_dim1] = f42;
2838 c[i + 2 + (j + 2) * c_dim1] = f33;
2839 c[i + 3 + (j + 2) * c_dim1] = f43;
2840 c[i + 2 + (j + 3) * c_dim1] = f34;
2841 c[i + 3 + (j + 3) * c_dim1] = f44;
2842 }
2843 if (uisec < isec)
2844 {
2845 i5 = ii + isec - 1;
2846 for (i = ii + uisec; i <= i5; ++i)
2847 {
2848 f11 = c[i + j * c_dim1];
2849 f12 = c[i + (j + 1) * c_dim1];
2850 f13 = c[i + (j + 2) * c_dim1];
2851 f14 = c[i + (j + 3) * c_dim1];
2852 i6 = ll + lsec - 1;
2853 for (l = ll; l <= i6; ++l)
2854 {
2855 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856 257] * b[l + j * b_dim1];
2857 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2858 257] * b[l + (j + 1) * b_dim1];
2859 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2860 257] * b[l + (j + 2) * b_dim1];
2861 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2862 257] * b[l + (j + 3) * b_dim1];
2863 }
2864 c[i + j * c_dim1] = f11;
2865 c[i + (j + 1) * c_dim1] = f12;
2866 c[i + (j + 2) * c_dim1] = f13;
2867 c[i + (j + 3) * c_dim1] = f14;
2868 }
2869 }
2870 }
2871 if (ujsec < jsec)
2872 {
2873 i4 = jj + jsec - 1;
2874 for (j = jj + ujsec; j <= i4; ++j)
2875 {
2876 i5 = ii + uisec - 1;
2877 for (i = ii; i <= i5; i += 4)
2878 {
2879 f11 = c[i + j * c_dim1];
2880 f21 = c[i + 1 + j * c_dim1];
2881 f31 = c[i + 2 + j * c_dim1];
2882 f41 = c[i + 3 + j * c_dim1];
2883 i6 = ll + lsec - 1;
2884 for (l = ll; l <= i6; ++l)
2885 {
2886 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2887 257] * b[l + j * b_dim1];
2888 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2889 257] * b[l + j * b_dim1];
2890 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2891 257] * b[l + j * b_dim1];
2892 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2893 257] * b[l + j * b_dim1];
2894 }
2895 c[i + j * c_dim1] = f11;
2896 c[i + 1 + j * c_dim1] = f21;
2897 c[i + 2 + j * c_dim1] = f31;
2898 c[i + 3 + j * c_dim1] = f41;
2899 }
2900 i5 = ii + isec - 1;
2901 for (i = ii + uisec; i <= i5; ++i)
2902 {
2903 f11 = c[i + j * c_dim1];
2904 i6 = ll + lsec - 1;
2905 for (l = ll; l <= i6; ++l)
2906 {
2907 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2908 257] * b[l + j * b_dim1];
2909 }
2910 c[i + j * c_dim1] = f11;
2911 }
2912 }
2913 }
2914 }
2915 }
2916 }
2917 free(t1);
2918 return;
2919 }
2920 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2921 {
2922 if (GFC_DESCRIPTOR_RANK (a) != 1)
2923 {
2924 const GFC_REAL_16 *restrict abase_x;
2925 const GFC_REAL_16 *restrict bbase_y;
2926 GFC_REAL_16 *restrict dest_y;
2927 GFC_REAL_16 s;
2928
2929 for (y = 0; y < ycount; y++)
2930 {
2931 bbase_y = &bbase[y*bystride];
2932 dest_y = &dest[y*rystride];
2933 for (x = 0; x < xcount; x++)
2934 {
2935 abase_x = &abase[x*axstride];
2936 s = (GFC_REAL_16) 0;
2937 for (n = 0; n < count; n++)
2938 s += abase_x[n] * bbase_y[n];
2939 dest_y[x] = s;
2940 }
2941 }
2942 }
2943 else
2944 {
2945 const GFC_REAL_16 *restrict bbase_y;
2946 GFC_REAL_16 s;
2947
2948 for (y = 0; y < ycount; y++)
2949 {
2950 bbase_y = &bbase[y*bystride];
2951 s = (GFC_REAL_16) 0;
2952 for (n = 0; n < count; n++)
2953 s += abase[n*axstride] * bbase_y[n];
2954 dest[y*rystride] = s;
2955 }
2956 }
2957 }
2958 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2959 {
2960 const GFC_REAL_16 *restrict bbase_y;
2961 GFC_REAL_16 s;
2962
2963 for (y = 0; y < ycount; y++)
2964 {
2965 bbase_y = &bbase[y*bystride];
2966 s = (GFC_REAL_16) 0;
2967 for (n = 0; n < count; n++)
2968 s += abase[n*axstride] * bbase_y[n*bxstride];
2969 dest[y*rxstride] = s;
2970 }
2971 }
2972 else if (axstride < aystride)
2973 {
2974 for (y = 0; y < ycount; y++)
2975 for (x = 0; x < xcount; x++)
2976 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2977
2978 for (y = 0; y < ycount; y++)
2979 for (n = 0; n < count; n++)
2980 for (x = 0; x < xcount; x++)
2981 /* dest[x,y] += a[x,n] * b[n,y] */
2982 dest[x*rxstride + y*rystride] +=
2983 abase[x*axstride + n*aystride] *
2984 bbase[n*bxstride + y*bystride];
2985 }
2986 else
2987 {
2988 const GFC_REAL_16 *restrict abase_x;
2989 const GFC_REAL_16 *restrict bbase_y;
2990 GFC_REAL_16 *restrict dest_y;
2991 GFC_REAL_16 s;
2992
2993 for (y = 0; y < ycount; y++)
2994 {
2995 bbase_y = &bbase[y*bystride];
2996 dest_y = &dest[y*rystride];
2997 for (x = 0; x < xcount; x++)
2998 {
2999 abase_x = &abase[x*axstride];
3000 s = (GFC_REAL_16) 0;
3001 for (n = 0; n < count; n++)
3002 s += abase_x[n*aystride] * bbase_y[n*bxstride];
3003 dest_y[x*rxstride] = s;
3004 }
3005 }
3006 }
3007 }
3008 #undef POW3
3009 #undef min
3010 #undef max
3011
3012 #endif
3013 #endif
3014
3015