1      SUBROUTINE DGEMM_OMP(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,
2     >                     BETA,C,LDC)
3*     .. Scalar Arguments ..
4      DOUBLE PRECISION ALPHA,BETA
5      INTEGER K,LDA,LDB,LDC,M,N
6      CHARACTER TRANSA,TRANSB
7*     ..
8*     .. Array Arguments ..
9      DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
10*     ..
11*
12*  Purpose
13*  =======
14*
15*  DGEMM  performs one of the matrix-matrix operations
16*
17*     C := alpha*op( A )*op( B ) + beta*C,
18*
19*  where  op( X ) is one of
20*
21*     op( X ) = X   or   op( X ) = X',
22*
23*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
24*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
25*
26*  Arguments
27*  ==========
28*
29*  TRANSA - CHARACTER*1.
30*           On entry, TRANSA specifies the form of op( A ) to be used in
31*           the matrix multiplication as follows:
32*
33*              TRANSA = 'N' or 'n',  op( A ) = A.
34*
35*              TRANSA = 'T' or 't',  op( A ) = A'.
36*
37*              TRANSA = 'C' or 'c',  op( A ) = A'.
38*
39*           Unchanged on exit.
40*
41*  TRANSB - CHARACTER*1.
42*           On entry, TRANSB specifies the form of op( B ) to be used in
43*           the matrix multiplication as follows:
44*
45*              TRANSB = 'N' or 'n',  op( B ) = B.
46*
47*              TRANSB = 'T' or 't',  op( B ) = B'.
48*
49*              TRANSB = 'C' or 'c',  op( B ) = B'.
50*
51*           Unchanged on exit.
52*
53*  M      - INTEGER.
54*           On entry,  M  specifies  the number  of rows  of the  matrix
55*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
56*           Unchanged on exit.
57*
58*  N      - INTEGER.
59*           On entry,  N  specifies the number  of columns of the matrix
60*           op( B ) and the number of columns of the matrix C. N must be
61*           at least zero.
62*           Unchanged on exit.
63*
64*  K      - INTEGER.
65*           On entry,  K  specifies  the number of columns of the matrix
66*           op( A ) and the number of rows of the matrix op( B ). K must
67*           be at least  zero.
68*           Unchanged on exit.
69*
70*  ALPHA  - DOUBLE PRECISION.
71*           On entry, ALPHA specifies the scalar alpha.
72*           Unchanged on exit.
73*
74*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
75*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
76*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
77*           part of the array  A  must contain the matrix  A,  otherwise
78*           the leading  k by m  part of the array  A  must contain  the
79*           matrix A.
80*           Unchanged on exit.
81*
82*  LDA    - INTEGER.
83*           On entry, LDA specifies the first dimension of A as declared
84*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
85*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
86*           least  max( 1, k ).
87*           Unchanged on exit.
88*
89*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
90*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
91*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
92*           part of the array  B  must contain the matrix  B,  otherwise
93*           the leading  n by k  part of the array  B  must contain  the
94*           matrix B.
95*           Unchanged on exit.
96*
97*  LDB    - INTEGER.
98*           On entry, LDB specifies the first dimension of B as declared
99*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
100*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
101*           least  max( 1, n ).
102*           Unchanged on exit.
103*
104*  BETA   - DOUBLE PRECISION.
105*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
106*           supplied as zero then C need not be set on input.
107*           Unchanged on exit.
108*
109*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
110*           Before entry, the leading  m by n  part of the array  C must
111*           contain the matrix  C,  except when  beta  is zero, in which
112*           case C need not be set on entry.
113*           On exit, the array  C  is overwritten by the  m by n  matrix
114*           ( alpha*op( A )*op( B ) + beta*C ).
115*
116*  LDC    - INTEGER.
117*           On entry, LDC specifies the first dimension of C as declared
118*           in  the  calling  (sub)  program.   LDC  must  be  at  least
119*           max( 1, m ).
120*           Unchanged on exit.
121*
122*
123*  Level 3 Blas routine.
124*
125*  -- Written on 8-February-1989.
126*     Jack Dongarra, Argonne National Laboratory.
127*     Iain Duff, AERE Harwell.
128*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
129*     Sven Hammarling, Numerical Algorithms Group Ltd.
130*
131*
132*     .. External Functions ..
133      LOGICAL LSAME
134      EXTERNAL LSAME
135      INTEGER  Parallel_threadid,Parallel_nthreads
136      EXTERNAL Parallel_threadid,Parallel_nthreads
137*     ..
138*     .. External Subroutines ..
139      EXTERNAL XERBLA
140*     ..
141*     .. Intrinsic Functions ..
142      INTRINSIC MAX
143*     ..
144      INTEGER NB
145      PARAMETER (NB=32)
146
147*     .. Local Scalars ..
148      DOUBLE PRECISION TEMP1(NB),TEMP
149      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
150      LOGICAL NOTA,NOTB
151      integer cidj,tid,nthr,Mtid,Itid,r
152*     ..
153*     .. Parameters ..
154      DOUBLE PRECISION ONE,ZERO
155      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
156*     ..
157*
158      nthr = Parallel_nthreads()
159      if (nthr.le.1) then
160         call DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,
161     >              BETA,C,LDC)
162         return
163      end if
164      tid  = Parallel_threadid()
165
166*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
167*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
168*     and  columns of  A  and the  number of  rows  of  B  respectively.
169*
170      NOTA = LSAME(TRANSA,'N')
171      NOTB = LSAME(TRANSB,'N')
172      IF (NOTA) THEN
173          NROWA = M
174          NCOLA = K
175      ELSE
176          NROWA = K
177          NCOLA = M
178      END IF
179      IF (NOTB) THEN
180          NROWB = K
181      ELSE
182          NROWB = N
183      END IF
184*
185*     Test the input parameters.
186*
187      INFO = 0
188      IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
189     +    (.NOT.LSAME(TRANSA,'T'))) THEN
190          INFO = 1
191      ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
192     +         (.NOT.LSAME(TRANSB,'T'))) THEN
193          INFO = 2
194      ELSE IF (M.LT.0) THEN
195          INFO = 3
196      ELSE IF (N.LT.0) THEN
197          INFO = 4
198      ELSE IF (K.LT.0) THEN
199          INFO = 5
200      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
201          INFO = 8
202      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
203          INFO = 10
204      ELSE IF (LDC.LT.MAX(1,M)) THEN
205          INFO = 13
206      END IF
207      IF (INFO.NE.0) THEN
208          CALL XERBLA('DGEMM_OMP',INFO)
209          RETURN
210      END IF
211*
212*     Quick return if possible.
213*
214      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
215     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
216
217
218*
219*     And if  alpha.eq.zero.
220
221*
222      IF (ALPHA.EQ.ZERO) THEN
223          IF (BETA.EQ.ZERO) THEN
224              cidj = tid
225              DO 20 J = 1,N
226                  DO 10 I = cidj+1,M
227                      C(I,J) = ZERO
228   10             CONTINUE
229                  cidj = mod(cidj+M,nthr)
230   20         CONTINUE
231          ELSE
232              cidj = tid
233              DO 40 J = 1,N
234                  DO 30 I = cidj+1,M,nthr
235                      C(I,J) = BETA*C(I,J)
236   30             CONTINUE
237                  cidj = mod(cidj+M,nthr)
238   40         CONTINUE
239          END IF
240!$OMP BARRIER
241          RETURN
242      END IF
243*
244*     Start the operations.
245*
246      IF (NOTB) THEN
247          IF (NOTA) THEN
248*
249*           Form  C := alpha*A*B + beta*C.
250*
251            r    = mod(M,nthr)
252            if (tid.lt.r) then
253                Itid = tid*(M/nthr+1) + 1
254                Mtid = M/nthr + 1
255            else
256                Itid = r + tid*(M/nthr) + 1
257                Mtid = M/nthr
258            end if
259            call DGEMM(TRANSA,TRANSB,Mtid,N,K,ALPHA,A(Itid,1),LDA,B,LDB,
260     >                 BETA,C(Itid,1),LDC)
261
262
263          ELSE
264*
265*           Form  C := alpha*A'*B + beta*C
266*
267              cidj = tid
268              DO 120 J = 1,N
269                  DO 110 I = cidj+1,M,nthr
270                         TEMP = ZERO
271                         DO 100 L = 1,K
272                             TEMP = TEMP + A(L,I)*B(L,J)
273  100                    CONTINUE
274                         IF (BETA.EQ.ZERO) THEN
275                             C(I,J) = ALPHA*TEMP
276                         ELSE
277                             C(I,J) = ALPHA*TEMP + BETA*C(I,J)
278                         END IF
279  110             CONTINUE
280                  cidj = mod(cidj+M,nthr)
281  120         CONTINUE
282          END IF
283      ELSE
284          IF (NOTA) THEN
285*
286*           Form  C := alpha*A*B' + beta*C
287            r    = mod(M,nthr)
288            if (tid.lt.r) then
289                Itid = tid*(M/nthr+1) + 1
290                Mtid = M/nthr + 1
291            else
292                Itid = r + tid*(M/nthr) + 1
293                Mtid = M/nthr
294            end if
295            call DGEMM(TRANSA,TRANSB,Mtid,N,K,ALPHA,A(Itid,1),LDA,B,LDB,
296     >                 BETA,C(Itid,1),LDC)
297
298          ELSE
299*
300*           Form  C := alpha*A'*B' + beta*C
301*
302              cidj = tid
303              DO 200 J = 1,N
304                  DO 190 I = cidj+1,M,nthr
305                      TEMP = ZERO
306                      DO 180 L = 1,K
307                          TEMP = TEMP + A(L,I)*B(J,L)
308  180                 CONTINUE
309                      IF (BETA.EQ.ZERO) THEN
310                          C(I,J) = ALPHA*TEMP
311                      ELSE
312                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
313                      END IF
314  190             CONTINUE
315                  cidj = mod(cidj+M,nthr)
316  200         CONTINUE
317          END IF
318      END IF
319*
320!$OMP BARRIER
321      RETURN
322*
323*     End of DGEMM_OMP .
324*
325      END
326
327
328
329
330****************************************************
331***** subroutine with no OMP barrier at the end ****
332****************************************************
333
334      SUBROUTINE DGEMM_OMP0(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,
335     >                     BETA,C,LDC)
336*     .. Scalar Arguments ..
337      DOUBLE PRECISION ALPHA,BETA
338      INTEGER K,LDA,LDB,LDC,M,N
339      CHARACTER TRANSA,TRANSB
340*     ..
341*     .. Array Arguments ..
342      DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
343*     ..
344
345*
346*  Purpose
347*  =======
348*
349*  DGEMM  performs one of the matrix-matrix operations
350*
351*     C := alpha*op( A )*op( B ) + beta*C,
352*
353*  where  op( X ) is one of
354*
355*     op( X ) = X   or   op( X ) = X',
356*
357*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
358*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
359*
360*  Arguments
361*  ==========
362*
363*  TRANSA - CHARACTER*1.
364*           On entry, TRANSA specifies the form of op( A ) to be used in
365*           the matrix multiplication as follows:
366*
367*              TRANSA = 'N' or 'n',  op( A ) = A.
368*
369*              TRANSA = 'T' or 't',  op( A ) = A'.
370*
371*              TRANSA = 'C' or 'c',  op( A ) = A'.
372*
373*           Unchanged on exit.
374*
375*  TRANSB - CHARACTER*1.
376*           On entry, TRANSB specifies the form of op( B ) to be used in
377*           the matrix multiplication as follows:
378*
379*              TRANSB = 'N' or 'n',  op( B ) = B.
380*
381*              TRANSB = 'T' or 't',  op( B ) = B'.
382*
383*              TRANSB = 'C' or 'c',  op( B ) = B'.
384*
385*           Unchanged on exit.
386*
387*  M      - INTEGER.
388*           On entry,  M  specifies  the number  of rows  of the  matrix
389*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
390*           Unchanged on exit.
391*
392*  N      - INTEGER.
393*           On entry,  N  specifies the number  of columns of the matrix
394*           op( B ) and the number of columns of the matrix C. N must be
395*           at least zero.
396*           Unchanged on exit.
397*
398*  K      - INTEGER.
399*           On entry,  K  specifies  the number of columns of the matrix
400*           op( A ) and the number of rows of the matrix op( B ). K must
401*           be at least  zero.
402*           Unchanged on exit.
403*
404*  ALPHA  - DOUBLE PRECISION.
405*           On entry, ALPHA specifies the scalar alpha.
406*           Unchanged on exit.
407*
408*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
409*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
410*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
411*           part of the array  A  must contain the matrix  A,  otherwise
412*           the leading  k by m  part of the array  A  must contain  the
413*           matrix A.
414*           Unchanged on exit.
415*
416*  LDA    - INTEGER.
417*           On entry, LDA specifies the first dimension of A as declared
418*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
419*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
420*           least  max( 1, k ).
421*           Unchanged on exit.
422*
423*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
424*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
425*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
426*           part of the array  B  must contain the matrix  B,  otherwise
427*           the leading  n by k  part of the array  B  must contain  the
428*           matrix B.
429*           Unchanged on exit.
430*
431*  LDB    - INTEGER.
432*           On entry, LDB specifies the first dimension of B as declared
433*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
434*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
435*           least  max( 1, n ).
436*           Unchanged on exit.
437*
438*  BETA   - DOUBLE PRECISION.
439*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
440*           supplied as zero then C need not be set on input.
441*           Unchanged on exit.
442*
443*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
444*           Before entry, the leading  m by n  part of the array  C must
445*           contain the matrix  C,  except when  beta  is zero, in which
446*           case C need not be set on entry.
447*           On exit, the array  C  is overwritten by the  m by n  matrix
448*           ( alpha*op( A )*op( B ) + beta*C ).
449*
450*  LDC    - INTEGER.
451*           On entry, LDC specifies the first dimension of C as declared
452*           in  the  calling  (sub)  program.   LDC  must  be  at  least
453*           max( 1, m ).
454*           Unchanged on exit.
455*
456*
457*  Level 3 Blas routine.
458*
459*  -- Written on 8-February-1989.
460*     Jack Dongarra, Argonne National Laboratory.
461*     Iain Duff, AERE Harwell.
462*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
463*     Sven Hammarling, Numerical Algorithms Group Ltd.
464*
465*
466*     .. External Functions ..
467      LOGICAL LSAME
468      EXTERNAL LSAME
469      INTEGER  Parallel_threadid,Parallel_nthreads
470      EXTERNAL Parallel_threadid,Parallel_nthreads
471*     ..
472*     .. External Subroutines ..
473      EXTERNAL XERBLA
474*     ..
475*     .. Intrinsic Functions ..
476      INTRINSIC MAX
477*     ..
478      INTEGER NB
479      PARAMETER (NB=32)
480
481*     .. Local Scalars ..
482      DOUBLE PRECISION TEMP1(NB),TEMP
483      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
484      LOGICAL NOTA,NOTB
485      integer cidj,tid,nthr,Mtid,Itid,r
486*     ..
487*     .. Parameters ..
488      DOUBLE PRECISION ONE,ZERO
489      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
490*     ..
491*
492      nthr = Parallel_nthreads()
493      if (nthr.le.1) then
494         call DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,
495     >              BETA,C,LDC)
496         return
497      end if
498      tid  = Parallel_threadid()
499
500*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
501*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
502*     and  columns of  A  and the  number of  rows  of  B  respectively.
503*
504      NOTA = LSAME(TRANSA,'N')
505      NOTB = LSAME(TRANSB,'N')
506      IF (NOTA) THEN
507          NROWA = M
508          NCOLA = K
509      ELSE
510          NROWA = K
511          NCOLA = M
512      END IF
513      IF (NOTB) THEN
514          NROWB = K
515      ELSE
516          NROWB = N
517      END IF
518*
519*     Test the input parameters.
520*
521      INFO = 0
522      IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
523     +    (.NOT.LSAME(TRANSA,'T'))) THEN
524          INFO = 1
525      ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
526     +         (.NOT.LSAME(TRANSB,'T'))) THEN
527          INFO = 2
528      ELSE IF (M.LT.0) THEN
529          INFO = 3
530      ELSE IF (N.LT.0) THEN
531          INFO = 4
532      ELSE IF (K.LT.0) THEN
533          INFO = 5
534      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
535          INFO = 8
536      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
537          INFO = 10
538      ELSE IF (LDC.LT.MAX(1,M)) THEN
539          INFO = 13
540      END IF
541      IF (INFO.NE.0) THEN
542          CALL XERBLA('DGEMM_OMP',INFO)
543          RETURN
544      END IF
545*
546*     Quick return if possible.
547*
548      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
549     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
550
551
552*
553*     And if  alpha.eq.zero.
554
555*
556      IF (ALPHA.EQ.ZERO) THEN
557          IF (BETA.EQ.ZERO) THEN
558              cidj = tid
559              DO 20 J = 1,N
560                  DO 10 I = cidj+1,M
561                      C(I,J) = ZERO
562   10             CONTINUE
563                  cidj = mod(cidj+M,nthr)
564   20         CONTINUE
565          ELSE
566              cidj = tid
567              DO 40 J = 1,N
568                  DO 30 I = cidj+1,M,nthr
569                      C(I,J) = BETA*C(I,J)
570   30             CONTINUE
571                  cidj = mod(cidj+M,nthr)
572   40         CONTINUE
573          END IF
574          RETURN
575      END IF
576*
577*     Start the operations.
578*
579      IF (NOTB) THEN
580          IF (NOTA) THEN
581*
582*           Form  C := alpha*A*B + beta*C.
583*
584            r    = mod(M,nthr)
585            if (tid.lt.r) then
586                Itid = tid*(M/nthr+1) + 1
587                Mtid = M/nthr + 1
588            else
589                Itid = r + tid*(M/nthr) + 1
590                Mtid = M/nthr
591            end if
592            call DGEMM(TRANSA,TRANSB,Mtid,N,K,ALPHA,A(Itid,1),LDA,B,LDB,
593     >                 BETA,C(Itid,1),LDC)
594
595
596          ELSE
597*
598*           Form  C := alpha*A'*B + beta*C
599*
600              cidj = tid
601              DO 120 J = 1,N
602                  DO 110 I = cidj+1,M,nthr
603                         TEMP = ZERO
604                         DO 100 L = 1,K
605                             TEMP = TEMP + A(L,I)*B(L,J)
606  100                    CONTINUE
607                         IF (BETA.EQ.ZERO) THEN
608                             C(I,J) = ALPHA*TEMP
609                         ELSE
610                             C(I,J) = ALPHA*TEMP + BETA*C(I,J)
611                         END IF
612  110             CONTINUE
613                  cidj = mod(cidj+M,nthr)
614  120         CONTINUE
615          END IF
616      ELSE
617          IF (NOTA) THEN
618*
619*           Form  C := alpha*A*B' + beta*C
620            r    = mod(M,nthr)
621            if (tid.lt.r) then
622                Itid = tid*(M/nthr+1) + 1
623                Mtid = M/nthr + 1
624            else
625                Itid = r + tid*(M/nthr) + 1
626                Mtid = M/nthr
627            end if
628            call DGEMM(TRANSA,TRANSB,Mtid,N,K,ALPHA,A(Itid,1),LDA,B,LDB,
629     >                 BETA,C(Itid,1),LDC)
630
631          ELSE
632*
633*           Form  C := alpha*A'*B' + beta*C
634*
635              cidj = tid
636              DO 200 J = 1,N
637                  DO 190 I = cidj+1,M,nthr
638                      TEMP = ZERO
639                      DO 180 L = 1,K
640                          TEMP = TEMP + A(L,I)*B(J,L)
641  180                 CONTINUE
642                      IF (BETA.EQ.ZERO) THEN
643                          C(I,J) = ALPHA*TEMP
644                      ELSE
645                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
646                      END IF
647  190             CONTINUE
648                  cidj = mod(cidj+M,nthr)
649  200         CONTINUE
650          END IF
651      END IF
652*
653      RETURN
654*
655*     End of DGEMM_OMP0
656*
657      END
658
659