1!
2! Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
3!
4! Licensed under the Apache License, Version 2.0 (the "License");
5! you may not use this file except in compliance with the License.
6! You may obtain a copy of the License at
7!
8!     http://www.apache.org/licenses/LICENSE-2.0
9!
10! Unless required by applicable law or agreed to in writing, software
11! distributed under the License is distributed on an "AS IS" BASIS,
12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13! See the License for the specific language governing permissions and
14! limitations under the License.
15!
16
17
18#include "mmul_dir.h"
19
20
21subroutine ftn_mtaxtb_real8( mra, ncb, kab, alpha, a, lda, b, ldb, beta, &
22     & c, ldc )
23  implicit none
24#include "pgf90_mmul_real8.h"
25
26  ! Everything herein is focused on how the transposition buffer maps
27  ! to the matrix a. The size of the buffer is bufrows * bufcols
28  ! Since once transposed data will be read from the buffer down the rows,
29  ! bufrows corresponds to the columns of a while bufcols corresponds to
30  ! the rows of a. A bit confusing, but correct, I think
31  ! There are 4 cases to consider:
32  ! 1. rowsa <= bufcols AND colsa <= bufrows
33  ! 2. rowsa <= bufcols ( corresponds to a wide matrix )
34  ! 3. colsa <= bufrows ( corresponds to a high matrix )
35  ! 4. Both dimensions of a exceed both dimensions of the buffer
36  !
37  ! The main idea here is that the bufrows will define the usage of the
38  ! L1 cache. We reference the same column or columns multiply while
39  ! accessing multiple partial rows of a transposed in the buffer.
40
41  !
42  !                 rowsa                           <-bufcb->
43  !                                                   colsb
44  !              i = 1, m  -ac->                   j = 1, n --bc->
45  !      |    +-----------------+   ^          +----------+----+  ^
46  !      |    |                 |   |          |          x    |  |
47  !      |    |                 |   |          |          x    |  |
48  !    ak     |  A              | rowchunks=2  |    B     x    |  |
49  !      |    |                 |   |          |          x    |  |
50  !      |    |                 |   |    |     |          x    | ka = 1, k
51  !      |    |                 |   |    |     |          x    |  |
52  !      |    |                 |   |   br     |    a     x    |  |
53  !      v    +xxxxxxxxxxxxxxxxx+   |    |     +xxxxxxxxxxxxxxx+  |
54  !      |    |                 |   |    v     |          x    |  |
55  !      |    |                 |   |          |          x    |  |
56  !           |                 |   |          |          x    |  |
57  !      |    |                 |   |          |    b     x    |  |
58  !      V    +-----------------+   V          +----------+----+  V
59  !            <--colachunks=2-->
60  !     x's mark buffer boudaries on the transposed matrices
61  !     For this case, bufca(1) = bufcols, bufr(1) = bufrows
62
63  colsa = kab
64  rowsb = kab
65  rowsa = mra
66  colsb = ncb
67  if (colsa * rowsa * colsb < min_blocked_mult) then
68    if( beta .eq. 0.0 ) then
69      do j = 1, colsb
70         do i = 1, rowsa
71            temp0 = 0.0
72            do k = 1, colsa
73               temp0 = temp0 + alpha * a (k, i) * b( j, k )
74            enddo
75            c( i, j ) = temp0
76         enddo
77      enddo
78    else
79      do j = 1, colsb
80         do i = 1, rowsa
81            temp0 = 0.0
82            do k = 1, colsa
83               temp0 = temp0 + alpha * a (k, i) * b( j, k )
84            enddo
85            c( i, j ) = beta * c( i, j ) + temp0
86         enddo
87      enddo
88    endif
89  else
90    allocate( bufferb( bufrows * bufcols ) )
91
92       ! set the number of buffer row chunks we will work on
93    bufr = min( bufrows, rowsb )
94    bufr_sav = bufr
95    rowchunks = ( rowsb + bufr - 1 )/bufr
96
97    bufcb = min( bufcols, colsb )
98    bufcb_sav = bufcb
99    colbchunks = ( colsb + bufcb - 1)/bufcb
100    ! Note that the starting column index into matrix a (ac) is the same as
101    ! starting index into matrix b. But we need 1 less than that so we can
102    ! add an index to it
103    br = 1
104    ac = 1
105    bc = 1
106    ak = 0
107    colsa_chunk = 4
108    colsa_chunks = mra/colsa_chunk
109    colsa_end = colsa_chunks * colsa_chunk
110    colsa_strt = colsa_end + 1
111
112
113    do rowchunk = 1, rowchunks
114        bc = 1
115        do colbchunk = 1, colbchunks
116           ak = br - 1
117           if( br .eq. 1 ) then
118             bufcb = min( bufcb_sav, colsb - bc + 1 )
119             bufr = min( bufr_sav, rowsb - br + 1 )
120             call ftn_transpose_real8( b( bc, br ), ldb, alpha, bufferb, &
121                  & bufr, bufcb )
122             if( beta .eq. 0.0 ) then
123                do i = 1, colsa_end, colsa_chunk
124                   ndxb = 0
125                   do j = bc, bc + bufcb - 1
126                      temp0 = 0.0
127                      temp1 = 0.0
128                      temp2 = 0.0
129                      temp3 = 0.0
130                      do k = 1, bufr
131                         bufbtemp = bufferb( ndxb + k )
132                         temp0 = temp0 + bufbtemp * a( ak + k, i )
133                         temp1 = temp1 + bufbtemp * a( ak + k, i + 1 )
134                         temp2 = temp2 + bufbtemp * a( ak + k, i + 2 )
135                         temp3 = temp3 + bufbtemp * a( ak + k, i + 3 )
136                      enddo
137                      c( i, j )     = temp0
138                      c( i + 1, j ) = temp1
139                      c( i + 2, j ) = temp2
140                      c( i + 3, j ) = temp3
141                      ndxb = ndxb + bufr
142                   enddo
143                enddo
144                ! Now clean up whatever is left from the loop unrolling
145                do i = colsa_strt, mra
146                   ndxb = 0
147                   do j = bc, bc + bufcb - 1
148                      temp = 0.0
149                      do k = 1, bufr
150                         temp = temp + bufferb( ndxb + k ) * a( ak + k, i )
151                      enddo
152                      c( i, j ) = temp
153                      ndxb = ndxb + bufr
154                   enddo
155                enddo
156             else
157                do i = 1, colsa_end, colsa_chunk
158                   ndxb = 0
159                   do j = bc, bc + bufcb - 1
160                      temp0 = 0.0
161                      temp1 = 0.0
162                      temp2 = 0.0
163                      temp3 = 0.0
164                      do k = 1, bufr
165                         bufbtemp = bufferb( ndxb + k )
166                         temp0 = temp0 + bufbtemp * a( ak + k, i )
167                         temp1 = temp1 + bufbtemp * a( ak + k, i + 1 )
168                         temp2 = temp2 + bufbtemp * a( ak + k, i + 2 )
169                         temp3 = temp3 + bufbtemp * a( ak + k, i + 3 )
170                      enddo
171                      c( i, j )     = beta * c( i, j )     + temp0
172                      c( i + 1, j ) = beta * c( i + 1, j ) + temp1
173                      c( i + 2, j ) = beta * c( i + 2, j ) + temp2
174                      c( i + 3, j ) = beta * c( i + 3, j ) + temp3
175                      ndxb = ndxb + bufr
176                   enddo
177                enddo
178                ! Now clean up whatever is left from the loop unrolling
179                do i = colsa_strt, mra
180                   ndxb = 0
181                   do j = bc, bc + bufcb - 1
182                      temp = 0.0
183                      do k = 1, bufr
184                         temp = temp + bufferb( ndxb + k ) * a( ak + k, i )
185                      enddo
186                      c( i, j ) = beta * c( i, j ) + temp
187                      ndxb = ndxb + bufr
188                   enddo
189                enddo
190             endif
191          else
192             bufcb = min( bufcb_sav, colsb - bc + 1 )
193             bufr = min( bufr_sav, rowsb - br + 1 )
194             call ftn_transpose_real8( b( bc, br ), ldb, alpha, bufferb, &
195                  & bufr, bufcb )
196             do i = 1, colsa_end, colsa_chunk
197                ndxb = 0
198                do j = bc, bc + bufcb - 1
199                   temp0 = 0.0
200                   temp1 = 0.0
201                   temp2 = 0.0
202                   temp3 = 0.0
203                   do k = 1, bufr
204                      bufbtemp = bufferb( ndxb + k )
205                      temp0 = temp0 + bufbtemp * a( ak + k, i )
206                      temp1 = temp1 + bufbtemp * a( ak + k, i + 1 )
207                      temp2 = temp2 + bufbtemp * a( ak + k, i + 2 )
208                      temp3 = temp3 + bufbtemp * a( ak + k, i + 3 )
209                   enddo
210                   c( i, j )     = c( i, j )     + temp0
211                   c( i + 1, j ) = c( i + 1, j ) + temp1
212                   c( i + 2, j ) = c( i + 2, j ) + temp2
213                   c( i + 3, j ) = c( i + 3, j ) + temp3
214                   ndxb = ndxb + bufr
215                enddo
216             enddo
217             ! Now clean up whatever is left from the loop unrolling
218             do i = colsa_strt, mra
219                ndxb = 0
220                do j = bc, bc + bufcb - 1
221                   temp = 0.0
222                   do k = 1, bufr
223                      temp = temp + bufferb( ndxb + k ) * a( ak + k, i )
224                   enddo
225                   c( i, j ) = c( i, j ) + temp
226                   ndxb = ndxb + bufr
227                enddo
228             enddo
229          endif
230          ! adjust the boundaries in the direction of the columns of b
231          ! adjust the row values
232          bc = bc + bufcb
233       enddo
234       br = br + bufr
235       ! controlled but tcbe numbebrcbof bufferb chunks we use.
236
237    enddo
238    deallocate( bufferb )
239  endif
240  return
241end subroutine ftn_mtaxtb_real8
242