1! 2! Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3! 4! Licensed under the Apache License, Version 2.0 (the "License"); 5! you may not use this file except in compliance with the License. 6! You may obtain a copy of the License at 7! 8! http://www.apache.org/licenses/LICENSE-2.0 9! 10! Unless required by applicable law or agreed to in writing, software 11! distributed under the License is distributed on an "AS IS" BASIS, 12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13! See the License for the specific language governing permissions and 14! limitations under the License. 15! 16 17 18#include "mmul_dir.h" 19 20 21subroutine ftn_mtaxtb_real8( mra, ncb, kab, alpha, a, lda, b, ldb, beta, & 22 & c, ldc ) 23 implicit none 24#include "pgf90_mmul_real8.h" 25 26 ! Everything herein is focused on how the transposition buffer maps 27 ! to the matrix a. The size of the buffer is bufrows * bufcols 28 ! Since once transposed data will be read from the buffer down the rows, 29 ! bufrows corresponds to the columns of a while bufcols corresponds to 30 ! the rows of a. A bit confusing, but correct, I think 31 ! There are 4 cases to consider: 32 ! 1. rowsa <= bufcols AND colsa <= bufrows 33 ! 2. rowsa <= bufcols ( corresponds to a wide matrix ) 34 ! 3. colsa <= bufrows ( corresponds to a high matrix ) 35 ! 4. Both dimensions of a exceed both dimensions of the buffer 36 ! 37 ! The main idea here is that the bufrows will define the usage of the 38 ! L1 cache. We reference the same column or columns multiply while 39 ! accessing multiple partial rows of a transposed in the buffer. 40 41 ! 42 ! rowsa <-bufcb-> 43 ! colsb 44 ! i = 1, m -ac-> j = 1, n --bc-> 45 ! | +-----------------+ ^ +----------+----+ ^ 46 ! | | | | | x | | 47 ! | | | | | x | | 48 ! ak | A | rowchunks=2 | B x | | 49 ! | | | | | x | | 50 ! | | | | | | x | ka = 1, k 51 ! | | | | | | x | | 52 ! | | | | br | a x | | 53 ! v +xxxxxxxxxxxxxxxxx+ | | +xxxxxxxxxxxxxxx+ | 54 ! | | | | v | x | | 55 ! | | | | | x | | 56 ! | | | | x | | 57 ! | | | | | b x | | 58 ! V +-----------------+ V +----------+----+ V 59 ! <--colachunks=2--> 60 ! x's mark buffer boudaries on the transposed matrices 61 ! For this case, bufca(1) = bufcols, bufr(1) = bufrows 62 63 colsa = kab 64 rowsb = kab 65 rowsa = mra 66 colsb = ncb 67 if (colsa * rowsa * colsb < min_blocked_mult) then 68 if( beta .eq. 0.0 ) then 69 do j = 1, colsb 70 do i = 1, rowsa 71 temp0 = 0.0 72 do k = 1, colsa 73 temp0 = temp0 + alpha * a (k, i) * b( j, k ) 74 enddo 75 c( i, j ) = temp0 76 enddo 77 enddo 78 else 79 do j = 1, colsb 80 do i = 1, rowsa 81 temp0 = 0.0 82 do k = 1, colsa 83 temp0 = temp0 + alpha * a (k, i) * b( j, k ) 84 enddo 85 c( i, j ) = beta * c( i, j ) + temp0 86 enddo 87 enddo 88 endif 89 else 90 allocate( bufferb( bufrows * bufcols ) ) 91 92 ! set the number of buffer row chunks we will work on 93 bufr = min( bufrows, rowsb ) 94 bufr_sav = bufr 95 rowchunks = ( rowsb + bufr - 1 )/bufr 96 97 bufcb = min( bufcols, colsb ) 98 bufcb_sav = bufcb 99 colbchunks = ( colsb + bufcb - 1)/bufcb 100 ! Note that the starting column index into matrix a (ac) is the same as 101 ! starting index into matrix b. But we need 1 less than that so we can 102 ! add an index to it 103 br = 1 104 ac = 1 105 bc = 1 106 ak = 0 107 colsa_chunk = 4 108 colsa_chunks = mra/colsa_chunk 109 colsa_end = colsa_chunks * colsa_chunk 110 colsa_strt = colsa_end + 1 111 112 113 do rowchunk = 1, rowchunks 114 bc = 1 115 do colbchunk = 1, colbchunks 116 ak = br - 1 117 if( br .eq. 1 ) then 118 bufcb = min( bufcb_sav, colsb - bc + 1 ) 119 bufr = min( bufr_sav, rowsb - br + 1 ) 120 call ftn_transpose_real8( b( bc, br ), ldb, alpha, bufferb, & 121 & bufr, bufcb ) 122 if( beta .eq. 0.0 ) then 123 do i = 1, colsa_end, colsa_chunk 124 ndxb = 0 125 do j = bc, bc + bufcb - 1 126 temp0 = 0.0 127 temp1 = 0.0 128 temp2 = 0.0 129 temp3 = 0.0 130 do k = 1, bufr 131 bufbtemp = bufferb( ndxb + k ) 132 temp0 = temp0 + bufbtemp * a( ak + k, i ) 133 temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) 134 temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) 135 temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) 136 enddo 137 c( i, j ) = temp0 138 c( i + 1, j ) = temp1 139 c( i + 2, j ) = temp2 140 c( i + 3, j ) = temp3 141 ndxb = ndxb + bufr 142 enddo 143 enddo 144 ! Now clean up whatever is left from the loop unrolling 145 do i = colsa_strt, mra 146 ndxb = 0 147 do j = bc, bc + bufcb - 1 148 temp = 0.0 149 do k = 1, bufr 150 temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) 151 enddo 152 c( i, j ) = temp 153 ndxb = ndxb + bufr 154 enddo 155 enddo 156 else 157 do i = 1, colsa_end, colsa_chunk 158 ndxb = 0 159 do j = bc, bc + bufcb - 1 160 temp0 = 0.0 161 temp1 = 0.0 162 temp2 = 0.0 163 temp3 = 0.0 164 do k = 1, bufr 165 bufbtemp = bufferb( ndxb + k ) 166 temp0 = temp0 + bufbtemp * a( ak + k, i ) 167 temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) 168 temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) 169 temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) 170 enddo 171 c( i, j ) = beta * c( i, j ) + temp0 172 c( i + 1, j ) = beta * c( i + 1, j ) + temp1 173 c( i + 2, j ) = beta * c( i + 2, j ) + temp2 174 c( i + 3, j ) = beta * c( i + 3, j ) + temp3 175 ndxb = ndxb + bufr 176 enddo 177 enddo 178 ! Now clean up whatever is left from the loop unrolling 179 do i = colsa_strt, mra 180 ndxb = 0 181 do j = bc, bc + bufcb - 1 182 temp = 0.0 183 do k = 1, bufr 184 temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) 185 enddo 186 c( i, j ) = beta * c( i, j ) + temp 187 ndxb = ndxb + bufr 188 enddo 189 enddo 190 endif 191 else 192 bufcb = min( bufcb_sav, colsb - bc + 1 ) 193 bufr = min( bufr_sav, rowsb - br + 1 ) 194 call ftn_transpose_real8( b( bc, br ), ldb, alpha, bufferb, & 195 & bufr, bufcb ) 196 do i = 1, colsa_end, colsa_chunk 197 ndxb = 0 198 do j = bc, bc + bufcb - 1 199 temp0 = 0.0 200 temp1 = 0.0 201 temp2 = 0.0 202 temp3 = 0.0 203 do k = 1, bufr 204 bufbtemp = bufferb( ndxb + k ) 205 temp0 = temp0 + bufbtemp * a( ak + k, i ) 206 temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) 207 temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) 208 temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) 209 enddo 210 c( i, j ) = c( i, j ) + temp0 211 c( i + 1, j ) = c( i + 1, j ) + temp1 212 c( i + 2, j ) = c( i + 2, j ) + temp2 213 c( i + 3, j ) = c( i + 3, j ) + temp3 214 ndxb = ndxb + bufr 215 enddo 216 enddo 217 ! Now clean up whatever is left from the loop unrolling 218 do i = colsa_strt, mra 219 ndxb = 0 220 do j = bc, bc + bufcb - 1 221 temp = 0.0 222 do k = 1, bufr 223 temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) 224 enddo 225 c( i, j ) = c( i, j ) + temp 226 ndxb = ndxb + bufr 227 enddo 228 enddo 229 endif 230 ! adjust the boundaries in the direction of the columns of b 231 ! adjust the row values 232 bc = bc + bufcb 233 enddo 234 br = br + bufr 235 ! controlled but tcbe numbebrcbof bufferb chunks we use. 236 237 enddo 238 deallocate( bufferb ) 239 endif 240 return 241end subroutine ftn_mtaxtb_real8 242