! ! Copyright (c) 2012-2018, NVIDIA CORPORATION. All rights reserved. ! ! Licensed under the Apache License, Version 2.0 (the "License"); ! you may not use this file except in compliance with the License. ! You may obtain a copy of the License at ! ! http://www.apache.org/licenses/LICENSE-2.0 ! ! Unless required by applicable law or agreed to in writing, software ! distributed under the License is distributed on an "AS IS" BASIS, ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ! See the License for the specific language governing permissions and ! limitations under the License. ! ! directives.h -- contains preprocessor directives for F90 rte files #include "mmul_dir.h" subroutine ftn_mtaxtb_cmplx16( mra, ncb, kab, alpha, a, lda, b, ldb, beta, & & c, ldc ) implicit none #include "pgf90_mmul_cmplx16.h" ! Everything herein is focused on how the transposition buffer maps ! to the matrix a. The size of the buffer is bufrows * bufcols ! Since once transposed data will be read from the buffer down the rows, ! bufrows corresponds to the columns of a while bufcols corresponds to ! the rows of a. A bit confusing, but correct, I think ! There are 4 cases to consider: ! 1. rowsa <= bufcols AND colsa <= bufrows ! 2. rowsa <= bufcols ( corresponds to a wide matrix ) ! 3. colsa <= bufrows ( corresponds to a high matrix ) ! 4. Both dimensions of a exceed both dimensions of the buffer ! ! The main idea here is that the bufrows will define the usage of the ! L1 cache. We reference the same column or columns multiply while ! accessing multiple partial rows of a transposed in the buffer. ! ! rowsa <-bufcb-> ! colsb ! i = 1, m -ac-> j = 1, n --bc-> ! | +-----------------+ ^ +----------+----+ ^ ! | | | | | x | | ! | | | | | x | | ! ak | A | rowchunks=2 | B x | | ! | | | | | x | | ! | | | | | | x | ka = 1, k ! | | | | | | x | | ! | | | | br | a x | | ! v +xxxxxxxxxxxxxxxxx+ | | +xxxxxxxxxxxxxxx+ | ! | | | | v | x | | ! | | | | | x | | ! | | | | x | | ! | | | | | b x | | ! V +-----------------+ V +----------+----+ V ! <--colachunks=2--> ! x's mark buffer boudaries on the transposed matrices ! For this case, bufca(1) = bufcols, bufr(1) = bufrows colsa = kab rowsb = kab rowsa = mra colsb = ncb if (colsa * rowsa * colsb < min_blocked_mult) then if( beta .eq. 0.0 ) then do j = 1, colsb do i = 1, rowsa temprr0 = 0.0 tempri0 = 0.0 tempir0 = 0.0 tempii0 = 0.0 do k = 1, colsa temprr0 = temprr0 + real(alpha) * real(a(k, i)) * real(b(j, k)) - aimag(alpha) * aimag(a(k, i)) * real(b(j, k)) enddo do k = 1, colsa tempii0 = tempii0 + real(alpha) * aimag(a(k, i)) * aimag(b(j, k)) + aimag(alpha) * real(a(k, i)) * aimag(b(j, k)) enddo do k = 1, colsa tempir0 = tempir0 + real(alpha) * real(a(k, i)) * aimag(b(j, k)) - aimag(alpha) * aimag(a(k, i)) * aimag(b(j, k)) enddo do k = 1, colsa tempri0 = tempri0 + real(alpha) * aimag(a(k, i)) * real(b(j, k)) + aimag(alpha) * real(a(k, i)) * real(b(j, k)) enddo c(i, j) = dcmplx((temprr0 - tempii0), (tempri0 + tempir0)) enddo enddo else do j = 1, colsb do i = 1, rowsa temprr0 = 0.0 tempri0 = 0.0 tempir0 = 0.0 tempii0 = 0.0 do k = 1, colsa temprr0 = temprr0 + real(alpha) * real(a(k, i)) * real(b(j, k)) - aimag(alpha) * aimag(a(k, i)) * real(b(j, k)) enddo do k = 1, colsa tempii0 = tempii0 + real(alpha) * aimag(a(k, i)) * aimag(b(j, k)) + aimag(alpha) * real(a(k, i)) * aimag(b(j, k)) enddo do k = 1, colsa tempir0 = tempir0 + real(alpha) * real(a(k, i)) * aimag(b(j, k)) - aimag(alpha) * aimag(a(k, i)) * aimag(b(j, k)) enddo do k = 1, colsa tempri0 = tempri0 + real(alpha) * aimag(a(k, i)) * real(b(j, k)) + aimag(alpha) * real(a(k, i)) * real(b(j, k)) enddo c(i, j) = beta * c(i, j) + dcmplx((temprr0 - tempii0), (tempri0 + tempir0)) enddo enddo endif else allocate( bufferb( bufrows * bufcols ) ) ! set the number of buffer row chunks we will work on bufr = min( bufrows, rowsb ) bufr_sav = bufr rowchunks = ( rowsb + bufr - 1 )/bufr bufcb = min( bufcols, colsb ) bufcb_sav = bufcb colbchunks = ( colsb + bufcb - 1)/bufcb ! Note that the starting column index into matrix a (ac) is the same as ! starting index into matrix b. But we need 1 less than that so we can ! add an index to it br = 1 ac = 1 bc = 1 ak = 0 colsa_chunk = 4 colsa_chunks = mra/colsa_chunk colsa_end = colsa_chunks * colsa_chunk colsa_strt = colsa_end + 1 do rowchunk = 1, rowchunks bc = 1 do colbchunk = 1, colbchunks ak = br - 1 if( ta .eq. 2 )then !conjugate matrix a; b conjugated in transpose if( br .eq. 1 ) then bufcb = min( bufcb_sav, colsb - bc + 1 ) bufr = min( bufr_sav, rowsb - br + 1 ) call ftn_transpose_cmplx16( tb, b( bc, br ), ldb, alpha, bufferb, & & bufr, bufcb ) if( beta .eq. 0.0 ) then do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * conjg( a( ak + k, i ) ) temp1 = temp1 + bufbtemp * conjg( a( ak + k, i + 1 ) ) temp2 = temp2 + bufbtemp * conjg( a( ak + k, i + 2 ) ) temp3 = temp3 + bufbtemp * conjg( a( ak + k, i + 3 ) ) enddo c( i, j ) = temp0 c( i + 1, j ) = temp1 c( i + 2, j ) = temp2 c( i + 3, j ) = temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * & & conjg( a( ak + k, i ) ) enddo c( i, j ) = temp ndxb = ndxb + bufr enddo enddo else do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * conjg( a( ak + k, i ) ) temp1 = temp1 + bufbtemp * conjg( a( ak + k, i + 1 ) ) temp2 = temp2 + bufbtemp * conjg( a( ak + k, i + 2 ) ) temp3 = temp3 + bufbtemp * conjg( a( ak + k, i + 3 ) ) enddo c( i, j ) = beta * c( i, j ) + temp0 c( i + 1, j ) = beta * c( i + 1, j ) + temp1 c( i + 2, j ) = beta * c( i + 2, j ) + temp2 c( i + 3, j ) = beta * c( i + 3, j ) + temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * & & conjg( a( ak + k, i ) ) enddo c( i, j ) = beta * c( i, j ) + temp ndxb = ndxb + bufr enddo enddo endif else bufcb = min( bufcb_sav, colsb - bc + 1 ) bufr = min( bufr_sav, rowsb - br + 1 ) call ftn_transpose_cmplx16( tb, b( bc, br ), ldb, alpha, bufferb, & & bufr, bufcb ) do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * conjg( a( ak + k, i ) ) temp1 = temp1 + bufbtemp * conjg( a( ak + k, i + 1 ) ) temp2 = temp2 + bufbtemp * conjg( a( ak + k, i + 2 ) ) temp3 = temp3 + bufbtemp * conjg( a( ak + k, i + 3 ) ) enddo c( i, j ) = c( i, j ) + temp0 c( i + 1, j ) = c( i + 1, j ) + temp1 c( i + 2, j ) = c( i + 2, j ) + temp2 c( i + 3, j ) = c( i + 3, j ) + temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * conjg( a( ak + k, i ) ) enddo c( i, j ) = c( i, j ) + temp ndxb = ndxb + bufr enddo enddo endif else if( br .eq. 1 ) then bufcb = min( bufcb_sav, colsb - bc + 1 ) bufr = min( bufr_sav, rowsb - br + 1 ) call ftn_transpose_cmplx16( tb, b( bc, br ), ldb, alpha, bufferb, & & bufr, bufcb ) if( beta .eq. 0.0 ) then do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * a( ak + k, i ) temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) enddo c( i, j ) = temp0 c( i + 1, j ) = temp1 c( i + 2, j ) = temp2 c( i + 3, j ) = temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) enddo c( i, j ) = temp ndxb = ndxb + bufr enddo enddo else do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * a( ak + k, i ) temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) enddo c( i, j ) = beta * c( i, j ) + temp0 c( i + 1, j ) = beta * c( i + 1, j ) + temp1 c( i + 2, j ) = beta * c( i + 2, j ) + temp2 c( i + 3, j ) = beta * c( i + 3, j ) + temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) enddo c( i, j ) = beta * c( i, j ) + temp ndxb = ndxb + bufr enddo enddo endif else bufcb = min( bufcb_sav, colsb - bc + 1 ) bufr = min( bufr_sav, rowsb - br + 1 ) call ftn_transpose_cmplx16( tb, b( bc, br ), ldb, alpha, bufferb, & & bufr, bufcb ) do i = 1, colsa_end, colsa_chunk ndxb = 0 do j = bc, bc + bufcb - 1 temp0 = 0.0 temp1 = 0.0 temp2 = 0.0 temp3 = 0.0 do k = 1, bufr bufbtemp = bufferb( ndxb + k ) temp0 = temp0 + bufbtemp * a( ak + k, i ) temp1 = temp1 + bufbtemp * a( ak + k, i + 1 ) temp2 = temp2 + bufbtemp * a( ak + k, i + 2 ) temp3 = temp3 + bufbtemp * a( ak + k, i + 3 ) enddo c( i, j ) = c( i, j ) + temp0 c( i + 1, j ) = c( i + 1, j ) + temp1 c( i + 2, j ) = c( i + 2, j ) + temp2 c( i + 3, j ) = c( i + 3, j ) + temp3 ndxb = ndxb + bufr enddo enddo ! Now clean up whatever is left from the loop unrolling do i = colsa_strt, mra ndxb = 0 do j = bc, bc + bufcb - 1 temp = 0.0 do k = 1, bufr temp = temp + bufferb( ndxb + k ) * a( ak + k, i ) enddo c( i, j ) = c( i, j ) + temp ndxb = ndxb + bufr enddo enddo endif endif ! adjust the boundaries in the direction of the columns of b ! adjust the row values bc = bc + bufcb enddo br = br + bufr ! controlled but tcbe numbebrcbof bufferb chunks we use. enddo deallocate( bufferb ) endif return end subroutine ftn_mtaxtb_cmplx16