1! 2! Copyright (c) 2012-2018, NVIDIA CORPORATION. All rights reserved. 3! 4! Licensed under the Apache License, Version 2.0 (the "License"); 5! you may not use this file except in compliance with the License. 6! You may obtain a copy of the License at 7! 8! http://www.apache.org/licenses/LICENSE-2.0 9! 10! Unless required by applicable law or agreed to in writing, software 11! distributed under the License is distributed on an "AS IS" BASIS, 12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13! See the License for the specific language governing permissions and 14! limitations under the License. 15! 16 17 18! directives.h -- contains preprocessor directives for F90 rte files 19 20#include "mmul_dir.h" 21 22subroutine ftn_transpose_cmplx16( ta, a, lda, alpha, buffer, bufrows, bufcols ) 23 implicit none 24 integer*8 lda 25 integer :: bufrows, bufcols 26 integer i, j, ndx, ndxsave 27 complex*16 :: a( lda, * ), alpha 28 complex*16 :: buffer(bufrows * bufcols) 29 integer :: ta 30 31 ! 32 ! The plan here is to copy the matrix a to the buffer, or at least a 33 ! portion of it, such that the matrix (really a buffer) is in proper 34 ! order for successive access. Some number of columns of a will be 35 ! dispersed to buffer to minimize page faults. 36 ! The calling function can manage the buffer for both L1 and L2 cache 37 ! utilization. bufcols defines the number of values taken from L1 cache 38 ! for each dot product. bufrows * bufcols defines how much L2 cache is 39 ! used. 40 ! 41 ! We may want to change this to be able to handle multiple sections of L1 42 ! cache usage such as giving an additional parameter, say, nbufrows 43 ! which would essentially copy more of the matrix a to the buffer using 44 ! an additional loop 45 46 ! 47 ! What do the parameters mean? 48 ! buffer: buffer array 49 ! a: matrix to be transposed 50 ! bufcols: number of rows in matrix a to transpose 51 ! bufrowss: number of cols in matrix a to transpose 52 ! lda: number of rows in matrix a 53 ! Note that we don't care what the dimensions of a are. We assume that the 54 ! calling function has done this correctly 55 ! 56 ndxsave = 1 57 if( alpha .eq. 1.0 )then 58 if( ta .eq. 2 )then ! conjugate the data on transfer to buffer 59 do j = 1, bufrows 60 ndx = ndxsave 61 do i = 1, bufcols 62 buffer( ndx ) = conjg( a( i, j ) ) 63 ndx = ndx + bufrows 64 enddo 65 ndxsave = ndxsave + 1 66 enddo 67 else 68 do j = 1, bufrows 69 ndx = ndxsave 70 do i = 1, bufcols 71 buffer( ndx ) = a( i, j ) 72 ndx = ndx + bufrows 73 enddo 74 ndxsave = ndxsave + 1 75 enddo 76 endif 77 else 78 if( ta .eq. 2 )then ! conjugate the data on transfer to buffer 79 do j = 1, bufrows 80 ndx = ndxsave 81 do i = 1, bufcols 82 buffer( ndx ) = alpha * conjg( a( i, j ) ) 83 ndx = ndx + bufrows 84 enddo 85 ndxsave = ndxsave + 1 86 enddo 87 else 88 do j = 1, bufrows 89 ndx = ndxsave 90 do i = 1, bufcols 91 buffer( ndx ) = alpha * a( i, j ) 92 ndx = ndx + bufrows 93 enddo 94 ndxsave = ndxsave + 1 95 enddo 96 endif 97 endif 98 ! write( *, * ) ( a(1, j ), j = 1, bufcols ) 99 ! write( *, * )( buffer( i ), i = 1, bufrows * bufcols ) 100 return 101end subroutine ftn_transpose_cmplx16 102