1!
2! Copyright (c) 2012-2018, NVIDIA CORPORATION.  All rights reserved.
3!
4! Licensed under the Apache License, Version 2.0 (the "License");
5! you may not use this file except in compliance with the License.
6! You may obtain a copy of the License at
7!
8!     http://www.apache.org/licenses/LICENSE-2.0
9!
10! Unless required by applicable law or agreed to in writing, software
11! distributed under the License is distributed on an "AS IS" BASIS,
12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13! See the License for the specific language governing permissions and
14! limitations under the License.
15!
16
17
18! directives.h -- contains preprocessor directives for F90 rte files
19
20#include "mmul_dir.h"
21
22subroutine ftn_transpose_cmplx16( ta, a, lda, alpha, buffer, bufrows, bufcols )
23  implicit none
24  integer*8 lda
25  integer :: bufrows, bufcols
26  integer i, j, ndx, ndxsave
27  complex*16 :: a( lda, * ), alpha
28  complex*16 :: buffer(bufrows * bufcols)
29  integer :: ta
30
31  !
32  !   The plan here is to copy the matrix a to the buffer, or at least a
33  !   portion of it, such that the matrix  (really a buffer) is in proper
34  !   order for successive access. Some number of columns of a will be
35  !   dispersed to buffer to minimize page faults.
36  !   The calling function can manage the buffer for both L1 and L2 cache
37  !   utilization. bufcols defines the number of values taken from L1 cache
38  !   for each dot product. bufrows * bufcols defines how much L2 cache is
39  !   used.
40  !
41  !   We may want to change this to be able to handle multiple sections of L1
42  !   cache usage such as giving an additional parameter, say, nbufrows
43  !   which would essentially copy more of the matrix a to the buffer using
44  !   an additional loop
45
46  !
47  !   What do the parameters mean?
48  !   buffer: buffer array
49  !   a: matrix to be transposed
50  !   bufcols: number of rows in matrix a to transpose
51  !   bufrowss: number of cols in matrix a to transpose
52  !   lda: number of rows in matrix a
53  !   Note that we don't care what the dimensions of a are. We assume that the
54  !   calling function has done this correctly
55  !
56  ndxsave = 1
57  if( alpha .eq. 1.0 )then
58     if( ta .eq. 2 )then ! conjugate the data on transfer to buffer
59        do j = 1, bufrows
60           ndx = ndxsave
61           do i = 1, bufcols
62              buffer( ndx ) = conjg( a( i, j ) )
63              ndx = ndx + bufrows
64           enddo
65           ndxsave = ndxsave + 1
66        enddo
67     else
68        do j = 1, bufrows
69           ndx = ndxsave
70           do i = 1, bufcols
71              buffer( ndx ) = a( i, j )
72              ndx = ndx + bufrows
73           enddo
74           ndxsave = ndxsave + 1
75        enddo
76     endif
77  else
78     if( ta .eq. 2 )then ! conjugate the data on transfer to buffer
79        do j = 1, bufrows
80           ndx = ndxsave
81           do i = 1, bufcols
82              buffer( ndx ) = alpha * conjg( a( i, j ) )
83              ndx = ndx + bufrows
84           enddo
85           ndxsave = ndxsave + 1
86        enddo
87     else
88        do j = 1, bufrows
89           ndx = ndxsave
90           do i = 1, bufcols
91              buffer( ndx ) = alpha * a( i, j )
92              ndx = ndx + bufrows
93           enddo
94           ndxsave = ndxsave + 1
95        enddo
96     endif
97  endif
98  !      write( *, * ) ( a(1, j ), j = 1, bufcols )
99  !      write( *, * )( buffer( i ), i = 1, bufrows * bufcols )
100  return
101end subroutine ftn_transpose_cmplx16
102