1!    This file is part of ELPA.
2!
3!    The ELPA library was originally created by the ELPA consortium,
4!    consisting of the following organizations:
5!
6!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9!      Informatik,
10!    - Technische Universität München, Lehrstuhl für Informatik mit
11!      Schwerpunkt Wissenschaftliches Rechnen ,
12!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15!      and
16!    - IBM Deutschland GmbH
17!
18!    This particular source code file contains additions, changes and
19!    enhancements authored by Intel Corporation which is not part of
20!    the ELPA consortium.
21!
22!    More information can be found here:
23!    http://elpa.mpcdf.mpg.de/
24!
25!    ELPA is free software: you can redistribute it and/or modify
26!    it under the terms of the version 3 of the license of the
27!    GNU Lesser General Public License as published by the Free
28!    Software Foundation.
29!
30!    ELPA is distributed in the hope that it will be useful,
31!    but WITHOUT ANY WARRANTY; without even the implied warranty of
32!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33!    GNU Lesser General Public License for more details.
34!
35!    You should have received a copy of the GNU Lesser General Public License
36!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
37!
38!    ELPA reflects a substantial effort on the part of the original
39!    ELPA consortium, and we ask you to respect the spirit of the
40!    license that we chose: i.e., please contribute any changes you
41!    may have back to the original ELPA library distribution, and keep
42!    any derivatives of ELPA under the same license that we chose for
43!    the original distribution, the GNU Lesser General Public License.
44
45#include "../general/sanity.F90"
46     use elpa1_compute
47     use elpa_utilities
48     use elpa_mpi
49     use precision
50     use elpa_abstract_impl
51     use elpa_omp
52
53     implicit none
54#include "../general/precision_kinds.F90"
55      class(elpa_abstract_impl_t), intent(inout) :: obj
56      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
57#ifdef USE_ASSUMED_SIZE
58      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,*)
59#else
60      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,obj%local_ncols)
61#endif
62      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
63      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
64      integer(kind=ik)              :: n, nc, i, info
65      integer(kind=ik)              :: lcs, lce, lrs, lre
66      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
67
68      MATH_DATATYPE(kind=rck), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
69      logical                       :: wantDebug
70      logical                       :: success
71      integer(kind=ik)              :: istat, debug, error
72      character(200)                :: errorMessage
73      integer(kind=ik)              :: nrThreads
74
75      call obj%timer%start("elpa_cholesky_&
76      &MATH_DATATYPE&
77      &_&
78      &PRECISION&
79      &")
80
81#ifdef WITH_OPENMP
82      ! store the number of OpenMP threads used in the calling function
83      ! restore this at the end of ELPA 2
84      omp_threads_caller = omp_get_max_threads()
85
86      ! check the number of threads that ELPA should use internally
87      call obj%get("omp_threads",nrThreads,error)
88      call omp_set_num_threads(nrThreads)
89#else
90      nrThreads=1
91#endif
92
93      na         = obj%na
94      lda        = obj%local_nrows
95      nblk       = obj%nblk
96      matrixCols = obj%local_ncols
97
98      call obj%get("mpi_comm_rows",mpi_comm_rows,error )
99      if (error .ne. ELPA_OK) then
100        print *,"Problem getting option. Aborting..."
101        stop
102      endif
103      call obj%get("mpi_comm_cols",mpi_comm_cols,error)
104      if (error .ne. ELPA_OK) then
105        print *,"Problem getting option. Aborting..."
106        stop
107      endif
108
109      call obj%get("debug",debug,error)
110      if (error .ne. ELPA_OK) then
111        print *,"Problem getting option. Aborting..."
112        stop
113      endif
114      if (debug == 1) then
115        wantDebug = .true.
116      else
117        wantDebug = .false.
118      endif
119
120      call obj%timer%start("mpi_communication")
121      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
122      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
123      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
124      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
125      call obj%timer%stop("mpi_communication")
126      success = .true.
127
128      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
129
130      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
131      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
132
133      l_rows_tile = tile_size/np_rows ! local rows of a tile
134      l_cols_tile = tile_size/np_cols ! local cols of a tile
135
136      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
137      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
138
139      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
140      if (istat .ne. 0) then
141        print *,"elpa_cholesky_&
142  &MATH_DATATYPE&: error when allocating tmp1 "//errorMessage
143        stop 1
144      endif
145
146      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
147      if (istat .ne. 0) then
148        print *,"elpa_cholesky_&
149  &MATH_DATATYPE&
150  &: error when allocating tmp2 "//errorMessage
151        stop 1
152      endif
153
154      tmp1 = 0
155      tmp2 = 0
156
157      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
158      if (istat .ne. 0) then
159        print *,"elpa_cholesky_&
160  &MATH_DATATYPE&
161  &: error when allocating tmatr "//errorMessage
162        stop 1
163      endif
164
165      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
166      if (istat .ne. 0) then
167        print *,"elpa_cholesky_&
168  &MATH_DATATYPE&
169  &: error when allocating tmatc "//errorMessage
170        stop 1
171      endif
172
173      tmatr = 0
174      tmatc = 0
175
176      do n = 1, na, nblk
177        ! Calculate first local row and column of the still remaining matrix
178        ! on the local processor
179
180        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
181        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
182
183        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
184        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
185
186        if (n+nblk > na) then
187
188          ! This is the last step, just do a Cholesky-Factorization
189          ! of the remaining block
190
191          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
192            call obj%timer%start("blas")
193
194            call PRECISION_POTRF('U', na-n+1, a(l_row1,l_col1), lda, info)
195            call obj%timer%stop("blas")
196
197            if (info/=0) then
198              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
199        &MATH_DATATYPE&
200
201#if REALCASE == 1
202        &: Error in dpotrf: ",info
203#endif
204#if COMPLEXCASE == 1
205              &: Error in zpotrf: ",info
206#endif
207              success = .false.
208              return
209            endif
210
211          endif
212
213          exit ! Loop
214
215        endif
216
217        if (my_prow==prow(n, nblk, np_rows)) then
218
219          if (my_pcol==pcol(n, nblk, np_cols)) then
220
221            ! The process owning the upper left remaining block does the
222            ! Cholesky-Factorization of this block
223            call obj%timer%start("blas")
224
225            call PRECISION_POTRF('U', nblk, a(l_row1,l_col1), lda, info)
226            call obj%timer%stop("blas")
227
228            if (info/=0) then
229              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
230        &MATH_DATATYPE&
231
232#if REALCASE == 1
233        &: Error in dpotrf 2: ",info
234#endif
235#if COMPLEXCASE == 1
236        &: Error in zpotrf 2: ",info
237
238#endif
239              success = .false.
240              return
241            endif
242
243            nc = 0
244            do i=1,nblk
245              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
246              nc = nc+i
247            enddo
248          endif
249#ifdef WITH_MPI
250          call obj%timer%start("mpi_communication")
251
252          call MPI_Bcast(tmp1, nblk*(nblk+1)/2,      &
253#if REALCASE == 1
254                   MPI_REAL_PRECISION,         &
255#endif
256#if COMPLEXCASE == 1
257                         MPI_COMPLEX_PRECISION,      &
258#endif
259       pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
260
261          call obj%timer%stop("mpi_communication")
262
263#endif /* WITH_MPI */
264          nc = 0
265          do i=1,nblk
266            tmp2(1:i,i) = tmp1(nc+1:nc+i)
267            nc = nc+i
268          enddo
269
270          call obj%timer%start("blas")
271          if (l_cols-l_colx+1>0) &
272              call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', nblk, l_cols-l_colx+1, ONE, tmp2, &
273                            ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
274          call obj%timer%stop("blas")
275        endif
276
277        do i=1,nblk
278
279#if REALCASE == 1
280          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
281#endif
282#if COMPLEXCASE == 1
283          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
284#endif
285
286#ifdef WITH_MPI
287
288          call obj%timer%start("mpi_communication")
289          if (l_cols-l_colx+1>0) &
290            call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_MATH_DATATYPE_PRECISION, &
291                           prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
292
293          call obj%timer%stop("mpi_communication")
294#endif /* WITH_MPI */
295        enddo
296        ! this has to be checked since it was changed substantially when doing type safe
297        call elpa_transpose_vectors_&
298  &MATH_DATATYPE&
299  &_&
300  &PRECISION &
301                 (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
302                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
303                                      n, na, nblk, nblk, nrThreads)
304
305        do i=0,(na-1)/tile_size
306          lcs = max(l_colx,i*l_cols_tile+1)
307          lce = min(l_cols,(i+1)*l_cols_tile)
308          lrs = l_rowx
309          lre = min(l_rows,(i+1)*l_rows_tile)
310          if (lce<lcs .or. lre<lrs) cycle
311          call obj%timer%start("blas")
312          call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre-lrs+1, lce-lcs+1, nblk, -ONE,  &
313                              tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
314                              ONE, a(lrs,lcs), lda)
315          call obj%timer%stop("blas")
316
317        enddo
318
319      enddo
320
321      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
322      if (istat .ne. 0) then
323        print *,"elpa_cholesky_&
324  &MATH_DATATYPE&
325  &: error when deallocating tmp1 "//errorMessage
326        stop 1
327      endif
328
329      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
330
331      do i=1,na
332        if (my_pcol==pcol(i, nblk, np_cols)) then
333          ! column i is on local processor
334          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
335          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
336          a(l_row1:l_rows,l_col1) = 0
337        endif
338      enddo
339
340      ! restore original OpenMP settings
341#ifdef WITH_OPENMP
342      ! store the number of OpenMP threads used in the calling function
343      ! restore this at the end of ELPA 2
344      call omp_set_num_threads(omp_threads_caller)
345#endif
346
347      call obj%timer%stop("elpa_cholesky_&
348      &MATH_DATATYPE&
349      &_&
350      &PRECISION&
351      &")
352