1! This file is part of ELPA. 2! 3! The ELPA library was originally created by the ELPA consortium, 4! consisting of the following organizations: 5! 6! - Max Planck Computing and Data Facility (MPCDF), formerly known as 7! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), 8! - Bergische Universität Wuppertal, Lehrstuhl für angewandte 9! Informatik, 10! - Technische Universität München, Lehrstuhl für Informatik mit 11! Schwerpunkt Wissenschaftliches Rechnen , 12! - Fritz-Haber-Institut, Berlin, Abt. Theorie, 13! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, 14! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, 15! and 16! - IBM Deutschland GmbH 17! 18! This particular source code file contains additions, changes and 19! enhancements authored by Intel Corporation which is not part of 20! the ELPA consortium. 21! 22! More information can be found here: 23! http://elpa.mpcdf.mpg.de/ 24! 25! ELPA is free software: you can redistribute it and/or modify 26! it under the terms of the version 3 of the license of the 27! GNU Lesser General Public License as published by the Free 28! Software Foundation. 29! 30! ELPA is distributed in the hope that it will be useful, 31! but WITHOUT ANY WARRANTY; without even the implied warranty of 32! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 33! GNU Lesser General Public License for more details. 34! 35! You should have received a copy of the GNU Lesser General Public License 36! along with ELPA. If not, see <http://www.gnu.org/licenses/> 37! 38! ELPA reflects a substantial effort on the part of the original 39! ELPA consortium, and we ask you to respect the spirit of the 40! license that we chose: i.e., please contribute any changes you 41! may have back to the original ELPA library distribution, and keep 42! any derivatives of ELPA under the same license that we chose for 43! the original distribution, the GNU Lesser General Public License. 44 45#include "../general/sanity.F90" 46 use elpa1_compute 47 use elpa_utilities 48 use elpa_mpi 49 use precision 50 use elpa_abstract_impl 51 use elpa_omp 52 53 implicit none 54#include "../general/precision_kinds.F90" 55 class(elpa_abstract_impl_t), intent(inout) :: obj 56 integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols 57#ifdef USE_ASSUMED_SIZE 58 MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,*) 59#else 60 MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,obj%local_ncols) 61#endif 62 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr 63 integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx 64 integer(kind=ik) :: n, nc, i, info 65 integer(kind=ik) :: lcs, lce, lrs, lre 66 integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile 67 68 MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) 69 logical :: wantDebug 70 logical :: success 71 integer(kind=ik) :: istat, debug, error 72 character(200) :: errorMessage 73 integer(kind=ik) :: nrThreads 74 75 call obj%timer%start("elpa_cholesky_& 76 &MATH_DATATYPE& 77 &_& 78 &PRECISION& 79 &") 80 81#ifdef WITH_OPENMP 82 ! store the number of OpenMP threads used in the calling function 83 ! restore this at the end of ELPA 2 84 omp_threads_caller = omp_get_max_threads() 85 86 ! check the number of threads that ELPA should use internally 87 call obj%get("omp_threads",nrThreads,error) 88 call omp_set_num_threads(nrThreads) 89#else 90 nrThreads=1 91#endif 92 93 na = obj%na 94 lda = obj%local_nrows 95 nblk = obj%nblk 96 matrixCols = obj%local_ncols 97 98 call obj%get("mpi_comm_rows",mpi_comm_rows,error ) 99 if (error .ne. ELPA_OK) then 100 print *,"Problem getting option. Aborting..." 101 stop 102 endif 103 call obj%get("mpi_comm_cols",mpi_comm_cols,error) 104 if (error .ne. ELPA_OK) then 105 print *,"Problem getting option. Aborting..." 106 stop 107 endif 108 109 call obj%get("debug",debug,error) 110 if (error .ne. ELPA_OK) then 111 print *,"Problem getting option. Aborting..." 112 stop 113 endif 114 if (debug == 1) then 115 wantDebug = .true. 116 else 117 wantDebug = .false. 118 endif 119 120 call obj%timer%start("mpi_communication") 121 call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) 122 call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) 123 call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) 124 call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) 125 call obj%timer%stop("mpi_communication") 126 success = .true. 127 128 ! Matrix is split into tiles; work is done only for tiles on the diagonal or above 129 130 tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size 131 tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide 132 133 l_rows_tile = tile_size/np_rows ! local rows of a tile 134 l_cols_tile = tile_size/np_cols ! local cols of a tile 135 136 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a 137 l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a 138 139 allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) 140 if (istat .ne. 0) then 141 print *,"elpa_cholesky_& 142 &MATH_DATATYPE&: error when allocating tmp1 "//errorMessage 143 stop 1 144 endif 145 146 allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) 147 if (istat .ne. 0) then 148 print *,"elpa_cholesky_& 149 &MATH_DATATYPE& 150 &: error when allocating tmp2 "//errorMessage 151 stop 1 152 endif 153 154 tmp1 = 0 155 tmp2 = 0 156 157 allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) 158 if (istat .ne. 0) then 159 print *,"elpa_cholesky_& 160 &MATH_DATATYPE& 161 &: error when allocating tmatr "//errorMessage 162 stop 1 163 endif 164 165 allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) 166 if (istat .ne. 0) then 167 print *,"elpa_cholesky_& 168 &MATH_DATATYPE& 169 &: error when allocating tmatc "//errorMessage 170 stop 1 171 endif 172 173 tmatr = 0 174 tmatc = 0 175 176 do n = 1, na, nblk 177 ! Calculate first local row and column of the still remaining matrix 178 ! on the local processor 179 180 l_row1 = local_index(n, my_prow, np_rows, nblk, +1) 181 l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) 182 183 l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) 184 l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) 185 186 if (n+nblk > na) then 187 188 ! This is the last step, just do a Cholesky-Factorization 189 ! of the remaining block 190 191 if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then 192 call obj%timer%start("blas") 193 194 call PRECISION_POTRF('U', na-n+1, a(l_row1,l_col1), lda, info) 195 call obj%timer%stop("blas") 196 197 if (info/=0) then 198 if (wantDebug) write(error_unit,*) "elpa_cholesky_& 199 &MATH_DATATYPE& 200 201#if REALCASE == 1 202 &: Error in dpotrf: ",info 203#endif 204#if COMPLEXCASE == 1 205 &: Error in zpotrf: ",info 206#endif 207 success = .false. 208 return 209 endif 210 211 endif 212 213 exit ! Loop 214 215 endif 216 217 if (my_prow==prow(n, nblk, np_rows)) then 218 219 if (my_pcol==pcol(n, nblk, np_cols)) then 220 221 ! The process owning the upper left remaining block does the 222 ! Cholesky-Factorization of this block 223 call obj%timer%start("blas") 224 225 call PRECISION_POTRF('U', nblk, a(l_row1,l_col1), lda, info) 226 call obj%timer%stop("blas") 227 228 if (info/=0) then 229 if (wantDebug) write(error_unit,*) "elpa_cholesky_& 230 &MATH_DATATYPE& 231 232#if REALCASE == 1 233 &: Error in dpotrf 2: ",info 234#endif 235#if COMPLEXCASE == 1 236 &: Error in zpotrf 2: ",info 237 238#endif 239 success = .false. 240 return 241 endif 242 243 nc = 0 244 do i=1,nblk 245 tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) 246 nc = nc+i 247 enddo 248 endif 249#ifdef WITH_MPI 250 call obj%timer%start("mpi_communication") 251 252 call MPI_Bcast(tmp1, nblk*(nblk+1)/2, & 253#if REALCASE == 1 254 MPI_REAL_PRECISION, & 255#endif 256#if COMPLEXCASE == 1 257 MPI_COMPLEX_PRECISION, & 258#endif 259 pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) 260 261 call obj%timer%stop("mpi_communication") 262 263#endif /* WITH_MPI */ 264 nc = 0 265 do i=1,nblk 266 tmp2(1:i,i) = tmp1(nc+1:nc+i) 267 nc = nc+i 268 enddo 269 270 call obj%timer%start("blas") 271 if (l_cols-l_colx+1>0) & 272 call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', nblk, l_cols-l_colx+1, ONE, tmp2, & 273 ubound(tmp2,dim=1), a(l_row1,l_colx), lda) 274 call obj%timer%stop("blas") 275 endif 276 277 do i=1,nblk 278 279#if REALCASE == 1 280 if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols) 281#endif 282#if COMPLEXCASE == 1 283 if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols)) 284#endif 285 286#ifdef WITH_MPI 287 288 call obj%timer%start("mpi_communication") 289 if (l_cols-l_colx+1>0) & 290 call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_MATH_DATATYPE_PRECISION, & 291 prow(n, nblk, np_rows), mpi_comm_rows, mpierr) 292 293 call obj%timer%stop("mpi_communication") 294#endif /* WITH_MPI */ 295 enddo 296 ! this has to be checked since it was changed substantially when doing type safe 297 call elpa_transpose_vectors_& 298 &MATH_DATATYPE& 299 &_& 300 &PRECISION & 301 (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & 302 tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & 303 n, na, nblk, nblk, nrThreads) 304 305 do i=0,(na-1)/tile_size 306 lcs = max(l_colx,i*l_cols_tile+1) 307 lce = min(l_cols,(i+1)*l_cols_tile) 308 lrs = l_rowx 309 lre = min(l_rows,(i+1)*l_rows_tile) 310 if (lce<lcs .or. lre<lrs) cycle 311 call obj%timer%start("blas") 312 call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre-lrs+1, lce-lcs+1, nblk, -ONE, & 313 tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), & 314 ONE, a(lrs,lcs), lda) 315 call obj%timer%stop("blas") 316 317 enddo 318 319 enddo 320 321 deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage) 322 if (istat .ne. 0) then 323 print *,"elpa_cholesky_& 324 &MATH_DATATYPE& 325 &: error when deallocating tmp1 "//errorMessage 326 stop 1 327 endif 328 329 ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications) 330 331 do i=1,na 332 if (my_pcol==pcol(i, nblk, np_cols)) then 333 ! column i is on local processor 334 l_col1 = local_index(i , my_pcol, np_cols, nblk, +1) ! local column number 335 l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal 336 a(l_row1:l_rows,l_col1) = 0 337 endif 338 enddo 339 340 ! restore original OpenMP settings 341#ifdef WITH_OPENMP 342 ! store the number of OpenMP threads used in the calling function 343 ! restore this at the end of ELPA 2 344 call omp_set_num_threads(omp_threads_caller) 345#endif 346 347 call obj%timer%stop("elpa_cholesky_& 348 &MATH_DATATYPE& 349 &_& 350 &PRECISION& 351 &") 352