1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3
4 This file is part of JDFTx.
5
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19
20 #include <core/BlasExtra.h>
21 #include <cstring>
22
eblas_lincomb_sub(int iMin,int iMax,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)23 void eblas_lincomb_sub(int iMin, int iMax,
24 const complex& sX, const complex* X, const int incX,
25 const complex& sY, const complex* Y, const int incY,
26 complex* Z, const int incZ)
27 { for(int i=iMin; i<iMax; i++) Z[i*incZ] = sX*X[i*incX] + sY*Y[i*incY];
28 }
eblas_lincomb(const int N,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)29 void eblas_lincomb(const int N,
30 const complex& sX, const complex* X, const int incX,
31 const complex& sY, const complex* Y, const int incY,
32 complex* Z, const int incZ)
33 { if(incZ==0) die("incZ cannot be = 0")
34 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
35 eblas_lincomb_sub, N, sX, X, incX, sY, Y, incY, Z, incZ);
36 }
37
38
eblas_zgemm_sub(size_t iMin,size_t iMax,const CBLAS_TRANSPOSE TransA,const CBLAS_TRANSPOSE TransB,const int M,const int N,const int K,const complex * alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex * beta,complex * C,const int ldc)39 void eblas_zgemm_sub(size_t iMin, size_t iMax,
40 const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
41 const complex* alpha, const complex *A, const int lda, const complex *B, const int ldb,
42 const complex* beta, complex *C, const int ldc)
43 {
44 int Msub, Nsub; const complex *Asub, *Bsub; complex *Csub;
45 if(M>N)
46 { Msub = iMax-iMin;
47 Nsub = N;
48 Asub = A+iMin*(TransA==CblasNoTrans ? 1 : lda);
49 Bsub = B;
50 Csub = C+iMin;
51 }
52 else
53 { Msub = M;
54 Nsub = iMax-iMin;
55 Asub = A;
56 Bsub = B+iMin*(TransB==CblasNoTrans ? ldb : 1);
57 Csub = C+iMin*ldc;
58 }
59 cblas_zgemm(CblasColMajor, TransA, TransB, Msub, Nsub, K, alpha, Asub, lda, Bsub, ldb, beta, Csub, ldc);
60 }
eblas_zgemm(const CBLAS_TRANSPOSE TransA,const CBLAS_TRANSPOSE TransB,const int M,const int N,const int K,const complex & alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex & beta,complex * C,const int ldc)61 void eblas_zgemm(
62 const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
63 const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
64 const complex& beta, complex *C, const int ldc)
65 {
66 #ifdef MKL_ENABLED
67 cblas_zgemm(CblasColMajor, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
68 #else
69 threadLaunch(shouldThreadOperators() ? 0 : 1, //disable threads if called from threaded top-level code
70 eblas_zgemm_sub, std::max(M,N), //parallelize along larger dimension of output
71 TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
72 #endif
73 }
74
75
eblas_scatter_axpy_sub(size_t iStart,size_t iStop,double a,const int * index,const scalar * x,scalar * y)76 template<typename scalar> void eblas_scatter_axpy_sub(size_t iStart, size_t iStop, double a, const int* index, const scalar* x, scalar* y)
77 { for(size_t i=iStart; i<iStop; i++) y[index[i]] += a * x[i];
78 }
eblas_scatter_axpy(const int Nindex,double a,const int * index,const scalar * x,scalar * y)79 template<typename scalar> void eblas_scatter_axpy(const int Nindex, double a, const int* index, const scalar* x, scalar* y)
80 { threadLaunch((Nindex<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
81 eblas_scatter_axpy_sub<scalar>, Nindex, a, index, x, y);
82 }
eblas_scatter_zdaxpy(const int Nindex,double a,const int * index,const complex * x,complex * y)83 void eblas_scatter_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y) { eblas_scatter_axpy<complex>(Nindex, a, index, x, y); }
eblas_scatter_daxpy(const int Nindex,double a,const int * index,const double * x,double * y)84 void eblas_scatter_daxpy(const int Nindex, double a, const int* index, const double* x, double* y) { eblas_scatter_axpy<double>(Nindex, a, index, x, y); }
85
86
eblas_gather_axpy_sub(size_t iStart,size_t iStop,double a,const int * index,const scalar * x,scalar * y)87 template<typename scalar> void eblas_gather_axpy_sub(size_t iStart, size_t iStop, double a, const int* index, const scalar* x, scalar* y)
88 { for(size_t i=iStart; i<iStop; i++) y[i] += a * x[index[i]];
89 }
eblas_gather_axpy(const int Nindex,double a,const int * index,const scalar * x,scalar * y)90 template<typename scalar> void eblas_gather_axpy(const int Nindex, double a, const int* index, const scalar* x, scalar* y)
91 { threadLaunch((Nindex<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
92 eblas_gather_axpy_sub<scalar>, Nindex, a, index, x, y);
93 }
eblas_gather_zdaxpy(const int Nindex,double a,const int * index,const complex * x,complex * y)94 void eblas_gather_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y) { eblas_gather_axpy<complex>(Nindex, a, index, x, y); }
eblas_gather_daxpy(const int Nindex,double a,const int * index,const double * x,double * y)95 void eblas_gather_daxpy(const int Nindex, double a, const int* index, const double* x, double* y) { eblas_gather_axpy<double>(Nindex, a, index, x, y); }
96
97
eblas_accumNorm_sub(size_t iStart,size_t iStop,const double & a,const complex * x,double * y)98 void eblas_accumNorm_sub(size_t iStart, size_t iStop, const double& a, const complex* x, double* y)
99 { for(size_t i=iStart; i<iStop; i++) y[i] += a * x[i].norm();
100 }
eblas_accumNorm(int N,const double & a,const complex * x,double * y)101 void eblas_accumNorm(int N, const double& a, const complex* x, double* y)
102 { threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
103 eblas_accumNorm_sub, N, a, x, y);
104 }
105
106
eblas_symmetrize_sub(size_t iStart,size_t iStop,int n,const int * symmIndex,scalar * x)107 template<typename scalar> void eblas_symmetrize_sub(size_t iStart, size_t iStop, int n, const int* symmIndex, scalar* x)
108 { double nInv = 1./n;
109 for(size_t i=iStart; i<iStop; i++)
110 { scalar xSum=0.0;
111 for(int j=0; j<n; j++) xSum += x[symmIndex[n*i+j]];
112 xSum *= nInv; //average n in the equivalence class
113 for(int j=0; j<n; j++) x[symmIndex[n*i+j]] = xSum;
114 }
115 }
eblas_symmetrize(int N,int n,const int * symmIndex,scalar * x)116 template<typename scalar> void eblas_symmetrize(int N, int n, const int* symmIndex, scalar* x)
117 { threadLaunch((N*n<10000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
118 eblas_symmetrize_sub<scalar>, N, n, symmIndex, x);
119 }
eblas_symmetrize(int N,int n,const int * symmIndex,double * x)120 void eblas_symmetrize(int N, int n, const int* symmIndex, double* x) { eblas_symmetrize<double>(N, n, symmIndex, x); }
eblas_symmetrize(int N,int n,const int * symmIndex,complex * x)121 void eblas_symmetrize(int N, int n, const int* symmIndex, complex* x) { eblas_symmetrize<complex>(N, n, symmIndex, x); }
122
123
124 //BLAS-1 threaded wrappers
125 template<typename T>
eblas_zero_sub(size_t iStart,size_t iStop,T * x)126 void eblas_zero_sub(size_t iStart, size_t iStop, T* x)
127 { memset(x+iStart, 0, (iStop-iStart)*sizeof(T));
128 }
eblas_zero(int N,complex * x)129 void eblas_zero(int N, complex* x)
130 { threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
131 eblas_zero_sub<complex>, N, x);
132 }
eblas_zero(int N,double * x)133 void eblas_zero(int N, double* x)
134 { threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
135 eblas_zero_sub<double>, N, x);
136 }
137
eblas_zscal_sub(size_t iStart,size_t iStop,const complex * a,complex * x,int incx)138 void eblas_zscal_sub(size_t iStart, size_t iStop, const complex* a, complex* x, int incx)
139 { cblas_zscal(iStop-iStart, a, x+incx*iStart, incx);
140 }
eblas_zscal(int N,const complex & a,complex * x,int incx)141 void eblas_zscal(int N, const complex& a, complex* x, int incx)
142 {
143 #ifdef MKL_ENABLED
144 cblas_zscal(N, &a, x, incx);
145 #else
146 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
147 eblas_zscal_sub, N, &a, x, incx);
148 #endif
149 }
eblas_zdscal_sub(size_t iStart,size_t iStop,double a,complex * x,int incx)150 void eblas_zdscal_sub(size_t iStart, size_t iStop, double a, complex* x, int incx)
151 { cblas_zdscal(iStop-iStart, a, x+incx*iStart, incx);
152 }
eblas_zdscal(int N,double a,complex * x,int incx)153 void eblas_zdscal(int N, double a, complex* x, int incx)
154 {
155 #ifdef MKL_ENABLED
156 cblas_zdscal(N, a, x, incx);
157 #else
158 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
159 eblas_zdscal_sub, N, a, x, incx);
160 #endif
161 }
eblas_dscal_sub(size_t iStart,size_t iStop,double a,double * x,int incx)162 void eblas_dscal_sub(size_t iStart, size_t iStop, double a, double* x, int incx)
163 { cblas_dscal(iStop-iStart, a, x+incx*iStart, incx);
164 }
eblas_dscal(int N,double a,double * x,int incx)165 void eblas_dscal(int N, double a, double* x, int incx)
166 {
167 #ifdef MKL_ENABLED
168 cblas_dscal(N, a, x, incx);
169 #else
170 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
171 eblas_dscal_sub, N, a, x, incx);
172 #endif
173 }
174
175
eblas_zaxpy_sub(size_t iStart,size_t iStop,const complex * a,const complex * x,int incx,complex * y,int incy)176 void eblas_zaxpy_sub(size_t iStart, size_t iStop, const complex* a, const complex* x, int incx, complex* y, int incy)
177 { cblas_zaxpy(iStop-iStart, a, x+incx*iStart, incx, y+incy*iStart, incy);
178 }
eblas_zaxpy(int N,const complex & a,const complex * x,int incx,complex * y,int incy)179 void eblas_zaxpy(int N, const complex& a, const complex* x, int incx, complex* y, int incy)
180 {
181 #ifdef MKL_ENABLED
182 cblas_zaxpy(N, &a, x, incx, y, incy);
183 #else
184 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
185 eblas_zaxpy_sub, N, &a, x, incx, y, incy);
186 #endif
187 }
eblas_daxpy_sub(size_t iStart,size_t iStop,double a,const double * x,int incx,double * y,int incy)188 void eblas_daxpy_sub(size_t iStart, size_t iStop, double a, const double* x, int incx, double* y, int incy)
189 { cblas_daxpy(iStop-iStart, a, x+incx*iStart, incx, y+incy*iStart, incy);
190 }
eblas_daxpy(int N,double a,const double * x,int incx,double * y,int incy)191 void eblas_daxpy(int N, double a, const double* x, int incx, double* y, int incy)
192 {
193 #ifdef MKL_ENABLED
194 cblas_daxpy(N, a, x, incx, y, incy);
195 #else
196 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
197 eblas_daxpy_sub, N, a, x, incx, y, incy);
198 #endif
199 }
200
201
eblas_zdotc_sub(size_t iStart,size_t iStop,const complex * x,int incx,const complex * y,int incy,complex * ret,std::mutex * lock)202 void eblas_zdotc_sub(size_t iStart, size_t iStop, const complex* x, int incx, const complex* y, int incy,
203 complex* ret, std::mutex* lock)
204 { //Compute this thread's contribution:
205 complex retSub;
206 cblas_zdotc_sub(iStop-iStart, x+incx*iStart, incx, y+incy*iStart, incy, &retSub);
207 //Accumulate over threads (need sync):
208 lock->lock();
209 *ret += retSub;
210 lock->unlock();
211 }
eblas_zdotc(int N,const complex * x,int incx,const complex * y,int incy)212 complex eblas_zdotc(int N, const complex* x, int incx, const complex* y, int incy)
213 { complex ret = 0.;
214 #ifdef MKL_ENABLED
215 cblas_zdotc_sub(N, x, incx, y, incy, &ret);
216 #else
217 std::mutex lock;
218 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
219 eblas_zdotc_sub, N, x, incx, y, incy, &ret, &lock);
220 #endif
221 return ret;
222 }
223
eblas_ddot_sub(size_t iStart,size_t iStop,const double * x,int incx,const double * y,int incy,double * ret,std::mutex * lock)224 void eblas_ddot_sub(size_t iStart, size_t iStop, const double* x, int incx, const double* y, int incy,
225 double* ret, std::mutex* lock)
226 { //Compute this thread's contribution:
227 double retSub = cblas_ddot(iStop-iStart, x+incx*iStart, incx, y+incy*iStart, incy);
228 //Accumulate over threads (need sync):
229 lock->lock();
230 *ret += retSub;
231 lock->unlock();
232 }
eblas_ddot(int N,const double * x,int incx,const double * y,int incy)233 double eblas_ddot(int N, const double* x, int incx, const double* y, int incy)
234 {
235 #ifdef MKL_ENABLED
236 return cblas_ddot(N, x, incx, y, incy);
237 #else
238 double ret = 0.;
239 std::mutex lock;
240 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
241 eblas_ddot_sub, N, x, incx, y, incy, &ret, &lock);
242 return ret;
243 #endif
244 }
245
eblas_dznrm2_sub(size_t iStart,size_t iStop,const complex * x,int incx,double * ret,std::mutex * lock)246 void eblas_dznrm2_sub(size_t iStart, size_t iStop, const complex* x, int incx, double* ret, std::mutex* lock)
247 { //Compute this thread's contribution:
248 double retSub = cblas_dznrm2(iStop-iStart, x+incx*iStart, incx);
249 //Accumulate over threads (need sync):
250 lock->lock();
251 *ret += retSub*retSub;
252 lock->unlock();
253 }
eblas_dznrm2(int N,const complex * x,int incx)254 double eblas_dznrm2(int N, const complex* x, int incx)
255 {
256 #ifdef MKL_ENABLED
257 return cblas_dznrm2(N, x, incx);
258 #else
259 double ret = 0.;
260 std::mutex lock;
261 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, eblas_dznrm2_sub, N, x, incx, &ret, &lock);
262 return sqrt(ret);
263 #endif
264 }
265
eblas_dnrm2_sub(size_t iStart,size_t iStop,const double * x,int incx,double * ret,std::mutex * lock)266 void eblas_dnrm2_sub(size_t iStart, size_t iStop, const double* x, int incx, double* ret, std::mutex* lock)
267 { //Compute this thread's contribution:
268 double retSub = cblas_dnrm2(iStop-iStart, x+incx*iStart, incx);
269 //Accumulate over threads (need sync):
270 lock->lock();
271 *ret += retSub*retSub;
272 lock->unlock();
273 }
eblas_dnrm2(int N,const double * x,int incx)274 double eblas_dnrm2(int N, const double* x, int incx)
275 {
276 #ifdef MKL_ENABLED
277 return cblas_dnrm2(N, x, incx);
278 #else
279 double ret = 0.;
280 std::mutex lock;
281 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, eblas_dnrm2_sub, N, x, incx, &ret, &lock);
282 return sqrt(ret);
283 #endif
284 }
285
286
287
288 //Min-max:
eblas_capMinMax_sub(size_t iStart,size_t iStop,double * x,double * xMin,double * xMax,double capLo,double capHi,std::mutex * lock)289 void eblas_capMinMax_sub(size_t iStart, size_t iStop,
290 double* x, double* xMin, double* xMax, double capLo, double capHi, std::mutex* lock)
291 { double xMinLoc = +DBL_MAX;
292 double xMaxLoc = -DBL_MAX;
293 for(size_t i=iStart; i<iStop; i++)
294 { if(x[i]<xMinLoc) xMinLoc=x[i];
295 if(x[i]>xMaxLoc) xMaxLoc=x[i];
296 if(x[i]<capLo) x[i]=capLo;
297 if(x[i]>capHi) x[i]=capHi;
298 }
299 lock->lock();
300 if(xMinLoc<*xMin) *xMin=xMinLoc;
301 if(xMaxLoc>*xMax) *xMax=xMaxLoc;
302 lock->unlock();
303 }
eblas_capMinMax(const int N,double * x,double & xMin,double & xMax,double capLo,double capHi)304 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo, double capHi)
305 { xMin = +DBL_MAX;
306 xMax = -DBL_MAX;
307 std::mutex lock;
308 threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
309 eblas_capMinMax_sub, N, x, &xMin, &xMax, capLo, capHi, &lock);
310 }
311