1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #include <core/BlasExtra.h>
21 #include <cstring>
22 
eblas_lincomb_sub(int iMin,int iMax,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)23 void eblas_lincomb_sub(int iMin, int iMax,
24 	const complex& sX, const complex* X, const int incX,
25 	const complex& sY, const complex* Y, const int incY,
26 	complex* Z, const int incZ)
27 {	for(int i=iMin; i<iMax; i++) Z[i*incZ] = sX*X[i*incX] + sY*Y[i*incY];
28 }
eblas_lincomb(const int N,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)29 void eblas_lincomb(const int N,
30 	const complex& sX, const complex* X, const int incX,
31 	const complex& sY, const complex* Y, const int incY,
32 	complex* Z, const int incZ)
33 {	if(incZ==0) die("incZ cannot be = 0")
34 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
35 		eblas_lincomb_sub, N, sX, X, incX, sY, Y, incY, Z, incZ);
36 }
37 
38 
eblas_zgemm_sub(size_t iMin,size_t iMax,const CBLAS_TRANSPOSE TransA,const CBLAS_TRANSPOSE TransB,const int M,const int N,const int K,const complex * alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex * beta,complex * C,const int ldc)39 void eblas_zgemm_sub(size_t iMin, size_t iMax,
40 	const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
41 	const complex* alpha, const complex *A, const int lda, const complex *B, const int ldb,
42 	const complex* beta, complex *C, const int ldc)
43 {
44 	int Msub, Nsub; const complex *Asub, *Bsub; complex *Csub;
45 	if(M>N)
46 	{	Msub = iMax-iMin;
47 		Nsub = N;
48 		Asub = A+iMin*(TransA==CblasNoTrans ? 1 : lda);
49 		Bsub = B;
50 		Csub = C+iMin;
51 	}
52 	else
53 	{	Msub = M;
54 		Nsub = iMax-iMin;
55 		Asub = A;
56 		Bsub = B+iMin*(TransB==CblasNoTrans ? ldb : 1);
57 		Csub = C+iMin*ldc;
58 	}
59 	cblas_zgemm(CblasColMajor, TransA, TransB, Msub, Nsub, K, alpha, Asub, lda, Bsub, ldb, beta, Csub, ldc);
60 }
eblas_zgemm(const CBLAS_TRANSPOSE TransA,const CBLAS_TRANSPOSE TransB,const int M,const int N,const int K,const complex & alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex & beta,complex * C,const int ldc)61 void eblas_zgemm(
62 	const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
63 	const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
64 	const complex& beta, complex *C, const int ldc)
65 {
66 	#ifdef MKL_ENABLED
67 	cblas_zgemm(CblasColMajor, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
68 	#else
69 	threadLaunch(shouldThreadOperators() ? 0 : 1, //disable threads if called from threaded top-level code
70 		eblas_zgemm_sub, std::max(M,N), //parallelize along larger dimension of output
71  		TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
72 	#endif
73 }
74 
75 
eblas_scatter_axpy_sub(size_t iStart,size_t iStop,double a,const int * index,const scalar * x,scalar * y)76 template<typename scalar> void eblas_scatter_axpy_sub(size_t iStart, size_t iStop, double a, const int* index, const scalar* x, scalar* y)
77 {	for(size_t i=iStart; i<iStop; i++) y[index[i]] += a * x[i];
78 }
eblas_scatter_axpy(const int Nindex,double a,const int * index,const scalar * x,scalar * y)79 template<typename scalar> void eblas_scatter_axpy(const int Nindex, double a, const int* index, const scalar* x, scalar* y)
80 {	threadLaunch((Nindex<100000 || (!shouldThreadOperators())) ? 1 : 0,  //force single threaded for small problem sizes
81 		eblas_scatter_axpy_sub<scalar>, Nindex, a, index, x, y);
82 }
eblas_scatter_zdaxpy(const int Nindex,double a,const int * index,const complex * x,complex * y)83 void eblas_scatter_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y) { eblas_scatter_axpy<complex>(Nindex, a, index, x, y); }
eblas_scatter_daxpy(const int Nindex,double a,const int * index,const double * x,double * y)84 void eblas_scatter_daxpy(const int Nindex, double a, const int* index, const double* x, double* y) { eblas_scatter_axpy<double>(Nindex, a, index, x, y); }
85 
86 
eblas_gather_axpy_sub(size_t iStart,size_t iStop,double a,const int * index,const scalar * x,scalar * y)87 template<typename scalar> void eblas_gather_axpy_sub(size_t iStart, size_t iStop, double a, const int* index, const scalar* x, scalar* y)
88 {	for(size_t i=iStart; i<iStop; i++) y[i] += a * x[index[i]];
89 }
eblas_gather_axpy(const int Nindex,double a,const int * index,const scalar * x,scalar * y)90 template<typename scalar> void eblas_gather_axpy(const int Nindex, double a, const int* index, const scalar* x, scalar* y)
91 {	threadLaunch((Nindex<100000 || (!shouldThreadOperators())) ? 1 : 0,  //force single threaded for small problem sizes
92 		eblas_gather_axpy_sub<scalar>, Nindex, a, index, x, y);
93 }
eblas_gather_zdaxpy(const int Nindex,double a,const int * index,const complex * x,complex * y)94 void eblas_gather_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y) { eblas_gather_axpy<complex>(Nindex, a, index, x, y); }
eblas_gather_daxpy(const int Nindex,double a,const int * index,const double * x,double * y)95 void eblas_gather_daxpy(const int Nindex, double a, const int* index, const double* x, double* y) { eblas_gather_axpy<double>(Nindex, a, index, x, y); }
96 
97 
eblas_accumNorm_sub(size_t iStart,size_t iStop,const double & a,const complex * x,double * y)98 void eblas_accumNorm_sub(size_t iStart, size_t iStop, const double& a, const complex* x, double* y)
99 {	for(size_t i=iStart; i<iStop; i++) y[i] += a * x[i].norm();
100 }
eblas_accumNorm(int N,const double & a,const complex * x,double * y)101 void eblas_accumNorm(int N, const double& a, const complex* x, double* y)
102 {	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
103 		eblas_accumNorm_sub, N, a, x, y);
104 }
105 
106 
eblas_symmetrize_sub(size_t iStart,size_t iStop,int n,const int * symmIndex,scalar * x)107 template<typename scalar> void eblas_symmetrize_sub(size_t iStart, size_t iStop, int n, const int* symmIndex, scalar* x)
108 {	double nInv = 1./n;
109 	for(size_t i=iStart; i<iStop; i++)
110 	{	scalar xSum=0.0;
111 		for(int j=0; j<n; j++) xSum += x[symmIndex[n*i+j]];
112 		xSum *= nInv; //average n in the equivalence class
113 		for(int j=0; j<n; j++) x[symmIndex[n*i+j]] = xSum;
114 	}
115 }
eblas_symmetrize(int N,int n,const int * symmIndex,scalar * x)116 template<typename scalar> void eblas_symmetrize(int N, int n, const int* symmIndex, scalar* x)
117 {	threadLaunch((N*n<10000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
118 		eblas_symmetrize_sub<scalar>, N, n, symmIndex, x);
119 }
eblas_symmetrize(int N,int n,const int * symmIndex,double * x)120 void eblas_symmetrize(int N, int n, const int* symmIndex, double* x) { eblas_symmetrize<double>(N, n, symmIndex, x); }
eblas_symmetrize(int N,int n,const int * symmIndex,complex * x)121 void eblas_symmetrize(int N, int n, const int* symmIndex, complex* x) { eblas_symmetrize<complex>(N, n, symmIndex, x); }
122 
123 
124 //BLAS-1 threaded wrappers
125 template<typename T>
eblas_zero_sub(size_t iStart,size_t iStop,T * x)126 void eblas_zero_sub(size_t iStart, size_t iStop, T* x)
127 {	memset(x+iStart, 0, (iStop-iStart)*sizeof(T));
128 }
eblas_zero(int N,complex * x)129 void eblas_zero(int N, complex* x)
130 {	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
131 		eblas_zero_sub<complex>, N, x);
132 }
eblas_zero(int N,double * x)133 void eblas_zero(int N, double* x)
134 {	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
135 		eblas_zero_sub<double>, N, x);
136 }
137 
eblas_zscal_sub(size_t iStart,size_t iStop,const complex * a,complex * x,int incx)138 void eblas_zscal_sub(size_t iStart, size_t iStop, const complex* a, complex* x, int incx)
139 {	cblas_zscal(iStop-iStart, a, x+incx*iStart, incx);
140 }
eblas_zscal(int N,const complex & a,complex * x,int incx)141 void eblas_zscal(int N, const complex& a, complex* x, int incx)
142 {
143 	#ifdef MKL_ENABLED
144 	cblas_zscal(N, &a, x, incx);
145 	#else
146 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
147 		eblas_zscal_sub, N, &a, x, incx);
148 	#endif
149 }
eblas_zdscal_sub(size_t iStart,size_t iStop,double a,complex * x,int incx)150 void eblas_zdscal_sub(size_t iStart, size_t iStop, double a, complex* x, int incx)
151 {	cblas_zdscal(iStop-iStart, a, x+incx*iStart, incx);
152 }
eblas_zdscal(int N,double a,complex * x,int incx)153 void eblas_zdscal(int N, double a, complex* x, int incx)
154 {
155 	#ifdef MKL_ENABLED
156 	cblas_zdscal(N, a, x, incx);
157 	#else
158 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
159 		eblas_zdscal_sub, N, a, x, incx);
160 	#endif
161 }
eblas_dscal_sub(size_t iStart,size_t iStop,double a,double * x,int incx)162 void eblas_dscal_sub(size_t iStart, size_t iStop, double a, double* x, int incx)
163 {	cblas_dscal(iStop-iStart, a, x+incx*iStart, incx);
164 }
eblas_dscal(int N,double a,double * x,int incx)165 void eblas_dscal(int N, double a, double* x, int incx)
166 {
167 	#ifdef MKL_ENABLED
168 	cblas_dscal(N, a, x, incx);
169 	#else
170 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
171 		eblas_dscal_sub, N, a, x, incx);
172 	#endif
173 }
174 
175 
eblas_zaxpy_sub(size_t iStart,size_t iStop,const complex * a,const complex * x,int incx,complex * y,int incy)176 void eblas_zaxpy_sub(size_t iStart, size_t iStop, const complex* a, const complex* x, int incx, complex* y, int incy)
177 {	cblas_zaxpy(iStop-iStart, a, x+incx*iStart, incx, y+incy*iStart, incy);
178 }
eblas_zaxpy(int N,const complex & a,const complex * x,int incx,complex * y,int incy)179 void eblas_zaxpy(int N, const complex& a, const complex* x, int incx, complex* y, int incy)
180 {
181 	#ifdef MKL_ENABLED
182 	cblas_zaxpy(N, &a, x, incx, y, incy);
183 	#else
184 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
185 		eblas_zaxpy_sub, N, &a, x, incx, y, incy);
186 	#endif
187 }
eblas_daxpy_sub(size_t iStart,size_t iStop,double a,const double * x,int incx,double * y,int incy)188 void eblas_daxpy_sub(size_t iStart, size_t iStop, double a, const double* x, int incx, double* y, int incy)
189 {	cblas_daxpy(iStop-iStart, a, x+incx*iStart, incx, y+incy*iStart, incy);
190 }
eblas_daxpy(int N,double a,const double * x,int incx,double * y,int incy)191 void eblas_daxpy(int N, double a, const double* x, int incx, double* y, int incy)
192 {
193 	#ifdef MKL_ENABLED
194 	cblas_daxpy(N, a, x, incx, y, incy);
195 	#else
196 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
197 		eblas_daxpy_sub, N, a, x, incx, y, incy);
198 	#endif
199 }
200 
201 
eblas_zdotc_sub(size_t iStart,size_t iStop,const complex * x,int incx,const complex * y,int incy,complex * ret,std::mutex * lock)202 void eblas_zdotc_sub(size_t iStart, size_t iStop, const complex* x, int incx, const complex* y, int incy,
203 	complex* ret, std::mutex* lock)
204 {	//Compute this thread's contribution:
205 	complex retSub;
206 	cblas_zdotc_sub(iStop-iStart, x+incx*iStart, incx, y+incy*iStart, incy, &retSub);
207 	//Accumulate over threads (need sync):
208 	lock->lock();
209 	*ret += retSub;
210 	lock->unlock();
211 }
eblas_zdotc(int N,const complex * x,int incx,const complex * y,int incy)212 complex eblas_zdotc(int N, const complex* x, int incx, const complex* y, int incy)
213 {	complex ret = 0.;
214 	#ifdef MKL_ENABLED
215 	cblas_zdotc_sub(N, x, incx, y, incy, &ret);
216 	#else
217 	std::mutex lock;
218 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
219 		eblas_zdotc_sub, N, x, incx, y, incy, &ret, &lock);
220 	#endif
221 	return ret;
222 }
223 
eblas_ddot_sub(size_t iStart,size_t iStop,const double * x,int incx,const double * y,int incy,double * ret,std::mutex * lock)224 void eblas_ddot_sub(size_t iStart, size_t iStop, const double* x, int incx, const double* y, int incy,
225 	double* ret, std::mutex* lock)
226 {	//Compute this thread's contribution:
227 	double retSub = cblas_ddot(iStop-iStart, x+incx*iStart, incx, y+incy*iStart, incy);
228 	//Accumulate over threads (need sync):
229 	lock->lock();
230 	*ret += retSub;
231 	lock->unlock();
232 }
eblas_ddot(int N,const double * x,int incx,const double * y,int incy)233 double eblas_ddot(int N, const double* x, int incx, const double* y, int incy)
234 {
235 	#ifdef MKL_ENABLED
236 	return cblas_ddot(N, x, incx, y, incy);
237 	#else
238 	double ret = 0.;
239 	std::mutex lock;
240 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0,
241 		eblas_ddot_sub, N, x, incx, y, incy, &ret, &lock);
242 	return ret;
243 	#endif
244 }
245 
eblas_dznrm2_sub(size_t iStart,size_t iStop,const complex * x,int incx,double * ret,std::mutex * lock)246 void eblas_dznrm2_sub(size_t iStart, size_t iStop, const complex* x, int incx, double* ret, std::mutex* lock)
247 {	//Compute this thread's contribution:
248 	double retSub = cblas_dznrm2(iStop-iStart, x+incx*iStart, incx);
249 	//Accumulate over threads (need sync):
250 	lock->lock();
251 	*ret += retSub*retSub;
252 	lock->unlock();
253 }
eblas_dznrm2(int N,const complex * x,int incx)254 double eblas_dznrm2(int N, const complex* x, int incx)
255 {
256 	#ifdef MKL_ENABLED
257 	return cblas_dznrm2(N, x, incx);
258 	#else
259 	double ret = 0.;
260 	std::mutex lock;
261 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, eblas_dznrm2_sub, N, x, incx, &ret, &lock);
262 	return sqrt(ret);
263 	#endif
264 }
265 
eblas_dnrm2_sub(size_t iStart,size_t iStop,const double * x,int incx,double * ret,std::mutex * lock)266 void eblas_dnrm2_sub(size_t iStart, size_t iStop, const double* x, int incx, double* ret, std::mutex* lock)
267 {	//Compute this thread's contribution:
268 	double retSub = cblas_dnrm2(iStop-iStart, x+incx*iStart, incx);
269 	//Accumulate over threads (need sync):
270 	lock->lock();
271 	*ret += retSub*retSub;
272 	lock->unlock();
273 }
eblas_dnrm2(int N,const double * x,int incx)274 double eblas_dnrm2(int N, const double* x, int incx)
275 {
276 	#ifdef MKL_ENABLED
277 	return cblas_dnrm2(N, x, incx);
278 	#else
279 	double ret = 0.;
280 	std::mutex lock;
281 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, eblas_dnrm2_sub, N, x, incx, &ret, &lock);
282 	return sqrt(ret);
283 	#endif
284 }
285 
286 
287 
288 //Min-max:
eblas_capMinMax_sub(size_t iStart,size_t iStop,double * x,double * xMin,double * xMax,double capLo,double capHi,std::mutex * lock)289 void eblas_capMinMax_sub(size_t iStart, size_t iStop,
290 	double* x, double* xMin, double* xMax, double capLo, double capHi, std::mutex* lock)
291 {	double xMinLoc = +DBL_MAX;
292 	double xMaxLoc = -DBL_MAX;
293 	for(size_t i=iStart; i<iStop; i++)
294 	{	if(x[i]<xMinLoc) xMinLoc=x[i];
295 		if(x[i]>xMaxLoc) xMaxLoc=x[i];
296 		if(x[i]<capLo) x[i]=capLo;
297 		if(x[i]>capHi) x[i]=capHi;
298 	}
299 	lock->lock();
300 	if(xMinLoc<*xMin) *xMin=xMinLoc;
301 	if(xMaxLoc>*xMax) *xMax=xMaxLoc;
302 	lock->unlock();
303 }
eblas_capMinMax(const int N,double * x,double & xMin,double & xMax,double capLo,double capHi)304 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo, double capHi)
305 {	xMin = +DBL_MAX;
306 	xMax = -DBL_MAX;
307 	std::mutex lock;
308 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
309 		eblas_capMinMax_sub, N, x, &xMin, &xMax, capLo, capHi, &lock);
310 }
311