1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #ifndef JDFTX_CORE_BLASEXTRA_H
21 #define JDFTX_CORE_BLASEXTRA_H
22 
23 /** @file BlasExtra.h
24 @brief Commonly used BLAS-like routines
25 */
26 
27 #include <gsl/gsl_cblas.h>
28 #include <cstdlib>
29 #include <cstdio>
30 #include <cfloat>
31 #include <core/scalar.h>
32 #include <core/Thread.h>
33 
34 #ifdef GPU_ENABLED
35 #include <cublas.h>
36 #include <cuda_runtime.h>
37 #endif
38 
39 /** @brief Templated elementwise multiply Y *= X for arrays X, Y
40 @tparam Ty primiitive data-type for array Y
41 @tparam Tx primiitive data-type for array X
42 @param N number of elements in X and Y
43 @param X pointer to the first element of array X
44 @param incX stride along the X array
45 @param Y pointer to the first element of array Y
46 @param incY stride along the Y array (must be non-zero)
47 */
48 template<typename Ty, typename Tx> void eblas_mul(const int N, const Tx* X, const int incX, Ty* Y, const int incY);
49 
50 //!@brief Specialization of #eblas_mul for double[] *= double[]
eblas_dmul(const int N,const double * X,const int incX,double * Y,const int incY)51 inline void eblas_dmul(const int N, const double* X, const int incX, double* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
52 //!@brief Specialization of #eblas_mul for complex[] *= complex[]
eblas_zmul(const int N,const complex * X,const int incX,complex * Y,const int incY)53 inline void eblas_zmul(const int N, const complex* X, const int incX, complex* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
54 //!@brief Specialization of #eblas_mul for complex[] *= double[]
eblas_zmuld(const int N,const double * X,const int incX,complex * Y,const int incY)55 inline void eblas_zmuld(const int N, const double* X, const int incX, complex* Y, const int incY) { eblas_mul(N,X,incX,Y,incY); }
56 #ifdef GPU_ENABLED
57 //GPU versions of the above functions implemented in BlasExtra.cu
58 //!@brief Equivalent of #eblas_dmul for GPU data pointers
59 void eblas_dmul_gpu(const int N, const double* X, const int incX, double* Y, const int incY);
60 //!@brief Equivalent of #eblas_zmul for GPU data pointers
61 void eblas_zmul_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY);
62 //!@brief Equivalent of #eblas_zmuld for GPU data pointers
63 void eblas_zmuld_gpu(const int N, const double* X, const int incX, complex* Y, const int incY);
64 #endif
65 
66 /** @brief Templated elementwise divide Y /= X for arrays X, Y
67 @tparam Ty primiitive data-type for array Y
68 @tparam Tx primiitive data-type for array X
69 @param N number of elements in X and Y
70 @param X pointer to the first element of array X
71 @param incX stride along the X array
72 @param Y pointer to the first element of array Y
73 @param incY stride along the Y array (must be non-zero)
74 */
75 template<typename Ty, typename Tx> void eblas_div(const int N, const Tx* X, const int incX, Ty* Y, const int incY);
76 
77 //!@brief Specialization of #eblas_div for double[] /= double[]
eblas_ddiv(const int N,const double * X,const int incX,double * Y,const int incY)78 inline void eblas_ddiv(const int N, const double* X, const int incX, double* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
79 //!@brief Specialization of #eblas_div for #complex[] /= #complex[]
eblas_zdiv(const int N,const complex * X,const int incX,complex * Y,const int incY)80 inline void eblas_zdiv(const int N, const complex* X, const int incX, complex* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
81 //!@brief Specialization of #eblas_div for #complex[] /= double[]
eblas_zdivd(const int N,const double * X,const int incX,complex * Y,const int incY)82 inline void eblas_zdivd(const int N, const double* X, const int incX, complex* Y, const int incY) { eblas_div(N,X,incX,Y,incY); }
83 #ifdef GPU_ENABLED
84 //GPU versions of the above functions implemented in BlasExtra.cu
85 //!@brief Equivalent of #eblas_ddiv for GPU data pointers
86 void eblas_ddiv_gpu(const int N, const double* X, const int incX, double* Y, const int incY);
87 //!@brief Equivalent of #eblas_zdiv for GPU data pointers
88 void eblas_zdiv_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY);
89 //!@brief Equivalent of #eblas_zdivd for GPU data pointers
90 void eblas_zdivd_gpu(const int N, const double* X, const int incX, complex* Y, const int incY);
91 #endif
92 
93 //! Elementwise linear combination Z = sX * X + sY * Y
94 void eblas_lincomb(const int N,
95 	const complex& sX, const complex* X, const int incX,
96 	const complex& sY, const complex* Y, const int incY,
97 	complex* Z, const int incZ);
98 
99 #ifdef GPU_ENABLED
100 //! Elementwise linear combination Z = sX * X + sY * Y (gpu version)
101 void eblas_lincomb_gpu(const int N,
102 	const complex& sX, const complex* X, const int incX,
103 	const complex& sY, const complex* Y, const int incY,
104 	complex* Z, const int incZ);
105 #endif
106 
107 /** @brief Threaded complex matrix multiply (threaded wrapper around zgemm)
108 All the parameters have the same meaning as in zgemm, except element order is always Column Major (FORTRAN order!)
109 */
110 void eblas_zgemm(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
111 	const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
112 	const complex& beta, complex *C, const int ldc);
113 #ifdef GPU_ENABLED
114 //! cublasZgemm wrapper to overload eblas_zgemm
115 void eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
116 	const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
117 	const complex& beta, complex *C, const int ldc);
118 #endif
119 
120 //Sparse<->dense vector operations:
121 void eblas_scatter_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y); //!< Scatter y(index) = x (in Octave notation)
122 void eblas_scatter_daxpy(const int Nindex, double a, const int* index, const double* x, double* y); //!< Scatter y(index) = x (in Octave notation)
123 void eblas_gather_zdaxpy(const int Nindex, double a, const int* index, const complex* x, complex* y); //!< Gather y = x(index) (in Octave notation)
124 void eblas_gather_daxpy(const int Nindex, double a, const int* index, const double* x, double* y); //!< Gather y = x(index) (in Octave notation)
125 #ifdef GPU_ENABLED
126 void eblas_scatter_zdaxpy_gpu(const int Nindex, double a, const int* index, const complex* x, complex* y); //!< Scatter y(index) += a*x (in Octave notation)
127 void eblas_scatter_daxpy_gpu(const int Nindex, double a, const int* index, const double* x, double* y); //!< Scatter y(index) += a*x (in Octave notation)
128 void eblas_gather_zdaxpy_gpu(const int Nindex, double a, const int* index, const complex* x, complex* y); //!< Gather y += a*x(index) (in Octave notation)
129 void eblas_gather_daxpy_gpu(const int Nindex, double a, const int* index, const double* x, double* y); //!< Gather y += a*x(index) (in Octave notation)
130 #endif
131 
132 
133 //! Elementwise y += a|x|^2
134 void eblas_accumNorm(int N, const double& a, const complex* x, double* y);
135 #ifdef GPU_ENABLED
136 //! Elementwise y += a|x|^2 on the GPU
137 void eblas_accumNorm_gpu(int N, const double& a, const complex* x, double* y);
138 #endif
139 
140 void eblas_symmetrize(int N, int n, const int* symmIndex, double* x); //!< Symmetrize an array x, using N n-fold equivalence classes in symmIndex
141 void eblas_symmetrize(int N, int n, const int* symmIndex, complex* x); //!< Symmetrize an array x, using N n-fold equivalence classes in symmIndex
142 #ifdef GPU_ENABLED
143 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, double* x); //!< Symmetrize an array x, using N n-fold equivalence classes in symmIndex
144 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, complex* x); //!< Symmetrize an array x, using N n-fold equivalence classes in symmIndex
145 #endif
146 
147 //Threaded-wrappers for BLAS1 functions (Cblas)
eblas_copy(T * dest,const T * src,int N)148 template<typename T> void eblas_copy(T* dest, const T* src, int N) { memcpy(dest, src, N*sizeof(T)); }
149 void eblas_zero(int N, double* x); //!< wraps memset
150 void eblas_zero(int N, complex* x); //!< wraps memset
151 void eblas_dscal(int N, double a, double* x, int incx); //!< wraps cblas_dscal
152 void eblas_zdscal(int N, double a, complex* x, int incx); //!< wraps cblas_zdscal
153 void eblas_zscal(int N, const complex& a, complex* x, int incx); //!< wraps cblas_zscal
154 void eblas_daxpy(int N, double a, const double* x, int incx, double* y, int incy); //!< wraps cblas_daxpy
155 void eblas_zaxpy(int N, const complex& a, const complex* x, int incx, complex* y, int incy); //!< wraps cblas_zaxpy
156 complex eblas_zdotc(int N, const complex* x, int incx, const complex* y, int incy); //!< wraps cblas_zdotc_sub
157 double eblas_ddot(int N, const double* x, int incx, const double* y, int incy); //!< wraps cblas_ddot
158 double eblas_dznrm2(int N, const complex* x, int incx); //!< wraps cblas_dznrm2
159 double eblas_dnrm2(int N, const double* x, int incx); //!< wraps cblas_dnrm2
160 
161 //Wrappers/alternate names for CUBLAS functions (auto-selectable with Cblas ones above using callPref)
162 #ifdef GPU_ENABLED
eblas_copy_gpu(T * dest,const T * src,int N)163 template<typename T> void eblas_copy_gpu(T* dest, const T* src, int N) { cudaMemcpy(dest, src, N*sizeof(T), cudaMemcpyDeviceToDevice); }
164 void eblas_zero_gpu(int N, double* x); //!< wraps cudaMemset
165 void eblas_zero_gpu(int N, complex* x); //!< wraps cudaMemset
166 #define eblas_dscal_gpu cublasDscal
167 void eblas_zdscal_gpu(int N, double a, complex* x, int incx); //!< wraps cublasZdscal
168 void eblas_zscal_gpu(int N, const complex& a, complex* x, int incx); //!< wraps cublasZscal
169 #define eblas_daxpy_gpu cublasDaxpy
170 void eblas_zaxpy_gpu(int N, const complex& a, const complex* x, int incx, complex* y, int incy); //!< wraps cublasZaxpy
171 complex eblas_zdotc_gpu(int N, const complex* x, int incx, const complex* y, int incy); //!< wraps cublasZdotc
172 #define eblas_ddot_gpu cublasDdot
173 double eblas_dznrm2_gpu(int N, const complex* x, int incx); //!< wraps cublasDznrm2
174 #define eblas_dnrm2_gpu cublasDnrm2
175 
176 #endif
177 
178 #ifdef GPU_ENABLED
179 	#define callPref(functionName) functionName##_gpu //gpu versions available, so use them preferentially
180 #else
181 	#define callPref(functionName) functionName //only cpu version available
182 #endif
183 
184 
185 //! Find the minimum, xMin, and maximum, xMax, of an array x, and optionally restrict its range to [capLo,capHi]
186 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
187 #ifdef GPU_ENABLED
188 //! Equivalent of eblas_capMinMax for the gpu
189 void eblas_capMinMax_gpu(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
190 #endif
191 
192 
193 //###################################################################################################
194 //####  Implementation  ####
195 //##########################
196 
197 //!@cond
198 
199 //Elementwise multiply implementation:
200 template<typename Ty, typename Tx>
eblas_mul_sub(size_t iMin,size_t iMax,const Tx * X,const int incX,Ty * Y,const int incY)201 void eblas_mul_sub(size_t iMin, size_t iMax, const Tx* X, const int incX, Ty* Y, const int incY)
202 {	for(size_t i=iMin; i<iMax; i++) Y[incY*i] *= X[incX*i];
203 }
204 
205 template<typename Ty, typename Tx>
eblas_mul(const int N,const Tx * X,const int incX,Ty * Y,const int incY)206 void eblas_mul(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
207 {	if(incY==0) die("incY cannot be = 0")
208 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
209 		eblas_mul_sub<Ty,Tx>, N, X, incX, Y, incY);
210 }
211 
212 //Elementwise divide implementation:
213 template<typename Ty, typename Tx>
eblas_div_sub(size_t iMin,size_t iMax,const Tx * X,const int incX,Ty * Y,const int incY)214 void eblas_div_sub(size_t iMin, size_t iMax, const Tx* X, const int incX, Ty* Y, const int incY)
215 {	for(size_t i=iMin; i<iMax; i++) Y[incY*i] /= X[incX*i];
216 }
217 template<typename Ty, typename Tx>
eblas_div(const int N,const Tx * X,const int incX,Ty * Y,const int incY)218 void eblas_div(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
219 {	if(incY==0) die("incY cannot be = 0")
220 	threadLaunch((N<100000 || (!shouldThreadOperators())) ? 1 : 0, //force single threaded for small problem sizes
221 		eblas_div_sub<Ty,Tx>, N, X, incX, Y, incY);
222 }
223 //!@endcond
224 
225 #endif // JDFTX_CORE_BLASEXTRA_H
226