1 //////////////////////////////////////////////////////////////////
2 // (c) Copyright 2003-  by Ken Esler and Jeongnim Kim           //
3 //////////////////////////////////////////////////////////////////
4 //   National Center for Supercomputing Applications &          //
5 //   Materials Computation Center                               //
6 //   University of Illinois, Urbana-Champaign                   //
7 //   Urbana, IL 61801                                           //
8 //   e-mail: jnkim@ncsa.uiuc.edu                                //
9 //                                                              //
10 // Supported by                                                 //
11 //   National Center for Supercomputing Applications, UIUC      //
12 //   Materials Computation Center, UIUC                         //
13 //////////////////////////////////////////////////////////////////
14 /** helper functions for EinsplineSetBuilder
15  */
16 #ifndef QMCPLUSPLUS_EINSPLINEBUILDER_HELPER_H
17 #define QMCPLUSPLUS_EINSPLINEBUILDER_HELPER_H
18 #include <complex>
19 #include "OhmmsPETE/TinyVector.h"
20 #include "OhmmsPETE/OhmmsVector.h"
21 #include "OhmmsPETE/OhmmsArray.h"
22 #include "mpi/collectives.h"
23 #include "CPU/SIMD/simd.hpp"
24 #include "config/stdlib/math.hpp"
25 
26 namespace qmcplusplus
27 {
28 /** unpack packed cG to fftbox
29    * @param cG packed vector
30    * @param gvecs g-coordinate for cG[i]
31    * @param maxg  fft grid
32    * @param fftbox unpacked data to be transformed
33    */
34 template<typename T>
unpack4fftw(const Vector<std::complex<T>> & cG,const std::vector<TinyVector<int,3>> & gvecs,const TinyVector<int,3> & maxg,Array<std::complex<T>,3> & fftbox)35 inline void unpack4fftw(const Vector<std::complex<T>>& cG,
36                         const std::vector<TinyVector<int, 3>>& gvecs,
37                         const TinyVector<int, 3>& maxg,
38                         Array<std::complex<T>, 3>& fftbox)
39 {
40   fftbox                   = std::complex<T>();
41   const int upper_bound[3] = {(maxg[0] - 1) / 2, (maxg[1] - 1) / 2, (maxg[2] - 1) / 2};
42   const int lower_bound[3] = {upper_bound[0] - maxg[0] + 1, upper_bound[1] - maxg[1] + 1, upper_bound[2] - maxg[2] + 1};
43   //only coefficient indices between [lower_bound,upper_bound] are taken for FFT.
44   //this is rather unsafe
45   //#pragma omp parallel for
46   for (int iG = 0; iG < cG.size(); iG++)
47   {
48     if (gvecs[iG][0] > upper_bound[0] || gvecs[iG][0] < lower_bound[0] || gvecs[iG][1] > upper_bound[1] ||
49         gvecs[iG][1] < lower_bound[1] || gvecs[iG][2] > upper_bound[2] || gvecs[iG][2] < lower_bound[2])
50     {
51       //std::cout << "Warning: cG out of bound "
52       //          << "x " << gvecs[iG][0]    << " y " << gvecs[iG][1]    << " z " << gvecs[iG][2] << std::endl
53       //          << "xu " << upper_bound[0] << " yu " << upper_bound[1] << " zu " << upper_bound[2] << std::endl
54       //          << "xd " << lower_bound[0] << " yd " << lower_bound[1] << " zd " << lower_bound[2] << std::endl;
55       continue;
56     }
57     fftbox((gvecs[iG][0] + maxg[0]) % maxg[0], (gvecs[iG][1] + maxg[1]) % maxg[1], (gvecs[iG][2] + maxg[2]) % maxg[2]) =
58         cG[iG];
59   }
60 }
61 
62 /** fix phase
63    * @param in input complex data
64    * @param out output real data
65    * @param twist vector to correct phase
66    */
67 template<typename T, typename T1, typename T2>
fix_phase_c2r(const Array<std::complex<T>,3> & in,Array<T1,3> & out,const TinyVector<T2,3> & twist)68 inline void fix_phase_c2r(const Array<std::complex<T>, 3>& in, Array<T1, 3>& out, const TinyVector<T2, 3>& twist)
69 {
70   const T1 two_pi = -2.0 * M_PI;
71   const int nx    = in.size(0);
72   const int ny    = in.size(1);
73   const int nz    = in.size(2);
74   T1 nx_i         = static_cast<T1>(twist[0]) / static_cast<T1>(nx);
75   T1 ny_i         = static_cast<T1>(twist[1]) / static_cast<T1>(ny);
76   T1 nz_i         = static_cast<T1>(twist[2]) / static_cast<T1>(nz);
77 
78 #pragma omp parallel for firstprivate(nx_i, ny_i, nz_i)
79   for (int ix = 0; ix < nx; ix++)
80   {
81     T1 s, c;
82     const std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz;
83     T1* restrict out_ptr                   = out.data() + ix * ny * nz;
84 
85     T1 rux = static_cast<T1>(ix) * nx_i;
86     for (int iy = 0; iy < ny; iy++)
87     {
88       T1 ruy = static_cast<T1>(iy) * ny_i;
89       for (int iz = 0; iz < nz; iz++)
90       {
91         T1 ruz = static_cast<T1>(iz) * nz_i;
92         qmcplusplus::sincos(two_pi * (rux + ruy + ruz), &s, &c);
93         *out_ptr = static_cast<T1>(c * in_ptr->real() - s * in_ptr->imag());
94         ++out_ptr;
95         ++in_ptr;
96       }
97     }
98   }
99   //#pragma omp parallel for
100   //          for (int ix=0; ix<nx; ix++) {
101   //            PosType ru;
102   //            ru[0] = (RealType)ix / (RealType)nx;
103   //            for (int iy=0; iy<ny; iy++) {
104   //              ru[1] = (RealType)iy / (RealType)ny;
105   //              for (int iz=0; iz<nz; iz++) {
106   //                ru[2] = (RealType)iz / (RealType)nz;
107   //                double phi = -2.0*M_PI*dot (ru, TwistAngles[ti]);
108   //                double s, c;
109   //                qmcplusplus::sincos(phi, &s, &c);
110   //                std::complex<double> phase(c,s);
111   //                std::complex<double> z = phase*rawData(ix,iy,iz);
112   //                splineData(ix,iy,iz) = z.real();
113   //              }
114   //            }
115   //          }
116 }
117 
118 
119 /** rotate the state after 3dfft
120    *
121    * First, add the eikr phase factor.
122    * Then, rotate the phase of the orbitals so that neither
123    * the real part nor the imaginary part are very near
124    * zero.  This sometimes happens in crystals with high
125    * symmetry at special k-points.
126    */
127 template<typename T, typename T1, typename T2>
fix_phase_rotate_c2r(Array<std::complex<T>,3> & in,Array<T1,3> & out,const TinyVector<T2,3> & twist,T & phase_r,T & phase_i)128 inline void fix_phase_rotate_c2r(Array<std::complex<T>, 3>& in,
129                                  Array<T1, 3>& out,
130                                  const TinyVector<T2, 3>& twist,
131                                  T& phase_r,
132                                  T& phase_i)
133 {
134   const T two_pi = -2.0 * M_PI;
135   const int nx   = in.size(0);
136   const int ny   = in.size(1);
137   const int nz   = in.size(2);
138   T nx_i         = 1.0 / static_cast<T>(nx);
139   T ny_i         = 1.0 / static_cast<T>(ny);
140   T nz_i         = 1.0 / static_cast<T>(nz);
141 
142   T rNorm = 0.0, iNorm = 0.0, riNorm = 0.0;
143 #pragma omp parallel for reduction(+ : rNorm, iNorm, riNorm)
144   for (int ix = 0; ix < nx; ix++)
145   {
146     T s, c;
147     std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz;
148     T rux                            = static_cast<T>(ix) * nx_i * twist[0];
149     for (int iy = 0; iy < ny; iy++)
150     {
151       T ruy = static_cast<T>(iy) * ny_i * twist[1];
152       for (int iz = 0; iz < nz; iz++)
153       {
154         T ruz = static_cast<T>(iz) * nz_i * twist[2];
155         qmcplusplus::sincos(two_pi * (rux + ruy + ruz), &s, &c);
156         std::complex<T> eikr(c, s);
157         *in_ptr *= eikr;
158         rNorm += in_ptr->real() * in_ptr->real();
159         iNorm += in_ptr->imag() * in_ptr->imag();
160         riNorm += in_ptr->real() * in_ptr->imag();
161         ++in_ptr;
162       }
163     }
164   }
165 
166   const T x   = (rNorm - iNorm) / riNorm;
167   const T y   = 1.0 / std::sqrt(x * x + 4.0);
168   const T phs = std::sqrt(0.5 - y);
169   phase_r     = phs;
170   phase_i     = (x < 0) ? std::sqrt(1.0 - phs * phs) : -std::sqrt(1.0 - phs * phs);
171 
172 #pragma omp parallel for
173   for (int ix = 0; ix < nx; ix++)
174   {
175     const std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz;
176     T1* restrict out_ptr                   = out.data() + ix * ny * nz;
177     for (int iy = 0; iy < ny; iy++)
178       for (int iz = 0; iz < nz; iz++)
179       {
180         *out_ptr = static_cast<T1>(phase_r * in_ptr->real() - phase_i * in_ptr->imag());
181         ++in_ptr;
182         ++out_ptr;
183       }
184   }
185 }
186 
187 template<typename T, typename T1, typename T2>
fix_phase_rotate_c2c(const Array<std::complex<T>,3> & in,Array<std::complex<T1>,3> & out,const TinyVector<T2,3> & twist)188 inline void fix_phase_rotate_c2c(const Array<std::complex<T>, 3>& in,
189                                  Array<std::complex<T1>, 3>& out,
190                                  const TinyVector<T2, 3>& twist)
191 {
192   const int nx = in.size(0);
193   const int ny = in.size(1);
194   const int nz = in.size(2);
195   T phase_r, phase_i;
196 
197   compute_phase(in, twist, phase_r, phase_i);
198 
199 #pragma omp parallel for
200   for (int ix = 0; ix < nx; ++ix)
201   {
202     const std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz;
203     std::complex<T1>* restrict out_ptr     = out.data() + ix * ny * nz;
204     for (int iy = 0; iy < ny; ++iy)
205       for (int iz = 0; iz < nz; ++iz)
206       {
207         *out_ptr = std::complex<T1>(static_cast<T1>(phase_r * in_ptr->real() - phase_i * in_ptr->imag()),
208                                     static_cast<T1>(phase_i * in_ptr->real() + phase_r * in_ptr->imag()));
209         ++out_ptr;
210         ++in_ptr;
211       }
212   }
213 }
214 
215 template<typename T, typename T1, typename T2>
fix_phase_rotate_c2c(const Array<std::complex<T>,3> & in,Array<T1,3> & out_r,Array<T1,3> & out_i,const TinyVector<T2,3> & twist,T & phase_r,T & phase_i)216 inline void fix_phase_rotate_c2c(const Array<std::complex<T>, 3>& in,
217                                  Array<T1, 3>& out_r,
218                                  Array<T1, 3>& out_i,
219                                  const TinyVector<T2, 3>& twist,
220                                  T& phase_r,
221                                  T& phase_i)
222 {
223   const int nx = in.size(0);
224   const int ny = in.size(1);
225   const int nz = in.size(2);
226 
227   compute_phase(in, twist, phase_r, phase_i);
228 
229 #pragma omp parallel for
230   for (size_t ix = 0; ix < nx; ++ix)
231     for (size_t iy = 0; iy < ny; ++iy)
232     {
233       const size_t offset                    = ix * ny * nz + iy * nz;
234       const std::complex<T>* restrict in_ptr = in.data() + offset;
235       T1* restrict r_ptr                     = out_r.data() + offset;
236       T1* restrict i_ptr                     = out_i.data() + offset;
237       for (size_t iz = 0; iz < nz; ++iz)
238       {
239         r_ptr[iz] = static_cast<T1>(phase_r * in_ptr[iz].real() - phase_i * in_ptr[iz].imag());
240         i_ptr[iz] = static_cast<T1>(phase_i * in_ptr[iz].real() + phase_r * in_ptr[iz].imag());
241       }
242     }
243 }
244 
245 /** Compute the norm of an orbital.
246    * @param cG the plane wave coefficients
247    * @return norm of the orbital
248    */
249 template<typename T>
compute_norm(const Vector<std::complex<T>> & cG)250 inline T compute_norm(const Vector<std::complex<T>>& cG)
251 {
252   T total_norm2(0);
253 #pragma omp parallel for reduction(+ : total_norm2)
254   for (size_t ig = 0; ig < cG.size(); ++ig)
255     total_norm2 += cG[ig].real() * cG[ig].real() + cG[ig].imag() * cG[ig].imag();
256   return std::sqrt(total_norm2);
257 }
258 
259 /** Compute the phase factor for rotation. The algorithm aims at balanced real and imaginary parts.
260    * @param in the real space orbital value on a 3D grid
261    * @param twist k-point in reduced coordinates
262    * @param phase_r output real part of the phase
263    * @param phase_i output imaginary part of the phase
264    */
265 template<typename T, typename T2>
compute_phase(const Array<std::complex<T>,3> & in,const TinyVector<T2,3> & twist,T & phase_r,T & phase_i)266 inline void compute_phase(const Array<std::complex<T>, 3>& in, const TinyVector<T2, 3>& twist, T& phase_r, T& phase_i)
267 {
268   const T two_pi  = -2.0 * M_PI;
269   const size_t nx = in.size(0);
270   const size_t ny = in.size(1);
271   const size_t nz = in.size(2);
272 
273   const T nx_i = 1.0 / static_cast<T>(nx);
274   const T ny_i = 1.0 / static_cast<T>(ny);
275   const T nz_i = 1.0 / static_cast<T>(nz);
276 
277   T rNorm = 0.0, iNorm = 0.0, riNorm = 0.0;
278 #pragma omp parallel for reduction(+ : rNorm, iNorm, riNorm)
279   for (size_t ix = 0; ix < nx; ++ix)
280   {
281     for (size_t iy = 0; iy < ny; ++iy)
282     {
283       const T rux = static_cast<T>(ix) * nx_i * twist[0];
284       T s, c;
285       T rsum = 0, isum = 0, risum = 0.0;
286       const T ruy                            = static_cast<T>(iy) * ny_i * twist[1];
287       const std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz + iy * nz;
288       for (size_t iz = 0; iz < nz; ++iz)
289       {
290         const T ruz = static_cast<T>(iz) * nz_i * twist[2];
291         qmcplusplus::sincos(two_pi * (rux + ruy + ruz), &s, &c);
292         const T re = c * in_ptr[iz].real() - s * in_ptr[iz].imag();
293         const T im = s * in_ptr[iz].real() + c * in_ptr[iz].imag();
294         rsum += re * re;
295         isum += im * im;
296         risum += re * im;
297       }
298       rNorm += rsum;
299       iNorm += isum;
300       riNorm += risum;
301     }
302   }
303 
304   const T x   = (rNorm - iNorm) / riNorm;
305   const T y   = 1.0 / std::sqrt(x * x + 4.0);
306   const T phs = std::sqrt(0.5 - y);
307   phase_r     = phs;
308   phase_i     = (x < 0) ? std::sqrt(1.0 - phs * phs) : -std::sqrt(1.0 - phs * phs);
309 }
310 
311 /** rotate the state after 3dfft
312    *
313    */
314 template<typename T>
fix_phase_rotate(const Array<std::complex<T>,3> & e2pi,Array<std::complex<T>,3> & in,Array<T,3> & out)315 inline void fix_phase_rotate(const Array<std::complex<T>, 3>& e2pi, Array<std::complex<T>, 3>& in, Array<T, 3>& out)
316 {
317   const int nx = e2pi.size(0);
318   const int ny = e2pi.size(1);
319   const int nz = e2pi.size(2);
320   T rNorm = 0.0, iNorm = 0.0;
321   //#pragma omp parallel for reduction(+:rNorm,iNorm)
322   for (int ix = 0; ix < nx; ix++)
323   {
324     T rpart = 0.0, ipart = 0.0;
325     const std::complex<T>* restrict p_ptr = e2pi.data() + ix * ny * nz;
326     std::complex<T>* restrict in_ptr      = in.data() + ix * ny * nz;
327     for (int iyz = 0; iyz < ny * nz; ++iyz)
328     {
329       in_ptr[iyz] *= p_ptr[iyz];
330       rpart += in_ptr[iyz].real() * in_ptr[iyz].real();
331       ipart += in_ptr[iyz].imag() * in_ptr[iyz].imag();
332     }
333     rNorm += rpart;
334     iNorm += ipart;
335   }
336 
337   //#pragma omp parallel
338   {
339     T arg = std::atan2(iNorm, rNorm);
340     T phase_i, phase_r;
341     qmcplusplus::sincos(0.125 * M_PI - 0.5 * arg, &phase_i, &phase_r);
342     //#pragma omp for
343     for (int ix = 0; ix < nx; ix++)
344     {
345       const std::complex<T>* restrict in_ptr = in.data() + ix * ny * nz;
346       T* restrict out_ptr                    = out.data() + ix * ny * nz;
347       for (int iyz = 0; iyz < ny * nz; iyz++)
348         out_ptr[iyz] = phase_r * in_ptr[iyz].real() - phase_i * in_ptr[iyz].imag();
349     }
350   }
351 }
352 
353 template<typename T>
fix_phase_rotate(const Array<std::complex<T>,3> & e2pi,const Array<std::complex<T>,3> & in,Array<std::complex<T>,3> & out)354 inline void fix_phase_rotate(const Array<std::complex<T>, 3>& e2pi,
355                              const Array<std::complex<T>, 3>& in,
356                              Array<std::complex<T>, 3>& out)
357 {
358   T rNorm = 0.0, iNorm = 0.0, riNorm = 0.0;
359 
360   simd::accumulate_phases(e2pi.size(), e2pi.data(), in.data(), rNorm, iNorm, riNorm);
361 
362   T x   = (rNorm - iNorm) / riNorm;
363   x     = 1.0 / std::sqrt(x * x + 4.0);
364   T phs = (x > 0.5) ? std::sqrt(0.5 + x) : std::sqrt(0.5 - x);
365   std::complex<T> phase_c(phs * phs / (rNorm + iNorm), (1.0 - phs * phs) / (rNorm + iNorm));
366 
367   BLAS::axpy(in.size(), phase_c, in.data(), out.data());
368 }
369 
370 
bcastSortBands(int spin,int n,bool root)371 inline bool EinsplineSetBuilder::bcastSortBands(int spin, int n, bool root)
372 {
373   update_token(__FILE__, __LINE__, "bcastSortBands");
374 
375   std::vector<BandInfo>& SortBands(*FullBands[spin]);
376 
377   TinyVector<int, 4> nbands(int(SortBands.size()), n, NumValenceOrbs, NumCoreOrbs);
378   mpi::bcast(*myComm, nbands);
379 
380   //buffer to serialize BandInfo
381   PooledData<OHMMS_PRECISION_FULL> misc(nbands[0] * 5);
382   bool isCore = false;
383   n = NumDistinctOrbitals = nbands[1];
384   NumValenceOrbs          = nbands[2];
385   NumCoreOrbs             = nbands[3];
386 
387   if (root)
388   {
389     misc.rewind();
390     //misc.put(NumValenceOrbs);
391     //misc.put(NumCoreOrbs);
392     for (int i = 0; i < n; ++i)
393     {
394       misc.put(SortBands[i].TwistIndex);
395       misc.put(SortBands[i].BandIndex);
396       misc.put(SortBands[i].Energy);
397       misc.put(SortBands[i].MakeTwoCopies);
398       misc.put(SortBands[i].IsCoreState);
399 
400       isCore |= SortBands[i].IsCoreState;
401     }
402 
403     for (int i = n; i < SortBands.size(); ++i)
404     {
405       misc.put(SortBands[i].TwistIndex);
406       misc.put(SortBands[i].BandIndex);
407       misc.put(SortBands[i].Energy);
408       misc.put(SortBands[i].MakeTwoCopies);
409       misc.put(SortBands[i].IsCoreState);
410     }
411   }
412   myComm->bcast(misc);
413 
414   if (!root)
415   {
416     SortBands.resize(nbands[0]);
417     misc.rewind();
418     //misc.get(NumValenceOrbs);
419     //misc.get(NumCoreOrbs);
420     for (int i = 0; i < n; ++i)
421     {
422       misc.get(SortBands[i].TwistIndex);
423       misc.get(SortBands[i].BandIndex);
424       misc.get(SortBands[i].Energy);
425       misc.get(SortBands[i].MakeTwoCopies);
426       misc.get(SortBands[i].IsCoreState);
427 
428       isCore |= SortBands[i].IsCoreState;
429     }
430     for (int i = n; i < SortBands.size(); ++i)
431     {
432       misc.get(SortBands[i].TwistIndex);
433       misc.get(SortBands[i].BandIndex);
434       misc.get(SortBands[i].Energy);
435       misc.get(SortBands[i].MakeTwoCopies);
436       misc.get(SortBands[i].IsCoreState);
437     }
438   }
439 
440   //char fname[64];
441   //sprintf(fname,"debug.%d",myComm->rank());
442   //ofstream fout(fname);
443   //fout.setf(std::ios::scientific, std::ios::floatfield);
444   //fout.precision(12);
445   //for(int i=0; i<misc.size();++i)
446   //  fout << misc[i] << std::endl;
447   return isCore;
448 }
449 
450 } // namespace qmcplusplus
451 
452 #endif
453