1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2008 The Regents of the University of California
4 //
5 // This file is part of Qbox
6 //
7 // Qbox is distributed under the terms of the GNU General Public License
8 // as published by the Free Software Foundation, either version 2 of
9 // the License, or (at your option) any later version.
10 // See the file COPYING in the root directory of this distribution
11 // or <http://www.gnu.org/licenses/>.
12 //
13 ////////////////////////////////////////////////////////////////////////////////
14 //
15 // NonLocalPotential.cpp
16 //
17 ////////////////////////////////////////////////////////////////////////////////
18 
19 #include "NonLocalPotential.h"
20 #include "Species.h"
21 #include "blas.h"
22 #include "MPIdata.h"
23 #include <iomanip>
24 using namespace std;
25 
26 #if USE_MASSV
27 extern "C" void vsincos(double *x, double *y, double *z, int *n);
28 #endif
29 
30 
31 ////////////////////////////////////////////////////////////////////////////////
~NonLocalPotential(void)32 NonLocalPotential::~NonLocalPotential(void)
33 {
34 #if TIMING
35   for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ )
36   {
37     double time = i->second.real();
38     double tmin, tmax;
39     MPI_Reduce(&time,&tmin,1,MPI_DOUBLE,MPI_MIN,0,MPIdata::comm());
40     MPI_Reduce(&time,&tmax,1,MPI_DOUBLE,MPI_MAX,0,MPIdata::comm());
41     if ( MPIdata::onpe0() && (tmax > 0.0) )
42     {
43       string s = "name=\"" + (*i).first + "\"";
44       cout << "<timing " << left << setw(22) << s
45            << " min=\"" << setprecision(3) << tmin << "\""
46            << " max=\"" << setprecision(3) << tmax << "\"/>"
47            << endl;
48     }
49   }
50 #endif
51 }
52 
53 ////////////////////////////////////////////////////////////////////////////////
init(void)54 void NonLocalPotential::init(void)
55 {
56   const int ngwl = basis_.localsize();
57 
58   nsp = atoms_.nsp();
59 
60   nop.resize(nsp);
61   lloc.resize(nsp);
62   lproj.resize(nsp);
63   na.resize(nsp);
64   npr.resize(nsp);
65   nprna.resize(nsp);
66   wt.resize(nsp);
67   twnl.resize(nsp);
68   dtwnl.resize(nsp);
69 
70   nquad.resize(nsp);
71   rquad.resize(nsp);
72   wquad.resize(nsp);
73 
74   nspnl = 0;
75   for ( int is = 0; is < nsp; is++ )
76   {
77     Species *s = atoms_.species_list[is];
78 
79     npr[is] = 0;
80     nprna[is] = 0;
81 
82     if ( s->non_local() )
83     {
84       nspnl++;
85       na[is] = atoms_.na(is);
86       nop[is] = s->nop();
87       lloc[is] = s->llocal();
88       nquad[is] = s->nquad();
89 
90       // compute total number of projectors npr[is]
91       // KB potentials have nlm projectors
92       // semilocal potentials have nlm*nquad projectors
93       if ( s->nquad() == 0 )
94       {
95         npr[is] = s->nlm();
96       }
97       else
98       {
99         npr[is] = s->nlm() * nquad[is];
100       }
101       nprna[is] = npr[is] * na[is];
102 
103       // l value for projector ipr
104       lproj[is].resize(npr[is]);
105 
106       twnl[is].resize(npr[is]*ngwl);
107       dtwnl[is].resize(npr[is]*6*ngwl);
108 
109       // quadrature abcissae and weights
110       rquad[is].resize(nquad[is]);
111       wquad[is].resize(nquad[is]);
112 
113       enum quadrature_rule_type { TRAPEZOID, MODIF_TRAPEZOID,
114       TRAPEZOID_WITH_RIGHT_ENDPOINT,
115       SIMPSON };
116 
117       const quadrature_rule_type quad_rule = TRAPEZOID;
118       //const quadrature_rule_type quad_rule = MODIF_TRAPEZOID;
119       //const quadrature_rule_type quad_rule = TRAPEZOID_WITH_RIGHT_ENDPOINT;
120       //const quadrature_rule_type quad_rule = SIMPSON;
121 
122       if ( quad_rule == TRAPEZOID )
123       {
124         // trapezoidal rule with interior points only
125         // (end points are zero)
126         const double h = s->rquad() / (nquad[is]+1);
127         for ( int iquad = 0; iquad < nquad[is]; iquad++ )
128         {
129           rquad[is][iquad] = (iquad+1) * h;
130           wquad[is][iquad] = h;
131         }
132         //cout << " NonLocalPotential::init: trapezoidal rule (interior)"
133         //     << endl;
134       }
135       else if ( quad_rule == MODIF_TRAPEZOID )
136       {
137         // use modified trapezoidal rule with interior points, and include
138         // correction for first derivative at r=0 as
139         // h^2/12 f'(0) where f'(0) is estimated with f(h)/h
140         // i.e. add correction h/12) * f(h)
141         // See Davis & Rabinowitz, p. 132
142         const double h = s->rquad() / (nquad[is]+1);
143         for ( int iquad = 0; iquad < nquad[is]; iquad++ )
144         {
145           rquad[is][iquad] = (iquad+1) * h;
146           wquad[is][iquad] = h;
147         }
148         wquad[is][0] += h / 12.0;
149         //cout << " NonLocalPotential::init: modified trapezoidal rule"
150         //     << endl;
151       }
152       else if ( quad_rule == TRAPEZOID_WITH_RIGHT_ENDPOINT )
153       {
154         const double h = s->rquad() / nquad[is];
155         for ( int iquad = 0; iquad < nquad[is]; iquad++ )
156         {
157           rquad[is][iquad] = (iquad+1) * h;
158           wquad[is][iquad] = h;
159         }
160         wquad[is][nquad[is]-1] = 0.5 * h;
161         //cout << " NonLocalPotential::init: trapezoidal rule, right endpoint"
162         //     << endl;
163       }
164       else if ( quad_rule == SIMPSON )
165       {
166         // must have 2n+1 points
167         assert(nquad[is]%2==1);
168         const double h = s->rquad() / (nquad[is]-1);
169         for ( int iquad = 0; iquad < nquad[is]; iquad++ )
170         {
171           rquad[is][iquad] = iquad * h;
172           if ( ( iquad == 0 ) || ( iquad == nquad[is]-1 ) )
173             wquad[is][iquad] = h / 3.0;
174           else if ( iquad % 2 == 0 )
175             wquad[is][iquad] = h * 2.0 / 3.0;
176           else
177             wquad[is][iquad] = h * 4.0 / 3.0;
178         }
179         //cout << " NonLocalPotential::init: Simpson rule" << endl;
180       }
181       else
182       {
183         assert(false);
184       }
185 
186       // compute weights wt[is][ipr]
187       wt[is].resize(npr[is]);
188 
189       // compute lproj[is][ipr]
190       int ipr_base = 0;
191       for ( int iop = 0; iop < nop[is]; iop++ )
192       {
193         int l = s->l(iop);
194         if ( l != lloc[is] )
195         {
196           if ( nquad[is] == 0 )
197           {
198             // Kleinman-Bylander form
199             // wt[is][ipr]
200             // index = ipr_base+m
201             for ( int m = 0; m < 2*l+1; m++ )
202             {
203               const int ipr = ipr_base + m;
204               wt[is][ipr] = s->wsg(iop);
205               lproj[is][ipr] = l;
206             }
207             ipr_base += 2*l+1;
208           }
209           else
210           {
211             for ( int iquad = 0; iquad < nquad[is]; iquad++ )
212             {
213               const double r = rquad[is][iquad];
214               double v,dv,vl,dvl;
215               s->dvpsr(l,r,v,dv);
216               s->dvpsr(lloc[is],r,vl,dvl);
217               // wt[is][iquad+ipr*nquad]
218               for ( int m = 0; m < 2*l+1; m++ )
219               {
220                 const int ipr = ipr_base + iquad + nquad[is] * m;
221                 wt[is][ipr] = ( v - vl ) * wquad[is][iquad];
222                 lproj[is][ipr] = l;
223               }
224             }
225             ipr_base += (2*l+1) * nquad[is];
226           }
227         }
228       }
229       assert(ipr_base==npr[is]);
230     } // if s->non_local()
231   }
232 }
233 
234 ////////////////////////////////////////////////////////////////////////////////
update_twnl(void)235 void NonLocalPotential::update_twnl(void)
236 {
237   // update arrays twnl[is][ipr][ig], dtwnl[is][ipr][j][ig],
238   // following a change of cell dimensions
239   // It is assumed that basis_ has been updated
240   // It is assumed that nsp, npr[is], nquad[is] did not change since init
241 
242   tmap["update_twnl"].start();
243 
244   if ( nspnl == 0 ) return;
245 
246   const int ngwl = basis_.localsize();
247   const double pi = M_PI;
248   const double fpi = 4.0 * pi;
249   const double s14pi = sqrt(1.0/fpi);
250   const double s34pi = sqrt(3.0/fpi);
251   const double s54pi = sqrt(5.0/fpi);
252   const double s74pi = sqrt(7.0/fpi);
253   const double s20pi = sqrt(20.0*pi);
254   const double s20pi3 = sqrt(20.0*pi/3.0);
255   const double s3 = sqrt(3.0);
256   const double s32 = sqrt(1.5);
257   const double s52 = sqrt(2.5);
258   const double s15 = sqrt(15.0);
259 
260   const double *kpg   = basis_.kpg_ptr();
261   const double *kpgi  = basis_.kpgi_ptr();
262   const double *kpg_x = basis_.kpgx_ptr(0);
263   const double *kpg_y = basis_.kpgx_ptr(1);
264   const double *kpg_z = basis_.kpgx_ptr(2);
265 
266   // compute twnl and dtwnl
267   for ( int is = 0; is < nsp; is++ )
268   {
269     Species *s = atoms_.species_list[is];
270 
271     int ilm = 0;
272     for ( int iop = 0; iop < nop[is]; iop++ )
273     {
274       int l = s->l(iop);
275       if ( l != lloc[is] )
276       {
277         if ( l == 0 )
278         {
279           if ( nquad[is] == 0 )
280           {
281             // Kleinman-Bylander
282 
283             // twnl[is][ipr][ig]
284             const int ipr = ilm;
285             // index = ig + ngwl*ipr, i.e. index = ig
286             double *t0 = &twnl[is][ngwl*ipr];
287 
288             // dtwnl[is][ipr][ij][ngwl]
289             // index = ig + ngwl * ( ij + 6 * ipr ), ipr = 0
290             // i.e. index = ig + ij * ngwl
291             double *dt0_xx = &dtwnl[is][ngwl*(0+6*ipr)];
292             double *dt0_yy = &dtwnl[is][ngwl*(1+6*ipr)];
293             double *dt0_zz = &dtwnl[is][ngwl*(2+6*ipr)];
294             double *dt0_xy = &dtwnl[is][ngwl*(3+6*ipr)];
295             double *dt0_yz = &dtwnl[is][ngwl*(4+6*ipr)];
296             double *dt0_xz = &dtwnl[is][ngwl*(5+6*ipr)];
297             // Special case k=G=0 is ok since kpgi[0] = 0.0 at k=G=0
298             for ( int ig = 0; ig < ngwl; ig++ )
299             {
300               double v,dv;
301               s->dvnlg(iop,kpg[ig],v,dv);
302 
303               t0[ig] = s14pi * v;
304 
305               const double tgx = kpg_x[ig];
306               const double tgy = kpg_y[ig];
307               const double tgz = kpg_z[ig];
308 
309               const double tmp = kpgi[ig] * s14pi * dv;
310               dt0_xx[ig] = tmp * tgx * tgx;
311               dt0_yy[ig] = tmp * tgy * tgy;
312               dt0_zz[ig] = tmp * tgz * tgz;
313               dt0_xy[ig] = tmp * tgx * tgy;
314               dt0_yz[ig] = tmp * tgy * tgz;
315               dt0_xz[ig] = tmp * tgx * tgz;
316             }
317           }
318           else
319           {
320             // semi-local
321             for ( int iquad = 0; iquad < nquad[is]; iquad++ )
322             {
323               // twnl[is][ipr][ig]
324               // ipr = iquad + nquad[is]*ilm, where ilm=0
325               //     = iquad
326               // index = ig + ngwl*iquad
327               double *t0 = &twnl[is][ngwl*iquad];
328               // dtwnl[is][ipr][j][ngwl]
329               // index = ig + ngwl * ( ij + 6 * iquad)
330               double *dt0_xx = &dtwnl[is][ngwl*(0+6*iquad)];
331               double *dt0_yy = &dtwnl[is][ngwl*(1+6*iquad)];
332               double *dt0_zz = &dtwnl[is][ngwl*(2+6*iquad)];
333               double *dt0_xy = &dtwnl[is][ngwl*(3+6*iquad)];
334               double *dt0_yz = &dtwnl[is][ngwl*(4+6*iquad)];
335               double *dt0_xz = &dtwnl[is][ngwl*(5+6*iquad)];
336               const double r = rquad[is][iquad];
337 
338               // G=0 element must be treated separately if k=0
339               if ( basis_.real() && ctxt_.myrow() == 0)
340               {
341                 // compute G=0 element separately to get
342                 // sin(Gr)/(Gr) limit -> 1
343                 const int ig = 0;
344                 // this task holds the G=0 element
345                 // I(l=0) = 4 pi j_l(G r) r
346                 // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm
347                 // j_0(Gr) = sin(Gr) / (Gr) == 1.0
348                 // Ylm = s14pi
349 
350                 // next line: sin(Gr)/(Gr) replaced by 1
351                 t0[ig] = fpi * s14pi * r;
352 
353                 // dtwnl = fpi s14pi G_i G_j / G (r cos(Gr)/G -sin(Gr)/G^2)
354                 // dtwnl = 0.0;
355                 dt0_xx[ig] = 0.0;
356                 dt0_yy[ig] = 0.0;
357                 dt0_zz[ig] = 0.0;
358                 dt0_xy[ig] = 0.0;
359                 dt0_yz[ig] = 0.0;
360                 dt0_xz[ig] = 0.0;
361               }
362               else
363               {
364                 // Use the normal procedure
365                 const int ig = 0;
366                 // I(l=0) = 4 pi j_l(G r) r
367                 // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm
368                 // j_0(Gr) * r = sin(Gr) / G
369                 // Ylm = s14pi
370                 const double arg = kpg[ig] * r;
371                 // Note: for G=0, gi[0] = 0
372 
373                 const double tgx = kpg_x[ig];
374                 const double tgy = kpg_y[ig];
375                 const double tgz = kpg_z[ig];
376                 const double tgi = kpgi[ig];
377                 const double tgi2 = tgi * tgi;
378 
379                 const double ts = sin(arg);
380                 const double tc = cos(arg);
381 
382                 t0[ig] = fpi * s14pi * ts * tgi;
383 
384                 // dtwnl = fpi s14pi k+G_i k+G_j / k+G
385                 // (r cos(k+Gr)/k+G -sin(k+Gr)/k+G^2)
386                 const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi);
387                 dt0_xx[ig] = tmp * tgx * tgx;
388                 dt0_yy[ig] = tmp * tgy * tgy;
389                 dt0_zz[ig] = tmp * tgz * tgz;
390                 dt0_xy[ig] = tmp * tgx * tgy;
391                 dt0_yz[ig] = tmp * tgy * tgz;
392                 dt0_xz[ig] = tmp * tgx * tgz;
393               }
394 
395               for ( int ig = 1; ig < ngwl; ig++ )
396               {
397                 // I(l=0) = 4 pi j_l(|k+G| r) r
398                 // twnl[is][ipr][l][ig] = 4 pi j_0(|k+G|r_i) r_i Ylm
399                 // j_0(|k+G|r) * r = sin(|k+G|r) / |k+G|
400                 // Ylm = s14pi
401                 const double arg = kpg[ig] * r;
402 
403                 const double tgx = kpg_x[ig];
404                 const double tgy = kpg_y[ig];
405                 const double tgz = kpg_z[ig];
406                 const double tgi = kpgi[ig];
407                 const double tgi2 = tgi * tgi;
408 
409                 const double ts = sin(arg);
410                 const double tc = cos(arg);
411 
412                 t0[ig] = fpi * s14pi * ts * tgi;
413 
414                 // dtwnl = fpi s14pi (k+G)_i (k+G)_j /
415                 // |k+G| (r cos(|k+G|r)/|k+G| -sin(|k+G|r)/|k+G|^2)
416                 const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi);
417                 dt0_xx[ig] = tmp * tgx * tgx;
418                 dt0_yy[ig] = tmp * tgy * tgy;
419                 dt0_zz[ig] = tmp * tgz * tgz;
420                 dt0_xy[ig] = tmp * tgx * tgy;
421                 dt0_yz[ig] = tmp * tgy * tgz;
422                 dt0_xz[ig] = tmp * tgx * tgz;
423               }
424             } // for iquad
425           }
426           ilm += 2*l+1;
427         }
428         else if ( l == 1 )
429         {
430           if ( nquad[is] == 0 )
431           {
432             // Kleinman-Bylander
433 
434             // twnl[is][ipr][ig]
435             // ipr = ilm
436             const int ipr1 = ilm;
437             const int ipr2 = ilm+1;
438             const int ipr3 = ilm+2;
439             // index = ig + ngwl*ilm
440             double *t1 = &twnl[is][ngwl*ipr1];
441             double *t2 = &twnl[is][ngwl*ipr2];
442             double *t3 = &twnl[is][ngwl*ipr3];
443 
444             // dtwnl[is][ipr][ij][ngwl]
445             // index = ig + ngwl * ( ij + 6 * ipr )
446             double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)];
447             double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)];
448             double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)];
449             double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)];
450             double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)];
451             double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)];
452 
453             double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)];
454             double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)];
455             double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)];
456             double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)];
457             double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)];
458             double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)];
459 
460             double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)];
461             double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)];
462             double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)];
463             double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)];
464             double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)];
465             double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)];
466 
467             for ( int ig = 0; ig < ngwl; ig++ )
468             {
469               double v,dv;
470               const double tg = kpg[ig];
471               s->dvnlg(iop,tg,v,dv);
472 
473               const double tgx = kpg_x[ig];
474               const double tgy = kpg_y[ig];
475               const double tgz = kpg_z[ig];
476               const double tgx2 = tgx * tgx;
477               const double tgy2 = tgy * tgy;
478               const double tgz2 = tgz * tgz;
479 
480               const double tgi = kpgi[ig];
481               const double tgi2 = tgi * tgi;
482 
483               const double y1 = s34pi * tgx * tgi;
484               const double y2 = s34pi * tgy * tgi;
485               const double y3 = s34pi * tgz * tgi;
486 
487               t1[ig]  = y1 * v;
488               t2[ig]  = y2 * v;
489               t3[ig]  = y3 * v;
490 
491               const double fac1 = - y1 * ( v - tg * dv ) * tgi2;
492               // m=x
493               dt1_xx[ig] = fac1 * tgx2 + v * y1;
494               dt1_yy[ig] = fac1 * tgy2;
495               dt1_zz[ig] = fac1 * tgz2;
496               dt1_xy[ig] = fac1 * tgx * tgy;
497               dt1_yz[ig] = fac1 * tgy * tgz;
498               dt1_xz[ig] = fac1 * tgx * tgz;
499 
500               const double fac2 = - y2 * ( v - tg * dv ) * tgi2;
501               // m=y
502               dt2_xx[ig] = fac2 * tgx2;
503               dt2_yy[ig] = fac2 * tgy2 + v * y2;
504               dt2_zz[ig] = fac2 * tgz2;
505               dt2_xy[ig] = fac2 * tgx * tgy + v * y1;
506               dt2_yz[ig] = fac2 * tgy * tgz;
507               dt2_xz[ig] = fac2 * tgx * tgz;
508 
509               const double fac3 = - y3 * ( v - tg * dv ) * tgi2;
510               // m=z
511               dt3_xx[ig] = fac3 * tgx2;
512               dt3_yy[ig] = fac3 * tgy2;
513               dt3_zz[ig] = fac3 * tgz2 + v * y3;
514               dt3_xy[ig] = fac3 * tgx * tgy;
515               dt3_yz[ig] = fac3 * tgy * tgz + v * y2;
516               dt3_xz[ig] = fac3 * tgx * tgz + v * y1;
517             }
518           }
519           else
520           {
521             // semi-local
522             for ( int iquad = 0; iquad < nquad[is]; iquad++ )
523             {
524               // twnl[is][ipr][ig]
525               // index = ig + ngwl*(iquad+nquad[is]*ilm)
526               const int ipr1 = iquad+nquad[is]*ilm;
527               const int ipr2 = iquad+nquad[is]*(ilm+1);
528               const int ipr3 = iquad+nquad[is]*(ilm+2);
529               double *t1 = &twnl[is][ngwl*ipr1];
530               double *t2 = &twnl[is][ngwl*ipr2];
531               double *t3 = &twnl[is][ngwl*ipr3];
532 
533               // dtwnl[is][ipr][j][ngwl]
534               // index = ig + ngwl * ( ij + 6 * ipr )
535               double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)];
536               double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)];
537               double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)];
538               double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)];
539               double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)];
540               double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)];
541 
542               double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)];
543               double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)];
544               double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)];
545               double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)];
546               double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)];
547               double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)];
548 
549               double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)];
550               double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)];
551               double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)];
552               double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)];
553               double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)];
554               double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)];
555 
556               const double r = rquad[is][iquad];
557               for ( int ig = 0; ig < ngwl; ig++ )
558               {
559                 double v = 0.0, dv = 0.0;
560                 // j_1(Gr) = (1/(Gr))*(sin(Gr)/(Gr)-cos(Gr))
561                 const double tg = kpg[ig];
562                 const double z = tg * r;
563                 if ( z != 0.0 )
564                 {
565                   const double zi = 1.0 / z;
566                   const double c = cos(z);
567                   const double s = sin(z);
568                   const double j1 = ( s * zi - c ) * zi;
569                   const double dj1 =
570                     ( 2.0 * z * c + ( z*z - 2.0 ) * s ) * zi*zi*zi;
571                   // v = 4 pi j1(Gr) r
572                   v = fpi * j1 * r;
573                   // dv = d/dG v = 4 pi dj1(Gr)/d(Gr) d(Gr)/dG r
574                   //    = 4 pi dj1 r^2
575                   dv = fpi * dj1 * r * r;
576                 }
577 
578                 const double tgx = kpg_x[ig];
579                 const double tgy = kpg_y[ig];
580                 const double tgz = kpg_z[ig];
581                 const double tgx2 = tgx * tgx;
582                 const double tgy2 = tgy * tgy;
583                 const double tgz2 = tgz * tgz;
584 
585                 const double tgi = kpgi[ig];
586                 const double tgi2 = tgi * tgi;
587 
588                 const double y1 = s34pi * tgx * tgi;
589                 const double y2 = s34pi * tgy * tgi;
590                 const double y3 = s34pi * tgz * tgi;
591 
592                 t1[ig]  = y1 * v;
593                 t2[ig]  = y2 * v;
594                 t3[ig]  = y3 * v;
595 
596                 const double fac1 = - y1 * ( v - tg * dv ) * tgi2;
597                 // m=x
598                 dt1_xx[ig] = fac1 * tgx2 + v * y1;
599                 dt1_yy[ig] = fac1 * tgy2;
600                 dt1_zz[ig] = fac1 * tgz2;
601                 dt1_xy[ig] = fac1 * tgx * tgy;
602                 dt1_yz[ig] = fac1 * tgy * tgz;
603                 dt1_xz[ig] = fac1 * tgx * tgz;
604 
605                 const double fac2 = - y2 * ( v - tg * dv ) * tgi2;
606                 // m=y
607                 dt2_xx[ig] = fac2 * tgx2;
608                 dt2_yy[ig] = fac2 * tgy2 + v * y2;
609                 dt2_zz[ig] = fac2 * tgz2;
610                 dt2_xy[ig] = fac2 * tgx * tgy + v * y1;
611                 dt2_yz[ig] = fac2 * tgy * tgz;
612                 dt2_xz[ig] = fac2 * tgx * tgz;
613 
614                 const double fac3 = - y3 * ( v - tg * dv ) * tgi2;
615                 // m=z
616                 dt3_xx[ig] = fac3 * tgx2;
617                 dt3_yy[ig] = fac3 * tgy2;
618                 dt3_zz[ig] = fac3 * tgz2 + v * y3;
619                 dt3_xy[ig] = fac3 * tgx * tgy;
620                 dt3_yz[ig] = fac3 * tgy * tgz + v * y2;
621                 dt3_xz[ig] = fac3 * tgx * tgz + v * y1;
622               } // ig
623             }
624           }
625           ilm += 2*l+1;
626         }
627         else if ( l == 2 )
628         {
629           if ( nquad[is] == 0 )
630           {
631             // Kleinman-Bylander
632             const int ipr4 = ilm;
633             const int ipr5 = ilm+1;
634             const int ipr6 = ilm+2;
635             const int ipr7 = ilm+3;
636             const int ipr8 = ilm+4;
637 
638             double *t4 = &twnl[is][ngwl*ipr4];
639             double *t5 = &twnl[is][ngwl*ipr5];
640             double *t6 = &twnl[is][ngwl*ipr6];
641             double *t7 = &twnl[is][ngwl*ipr7];
642             double *t8 = &twnl[is][ngwl*ipr8];
643 
644             // dtwnl[is][ipr][ij][ngwl]
645             // index = ig + ngwl * ( ij + 6 * ipr )
646             double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)];
647             double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)];
648             double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)];
649             double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)];
650             double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)];
651             double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)];
652 
653             double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)];
654             double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)];
655             double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)];
656             double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)];
657             double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)];
658             double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)];
659 
660             double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)];
661             double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)];
662             double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)];
663             double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)];
664             double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)];
665             double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)];
666 
667             double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)];
668             double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)];
669             double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)];
670             double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)];
671             double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)];
672             double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)];
673 
674             double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)];
675             double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)];
676             double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)];
677             double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)];
678             double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)];
679             double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)];
680 
681             for ( int ig = 0; ig < ngwl; ig++ )
682             {
683               double v,dv;
684               const double tg = kpg[ig];
685 
686               s->dvnlg(iop,tg,v,dv);
687 
688               const double tgx = kpg_x[ig];
689               const double tgy = kpg_y[ig];
690               const double tgz = kpg_z[ig];
691               const double tgx2 = tgx * tgx;
692               const double tgy2 = tgy * tgy;
693               const double tgz2 = tgz * tgz;
694 
695               const double tgi = kpgi[ig];
696               const double tgi2 = tgi * tgi;
697 
698               const double tgxx = tgx2 * tgi2;
699               const double tgyy = tgy2 * tgi2;
700               const double tgzz = tgz2 * tgi2;
701               const double tgxy = tgx * tgy * tgi2;
702               const double tgyz = tgy * tgz * tgi2;
703               const double tgxz = tgx * tgz * tgi2;
704 
705               const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 );
706               const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy );
707               const double y6 = s54pi * s3 * tgxy;
708               const double y7 = s54pi * s3 * tgyz;
709               const double y8 = s54pi * s3 * tgxz;
710 
711               const double y1x = s34pi * tgx * tgi;
712               const double y1y = s34pi * tgy * tgi;
713               const double y1z = s34pi * tgz * tgi;
714 
715               const double dx_xx = y1x * tgxx - y1x;
716               const double dx_yy = y1x * tgyy;
717               const double dx_zz = y1x * tgzz;
718               const double dx_xy = y1x * tgxy;
719               const double dx_yz = y1x * tgyz;
720               const double dx_xz = y1x * tgxz;
721 
722               const double dy_xx = y1y * tgxx;
723               const double dy_yy = y1y * tgyy - y1y;
724               const double dy_zz = y1y * tgzz;
725               const double dy_xy = y1y * tgxy - y1x;
726               const double dy_yz = y1y * tgyz;
727               const double dy_xz = y1y * tgxz;
728 
729               const double dz_xx = y1z * tgxx;
730               const double dz_yy = y1z * tgyy;
731               const double dz_zz = y1z * tgzz - y1z;
732               const double dz_xy = y1z * tgxy;
733               const double dz_yz = y1z * tgyz - y1y;
734               const double dz_xz = y1z * tgxz - y1x;
735 
736               t4[ig]  = y4 * v;
737               t5[ig]  = y5 * v;
738               t6[ig]  = y6 * v;
739               t7[ig]  = y7 * v;
740               t8[ig]  = y8 * v;
741 
742               // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 )
743               dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx);
744               dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy);
745               dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz);
746               dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy);
747               dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz);
748               dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz);
749 
750               // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2
751               dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx);
752               dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy);
753               dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz);
754               dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy);
755               dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz);
756               dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz);
757 
758               // y6 = s54pi sqrt(3) x y / r^2
759               dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx);
760               dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy);
761               dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz);
762               dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy);
763               dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz);
764               dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz);
765 
766               // y7 = s54pi sqrt(3) y z / r^2
767               dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx);
768               dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy);
769               dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz);
770               dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy);
771               dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz);
772               dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz);
773 
774               // y8 = s54pi sqrt(3) z x / r^2
775               dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx);
776               dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy);
777               dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz);
778               dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy);
779               dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz);
780               dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz);
781             }
782           }
783           else
784           {
785             // semi-local
786             for ( int iquad = 0; iquad < nquad[is]; iquad++ )
787             {
788               // twnl[is][ipr][ig]
789               // ipr = iquad+nquad[is]*ilm
790               // index = ig + ngwl*ipr
791               const int ipr4 = iquad+nquad[is]*ilm;
792               const int ipr5 = iquad+nquad[is]*(ilm+1);
793               const int ipr6 = iquad+nquad[is]*(ilm+2);
794               const int ipr7 = iquad+nquad[is]*(ilm+3);
795               const int ipr8 = iquad+nquad[is]*(ilm+4);
796 
797               double *t4 = &twnl[is][ngwl*ipr4];
798               double *t5 = &twnl[is][ngwl*ipr5];
799               double *t6 = &twnl[is][ngwl*ipr6];
800               double *t7 = &twnl[is][ngwl*ipr7];
801               double *t8 = &twnl[is][ngwl*ipr8];
802 
803               // dtwnl[is][ipr][ij][ngwl]
804               // index = ig + ngwl * ( ij + 6 * ipr )
805               double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)];
806               double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)];
807               double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)];
808               double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)];
809               double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)];
810               double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)];
811 
812               double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)];
813               double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)];
814               double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)];
815               double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)];
816               double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)];
817               double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)];
818 
819               double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)];
820               double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)];
821               double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)];
822               double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)];
823               double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)];
824               double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)];
825 
826               double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)];
827               double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)];
828               double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)];
829               double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)];
830               double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)];
831               double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)];
832 
833               double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)];
834               double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)];
835               double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)];
836               double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)];
837               double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)];
838               double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)];
839 
840               const double r = rquad[is][iquad];
841               for ( int ig = 0; ig < ngwl; ig++ )
842               {
843                 double v = 0.0, dv = 0.0;
844                 // j_2(z) = (3/z^3-1/z) sin(z) - 3/z^2 cos(z)
845                 // j_2(z) = (1/z)*((3/z^2-1)*sin(z) - (3/z) cos(z) )
846 
847                 const double tg = kpg[ig];
848                 const double z = tg * r;
849                 if ( z != 0.0 )
850                 {
851                   const double zi = 1.0 / z;
852                   const double c = cos(z);
853                   const double s = sin(z);
854                   const double j2 = ((3.0*zi*zi - 1.0) * s - 3.0*zi * c ) * zi;
855                   const double z2 = z * z;
856                   const double dj2 =
857                     ( (4.0 * z2 - 9.0) * s + z*(9.0-z2) * c ) / (z2*z2) ;
858                   // v = 4 pi j2(Gr) r
859                   v = fpi * j2 * r;
860                   // dv = d/dG v = 4 pi dj2(Gr)/d(Gr) d(Gr)/dG r
861                   //    = 4 pi dj2 r^2
862                   dv = fpi * dj2 * r * r;
863                 }
864 
865                 const double tgx = kpg_x[ig];
866                 const double tgy = kpg_y[ig];
867                 const double tgz = kpg_z[ig];
868                 const double tgx2 = tgx * tgx;
869                 const double tgy2 = tgy * tgy;
870                 const double tgz2 = tgz * tgz;
871 
872                 const double tgi = kpgi[ig];
873                 const double tgi2 = tgi * tgi;
874 
875                 const double tgxx = tgx2 * tgi2;
876                 const double tgyy = tgy2 * tgi2;
877                 const double tgzz = tgz2 * tgi2;
878                 const double tgxy = tgx * tgy * tgi2;
879                 const double tgyz = tgy * tgz * tgi2;
880                 const double tgxz = tgx * tgz * tgi2;
881 
882                 const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 );
883                 const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy );
884                 const double y6 = s54pi * s3 * tgxy;
885                 const double y7 = s54pi * s3 * tgyz;
886                 const double y8 = s54pi * s3 * tgxz;
887 
888                 const double y1x = s34pi * tgx * tgi;
889                 const double y1y = s34pi * tgy * tgi;
890                 const double y1z = s34pi * tgz * tgi;
891 
892                 const double dx_xx = y1x * tgxx - y1x;
893                 const double dx_yy = y1x * tgyy;
894                 const double dx_zz = y1x * tgzz;
895                 const double dx_xy = y1x * tgxy;
896                 const double dx_yz = y1x * tgyz;
897                 const double dx_xz = y1x * tgxz;
898 
899                 const double dy_xx = y1y * tgxx;
900                 const double dy_yy = y1y * tgyy - y1y;
901                 const double dy_zz = y1y * tgzz;
902                 const double dy_xy = y1y * tgxy - y1x;
903                 const double dy_yz = y1y * tgyz;
904                 const double dy_xz = y1y * tgxz;
905 
906                 const double dz_xx = y1z * tgxx;
907                 const double dz_yy = y1z * tgyy;
908                 const double dz_zz = y1z * tgzz - y1z;
909                 const double dz_xy = y1z * tgxy;
910                 const double dz_yz = y1z * tgyz - y1y;
911                 const double dz_xz = y1z * tgxz - y1x;
912 
913                 t4[ig]  = y4 * v;
914                 t5[ig]  = y5 * v;
915                 t6[ig]  = y6 * v;
916                 t7[ig]  = y7 * v;
917                 t8[ig]  = y8 * v;
918 
919                 // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 )
920                 dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx);
921                 dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy);
922                 dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz);
923                 dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy);
924                 dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz);
925                 dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz);
926 
927                 // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2
928                 dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx);
929                 dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy);
930                 dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz);
931                 dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy);
932                 dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz);
933                 dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz);
934 
935                 // y6 = s54pi sqrt(3) x y / r^2
936                 dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx);
937                 dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy);
938                 dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz);
939                 dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy);
940                 dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz);
941                 dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz);
942 
943                 // y7 = s54pi sqrt(3) y z / r^2
944                 dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx);
945                 dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy);
946                 dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz);
947                 dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy);
948                 dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz);
949                 dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz);
950 
951                 // y8 = s54pi sqrt(3) z x / r^2
952                 dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx);
953                 dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy);
954                 dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz);
955                 dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy);
956                 dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz);
957                 dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz);
958               } // ig
959             } // iquad
960           }
961           ilm += 2*l+1;
962         }
963         else if ( l == 3 )
964         {
965           // only implemented for Kleiman-Bylander type
966           assert(nquad[is] == 0);
967           // Kleinman-Bylander
968           const int ipr09 = ilm;
969           const int ipr10 = ilm + 1;
970           const int ipr11 = ilm + 2;
971           const int ipr12 = ilm + 3;
972           const int ipr13 = ilm + 4;
973           const int ipr14 = ilm + 5;
974           const int ipr15 = ilm + 6;
975 
976           double *t09 = &twnl[is][ngwl * ipr09];
977           double *t10 = &twnl[is][ngwl * ipr10];
978           double *t11 = &twnl[is][ngwl * ipr11];
979           double *t12 = &twnl[is][ngwl * ipr12];
980           double *t13 = &twnl[is][ngwl * ipr13];
981           double *t14 = &twnl[is][ngwl * ipr14];
982           double *t15 = &twnl[is][ngwl * ipr15];
983 
984           // dtwnl[is][ipr][ij][ngwl]
985           // index = ig + ngwl * ( ij + 6 * ipr )
986           double *dt09_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr09 )];
987           double *dt09_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr09 )];
988           double *dt09_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr09 )];
989           double *dt09_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr09 )];
990           double *dt09_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr09 )];
991           double *dt09_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr09 )];
992 
993           double *dt10_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr10 )];
994           double *dt10_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr10 )];
995           double *dt10_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr10 )];
996           double *dt10_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr10 )];
997           double *dt10_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr10 )];
998           double *dt10_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr10 )];
999 
1000           double *dt11_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr11 )];
1001           double *dt11_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr11 )];
1002           double *dt11_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr11 )];
1003           double *dt11_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr11 )];
1004           double *dt11_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr11 )];
1005           double *dt11_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr11 )];
1006 
1007           double *dt12_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr12 )];
1008           double *dt12_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr12 )];
1009           double *dt12_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr12 )];
1010           double *dt12_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr12 )];
1011           double *dt12_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr12 )];
1012           double *dt12_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr12 )];
1013 
1014           double *dt13_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr13 )];
1015           double *dt13_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr13 )];
1016           double *dt13_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr13 )];
1017           double *dt13_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr13 )];
1018           double *dt13_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr13 )];
1019           double *dt13_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr13 )];
1020 
1021           double *dt14_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr14 )];
1022           double *dt14_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr14 )];
1023           double *dt14_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr14 )];
1024           double *dt14_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr14 )];
1025           double *dt14_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr14 )];
1026           double *dt14_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr14 )];
1027 
1028           double *dt15_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr15 )];
1029           double *dt15_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr15 )];
1030           double *dt15_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr15 )];
1031           double *dt15_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr15 )];
1032           double *dt15_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr15 )];
1033           double *dt15_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr15 )];
1034 
1035           for ( int ig = 0; ig < ngwl; ig++ )
1036           {
1037             double v, dv;
1038             const double tg = kpg[ig];
1039 
1040             s->dvnlg(iop,tg,v,dv);
1041 
1042             const double tgx = kpg_x[ig];
1043             const double tgy = kpg_y[ig];
1044             const double tgz = kpg_z[ig];
1045             const double tgx2 = tgx * tgx;
1046             const double tgy2 = tgy * tgy;
1047             const double tgz2 = tgz * tgz;
1048             const double tgx3 = tgx * tgx2;
1049             const double tgy3 = tgy * tgy2;
1050             const double tgz3 = tgz * tgz2;
1051 
1052             const double tgi = kpgi[ig];
1053             const double tgi2 = tgi * tgi;
1054             const double tgi3 = tgi * tgi2;
1055 
1056             const double tgxx = tgx2 * tgi2;
1057             const double tgyy = tgy2 * tgi2;
1058             const double tgzz = tgz2 * tgi2;
1059             const double tgxy = tgx * tgy * tgi2;
1060             const double tgyz = tgy * tgz * tgi2;
1061             const double tgxz = tgx * tgz * tgi2;
1062 
1063             const double tgxxx = tgx3 * tgi3;
1064             const double tgyyy = tgy3 * tgi3;
1065             const double tgzzz = tgz3 * tgi3;
1066             const double tgxyy = tgx * tgy2 * tgi3;
1067             const double tgxzz = tgx * tgz2 * tgi3;
1068             const double tgyxx = tgy * tgx2 * tgi3;
1069             const double tgyzz = tgy * tgz2 * tgi3;
1070             const double tgzxx = tgz * tgx2 * tgi3;
1071             const double tgzyy = tgz * tgy2 * tgi3;
1072             const double tgxyz = tgx * tgy * tgz * tgi3;
1073 
1074             const double factor = 0.5 * s74pi;
1075 
1076             const double y09 = factor * s52 * ( 3.0 * tgyxx - tgyyy );
1077             const double y10 = factor * s15 * 2.0 * tgxyz;
1078             const double y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy );
1079             const double y12 = factor
1080               * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) );
1081             const double y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy );
1082             const double y14 = factor * s15 * ( tgzxx - tgzyy );
1083             const double y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy );
1084 
1085             // derivative of x^3/r^3 w.r.t. x, y, and z
1086             const double dx_xxx = 3.0 * tgxx * ( 1.0 - tgxx ) * tgi;
1087             const double dy_xxx = -3.0 * tgxx * tgxy * tgi;
1088             const double dz_xxx = -3.0 * tgxx * tgxz * tgi;
1089             // derivative of y^3/r^3 w.r.t. x, y, and z
1090             const double dx_yyy = -3.0 * tgyy * tgxy * tgi;
1091             const double dy_yyy = 3.0 * tgyy * ( 1.0 - tgyy ) * tgi;
1092             const double dz_yyy = -3.0 * tgyy * tgyz * tgi;
1093             // derivative of z^3/r^3 w.r.t. x, y, and z
1094             const double dx_zzz = -3.0 * tgzz * tgxz * tgi;
1095             const double dy_zzz = -3.0 * tgzz * tgyz * tgi;
1096             const double dz_zzz = 3.0 * tgzz * ( 1.0 - tgzz ) * tgi;
1097             // derivative of xy^2/r^3 w.r.t. x, y, and z
1098             const double dx_xyy = tgyy * ( 1.0 - 3.0 * tgxx ) * tgi;
1099             const double dy_xyy = tgxy * ( 2.0 - 3.0 * tgyy ) * tgi;
1100             const double dz_xyy = -3.0 * tgyy * tgxz * tgi;
1101             // derivative of xz^2/r^3 w.r.t. x, y, and z
1102             const double dx_xzz = tgzz * ( 1.0 - 3.0 * tgxx ) * tgi;
1103             const double dy_xzz = -3.0 * tgzz * tgxy * tgi;
1104             const double dz_xzz = tgxz * ( 2.0 - 3.0 * tgzz ) * tgi;
1105             // derivative of yx^2/r^3 w.r.t. x, y, and z
1106             const double dx_yxx = tgxy * ( 2.0 - 3.0 * tgxx ) * tgi;
1107             const double dy_yxx = tgxx * ( 1.0 - 3.0 * tgyy ) * tgi;
1108             const double dz_yxx = -3.0 * tgxx * tgyz * tgi;
1109             // derivative of yz^2/r^3 w.r.t. x, y, and z
1110             const double dx_yzz = -3.0 * tgzz * tgxy * tgi;
1111             const double dy_yzz = tgzz * ( 1.0 - 3.0 * tgyy ) * tgi;
1112             const double dz_yzz = tgyz * ( 2.0 - 3.0 * tgzz ) * tgi;
1113             // derivative of zx^2/r^3 w.r.t. x, y, and z
1114             const double dx_zxx = tgxz * ( 2.0 - 3.0 * tgxx ) * tgi;
1115             const double dy_zxx = -3.0 * tgxx * tgyz * tgi;
1116             const double dz_zxx = tgxx * ( 1.0 - 3.0 * tgzz ) * tgi;
1117             // derivative of zy^2/r^3 w.r.t. x, y, and z
1118             const double dx_zyy = -3.0 * tgyy * tgxz * tgi;
1119             const double dy_zyy = tgyz * ( 2.0 - 3.0 * tgyy ) * tgi;
1120             const double dz_zyy = tgyy * ( 1.0 - 3.0 * tgzz ) * tgi;
1121             // derivative of xyz/r^3 w.r.t. x, y, and z
1122             const double dx_xyz = tgyz * ( 1 - 3.0 * tgxx ) * tgi;
1123             const double dy_xyz = tgxz * ( 1 - 3.0 * tgyy ) * tgi;
1124             const double dz_xyz = tgxy * ( 1 - 3.0 * tgzz ) * tgi;
1125 
1126             // derivatives of spherical harmonics
1127 
1128             // y9 = factor * s52 * ( 3.0 * tgyxx - tgyyy );
1129             const double dx_y09 = factor * s52 * ( 3.0 * dx_yxx - dx_yyy );
1130             const double dy_y09 = factor * s52 * ( 3.0 * dy_yxx - dy_yyy );
1131             const double dz_y09 = factor * s52 * ( 3.0 * dz_yxx - dz_yyy );
1132             // y10 = factor * s15 * 2.0 * tgxyz;
1133             const double dx_y10 = factor * s15 * 2.0 * dx_xyz;
1134             const double dy_y10 = factor * s15 * 2.0 * dy_xyz;
1135             const double dz_y10 = factor * s15 * 2.0 * dz_xyz;
1136             // y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy );
1137             const double dx_y11 = factor * s32 * ( 4.0 * dx_yzz - dx_yxx - dx_yyy );
1138             const double dy_y11 = factor * s32 * ( 4.0 * dy_yzz - dy_yxx - dy_yyy );
1139             const double dz_y11 = factor * s32 * ( 4.0 * dz_yzz - dz_yxx - dz_yyy );
1140             // y12 = factor * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) );
1141             const double dx_y12 = factor
1142               * ( 2.0 * dx_zzz - 3.0 * ( dx_zxx + dx_zyy ) );
1143             const double dy_y12 = factor
1144               * ( 2.0 * dy_zzz - 3.0 * ( dy_zxx + dy_zyy ) );
1145             const double dz_y12 = factor
1146               * ( 2.0 * dz_zzz - 3.0 * ( dz_zxx + dz_zyy ) );
1147             // y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy );
1148             const double dx_y13 = factor * s32 * ( 4.0 * dx_xzz - dx_xxx - dx_xyy );
1149             const double dy_y13 = factor * s32 * ( 4.0 * dy_xzz - dy_xxx - dy_xyy );
1150             const double dz_y13 = factor * s32 * ( 4.0 * dz_xzz - dz_xxx - dz_xyy );
1151             // y14 = factor * s15 * ( tgzxx - tgzyy );
1152             const double dx_y14 = factor * s15 * ( dx_zxx - dx_zyy );
1153             const double dy_y14 = factor * s15 * ( dy_zxx - dy_zyy );
1154             const double dz_y14 = factor * s15 * ( dz_zxx - dz_zyy );
1155             // y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy );
1156             const double dx_y15 = factor * s52 * ( dx_xxx - 3.0 * dx_xyy );
1157             const double dy_y15 = factor * s52 * ( dy_xxx - 3.0 * dy_xyy );
1158             const double dz_y15 = factor * s52 * ( dz_xxx - 3.0 * dz_xyy );
1159 
1160             t09[ig] = y09 * v;
1161             t10[ig] = y10 * v;
1162             t11[ig] = y11 * v;
1163             t12[ig] = y12 * v;
1164             t13[ig] = y13 * v;
1165             t14[ig] = y14 * v;
1166             t15[ig] = y15 * v;
1167 
1168             // contribution to stress tensor
1169             // sigma_ij = 0.5 * (xi * dj + xj * di)
1170 
1171             dt09_xx[ig] = -( v * dx_y09 * tgx - y09 * dv * tg * tgxx );
1172             dt09_yy[ig] = -( v * dy_y09 * tgy - y09 * dv * tg * tgyy );
1173             dt09_zz[ig] = -( v * dz_y09 * tgz - y09 * dv * tg * tgzz );
1174             dt09_xy[ig] = -( 0.5 * v * ( dx_y09 * tgy + dy_y09 * tgx )
1175               - y09 * dv * tg * tgxy );
1176             dt09_yz[ig] = -( 0.5 * v * ( dy_y09 * tgz + dz_y09 * tgy )
1177               - y09 * dv * tg * tgyz );
1178             dt09_xz[ig] = -( 0.5 * v * ( dx_y09 * tgz + dz_y09 * tgx )
1179               - y09 * dv * tg * tgxz );
1180 
1181             dt10_xx[ig] = -( v * dx_y10 * tgx - y10 * dv * tg * tgxx );
1182             dt10_yy[ig] = -( v * dy_y10 * tgy - y10 * dv * tg * tgyy );
1183             dt10_zz[ig] = -( v * dz_y10 * tgz - y10 * dv * tg * tgzz );
1184             dt10_xy[ig] = -( 0.5 * v * ( dx_y10 * tgy + dy_y10 * tgx )
1185               - y10 * dv * tg * tgxy );
1186             dt10_yz[ig] = -( 0.5 * v * ( dy_y10 * tgz + dz_y10 * tgy )
1187               - y10 * dv * tg * tgyz );
1188             dt10_xz[ig] = -( 0.5 * v * ( dx_y10 * tgz + dz_y10 * tgx )
1189               - y10 * dv * tg * tgxz );
1190 
1191             dt11_xx[ig] = -( v * dx_y11 * tgx - y11 * dv * tg * tgxx );
1192             dt11_yy[ig] = -( v * dy_y11 * tgy - y11 * dv * tg * tgyy );
1193             dt11_zz[ig] = -( v * dz_y11 * tgz - y11 * dv * tg * tgzz );
1194             dt11_xy[ig] = -( 0.5 * v * ( dx_y11 * tgy + dy_y11 * tgx )
1195               - y11 * dv * tg * tgxy );
1196             dt11_yz[ig] = -( 0.5 * v * ( dy_y11 * tgz + dz_y11 * tgy )
1197               - y11 * dv * tg * tgyz );
1198             dt11_xz[ig] = -( 0.5 * v * ( dx_y11 * tgz + dz_y11 * tgx )
1199               - y11 * dv * tg * tgxz );
1200 
1201             dt12_xx[ig] = -( v * dx_y12 * tgx - y12 * dv * tg * tgxx );
1202             dt12_yy[ig] = -( v * dy_y12 * tgy - y12 * dv * tg * tgyy );
1203             dt12_zz[ig] = -( v * dz_y12 * tgz - y12 * dv * tg * tgzz );
1204             dt12_xy[ig] = -( 0.5 * v * ( dx_y12 * tgy + dy_y12 * tgx )
1205               - y12 * dv * tg * tgxy );
1206             dt12_yz[ig] = -( 0.5 * v * ( dy_y12 * tgz + dz_y12 * tgy )
1207               - y12 * dv * tg * tgyz );
1208             dt12_xz[ig] = -( 0.5 * v * ( dx_y12 * tgz + dz_y12 * tgx )
1209               - y12 * dv * tg * tgxz );
1210 
1211             dt13_xx[ig] = -( v * dx_y13 * tgx - y13 * dv * tg * tgxx );
1212             dt13_yy[ig] = -( v * dy_y13 * tgy - y13 * dv * tg * tgyy );
1213             dt13_zz[ig] = -( v * dz_y13 * tgz - y13 * dv * tg * tgzz );
1214             dt13_xy[ig] = -( 0.5 * v * ( dx_y13 * tgy + dy_y13 * tgx )
1215               - y13 * dv * tg * tgxy );
1216             dt13_yz[ig] = -( 0.5 * v * ( dy_y13 * tgz + dz_y13 * tgy )
1217               - y13 * dv * tg * tgyz );
1218             dt13_xz[ig] = -( 0.5 * v * ( dx_y13 * tgz + dz_y13 * tgx )
1219               - y13 * dv * tg * tgxz );
1220 
1221             dt14_xx[ig] = -( v * dx_y14 * tgx - y14 * dv * tg * tgxx );
1222             dt14_yy[ig] = -( v * dy_y14 * tgy - y14 * dv * tg * tgyy );
1223             dt14_zz[ig] = -( v * dz_y14 * tgz - y14 * dv * tg * tgzz );
1224             dt14_xy[ig] = -( 0.5 * v * ( dx_y14 * tgy + dy_y14 * tgx )
1225               - y14 * dv * tg * tgxy );
1226             dt14_yz[ig] = -( 0.5 * v * ( dy_y14 * tgz + dz_y14 * tgy )
1227               - y14 * dv * tg * tgyz );
1228             dt14_xz[ig] = -( 0.5 * v * ( dx_y14 * tgz + dz_y14 * tgx )
1229               - y14 * dv * tg * tgxz );
1230 
1231             dt15_xx[ig] = -( v * dx_y15 * tgx - y15 * dv * tg * tgxx );
1232             dt15_yy[ig] = -( v * dy_y15 * tgy - y15 * dv * tg * tgyy );
1233             dt15_zz[ig] = -( v * dz_y15 * tgz - y15 * dv * tg * tgzz );
1234             dt15_xy[ig] = -( 0.5 * v * ( dx_y15 * tgy + dy_y15 * tgx )
1235               - y15 * dv * tg * tgxy );
1236             dt15_yz[ig] = -( 0.5 * v * ( dy_y15 * tgz + dz_y15 * tgy )
1237               - y15 * dv * tg * tgyz );
1238             dt15_xz[ig] = -( 0.5 * v * ( dx_y15 * tgz + dz_y15 * tgx )
1239               - y15 * dv * tg * tgxz );
1240 
1241           }
1242 
1243           ilm += 2 * l + 1;
1244 
1245         } // l == 3
1246         else
1247         {
1248           assert(false);
1249         }
1250       } // l != lloc[is]
1251     } // for l
1252     assert(ilm == s->nlm());
1253   }
1254   tmap["update_twnl"].stop();
1255 }
1256 
1257 ////////////////////////////////////////////////////////////////////////////////
energy(bool compute_hpsi,SlaterDet & dsd,bool compute_forces,vector<vector<double>> & fion_enl,bool compute_stress,valarray<double> & sigma_enl)1258 double NonLocalPotential::energy(bool compute_hpsi, SlaterDet& dsd,
1259     bool compute_forces, vector<vector<double> >& fion_enl,
1260     bool compute_stress, valarray<double>& sigma_enl)
1261 {
1262   for ( int is = 0; is < fion_enl.size(); is++ )
1263     for ( int i = 0; i < fion_enl[is].size(); i++ )
1264        fion_enl[is][i] = 0.0;
1265   sigma_enl = 0.0;
1266 
1267   if ( nspnl == 0 ) return 0.0;
1268 
1269   double enl = 0.0;
1270   double tsum[6] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
1271   const vector<double>& occ = sd_.occ();
1272   const int ngwl = basis_.localsize();
1273   // define atom block size
1274   const int na_block_size = 32;
1275   valarray<double> gr(na_block_size*ngwl); // gr[ig+ia*ngwl]
1276   valarray<double> cgr(na_block_size*ngwl); // cgr[ig+ia*ngwl]
1277   valarray<double> sgr(na_block_size*ngwl); // sgr[ig+ia*ngwl]
1278   vector<vector<double> > tau;
1279   atoms_.get_positions(tau);
1280 
1281   const double omega = basis_.cell().volume();
1282   assert(omega != 0.0);
1283   const double omega_inv = 1.0 / omega;
1284 
1285   for ( int is = 0; is < nsp; is++ )
1286   {
1287     Species *s = atoms_.species_list[is];
1288     if ( npr[is] > 0 ) // species is is non-local
1289     {
1290       valarray<double> tmpfion(3*na[is]);
1291       tmpfion = 0.0;
1292       // define number of atom blocks
1293       const int na_blocks = na[is] / na_block_size +
1294                             ( na[is] % na_block_size == 0 ? 0 : 1 );
1295 
1296       valarray<double> anl_loc(npr[is]*na_block_size*2*ngwl);
1297       const int nstloc = sd_.nstloc();
1298       // fnl_loc[ipra][n]
1299       // fnl is real if basis is real, complex otherwise
1300       const int fnl_loc_size = basis_.real() ? npr[is]*na_block_size*nstloc :
1301         2*npr[is]*na_block_size*nstloc;
1302       valarray<double> fnl_loc(fnl_loc_size);
1303       valarray<double> fnl_buf(fnl_loc_size);
1304       for ( int ia_block = 0; ia_block < na_blocks; ia_block++ )
1305       {
1306         // process projectors of atoms in block ia_block
1307 
1308         const int iastart = ia_block * na_block_size;
1309         const int iaend = (ia_block+1) * na_block_size < na[is] ?
1310                         (ia_block+1) * na_block_size :
1311                         na[is];
1312         const int ia_block_size = iaend - iastart;
1313 
1314         // compute cgr[is][ia][ig], sgr[is][ia][ig]
1315         tmap["comp_eigr"].start();
1316         int k = 3;
1317         double mone = -1.0, zero = 0.0;
1318         char cn='n';
1319 
1320         // next line: const cast is ok since dgemm_ does not modify argument
1321         double* kpgx = const_cast<double*>(basis_.kpgx_ptr(0));
1322         dgemm(&cn,&cn,(int*)&ngwl,(int*)&ia_block_size,&k,&mone,
1323               kpgx,(int*)&ngwl, &tau[is][3*iastart],&k,
1324               &zero,&gr[0],(int*)&ngwl);
1325 
1326         int len = ia_block_size * ngwl;
1327 #if USE_MASSV
1328         vsincos(&sgr[0],&cgr[0],&gr[0],&len);
1329 #else
1330         for ( int i = 0; i < len; i++ )
1331         {
1332           const double arg = gr[i];
1333           sgr[i] = sin(arg);
1334           cgr[i] = cos(arg);
1335         }
1336 #endif
1337         tmap["comp_eigr"].stop();
1338 
1339         // compute anl_loc
1340         tmap["comp_anl"].start();
1341         for ( int ipr = 0; ipr < npr[is]; ipr++ )
1342         {
1343           // twnl[is][ig+ngwl*ipr]
1344           const double * t = &twnl[is][ngwl*ipr];
1345           const int l = lproj[is][ipr];
1346 
1347           // anl_loc[ig+ipra*ngwl]
1348 
1349           for ( int ia = 0; ia < ia_block_size; ia++ )
1350           {
1351             double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1352             const double* c = &cgr[ia*ngwl];
1353             const double* s = &sgr[ia*ngwl];
1354             if ( l == 0 )
1355             {
1356               for ( int ig = 0; ig < ngwl; ig++ )
1357               {
1358                 a[2*ig]   = t[ig] * c[ig];
1359                 a[2*ig+1] = t[ig] * s[ig];
1360               }
1361             }
1362             else if ( l == 1 )
1363             {
1364               for ( int ig = 0; ig < ngwl; ig++ )
1365               {
1366                 /* Next line: -i * eigr */
1367                 /* -i * (a+i*b) = b - i*a */
1368                 a[2*ig]   =  t[ig] * s[ig];
1369                 a[2*ig+1] = -t[ig] * c[ig];
1370               }
1371             }
1372             else if ( l == 2 )
1373             {
1374               for ( int ig = 0; ig < ngwl; ig++ )
1375               {
1376                 // Next line: (-) sign for -eigr
1377                 a[2*ig]   = -t[ig] * c[ig];
1378                 a[2*ig+1] = -t[ig] * s[ig];
1379               }
1380             }
1381             else if ( l == 3 )
1382             {
1383               for ( int ig = 0; ig < ngwl; ig++ )
1384               {
1385                 // Next line: i * eigr
1386                 // i * (a+i*b) = -b + i*a
1387                 a[2*ig]   = -t[ig] * s[ig];
1388                 a[2*ig+1] =  t[ig] * c[ig];
1389               }
1390             }
1391           }
1392         } // ipr
1393         tmap["comp_anl"].stop();
1394 
1395         // array anl_loc is complete
1396 
1397         // compute fnl[npra][nstloc] = anl^T * c
1398         int nprnaloc = ia_block_size * npr[is];
1399         if ( basis_.real() )
1400         {
1401           double one=1.0;
1402           char ct='t';
1403           int twongwl = 2 * ngwl;
1404           int c_lda = 2*sd_.c().mloc();
1405           const complex<double>* c = sd_.c().cvalptr();
1406           tmap["fnl_gemm"].start();
1407           dgemm(&ct,&cn,&nprnaloc,(int*)&nstloc,&twongwl,&one,
1408                 &anl_loc[0],&twongwl, (double*)c, &c_lda,
1409                 &zero,&fnl_loc[0],&nprnaloc);
1410           tmap["fnl_gemm"].stop();
1411 
1412           // correct for double counting if ctxt_.myrow() == 0
1413           if ( ctxt_.myrow() == 0 )
1414           {
1415             // rank-one update
1416             // dger(m,n,alpha,x,incx,y,incy,a,lda);
1417             // a += alpha * x * transpose(y)
1418             // x = first row of anl_loc
1419             // y^T = first row of c
1420             double alpha = -0.5;
1421             dger(&nprnaloc,(int*)&nstloc,&alpha,&anl_loc[0],&twongwl,
1422                  (double*)c,&c_lda,&fnl_loc[0],&nprnaloc);
1423           }
1424         }
1425         else
1426         {
1427           // fnl is complex
1428           complex<double> cone=1.0;
1429           complex<double> czero=0.0;
1430           char cc='c';
1431           int c_lda = sd_.c().mloc();
1432           const complex<double>* c = sd_.c().cvalptr();
1433           tmap["fnl_zemm"].start();
1434           zgemm(&cc,&cn,&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1435                 (complex<double>*) &anl_loc[0],(int*)&ngwl,
1436                 (complex<double>*) c, &c_lda,&czero,
1437                 (complex<double>*)&fnl_loc[0],&nprnaloc);
1438           tmap["fnl_zemm"].stop();
1439         }
1440 
1441 #if USE_MPI
1442         tmap["fnl_allreduce"].start();
1443         // Allreduce fnl partial sum
1444         MPI_Comm basis_comm = basis_.comm();
1445         int fnl_size = basis_.real() ? nprnaloc*nstloc : 2*nprnaloc*nstloc;
1446         MPI_Allreduce(&fnl_loc[0],&fnl_buf[0],fnl_size,
1447                       MPI_DOUBLE,MPI_SUM,basis_comm);
1448         tmap["fnl_allreduce"].stop();
1449 
1450         // factor 2.0 in next line is: counting G, -G
1451         if ( basis_.real() )
1452           fnl_loc = 2.0 * fnl_buf;
1453         else
1454           fnl_loc = fnl_buf;
1455 #else
1456         // factor 2.0 in next line is: counting G, -G
1457         if ( basis_.real() )
1458           fnl_loc *= 2.0;
1459 #endif
1460 
1461         // if the species has multiple projectors, that are not orthogonal
1462         // multiply with D matrix
1463         if ( s->has_dmatrix() )
1464         {
1465           if ( basis_.real() )
1466           {
1467             // helper variables
1468             double one = 1.0;
1469             double zero = 0.0;
1470             const size_t dsize = nprnaloc * nprnaloc;
1471             //
1472             // construct D matrix
1473             //
1474             // allocate array
1475             valarray<double> dmatrix(zero,dsize);
1476             // loop over all elements
1477             for ( int i = 0; i < nprnaloc; i++ )
1478             {
1479               // extract atom, l, and m corresponding to this index
1480               const int iatom = i % ia_block_size;
1481               const int ipr = i / ia_block_size;
1482               for ( int j = 0; j <= i; j++ )
1483               {
1484                 // extract atom, l, and m corresponding to this index
1485                 const int jatom = j % ia_block_size;
1486                 const int jpr = j / ia_block_size;
1487                 // D matrix is diagonal in atom, l, and m
1488                 if ( iatom != jatom ) continue;
1489                 const double val = s->dmatrix(ipr,jpr);
1490                 dmatrix[nprnaloc * i + j] = val;
1491                 // matrix is symmetric
1492                 if ( i != j ) dmatrix[nprnaloc * j + i] = val;
1493               }
1494             }
1495             // multiply F'_In = D_IJ F_Jn
1496             dgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0],
1497               &nprnaloc,&fnl_loc[0],&nprnaloc,&zero,&fnl_buf[0],&nprnaloc);
1498           }
1499           else
1500           {
1501             // helper variables
1502             complex<double> one = 1.0;
1503             complex<double> zero = 0.0;
1504             const size_t dsize = nprnaloc * nprnaloc;
1505             //
1506             // construct D matrix
1507             //
1508             // allocate array
1509             valarray < complex<double> > dmatrix(zero,dsize);
1510             // loop over all elements
1511             for ( int i = 0; i < nprnaloc; i++ )
1512             {
1513               // extract atom, l, and m corresponding to this index
1514               const int iatom = i % ia_block_size;
1515               const int ipr = i / ia_block_size;
1516               for ( int j = 0; j <= i; j++ )
1517               {
1518                 // extract atom, l, and m corresponding to this index
1519                 const int jatom = j % ia_block_size;
1520                 const int jpr = j / ia_block_size;
1521                 // D matrix is diagonal in atom, l, and m
1522                 if ( iatom != jatom ) continue;
1523                 const double val = s->dmatrix(ipr,jpr);
1524                 dmatrix[nprnaloc * i + j] = val;
1525                 // matrix is symmetric
1526                 if ( i != j ) dmatrix[nprnaloc * j + i] = val;
1527               }
1528             }
1529             // multiply F'_In = D_IJ F_Jn
1530             zgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0],
1531               &nprnaloc,(complex<double>*) &fnl_loc[0],&nprnaloc,&zero,
1532               (complex<double>*) &fnl_buf[0],&nprnaloc);
1533           }
1534         }
1535         else
1536         {
1537           fnl_buf = fnl_loc;
1538         }
1539 
1540         // accumulate Enl contribution
1541         const int nbase = ctxt_.mycol() * sd_.c().nb();
1542         if ( basis_.real() )
1543         {
1544           for ( int ipr = 0; ipr < npr[is]; ipr++ )
1545           {
1546             const double fac = wt[is][ipr] * omega_inv;
1547             for ( int n = 0; n < nstloc; n++ )
1548             {
1549               const double facn = fac * occ[n + nbase];
1550               for ( int ia = 0; ia < ia_block_size; ia++ )
1551               {
1552                 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1553                 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1554                 //     << ",n=" << n << "]: " << fnl_loc[i] << endl;
1555                 enl += facn * fnl_loc[i] * fnl_buf[i];
1556                 fnl_loc[i] = fac * fnl_buf[i];
1557               }
1558             }
1559           }
1560         }
1561         else
1562         {
1563           // fnl is complex
1564           for ( int ipr = 0; ipr < npr[is]; ipr++ )
1565           {
1566             const double fac = wt[is][ipr] * omega_inv;
1567             for ( int n = 0; n < nstloc; n++ )
1568             {
1569               const double facn = fac * occ[n + nbase];
1570               for ( int ia = 0; ia < ia_block_size; ia++ )
1571               {
1572                 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1573                 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1574                 //     << ",n=" << n << "]: " << fnl_loc[i] << endl;
1575                 const double f_re = fnl_loc[2*i];
1576                 const double f_im = fnl_loc[2*i+1];
1577                 const double fb_re = fnl_buf[2*i];
1578                 const double fb_im = fnl_buf[2*i+1];
1579                 enl += facn * (f_re*fb_re + f_im*fb_im);
1580                 fnl_loc[2*i] = fac * fb_re;
1581                 fnl_loc[2*i+1] = fac * fb_im;
1582               }
1583             }
1584           }
1585         }
1586 
1587         if ( compute_hpsi )
1588         {
1589           if ( basis_.real() )
1590           {
1591             tmap["enl_hpsi"].start();
1592             // compute cp += anl * fnl
1593             complex<double>* cp = dsd.c().valptr();
1594             int cp_lda = 2*dsd.c().mloc();
1595             int twongwl = 2 * ngwl;
1596             double one = 1.0;
1597             dgemm(&cn,&cn,&twongwl,(int*)&nstloc,&nprnaloc,&one,
1598                   &anl_loc[0],&twongwl, &fnl_loc[0],&nprnaloc,
1599                   &one,(double*)cp, &cp_lda);
1600             tmap["enl_hpsi"].stop();
1601           }
1602           else
1603           {
1604             tmap["enl_hpsi"].start();
1605             // compute cp += anl * fnl
1606             complex<double>* cp = dsd.c().valptr();
1607             complex<double> cone = 1.0;
1608             int cp_lda = dsd.c().mloc();
1609             zgemm(&cn,&cn,(int*)&ngwl,(int*)&nstloc,&nprnaloc,&cone,
1610                   (complex<double>*) &anl_loc[0],(int*)&ngwl,
1611                   (complex<double>*) &fnl_loc[0],&nprnaloc,
1612                   &cone,cp, &cp_lda);
1613             tmap["enl_hpsi"].stop();
1614           }
1615         }
1616 
1617         // ionic forces
1618         if ( compute_forces )
1619         {
1620           tmap["enl_fion"].start();
1621 
1622           valarray<double> dfnl_loc(fnl_loc_size);
1623           for ( int j = 0; j < 3; j++ )
1624           {
1625             const double *const kpgxj = basis_.kpgx_ptr(j);
1626 
1627             // compute anl_loc
1628             for ( int ipr = 0; ipr < npr[is]; ipr++ )
1629             {
1630               // twnl[is][ig+ngwl*ipr]
1631               const double * t = &twnl[is][ngwl*ipr];
1632               const int l = lproj[is][ipr];
1633 
1634               // anl_loc[ig+ipra*ngwl]
1635 
1636               for ( int ia = 0; ia < ia_block_size; ia++ )
1637               {
1638                 double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1639                 const double* c = &cgr[ia*ngwl];
1640                 const double* s = &sgr[ia*ngwl];
1641                 if ( l == 0 )
1642                 {
1643                   for ( int ig = 0; ig < ngwl; ig++ )
1644                   {
1645                     const double tt = kpgxj[ig] * t[ig];
1646                     // Next lines: -i * ( a + ib ) = b - ia
1647                     a[2*ig]   =  tt * s[ig];
1648                     a[2*ig+1] = -tt * c[ig];
1649                   }
1650                 }
1651                 else if ( l == 1 )
1652                 {
1653                   for ( int ig = 0; ig < ngwl; ig++ )
1654                   {
1655                     // Next lines: (-i)**2 * ( a + ib ) = - a - ib
1656                     const double tt = - kpgxj[ig] * t[ig];
1657                     a[2*ig]   = tt * c[ig];
1658                     a[2*ig+1] = tt * s[ig];
1659                   }
1660                 }
1661                 else if ( l == 2 )
1662                 {
1663                   for ( int ig = 0; ig < ngwl; ig++ )
1664                   {
1665                     // Next lines: (-i) * - ( a + ib ) = i*(a+ib) = - b + ia
1666                     const double tt = kpgxj[ig] * t[ig];
1667                     a[2*ig]   = -tt * s[ig];
1668                     a[2*ig+1] =  tt * c[ig];
1669                   }
1670                 }
1671                 else if ( l == 3 )
1672                 {
1673                   for ( int ig = 0; ig < ngwl; ig++ )
1674                   {
1675                     // Next lines: (-i)**4 * ( a + ib ) = a + ib
1676                     const double tt = kpgxj[ig] * t[ig];
1677                     a[2*ig]   = tt * c[ig];
1678                     a[2*ig+1] = tt * s[ig];
1679                   }
1680                 }
1681               }
1682             } // ipr
1683 
1684             // array anl_loc is complete
1685 
1686             // compute dfnl[npra][nstloc] = anl^T * c
1687             if ( basis_.real() )
1688             {
1689               // factor 2.0 in next line is: counting G, -G
1690               // Note: no need to correct for double counting of the
1691               // G=0 component which is always zero
1692               double two=2.0;
1693               char ct='t';
1694               const int twongwl = 2 * ngwl;
1695               const int nprnaloc = ia_block_size * npr[is];
1696               int c_lda = 2*sd_.c().mloc();
1697               const complex<double>* c = sd_.c().cvalptr();
1698               dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two,
1699                     &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda,
1700                     &zero,&dfnl_loc[0],(int*)&nprnaloc);
1701             }
1702             else
1703             {
1704               complex<double> cone=1.0;
1705               complex<double> czero=0.0;
1706               char cc='c';
1707               const int nprnaloc = ia_block_size * npr[is];
1708               int c_lda = sd_.c().mloc();
1709               const complex<double>* c = sd_.c().cvalptr();
1710               zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1711                     (complex<double>*) &anl_loc[0],(int*)&ngwl,
1712                     (complex<double>*) c,(int*)&c_lda,
1713                     &czero,(complex<double>*)&dfnl_loc[0],(int*)&nprnaloc);
1714             }
1715 
1716             // accumulate non-local contributions to forces
1717             if ( basis_.real() )
1718             {
1719               for ( int ipr = 0; ipr < npr[is]; ipr++ )
1720               {
1721                 for ( int n = 0; n < nstloc; n++ )
1722                 {
1723                   // Factor 2.0 in next line from derivative of |Fnl|^2
1724                   const double facn = 2.0 * occ[n + nbase];
1725                   for ( int ia = 0; ia < ia_block_size; ia++ )
1726                   {
1727                     const int ia_global = ia + iastart;
1728                     const int i = ia + ipr*ia_block_size + n * nprnaloc;
1729                     //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1730                     //     << ",n=" << n << "]: " << fnl_loc[i] << endl;
1731                     tmpfion[3*ia_global+j] -= facn *
1732                                               fnl_loc[i] * dfnl_loc[i];
1733                   }
1734                 }
1735               }
1736             }
1737             else
1738             {
1739               for ( int ipr = 0; ipr < npr[is]; ipr++ )
1740               {
1741                 for ( int n = 0; n < nstloc; n++ )
1742                 {
1743                   // Factor 2.0 in next line from derivative of |Fnl|^2
1744                   const double facn = 2.0 * occ[n + nbase];
1745                   for ( int ia = 0; ia < ia_block_size; ia++ )
1746                   {
1747                     const int ia_global = ia + iastart;
1748                     const int i = ia + ipr*ia_block_size + n * nprnaloc;
1749                     //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1750                     //     << ",n=" << n << "]: " << fnl_loc[i] << endl;
1751                     double f_re = fnl_loc[2*i];
1752                     double f_im = fnl_loc[2*i+1];
1753                     double df_re = dfnl_loc[2*i];
1754                     double df_im = dfnl_loc[2*i+1];
1755                     tmpfion[3*ia_global+j] -=
1756                       facn * ( f_re * df_re + f_im * df_im );
1757                   }
1758                 }
1759               }
1760             }
1761           } // j
1762 
1763           tmap["enl_fion"].stop();
1764         } // compute_forces
1765 
1766         if ( compute_stress )
1767         {
1768           tmap["enl_sigma"].start();
1769           valarray<double> dfnl_loc(fnl_loc_size);
1770 
1771           for ( int ij = 0; ij < 6; ij++ )
1772           {
1773             // compute anl_loc
1774             int ipr = 0;
1775             while ( ipr < npr[is] )
1776             {
1777               // twnl[is][ig+ngwl*ipr]
1778               const int l = lproj[is][ipr];
1779               if ( l == 0 )
1780               {
1781                 // dtwnl[is][ipr][ij][ngwl]
1782                 // index = ig + ngwl * ( ij + 6 * ipr))
1783                 // ipr = iquad + nquad[is] * ilm, where ilm = 0
1784                 const double *const dt0 = &dtwnl[is][ngwl*(ij+6*ipr)];
1785                 for ( int ia = 0; ia < ia_block_size; ia++ )
1786                 {
1787                   double* a0 = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1788                   const double* c = &cgr[ia*ngwl];
1789                   const double* s = &sgr[ia*ngwl];
1790                   for ( int ig = 0; ig < ngwl; ig++ )
1791                   {
1792                     const double d0 = dt0[ig];
1793                     // Next lines: -i * ( a + ib ) = b - ia
1794                     a0[2*ig]   =  d0 * c[ig];
1795                     a0[2*ig+1] =  d0 * s[ig];
1796                   }
1797                 }
1798               }
1799               else if ( l == 1 )
1800               {
1801                 const int ipr1 = ipr;
1802                 const int ipr2 = ipr + 1;
1803                 const int ipr3 = ipr + 2;
1804                 // dtwnl[is][ipr][ij][ngwl]
1805                 // index = ig + ngwl * ( ij + 6 * iprx ))
1806                 const double *dt1 = &dtwnl[is][ngwl*(ij+6*ipr1)];
1807                 const double *dt2 = &dtwnl[is][ngwl*(ij+6*ipr2)];
1808                 const double *dt3 = &dtwnl[is][ngwl*(ij+6*ipr3)];
1809                 for ( int ia = 0; ia < ia_block_size; ia++ )
1810                 {
1811                   double* a1 = &anl_loc[2*(ia+ipr1*ia_block_size)*ngwl];
1812                   double* a2 = &anl_loc[2*(ia+ipr2*ia_block_size)*ngwl];
1813                   double* a3 = &anl_loc[2*(ia+ipr3*ia_block_size)*ngwl];
1814                   const double* c = &cgr[ia*ngwl];
1815                   const double* s = &sgr[ia*ngwl];
1816                   for ( int ig = 0; ig < ngwl; ig++ )
1817                   {
1818                     const double d1 = dt1[ig];
1819                     const double d2 = dt2[ig];
1820                     const double d3 = dt3[ig];
1821                     // Next line: (-i)^l factor is -i
1822                     // Next line: -i * eigr
1823                     // -i * (a+i*b) = b - i*a
1824                     const double tc = -c[ig]; //  -cosgr[ia][ig]
1825                     const double ts =  s[ig]; //   singr[ia][ig]
1826                     a1[2*ig]   = d1 * ts;
1827                     a1[2*ig+1] = d1 * tc;
1828                     a2[2*ig]   = d2 * ts;
1829                     a2[2*ig+1] = d2 * tc;
1830                     a3[2*ig]   = d3 * ts;
1831                     a3[2*ig+1] = d3 * tc;
1832                   }
1833                 }
1834               }
1835               else if ( l == 2 )
1836               {
1837                 const int ipr4 = ipr;
1838                 const int ipr5 = ipr + 1;
1839                 const int ipr6 = ipr + 2;
1840                 const int ipr7 = ipr + 3;
1841                 const int ipr8 = ipr + 4;
1842                 // dtwnl[is][ipr][iquad][ij][ngwl]
1843                 // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr ))
1844                 const double *dt4 = &dtwnl[is][ngwl*(ij+6*ipr4)];
1845                 const double *dt5 = &dtwnl[is][ngwl*(ij+6*ipr5)];
1846                 const double *dt6 = &dtwnl[is][ngwl*(ij+6*ipr6)];
1847                 const double *dt7 = &dtwnl[is][ngwl*(ij+6*ipr7)];
1848                 const double *dt8 = &dtwnl[is][ngwl*(ij+6*ipr8)];
1849                 for ( int ia = 0; ia < ia_block_size; ia++ )
1850                 {
1851                   double* a4 = &anl_loc[2*(ia+ipr4*ia_block_size)*ngwl];
1852                   double* a5 = &anl_loc[2*(ia+ipr5*ia_block_size)*ngwl];
1853                   double* a6 = &anl_loc[2*(ia+ipr6*ia_block_size)*ngwl];
1854                   double* a7 = &anl_loc[2*(ia+ipr7*ia_block_size)*ngwl];
1855                   double* a8 = &anl_loc[2*(ia+ipr8*ia_block_size)*ngwl];
1856                   const double* c = &cgr[ia*ngwl];
1857                   const double* s = &sgr[ia*ngwl];
1858 
1859                   for ( int ig = 0; ig < ngwl; ig++ )
1860                   {
1861                     const double d4 = dt4[ig];
1862                     const double d5 = dt5[ig];
1863                     const double d6 = dt6[ig];
1864                     const double d7 = dt7[ig];
1865                     const double d8 = dt8[ig];
1866                     // Next lines: (-i)^2 * ( a + ib ) =  - ( a + ib )
1867                     const double tc = -c[ig]; //  -cosgr[ia][ig]
1868                     const double ts = -s[ig]; //  -singr[ia][ig]
1869                     a4[2*ig]   = d4 * tc;
1870                     a4[2*ig+1] = d4 * ts;
1871                     a5[2*ig]   = d5 * tc;
1872                     a5[2*ig+1] = d5 * ts;
1873                     a6[2*ig]   = d6 * tc;
1874                     a6[2*ig+1] = d6 * ts;
1875                     a7[2*ig]   = d7 * tc;
1876                     a7[2*ig+1] = d7 * ts;
1877                     a8[2*ig]   = d8 * tc;
1878                     a8[2*ig+1] = d8 * ts;
1879                   }
1880                 }
1881               }
1882               else if ( l == 3 )
1883               {
1884                 const int ipr09 = ipr;
1885                 const int ipr10= ipr + 1;
1886                 const int ipr11= ipr + 2;
1887                 const int ipr12= ipr + 3;
1888                 const int ipr13= ipr + 4;
1889                 const int ipr14= ipr + 5;
1890                 const int ipr15= ipr + 6;
1891                 // dtwnl[is][ipr][iquad][ij][ngwl]
1892                 // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr ))
1893                 const double *dt09 = &dtwnl[is][ngwl * ( ij + 6 * ipr09 )];
1894                 const double *dt10 = &dtwnl[is][ngwl * ( ij + 6 * ipr10 )];
1895                 const double *dt11 = &dtwnl[is][ngwl * ( ij + 6 * ipr11 )];
1896                 const double *dt12 = &dtwnl[is][ngwl * ( ij + 6 * ipr12 )];
1897                 const double *dt13 = &dtwnl[is][ngwl * ( ij + 6 * ipr13 )];
1898                 const double *dt14 = &dtwnl[is][ngwl * ( ij + 6 * ipr14 )];
1899                 const double *dt15 = &dtwnl[is][ngwl * ( ij + 6 * ipr15 )];
1900                 for ( int ia = 0; ia < ia_block_size; ia++ )
1901                 {
1902                   double* a09 = &anl_loc[2 * ( ia + ipr09 * ia_block_size ) * ngwl];
1903                   double* a10 = &anl_loc[2 * ( ia + ipr10 * ia_block_size ) * ngwl];
1904                   double* a11 = &anl_loc[2 * ( ia + ipr11 * ia_block_size ) * ngwl];
1905                   double* a12 = &anl_loc[2 * ( ia + ipr12 * ia_block_size ) * ngwl];
1906                   double* a13 = &anl_loc[2 * ( ia + ipr13 * ia_block_size ) * ngwl];
1907                   double* a14 = &anl_loc[2 * ( ia + ipr14 * ia_block_size ) * ngwl];
1908                   double* a15 = &anl_loc[2 * ( ia + ipr15 * ia_block_size ) * ngwl];
1909                   const double* c = &cgr[ia * ngwl];
1910                   const double* s = &sgr[ia * ngwl];
1911 
1912                   for ( int ig = 0; ig < ngwl; ig++ )
1913                   {
1914                     const double d09 = dt09[ig];
1915                     const double d10 = dt10[ig];
1916                     const double d11 = dt11[ig];
1917                     const double d12 = dt12[ig];
1918                     const double d13 = dt13[ig];
1919                     const double d14 = dt14[ig];
1920                     const double d15 = dt15[ig];
1921                     // Next lines: (-i)^2 * ( a + ib ) =  - ( a + ib )
1922                     const double tc =  c[ig]; //   cosgr[ia][ig]
1923                     const double ts = -s[ig]; //  -singr[ia][ig]
1924                     a09[2 * ig] = d09 * ts;
1925                     a09[2 * ig + 1] = d09 * tc;
1926                     a10[2 * ig] = d10 * ts;
1927                     a10[2 * ig + 1] = d10 * tc;
1928                     a11[2 * ig] = d11 * ts;
1929                     a11[2 * ig + 1] = d11 * tc;
1930                     a12[2 * ig] = d12 * ts;
1931                     a12[2 * ig + 1] = d12 * tc;
1932                     a13[2 * ig] = d13 * ts;
1933                     a13[2 * ig + 1] = d13 * tc;
1934                     a14[2 * ig] = d14 * ts;
1935                     a14[2 * ig + 1] = d14 * tc;
1936                     a15[2 * ig] = d15 * ts;
1937                     a15[2 * ig + 1] = d15 * tc;
1938                   }
1939                 }
1940               }
1941               else
1942               {
1943                 assert(false);
1944               } // l
1945               ipr += 2*l+1;
1946             } // while ipr
1947 
1948             // array anl_loc is complete
1949 
1950             // compute dfnl[npra][nstloc] = anl^T * c
1951             if ( basis_.real() )
1952             {
1953               // factor 2.0 in next line is: counting G, -G
1954               // Note: no need to correct for double counting of the
1955               // G=0 component which is always zero
1956               double two=2.0;
1957               char ct='t';
1958               const int twongwl = 2 * ngwl;
1959               const int nprnaloc = ia_block_size * npr[is];
1960               int c_lda = 2*sd_.c().mloc();
1961               const complex<double>* c = sd_.c().cvalptr();
1962               dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two,
1963                     &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda,
1964                     &zero,&dfnl_loc[0],(int*)&nprnaloc);
1965             }
1966             else
1967             {
1968               complex<double> cone=1.0;
1969               complex<double> czero=0.0;
1970               char cc='c';
1971               const int nprnaloc = ia_block_size * npr[is];
1972               int c_lda = sd_.c().mloc();
1973               const complex<double>* c = sd_.c().cvalptr();
1974               zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1975                     (complex<double>*)&anl_loc[0],(int*)&ngwl,
1976                     (complex<double>*)c,(int*)&c_lda,
1977                     &czero,(complex<double>*)&dfnl_loc[0],(int*)&nprnaloc);
1978             }
1979 
1980             // accumulate non-local contributions to sigma_ij
1981             if ( basis_.real() )
1982             {
1983               for ( int n = 0; n < nstloc; n++ )
1984               {
1985                 // Factor 2.0 in next line from derivative of |Fnl|^2
1986                 const double facn = 2.0 * occ[n + nbase];
1987                 for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ )
1988                 {
1989                   const int i = ipra + n * nprnaloc;
1990                   tsum[ij] += facn * fnl_loc[i] * dfnl_loc[i];
1991                 }
1992               }
1993             }
1994             else
1995             {
1996               for ( int n = 0; n < nstloc; n++ )
1997               {
1998                 // Factor 2.0 in next line from derivative of |Fnl|^2
1999                 const double facn = 2.0 * occ[n + nbase];
2000                 for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ )
2001                 {
2002                   const int i = ipra + n * nprnaloc;
2003                   double f_re = fnl_loc[2*i];
2004                   double f_im = fnl_loc[2*i+1];
2005                   double df_re = dfnl_loc[2*i];
2006                   double df_im = dfnl_loc[2*i+1];
2007                   tsum[ij] += facn * ( f_re * df_re + f_im * df_im );
2008                 }
2009               }
2010             }
2011           } // ij
2012           tmap["enl_sigma"].stop();
2013         } // compute_stress
2014 
2015       } // ia_block
2016 
2017       if ( compute_forces )
2018       {
2019         ctxt_.dsum(3*na[is],1,&tmpfion[0],3*na[is]);
2020         for ( int ia = 0; ia < na[is]; ia++ )
2021         {
2022           fion_enl[is][3*ia+0] += tmpfion[3*ia];
2023           fion_enl[is][3*ia+1] += tmpfion[3*ia+1];
2024           fion_enl[is][3*ia+2] += tmpfion[3*ia+2];
2025         }
2026       }
2027     } // npr[is]>0
2028   } // is
2029 
2030   // reduction of enl across rows
2031   ctxt_.dsum('r',1,1,&enl,1);
2032 
2033   if ( compute_stress )
2034   {
2035     ctxt_.dsum(6,1,&tsum[0],6);
2036     sigma_enl[0] = ( enl + tsum[0] ) * omega_inv;
2037     sigma_enl[1] = ( enl + tsum[1] ) * omega_inv;
2038     sigma_enl[2] = ( enl + tsum[2] ) * omega_inv;
2039     sigma_enl[3] = + tsum[3] * omega_inv;
2040     sigma_enl[4] = + tsum[4] * omega_inv;
2041     sigma_enl[5] = + tsum[5] * omega_inv;
2042   }
2043 
2044   return enl;
2045 }
2046