1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2008 The Regents of the University of California
4 //
5 // This file is part of Qbox
6 //
7 // Qbox is distributed under the terms of the GNU General Public License
8 // as published by the Free Software Foundation, either version 2 of
9 // the License, or (at your option) any later version.
10 // See the file COPYING in the root directory of this distribution
11 // or <http://www.gnu.org/licenses/>.
12 //
13 ////////////////////////////////////////////////////////////////////////////////
14 //
15 // NonLocalPotential.cpp
16 //
17 ////////////////////////////////////////////////////////////////////////////////
18
19 #include "NonLocalPotential.h"
20 #include "Species.h"
21 #include "blas.h"
22 #include "MPIdata.h"
23 #include <iomanip>
24 using namespace std;
25
26 #if USE_MASSV
27 extern "C" void vsincos(double *x, double *y, double *z, int *n);
28 #endif
29
30
31 ////////////////////////////////////////////////////////////////////////////////
~NonLocalPotential(void)32 NonLocalPotential::~NonLocalPotential(void)
33 {
34 #if TIMING
35 for ( TimerMap::iterator i = tmap.begin(); i != tmap.end(); i++ )
36 {
37 double time = i->second.real();
38 double tmin, tmax;
39 MPI_Reduce(&time,&tmin,1,MPI_DOUBLE,MPI_MIN,0,MPIdata::comm());
40 MPI_Reduce(&time,&tmax,1,MPI_DOUBLE,MPI_MAX,0,MPIdata::comm());
41 if ( MPIdata::onpe0() && (tmax > 0.0) )
42 {
43 string s = "name=\"" + (*i).first + "\"";
44 cout << "<timing " << left << setw(22) << s
45 << " min=\"" << setprecision(3) << tmin << "\""
46 << " max=\"" << setprecision(3) << tmax << "\"/>"
47 << endl;
48 }
49 }
50 #endif
51 }
52
53 ////////////////////////////////////////////////////////////////////////////////
init(void)54 void NonLocalPotential::init(void)
55 {
56 const int ngwl = basis_.localsize();
57
58 nsp = atoms_.nsp();
59
60 nop.resize(nsp);
61 lloc.resize(nsp);
62 lproj.resize(nsp);
63 na.resize(nsp);
64 npr.resize(nsp);
65 nprna.resize(nsp);
66 wt.resize(nsp);
67 twnl.resize(nsp);
68 dtwnl.resize(nsp);
69
70 nquad.resize(nsp);
71 rquad.resize(nsp);
72 wquad.resize(nsp);
73
74 nspnl = 0;
75 for ( int is = 0; is < nsp; is++ )
76 {
77 Species *s = atoms_.species_list[is];
78
79 npr[is] = 0;
80 nprna[is] = 0;
81
82 if ( s->non_local() )
83 {
84 nspnl++;
85 na[is] = atoms_.na(is);
86 nop[is] = s->nop();
87 lloc[is] = s->llocal();
88 nquad[is] = s->nquad();
89
90 // compute total number of projectors npr[is]
91 // KB potentials have nlm projectors
92 // semilocal potentials have nlm*nquad projectors
93 if ( s->nquad() == 0 )
94 {
95 npr[is] = s->nlm();
96 }
97 else
98 {
99 npr[is] = s->nlm() * nquad[is];
100 }
101 nprna[is] = npr[is] * na[is];
102
103 // l value for projector ipr
104 lproj[is].resize(npr[is]);
105
106 twnl[is].resize(npr[is]*ngwl);
107 dtwnl[is].resize(npr[is]*6*ngwl);
108
109 // quadrature abcissae and weights
110 rquad[is].resize(nquad[is]);
111 wquad[is].resize(nquad[is]);
112
113 enum quadrature_rule_type { TRAPEZOID, MODIF_TRAPEZOID,
114 TRAPEZOID_WITH_RIGHT_ENDPOINT,
115 SIMPSON };
116
117 const quadrature_rule_type quad_rule = TRAPEZOID;
118 //const quadrature_rule_type quad_rule = MODIF_TRAPEZOID;
119 //const quadrature_rule_type quad_rule = TRAPEZOID_WITH_RIGHT_ENDPOINT;
120 //const quadrature_rule_type quad_rule = SIMPSON;
121
122 if ( quad_rule == TRAPEZOID )
123 {
124 // trapezoidal rule with interior points only
125 // (end points are zero)
126 const double h = s->rquad() / (nquad[is]+1);
127 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
128 {
129 rquad[is][iquad] = (iquad+1) * h;
130 wquad[is][iquad] = h;
131 }
132 //cout << " NonLocalPotential::init: trapezoidal rule (interior)"
133 // << endl;
134 }
135 else if ( quad_rule == MODIF_TRAPEZOID )
136 {
137 // use modified trapezoidal rule with interior points, and include
138 // correction for first derivative at r=0 as
139 // h^2/12 f'(0) where f'(0) is estimated with f(h)/h
140 // i.e. add correction h/12) * f(h)
141 // See Davis & Rabinowitz, p. 132
142 const double h = s->rquad() / (nquad[is]+1);
143 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
144 {
145 rquad[is][iquad] = (iquad+1) * h;
146 wquad[is][iquad] = h;
147 }
148 wquad[is][0] += h / 12.0;
149 //cout << " NonLocalPotential::init: modified trapezoidal rule"
150 // << endl;
151 }
152 else if ( quad_rule == TRAPEZOID_WITH_RIGHT_ENDPOINT )
153 {
154 const double h = s->rquad() / nquad[is];
155 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
156 {
157 rquad[is][iquad] = (iquad+1) * h;
158 wquad[is][iquad] = h;
159 }
160 wquad[is][nquad[is]-1] = 0.5 * h;
161 //cout << " NonLocalPotential::init: trapezoidal rule, right endpoint"
162 // << endl;
163 }
164 else if ( quad_rule == SIMPSON )
165 {
166 // must have 2n+1 points
167 assert(nquad[is]%2==1);
168 const double h = s->rquad() / (nquad[is]-1);
169 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
170 {
171 rquad[is][iquad] = iquad * h;
172 if ( ( iquad == 0 ) || ( iquad == nquad[is]-1 ) )
173 wquad[is][iquad] = h / 3.0;
174 else if ( iquad % 2 == 0 )
175 wquad[is][iquad] = h * 2.0 / 3.0;
176 else
177 wquad[is][iquad] = h * 4.0 / 3.0;
178 }
179 //cout << " NonLocalPotential::init: Simpson rule" << endl;
180 }
181 else
182 {
183 assert(false);
184 }
185
186 // compute weights wt[is][ipr]
187 wt[is].resize(npr[is]);
188
189 // compute lproj[is][ipr]
190 int ipr_base = 0;
191 for ( int iop = 0; iop < nop[is]; iop++ )
192 {
193 int l = s->l(iop);
194 if ( l != lloc[is] )
195 {
196 if ( nquad[is] == 0 )
197 {
198 // Kleinman-Bylander form
199 // wt[is][ipr]
200 // index = ipr_base+m
201 for ( int m = 0; m < 2*l+1; m++ )
202 {
203 const int ipr = ipr_base + m;
204 wt[is][ipr] = s->wsg(iop);
205 lproj[is][ipr] = l;
206 }
207 ipr_base += 2*l+1;
208 }
209 else
210 {
211 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
212 {
213 const double r = rquad[is][iquad];
214 double v,dv,vl,dvl;
215 s->dvpsr(l,r,v,dv);
216 s->dvpsr(lloc[is],r,vl,dvl);
217 // wt[is][iquad+ipr*nquad]
218 for ( int m = 0; m < 2*l+1; m++ )
219 {
220 const int ipr = ipr_base + iquad + nquad[is] * m;
221 wt[is][ipr] = ( v - vl ) * wquad[is][iquad];
222 lproj[is][ipr] = l;
223 }
224 }
225 ipr_base += (2*l+1) * nquad[is];
226 }
227 }
228 }
229 assert(ipr_base==npr[is]);
230 } // if s->non_local()
231 }
232 }
233
234 ////////////////////////////////////////////////////////////////////////////////
update_twnl(void)235 void NonLocalPotential::update_twnl(void)
236 {
237 // update arrays twnl[is][ipr][ig], dtwnl[is][ipr][j][ig],
238 // following a change of cell dimensions
239 // It is assumed that basis_ has been updated
240 // It is assumed that nsp, npr[is], nquad[is] did not change since init
241
242 tmap["update_twnl"].start();
243
244 if ( nspnl == 0 ) return;
245
246 const int ngwl = basis_.localsize();
247 const double pi = M_PI;
248 const double fpi = 4.0 * pi;
249 const double s14pi = sqrt(1.0/fpi);
250 const double s34pi = sqrt(3.0/fpi);
251 const double s54pi = sqrt(5.0/fpi);
252 const double s74pi = sqrt(7.0/fpi);
253 const double s20pi = sqrt(20.0*pi);
254 const double s20pi3 = sqrt(20.0*pi/3.0);
255 const double s3 = sqrt(3.0);
256 const double s32 = sqrt(1.5);
257 const double s52 = sqrt(2.5);
258 const double s15 = sqrt(15.0);
259
260 const double *kpg = basis_.kpg_ptr();
261 const double *kpgi = basis_.kpgi_ptr();
262 const double *kpg_x = basis_.kpgx_ptr(0);
263 const double *kpg_y = basis_.kpgx_ptr(1);
264 const double *kpg_z = basis_.kpgx_ptr(2);
265
266 // compute twnl and dtwnl
267 for ( int is = 0; is < nsp; is++ )
268 {
269 Species *s = atoms_.species_list[is];
270
271 int ilm = 0;
272 for ( int iop = 0; iop < nop[is]; iop++ )
273 {
274 int l = s->l(iop);
275 if ( l != lloc[is] )
276 {
277 if ( l == 0 )
278 {
279 if ( nquad[is] == 0 )
280 {
281 // Kleinman-Bylander
282
283 // twnl[is][ipr][ig]
284 const int ipr = ilm;
285 // index = ig + ngwl*ipr, i.e. index = ig
286 double *t0 = &twnl[is][ngwl*ipr];
287
288 // dtwnl[is][ipr][ij][ngwl]
289 // index = ig + ngwl * ( ij + 6 * ipr ), ipr = 0
290 // i.e. index = ig + ij * ngwl
291 double *dt0_xx = &dtwnl[is][ngwl*(0+6*ipr)];
292 double *dt0_yy = &dtwnl[is][ngwl*(1+6*ipr)];
293 double *dt0_zz = &dtwnl[is][ngwl*(2+6*ipr)];
294 double *dt0_xy = &dtwnl[is][ngwl*(3+6*ipr)];
295 double *dt0_yz = &dtwnl[is][ngwl*(4+6*ipr)];
296 double *dt0_xz = &dtwnl[is][ngwl*(5+6*ipr)];
297 // Special case k=G=0 is ok since kpgi[0] = 0.0 at k=G=0
298 for ( int ig = 0; ig < ngwl; ig++ )
299 {
300 double v,dv;
301 s->dvnlg(iop,kpg[ig],v,dv);
302
303 t0[ig] = s14pi * v;
304
305 const double tgx = kpg_x[ig];
306 const double tgy = kpg_y[ig];
307 const double tgz = kpg_z[ig];
308
309 const double tmp = kpgi[ig] * s14pi * dv;
310 dt0_xx[ig] = tmp * tgx * tgx;
311 dt0_yy[ig] = tmp * tgy * tgy;
312 dt0_zz[ig] = tmp * tgz * tgz;
313 dt0_xy[ig] = tmp * tgx * tgy;
314 dt0_yz[ig] = tmp * tgy * tgz;
315 dt0_xz[ig] = tmp * tgx * tgz;
316 }
317 }
318 else
319 {
320 // semi-local
321 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
322 {
323 // twnl[is][ipr][ig]
324 // ipr = iquad + nquad[is]*ilm, where ilm=0
325 // = iquad
326 // index = ig + ngwl*iquad
327 double *t0 = &twnl[is][ngwl*iquad];
328 // dtwnl[is][ipr][j][ngwl]
329 // index = ig + ngwl * ( ij + 6 * iquad)
330 double *dt0_xx = &dtwnl[is][ngwl*(0+6*iquad)];
331 double *dt0_yy = &dtwnl[is][ngwl*(1+6*iquad)];
332 double *dt0_zz = &dtwnl[is][ngwl*(2+6*iquad)];
333 double *dt0_xy = &dtwnl[is][ngwl*(3+6*iquad)];
334 double *dt0_yz = &dtwnl[is][ngwl*(4+6*iquad)];
335 double *dt0_xz = &dtwnl[is][ngwl*(5+6*iquad)];
336 const double r = rquad[is][iquad];
337
338 // G=0 element must be treated separately if k=0
339 if ( basis_.real() && ctxt_.myrow() == 0)
340 {
341 // compute G=0 element separately to get
342 // sin(Gr)/(Gr) limit -> 1
343 const int ig = 0;
344 // this task holds the G=0 element
345 // I(l=0) = 4 pi j_l(G r) r
346 // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm
347 // j_0(Gr) = sin(Gr) / (Gr) == 1.0
348 // Ylm = s14pi
349
350 // next line: sin(Gr)/(Gr) replaced by 1
351 t0[ig] = fpi * s14pi * r;
352
353 // dtwnl = fpi s14pi G_i G_j / G (r cos(Gr)/G -sin(Gr)/G^2)
354 // dtwnl = 0.0;
355 dt0_xx[ig] = 0.0;
356 dt0_yy[ig] = 0.0;
357 dt0_zz[ig] = 0.0;
358 dt0_xy[ig] = 0.0;
359 dt0_yz[ig] = 0.0;
360 dt0_xz[ig] = 0.0;
361 }
362 else
363 {
364 // Use the normal procedure
365 const int ig = 0;
366 // I(l=0) = 4 pi j_l(G r) r
367 // twnl[is][ipr][l][ig] = 4 pi j_0(Gr_i) r_i Ylm
368 // j_0(Gr) * r = sin(Gr) / G
369 // Ylm = s14pi
370 const double arg = kpg[ig] * r;
371 // Note: for G=0, gi[0] = 0
372
373 const double tgx = kpg_x[ig];
374 const double tgy = kpg_y[ig];
375 const double tgz = kpg_z[ig];
376 const double tgi = kpgi[ig];
377 const double tgi2 = tgi * tgi;
378
379 const double ts = sin(arg);
380 const double tc = cos(arg);
381
382 t0[ig] = fpi * s14pi * ts * tgi;
383
384 // dtwnl = fpi s14pi k+G_i k+G_j / k+G
385 // (r cos(k+Gr)/k+G -sin(k+Gr)/k+G^2)
386 const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi);
387 dt0_xx[ig] = tmp * tgx * tgx;
388 dt0_yy[ig] = tmp * tgy * tgy;
389 dt0_zz[ig] = tmp * tgz * tgz;
390 dt0_xy[ig] = tmp * tgx * tgy;
391 dt0_yz[ig] = tmp * tgy * tgz;
392 dt0_xz[ig] = tmp * tgx * tgz;
393 }
394
395 for ( int ig = 1; ig < ngwl; ig++ )
396 {
397 // I(l=0) = 4 pi j_l(|k+G| r) r
398 // twnl[is][ipr][l][ig] = 4 pi j_0(|k+G|r_i) r_i Ylm
399 // j_0(|k+G|r) * r = sin(|k+G|r) / |k+G|
400 // Ylm = s14pi
401 const double arg = kpg[ig] * r;
402
403 const double tgx = kpg_x[ig];
404 const double tgy = kpg_y[ig];
405 const double tgz = kpg_z[ig];
406 const double tgi = kpgi[ig];
407 const double tgi2 = tgi * tgi;
408
409 const double ts = sin(arg);
410 const double tc = cos(arg);
411
412 t0[ig] = fpi * s14pi * ts * tgi;
413
414 // dtwnl = fpi s14pi (k+G)_i (k+G)_j /
415 // |k+G| (r cos(|k+G|r)/|k+G| -sin(|k+G|r)/|k+G|^2)
416 const double tmp = fpi * s14pi * tgi2 * (r*tc - ts*tgi);
417 dt0_xx[ig] = tmp * tgx * tgx;
418 dt0_yy[ig] = tmp * tgy * tgy;
419 dt0_zz[ig] = tmp * tgz * tgz;
420 dt0_xy[ig] = tmp * tgx * tgy;
421 dt0_yz[ig] = tmp * tgy * tgz;
422 dt0_xz[ig] = tmp * tgx * tgz;
423 }
424 } // for iquad
425 }
426 ilm += 2*l+1;
427 }
428 else if ( l == 1 )
429 {
430 if ( nquad[is] == 0 )
431 {
432 // Kleinman-Bylander
433
434 // twnl[is][ipr][ig]
435 // ipr = ilm
436 const int ipr1 = ilm;
437 const int ipr2 = ilm+1;
438 const int ipr3 = ilm+2;
439 // index = ig + ngwl*ilm
440 double *t1 = &twnl[is][ngwl*ipr1];
441 double *t2 = &twnl[is][ngwl*ipr2];
442 double *t3 = &twnl[is][ngwl*ipr3];
443
444 // dtwnl[is][ipr][ij][ngwl]
445 // index = ig + ngwl * ( ij + 6 * ipr )
446 double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)];
447 double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)];
448 double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)];
449 double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)];
450 double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)];
451 double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)];
452
453 double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)];
454 double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)];
455 double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)];
456 double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)];
457 double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)];
458 double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)];
459
460 double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)];
461 double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)];
462 double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)];
463 double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)];
464 double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)];
465 double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)];
466
467 for ( int ig = 0; ig < ngwl; ig++ )
468 {
469 double v,dv;
470 const double tg = kpg[ig];
471 s->dvnlg(iop,tg,v,dv);
472
473 const double tgx = kpg_x[ig];
474 const double tgy = kpg_y[ig];
475 const double tgz = kpg_z[ig];
476 const double tgx2 = tgx * tgx;
477 const double tgy2 = tgy * tgy;
478 const double tgz2 = tgz * tgz;
479
480 const double tgi = kpgi[ig];
481 const double tgi2 = tgi * tgi;
482
483 const double y1 = s34pi * tgx * tgi;
484 const double y2 = s34pi * tgy * tgi;
485 const double y3 = s34pi * tgz * tgi;
486
487 t1[ig] = y1 * v;
488 t2[ig] = y2 * v;
489 t3[ig] = y3 * v;
490
491 const double fac1 = - y1 * ( v - tg * dv ) * tgi2;
492 // m=x
493 dt1_xx[ig] = fac1 * tgx2 + v * y1;
494 dt1_yy[ig] = fac1 * tgy2;
495 dt1_zz[ig] = fac1 * tgz2;
496 dt1_xy[ig] = fac1 * tgx * tgy;
497 dt1_yz[ig] = fac1 * tgy * tgz;
498 dt1_xz[ig] = fac1 * tgx * tgz;
499
500 const double fac2 = - y2 * ( v - tg * dv ) * tgi2;
501 // m=y
502 dt2_xx[ig] = fac2 * tgx2;
503 dt2_yy[ig] = fac2 * tgy2 + v * y2;
504 dt2_zz[ig] = fac2 * tgz2;
505 dt2_xy[ig] = fac2 * tgx * tgy + v * y1;
506 dt2_yz[ig] = fac2 * tgy * tgz;
507 dt2_xz[ig] = fac2 * tgx * tgz;
508
509 const double fac3 = - y3 * ( v - tg * dv ) * tgi2;
510 // m=z
511 dt3_xx[ig] = fac3 * tgx2;
512 dt3_yy[ig] = fac3 * tgy2;
513 dt3_zz[ig] = fac3 * tgz2 + v * y3;
514 dt3_xy[ig] = fac3 * tgx * tgy;
515 dt3_yz[ig] = fac3 * tgy * tgz + v * y2;
516 dt3_xz[ig] = fac3 * tgx * tgz + v * y1;
517 }
518 }
519 else
520 {
521 // semi-local
522 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
523 {
524 // twnl[is][ipr][ig]
525 // index = ig + ngwl*(iquad+nquad[is]*ilm)
526 const int ipr1 = iquad+nquad[is]*ilm;
527 const int ipr2 = iquad+nquad[is]*(ilm+1);
528 const int ipr3 = iquad+nquad[is]*(ilm+2);
529 double *t1 = &twnl[is][ngwl*ipr1];
530 double *t2 = &twnl[is][ngwl*ipr2];
531 double *t3 = &twnl[is][ngwl*ipr3];
532
533 // dtwnl[is][ipr][j][ngwl]
534 // index = ig + ngwl * ( ij + 6 * ipr )
535 double *dt1_xx = &dtwnl[is][ngwl*(0+6*ipr1)];
536 double *dt1_yy = &dtwnl[is][ngwl*(1+6*ipr1)];
537 double *dt1_zz = &dtwnl[is][ngwl*(2+6*ipr1)];
538 double *dt1_xy = &dtwnl[is][ngwl*(3+6*ipr1)];
539 double *dt1_yz = &dtwnl[is][ngwl*(4+6*ipr1)];
540 double *dt1_xz = &dtwnl[is][ngwl*(5+6*ipr1)];
541
542 double *dt2_xx = &dtwnl[is][ngwl*(0+6*ipr2)];
543 double *dt2_yy = &dtwnl[is][ngwl*(1+6*ipr2)];
544 double *dt2_zz = &dtwnl[is][ngwl*(2+6*ipr2)];
545 double *dt2_xy = &dtwnl[is][ngwl*(3+6*ipr2)];
546 double *dt2_yz = &dtwnl[is][ngwl*(4+6*ipr2)];
547 double *dt2_xz = &dtwnl[is][ngwl*(5+6*ipr2)];
548
549 double *dt3_xx = &dtwnl[is][ngwl*(0+6*ipr3)];
550 double *dt3_yy = &dtwnl[is][ngwl*(1+6*ipr3)];
551 double *dt3_zz = &dtwnl[is][ngwl*(2+6*ipr3)];
552 double *dt3_xy = &dtwnl[is][ngwl*(3+6*ipr3)];
553 double *dt3_yz = &dtwnl[is][ngwl*(4+6*ipr3)];
554 double *dt3_xz = &dtwnl[is][ngwl*(5+6*ipr3)];
555
556 const double r = rquad[is][iquad];
557 for ( int ig = 0; ig < ngwl; ig++ )
558 {
559 double v = 0.0, dv = 0.0;
560 // j_1(Gr) = (1/(Gr))*(sin(Gr)/(Gr)-cos(Gr))
561 const double tg = kpg[ig];
562 const double z = tg * r;
563 if ( z != 0.0 )
564 {
565 const double zi = 1.0 / z;
566 const double c = cos(z);
567 const double s = sin(z);
568 const double j1 = ( s * zi - c ) * zi;
569 const double dj1 =
570 ( 2.0 * z * c + ( z*z - 2.0 ) * s ) * zi*zi*zi;
571 // v = 4 pi j1(Gr) r
572 v = fpi * j1 * r;
573 // dv = d/dG v = 4 pi dj1(Gr)/d(Gr) d(Gr)/dG r
574 // = 4 pi dj1 r^2
575 dv = fpi * dj1 * r * r;
576 }
577
578 const double tgx = kpg_x[ig];
579 const double tgy = kpg_y[ig];
580 const double tgz = kpg_z[ig];
581 const double tgx2 = tgx * tgx;
582 const double tgy2 = tgy * tgy;
583 const double tgz2 = tgz * tgz;
584
585 const double tgi = kpgi[ig];
586 const double tgi2 = tgi * tgi;
587
588 const double y1 = s34pi * tgx * tgi;
589 const double y2 = s34pi * tgy * tgi;
590 const double y3 = s34pi * tgz * tgi;
591
592 t1[ig] = y1 * v;
593 t2[ig] = y2 * v;
594 t3[ig] = y3 * v;
595
596 const double fac1 = - y1 * ( v - tg * dv ) * tgi2;
597 // m=x
598 dt1_xx[ig] = fac1 * tgx2 + v * y1;
599 dt1_yy[ig] = fac1 * tgy2;
600 dt1_zz[ig] = fac1 * tgz2;
601 dt1_xy[ig] = fac1 * tgx * tgy;
602 dt1_yz[ig] = fac1 * tgy * tgz;
603 dt1_xz[ig] = fac1 * tgx * tgz;
604
605 const double fac2 = - y2 * ( v - tg * dv ) * tgi2;
606 // m=y
607 dt2_xx[ig] = fac2 * tgx2;
608 dt2_yy[ig] = fac2 * tgy2 + v * y2;
609 dt2_zz[ig] = fac2 * tgz2;
610 dt2_xy[ig] = fac2 * tgx * tgy + v * y1;
611 dt2_yz[ig] = fac2 * tgy * tgz;
612 dt2_xz[ig] = fac2 * tgx * tgz;
613
614 const double fac3 = - y3 * ( v - tg * dv ) * tgi2;
615 // m=z
616 dt3_xx[ig] = fac3 * tgx2;
617 dt3_yy[ig] = fac3 * tgy2;
618 dt3_zz[ig] = fac3 * tgz2 + v * y3;
619 dt3_xy[ig] = fac3 * tgx * tgy;
620 dt3_yz[ig] = fac3 * tgy * tgz + v * y2;
621 dt3_xz[ig] = fac3 * tgx * tgz + v * y1;
622 } // ig
623 }
624 }
625 ilm += 2*l+1;
626 }
627 else if ( l == 2 )
628 {
629 if ( nquad[is] == 0 )
630 {
631 // Kleinman-Bylander
632 const int ipr4 = ilm;
633 const int ipr5 = ilm+1;
634 const int ipr6 = ilm+2;
635 const int ipr7 = ilm+3;
636 const int ipr8 = ilm+4;
637
638 double *t4 = &twnl[is][ngwl*ipr4];
639 double *t5 = &twnl[is][ngwl*ipr5];
640 double *t6 = &twnl[is][ngwl*ipr6];
641 double *t7 = &twnl[is][ngwl*ipr7];
642 double *t8 = &twnl[is][ngwl*ipr8];
643
644 // dtwnl[is][ipr][ij][ngwl]
645 // index = ig + ngwl * ( ij + 6 * ipr )
646 double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)];
647 double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)];
648 double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)];
649 double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)];
650 double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)];
651 double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)];
652
653 double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)];
654 double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)];
655 double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)];
656 double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)];
657 double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)];
658 double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)];
659
660 double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)];
661 double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)];
662 double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)];
663 double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)];
664 double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)];
665 double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)];
666
667 double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)];
668 double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)];
669 double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)];
670 double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)];
671 double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)];
672 double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)];
673
674 double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)];
675 double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)];
676 double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)];
677 double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)];
678 double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)];
679 double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)];
680
681 for ( int ig = 0; ig < ngwl; ig++ )
682 {
683 double v,dv;
684 const double tg = kpg[ig];
685
686 s->dvnlg(iop,tg,v,dv);
687
688 const double tgx = kpg_x[ig];
689 const double tgy = kpg_y[ig];
690 const double tgz = kpg_z[ig];
691 const double tgx2 = tgx * tgx;
692 const double tgy2 = tgy * tgy;
693 const double tgz2 = tgz * tgz;
694
695 const double tgi = kpgi[ig];
696 const double tgi2 = tgi * tgi;
697
698 const double tgxx = tgx2 * tgi2;
699 const double tgyy = tgy2 * tgi2;
700 const double tgzz = tgz2 * tgi2;
701 const double tgxy = tgx * tgy * tgi2;
702 const double tgyz = tgy * tgz * tgi2;
703 const double tgxz = tgx * tgz * tgi2;
704
705 const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 );
706 const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy );
707 const double y6 = s54pi * s3 * tgxy;
708 const double y7 = s54pi * s3 * tgyz;
709 const double y8 = s54pi * s3 * tgxz;
710
711 const double y1x = s34pi * tgx * tgi;
712 const double y1y = s34pi * tgy * tgi;
713 const double y1z = s34pi * tgz * tgi;
714
715 const double dx_xx = y1x * tgxx - y1x;
716 const double dx_yy = y1x * tgyy;
717 const double dx_zz = y1x * tgzz;
718 const double dx_xy = y1x * tgxy;
719 const double dx_yz = y1x * tgyz;
720 const double dx_xz = y1x * tgxz;
721
722 const double dy_xx = y1y * tgxx;
723 const double dy_yy = y1y * tgyy - y1y;
724 const double dy_zz = y1y * tgzz;
725 const double dy_xy = y1y * tgxy - y1x;
726 const double dy_yz = y1y * tgyz;
727 const double dy_xz = y1y * tgxz;
728
729 const double dz_xx = y1z * tgxx;
730 const double dz_yy = y1z * tgyy;
731 const double dz_zz = y1z * tgzz - y1z;
732 const double dz_xy = y1z * tgxy;
733 const double dz_yz = y1z * tgyz - y1y;
734 const double dz_xz = y1z * tgxz - y1x;
735
736 t4[ig] = y4 * v;
737 t5[ig] = y5 * v;
738 t6[ig] = y6 * v;
739 t7[ig] = y7 * v;
740 t8[ig] = y8 * v;
741
742 // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 )
743 dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx);
744 dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy);
745 dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz);
746 dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy);
747 dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz);
748 dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz);
749
750 // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2
751 dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx);
752 dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy);
753 dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz);
754 dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy);
755 dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz);
756 dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz);
757
758 // y6 = s54pi sqrt(3) x y / r^2
759 dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx);
760 dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy);
761 dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz);
762 dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy);
763 dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz);
764 dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz);
765
766 // y7 = s54pi sqrt(3) y z / r^2
767 dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx);
768 dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy);
769 dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz);
770 dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy);
771 dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz);
772 dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz);
773
774 // y8 = s54pi sqrt(3) z x / r^2
775 dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx);
776 dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy);
777 dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz);
778 dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy);
779 dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz);
780 dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz);
781 }
782 }
783 else
784 {
785 // semi-local
786 for ( int iquad = 0; iquad < nquad[is]; iquad++ )
787 {
788 // twnl[is][ipr][ig]
789 // ipr = iquad+nquad[is]*ilm
790 // index = ig + ngwl*ipr
791 const int ipr4 = iquad+nquad[is]*ilm;
792 const int ipr5 = iquad+nquad[is]*(ilm+1);
793 const int ipr6 = iquad+nquad[is]*(ilm+2);
794 const int ipr7 = iquad+nquad[is]*(ilm+3);
795 const int ipr8 = iquad+nquad[is]*(ilm+4);
796
797 double *t4 = &twnl[is][ngwl*ipr4];
798 double *t5 = &twnl[is][ngwl*ipr5];
799 double *t6 = &twnl[is][ngwl*ipr6];
800 double *t7 = &twnl[is][ngwl*ipr7];
801 double *t8 = &twnl[is][ngwl*ipr8];
802
803 // dtwnl[is][ipr][ij][ngwl]
804 // index = ig + ngwl * ( ij + 6 * ipr )
805 double *dt4_xx = &dtwnl[is][ngwl*(0+6*ipr4)];
806 double *dt4_yy = &dtwnl[is][ngwl*(1+6*ipr4)];
807 double *dt4_zz = &dtwnl[is][ngwl*(2+6*ipr4)];
808 double *dt4_xy = &dtwnl[is][ngwl*(3+6*ipr4)];
809 double *dt4_yz = &dtwnl[is][ngwl*(4+6*ipr4)];
810 double *dt4_xz = &dtwnl[is][ngwl*(5+6*ipr4)];
811
812 double *dt5_xx = &dtwnl[is][ngwl*(0+6*ipr5)];
813 double *dt5_yy = &dtwnl[is][ngwl*(1+6*ipr5)];
814 double *dt5_zz = &dtwnl[is][ngwl*(2+6*ipr5)];
815 double *dt5_xy = &dtwnl[is][ngwl*(3+6*ipr5)];
816 double *dt5_yz = &dtwnl[is][ngwl*(4+6*ipr5)];
817 double *dt5_xz = &dtwnl[is][ngwl*(5+6*ipr5)];
818
819 double *dt6_xx = &dtwnl[is][ngwl*(0+6*ipr6)];
820 double *dt6_yy = &dtwnl[is][ngwl*(1+6*ipr6)];
821 double *dt6_zz = &dtwnl[is][ngwl*(2+6*ipr6)];
822 double *dt6_xy = &dtwnl[is][ngwl*(3+6*ipr6)];
823 double *dt6_yz = &dtwnl[is][ngwl*(4+6*ipr6)];
824 double *dt6_xz = &dtwnl[is][ngwl*(5+6*ipr6)];
825
826 double *dt7_xx = &dtwnl[is][ngwl*(0+6*ipr7)];
827 double *dt7_yy = &dtwnl[is][ngwl*(1+6*ipr7)];
828 double *dt7_zz = &dtwnl[is][ngwl*(2+6*ipr7)];
829 double *dt7_xy = &dtwnl[is][ngwl*(3+6*ipr7)];
830 double *dt7_yz = &dtwnl[is][ngwl*(4+6*ipr7)];
831 double *dt7_xz = &dtwnl[is][ngwl*(5+6*ipr7)];
832
833 double *dt8_xx = &dtwnl[is][ngwl*(0+6*ipr8)];
834 double *dt8_yy = &dtwnl[is][ngwl*(1+6*ipr8)];
835 double *dt8_zz = &dtwnl[is][ngwl*(2+6*ipr8)];
836 double *dt8_xy = &dtwnl[is][ngwl*(3+6*ipr8)];
837 double *dt8_yz = &dtwnl[is][ngwl*(4+6*ipr8)];
838 double *dt8_xz = &dtwnl[is][ngwl*(5+6*ipr8)];
839
840 const double r = rquad[is][iquad];
841 for ( int ig = 0; ig < ngwl; ig++ )
842 {
843 double v = 0.0, dv = 0.0;
844 // j_2(z) = (3/z^3-1/z) sin(z) - 3/z^2 cos(z)
845 // j_2(z) = (1/z)*((3/z^2-1)*sin(z) - (3/z) cos(z) )
846
847 const double tg = kpg[ig];
848 const double z = tg * r;
849 if ( z != 0.0 )
850 {
851 const double zi = 1.0 / z;
852 const double c = cos(z);
853 const double s = sin(z);
854 const double j2 = ((3.0*zi*zi - 1.0) * s - 3.0*zi * c ) * zi;
855 const double z2 = z * z;
856 const double dj2 =
857 ( (4.0 * z2 - 9.0) * s + z*(9.0-z2) * c ) / (z2*z2) ;
858 // v = 4 pi j2(Gr) r
859 v = fpi * j2 * r;
860 // dv = d/dG v = 4 pi dj2(Gr)/d(Gr) d(Gr)/dG r
861 // = 4 pi dj2 r^2
862 dv = fpi * dj2 * r * r;
863 }
864
865 const double tgx = kpg_x[ig];
866 const double tgy = kpg_y[ig];
867 const double tgz = kpg_z[ig];
868 const double tgx2 = tgx * tgx;
869 const double tgy2 = tgy * tgy;
870 const double tgz2 = tgz * tgz;
871
872 const double tgi = kpgi[ig];
873 const double tgi2 = tgi * tgi;
874
875 const double tgxx = tgx2 * tgi2;
876 const double tgyy = tgy2 * tgi2;
877 const double tgzz = tgz2 * tgi2;
878 const double tgxy = tgx * tgy * tgi2;
879 const double tgyz = tgy * tgz * tgi2;
880 const double tgxz = tgx * tgz * tgi2;
881
882 const double y4 = s54pi * 0.5 * (3.0 * tgzz - 1.0 );
883 const double y5 = s54pi * 0.5 * s3 * ( tgxx - tgyy );
884 const double y6 = s54pi * s3 * tgxy;
885 const double y7 = s54pi * s3 * tgyz;
886 const double y8 = s54pi * s3 * tgxz;
887
888 const double y1x = s34pi * tgx * tgi;
889 const double y1y = s34pi * tgy * tgi;
890 const double y1z = s34pi * tgz * tgi;
891
892 const double dx_xx = y1x * tgxx - y1x;
893 const double dx_yy = y1x * tgyy;
894 const double dx_zz = y1x * tgzz;
895 const double dx_xy = y1x * tgxy;
896 const double dx_yz = y1x * tgyz;
897 const double dx_xz = y1x * tgxz;
898
899 const double dy_xx = y1y * tgxx;
900 const double dy_yy = y1y * tgyy - y1y;
901 const double dy_zz = y1y * tgzz;
902 const double dy_xy = y1y * tgxy - y1x;
903 const double dy_yz = y1y * tgyz;
904 const double dy_xz = y1y * tgxz;
905
906 const double dz_xx = y1z * tgxx;
907 const double dz_yy = y1z * tgyy;
908 const double dz_zz = y1z * tgzz - y1z;
909 const double dz_xy = y1z * tgxy;
910 const double dz_yz = y1z * tgyz - y1y;
911 const double dz_xz = y1z * tgxz - y1x;
912
913 t4[ig] = y4 * v;
914 t5[ig] = y5 * v;
915 t6[ig] = y6 * v;
916 t7[ig] = y7 * v;
917 t8[ig] = y8 * v;
918
919 // y4 = s54pi 1/2 ( 3 z^2/r^2 - 1 )
920 dt4_xx[ig] = -(v * s20pi * dz_xx * y1z - y4 * dv * tg * tgxx);
921 dt4_yy[ig] = -(v * s20pi * dz_yy * y1z - y4 * dv * tg * tgyy);
922 dt4_zz[ig] = -(v * s20pi * dz_zz * y1z - y4 * dv * tg * tgzz);
923 dt4_xy[ig] = -(v * s20pi * dz_xy * y1z - y4 * dv * tg * tgxy);
924 dt4_yz[ig] = -(v * s20pi * dz_yz * y1z - y4 * dv * tg * tgyz);
925 dt4_xz[ig] = -(v * s20pi * dz_xz * y1z - y4 * dv * tg * tgxz);
926
927 // y5 = s54pi sqrt(3)/2 ( x^2 - y^2 ) / r^2
928 dt5_xx[ig] = -(v * s20pi3 * (y1x * dx_xx - y1y * dy_xx) - y5 * dv * tg * tgxx);
929 dt5_yy[ig] = -(v * s20pi3 * (y1x * dx_yy - y1y * dy_yy) - y5 * dv * tg * tgyy);
930 dt5_zz[ig] = -(v * s20pi3 * (y1x * dx_zz - y1y * dy_zz) - y5 * dv * tg * tgzz);
931 dt5_xy[ig] = -(v * s20pi3 * (y1x * dx_xy - y1y * dy_xy) - y5 * dv * tg * tgxy);
932 dt5_yz[ig] = -(v * s20pi3 * (y1x * dx_yz - y1y * dy_yz) - y5 * dv * tg * tgyz);
933 dt5_xz[ig] = -(v * s20pi3 * (y1x * dx_xz - y1y * dy_xz) - y5 * dv * tg * tgxz);
934
935 // y6 = s54pi sqrt(3) x y / r^2
936 dt6_xx[ig] = -(v * s20pi3 * (dx_xx * y1y + y1x * dy_xx) - y6 * dv * tg * tgxx);
937 dt6_yy[ig] = -(v * s20pi3 * (dx_yy * y1y + y1x * dy_yy) - y6 * dv * tg * tgyy);
938 dt6_zz[ig] = -(v * s20pi3 * (dx_zz * y1y + y1x * dy_zz) - y6 * dv * tg * tgzz);
939 dt6_xy[ig] = -(v * s20pi3 * (dx_xy * y1y + y1x * dy_xy) - y6 * dv * tg * tgxy);
940 dt6_yz[ig] = -(v * s20pi3 * (dx_yz * y1y + y1x * dy_yz) - y6 * dv * tg * tgyz);
941 dt6_xz[ig] = -(v * s20pi3 * (dx_xz * y1y + y1x * dy_xz) - y6 * dv * tg * tgxz);
942
943 // y7 = s54pi sqrt(3) y z / r^2
944 dt7_xx[ig] = -(v * s20pi3 * (dy_xx * y1z + y1y * dz_xx) - y7 * dv * tg * tgxx);
945 dt7_yy[ig] = -(v * s20pi3 * (dy_yy * y1z + y1y * dz_yy) - y7 * dv * tg * tgyy);
946 dt7_zz[ig] = -(v * s20pi3 * (dy_zz * y1z + y1y * dz_zz) - y7 * dv * tg * tgzz);
947 dt7_xy[ig] = -(v * s20pi3 * (dy_xy * y1z + y1y * dz_xy) - y7 * dv * tg * tgxy);
948 dt7_yz[ig] = -(v * s20pi3 * (dy_yz * y1z + y1y * dz_yz) - y7 * dv * tg * tgyz);
949 dt7_xz[ig] = -(v * s20pi3 * (dy_xz * y1z + y1y * dz_xz) - y7 * dv * tg * tgxz);
950
951 // y8 = s54pi sqrt(3) z x / r^2
952 dt8_xx[ig] = -(v * s20pi3 * (dx_xx * y1z + y1x * dz_xx) - y8 * dv * tg * tgxx);
953 dt8_yy[ig] = -(v * s20pi3 * (dx_yy * y1z + y1x * dz_yy) - y8 * dv * tg * tgyy);
954 dt8_zz[ig] = -(v * s20pi3 * (dx_zz * y1z + y1x * dz_zz) - y8 * dv * tg * tgzz);
955 dt8_xy[ig] = -(v * s20pi3 * (dx_xy * y1z + y1x * dz_xy) - y8 * dv * tg * tgxy);
956 dt8_yz[ig] = -(v * s20pi3 * (dx_yz * y1z + y1x * dz_yz) - y8 * dv * tg * tgyz);
957 dt8_xz[ig] = -(v * s20pi3 * (dx_xz * y1z + y1x * dz_xz) - y8 * dv * tg * tgxz);
958 } // ig
959 } // iquad
960 }
961 ilm += 2*l+1;
962 }
963 else if ( l == 3 )
964 {
965 // only implemented for Kleiman-Bylander type
966 assert(nquad[is] == 0);
967 // Kleinman-Bylander
968 const int ipr09 = ilm;
969 const int ipr10 = ilm + 1;
970 const int ipr11 = ilm + 2;
971 const int ipr12 = ilm + 3;
972 const int ipr13 = ilm + 4;
973 const int ipr14 = ilm + 5;
974 const int ipr15 = ilm + 6;
975
976 double *t09 = &twnl[is][ngwl * ipr09];
977 double *t10 = &twnl[is][ngwl * ipr10];
978 double *t11 = &twnl[is][ngwl * ipr11];
979 double *t12 = &twnl[is][ngwl * ipr12];
980 double *t13 = &twnl[is][ngwl * ipr13];
981 double *t14 = &twnl[is][ngwl * ipr14];
982 double *t15 = &twnl[is][ngwl * ipr15];
983
984 // dtwnl[is][ipr][ij][ngwl]
985 // index = ig + ngwl * ( ij + 6 * ipr )
986 double *dt09_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr09 )];
987 double *dt09_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr09 )];
988 double *dt09_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr09 )];
989 double *dt09_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr09 )];
990 double *dt09_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr09 )];
991 double *dt09_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr09 )];
992
993 double *dt10_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr10 )];
994 double *dt10_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr10 )];
995 double *dt10_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr10 )];
996 double *dt10_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr10 )];
997 double *dt10_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr10 )];
998 double *dt10_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr10 )];
999
1000 double *dt11_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr11 )];
1001 double *dt11_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr11 )];
1002 double *dt11_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr11 )];
1003 double *dt11_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr11 )];
1004 double *dt11_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr11 )];
1005 double *dt11_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr11 )];
1006
1007 double *dt12_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr12 )];
1008 double *dt12_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr12 )];
1009 double *dt12_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr12 )];
1010 double *dt12_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr12 )];
1011 double *dt12_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr12 )];
1012 double *dt12_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr12 )];
1013
1014 double *dt13_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr13 )];
1015 double *dt13_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr13 )];
1016 double *dt13_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr13 )];
1017 double *dt13_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr13 )];
1018 double *dt13_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr13 )];
1019 double *dt13_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr13 )];
1020
1021 double *dt14_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr14 )];
1022 double *dt14_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr14 )];
1023 double *dt14_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr14 )];
1024 double *dt14_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr14 )];
1025 double *dt14_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr14 )];
1026 double *dt14_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr14 )];
1027
1028 double *dt15_xx = &dtwnl[is][ngwl * ( 0 + 6 * ipr15 )];
1029 double *dt15_yy = &dtwnl[is][ngwl * ( 1 + 6 * ipr15 )];
1030 double *dt15_zz = &dtwnl[is][ngwl * ( 2 + 6 * ipr15 )];
1031 double *dt15_xy = &dtwnl[is][ngwl * ( 3 + 6 * ipr15 )];
1032 double *dt15_yz = &dtwnl[is][ngwl * ( 4 + 6 * ipr15 )];
1033 double *dt15_xz = &dtwnl[is][ngwl * ( 5 + 6 * ipr15 )];
1034
1035 for ( int ig = 0; ig < ngwl; ig++ )
1036 {
1037 double v, dv;
1038 const double tg = kpg[ig];
1039
1040 s->dvnlg(iop,tg,v,dv);
1041
1042 const double tgx = kpg_x[ig];
1043 const double tgy = kpg_y[ig];
1044 const double tgz = kpg_z[ig];
1045 const double tgx2 = tgx * tgx;
1046 const double tgy2 = tgy * tgy;
1047 const double tgz2 = tgz * tgz;
1048 const double tgx3 = tgx * tgx2;
1049 const double tgy3 = tgy * tgy2;
1050 const double tgz3 = tgz * tgz2;
1051
1052 const double tgi = kpgi[ig];
1053 const double tgi2 = tgi * tgi;
1054 const double tgi3 = tgi * tgi2;
1055
1056 const double tgxx = tgx2 * tgi2;
1057 const double tgyy = tgy2 * tgi2;
1058 const double tgzz = tgz2 * tgi2;
1059 const double tgxy = tgx * tgy * tgi2;
1060 const double tgyz = tgy * tgz * tgi2;
1061 const double tgxz = tgx * tgz * tgi2;
1062
1063 const double tgxxx = tgx3 * tgi3;
1064 const double tgyyy = tgy3 * tgi3;
1065 const double tgzzz = tgz3 * tgi3;
1066 const double tgxyy = tgx * tgy2 * tgi3;
1067 const double tgxzz = tgx * tgz2 * tgi3;
1068 const double tgyxx = tgy * tgx2 * tgi3;
1069 const double tgyzz = tgy * tgz2 * tgi3;
1070 const double tgzxx = tgz * tgx2 * tgi3;
1071 const double tgzyy = tgz * tgy2 * tgi3;
1072 const double tgxyz = tgx * tgy * tgz * tgi3;
1073
1074 const double factor = 0.5 * s74pi;
1075
1076 const double y09 = factor * s52 * ( 3.0 * tgyxx - tgyyy );
1077 const double y10 = factor * s15 * 2.0 * tgxyz;
1078 const double y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy );
1079 const double y12 = factor
1080 * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) );
1081 const double y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy );
1082 const double y14 = factor * s15 * ( tgzxx - tgzyy );
1083 const double y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy );
1084
1085 // derivative of x^3/r^3 w.r.t. x, y, and z
1086 const double dx_xxx = 3.0 * tgxx * ( 1.0 - tgxx ) * tgi;
1087 const double dy_xxx = -3.0 * tgxx * tgxy * tgi;
1088 const double dz_xxx = -3.0 * tgxx * tgxz * tgi;
1089 // derivative of y^3/r^3 w.r.t. x, y, and z
1090 const double dx_yyy = -3.0 * tgyy * tgxy * tgi;
1091 const double dy_yyy = 3.0 * tgyy * ( 1.0 - tgyy ) * tgi;
1092 const double dz_yyy = -3.0 * tgyy * tgyz * tgi;
1093 // derivative of z^3/r^3 w.r.t. x, y, and z
1094 const double dx_zzz = -3.0 * tgzz * tgxz * tgi;
1095 const double dy_zzz = -3.0 * tgzz * tgyz * tgi;
1096 const double dz_zzz = 3.0 * tgzz * ( 1.0 - tgzz ) * tgi;
1097 // derivative of xy^2/r^3 w.r.t. x, y, and z
1098 const double dx_xyy = tgyy * ( 1.0 - 3.0 * tgxx ) * tgi;
1099 const double dy_xyy = tgxy * ( 2.0 - 3.0 * tgyy ) * tgi;
1100 const double dz_xyy = -3.0 * tgyy * tgxz * tgi;
1101 // derivative of xz^2/r^3 w.r.t. x, y, and z
1102 const double dx_xzz = tgzz * ( 1.0 - 3.0 * tgxx ) * tgi;
1103 const double dy_xzz = -3.0 * tgzz * tgxy * tgi;
1104 const double dz_xzz = tgxz * ( 2.0 - 3.0 * tgzz ) * tgi;
1105 // derivative of yx^2/r^3 w.r.t. x, y, and z
1106 const double dx_yxx = tgxy * ( 2.0 - 3.0 * tgxx ) * tgi;
1107 const double dy_yxx = tgxx * ( 1.0 - 3.0 * tgyy ) * tgi;
1108 const double dz_yxx = -3.0 * tgxx * tgyz * tgi;
1109 // derivative of yz^2/r^3 w.r.t. x, y, and z
1110 const double dx_yzz = -3.0 * tgzz * tgxy * tgi;
1111 const double dy_yzz = tgzz * ( 1.0 - 3.0 * tgyy ) * tgi;
1112 const double dz_yzz = tgyz * ( 2.0 - 3.0 * tgzz ) * tgi;
1113 // derivative of zx^2/r^3 w.r.t. x, y, and z
1114 const double dx_zxx = tgxz * ( 2.0 - 3.0 * tgxx ) * tgi;
1115 const double dy_zxx = -3.0 * tgxx * tgyz * tgi;
1116 const double dz_zxx = tgxx * ( 1.0 - 3.0 * tgzz ) * tgi;
1117 // derivative of zy^2/r^3 w.r.t. x, y, and z
1118 const double dx_zyy = -3.0 * tgyy * tgxz * tgi;
1119 const double dy_zyy = tgyz * ( 2.0 - 3.0 * tgyy ) * tgi;
1120 const double dz_zyy = tgyy * ( 1.0 - 3.0 * tgzz ) * tgi;
1121 // derivative of xyz/r^3 w.r.t. x, y, and z
1122 const double dx_xyz = tgyz * ( 1 - 3.0 * tgxx ) * tgi;
1123 const double dy_xyz = tgxz * ( 1 - 3.0 * tgyy ) * tgi;
1124 const double dz_xyz = tgxy * ( 1 - 3.0 * tgzz ) * tgi;
1125
1126 // derivatives of spherical harmonics
1127
1128 // y9 = factor * s52 * ( 3.0 * tgyxx - tgyyy );
1129 const double dx_y09 = factor * s52 * ( 3.0 * dx_yxx - dx_yyy );
1130 const double dy_y09 = factor * s52 * ( 3.0 * dy_yxx - dy_yyy );
1131 const double dz_y09 = factor * s52 * ( 3.0 * dz_yxx - dz_yyy );
1132 // y10 = factor * s15 * 2.0 * tgxyz;
1133 const double dx_y10 = factor * s15 * 2.0 * dx_xyz;
1134 const double dy_y10 = factor * s15 * 2.0 * dy_xyz;
1135 const double dz_y10 = factor * s15 * 2.0 * dz_xyz;
1136 // y11 = factor * s32 * ( 4.0 * tgyzz - tgyxx - tgyyy );
1137 const double dx_y11 = factor * s32 * ( 4.0 * dx_yzz - dx_yxx - dx_yyy );
1138 const double dy_y11 = factor * s32 * ( 4.0 * dy_yzz - dy_yxx - dy_yyy );
1139 const double dz_y11 = factor * s32 * ( 4.0 * dz_yzz - dz_yxx - dz_yyy );
1140 // y12 = factor * ( 2.0 * tgzzz - 3.0 * ( tgzxx + tgzyy ) );
1141 const double dx_y12 = factor
1142 * ( 2.0 * dx_zzz - 3.0 * ( dx_zxx + dx_zyy ) );
1143 const double dy_y12 = factor
1144 * ( 2.0 * dy_zzz - 3.0 * ( dy_zxx + dy_zyy ) );
1145 const double dz_y12 = factor
1146 * ( 2.0 * dz_zzz - 3.0 * ( dz_zxx + dz_zyy ) );
1147 // y13 = factor * s32 * ( 4.0 * tgxzz - tgxxx - tgxyy );
1148 const double dx_y13 = factor * s32 * ( 4.0 * dx_xzz - dx_xxx - dx_xyy );
1149 const double dy_y13 = factor * s32 * ( 4.0 * dy_xzz - dy_xxx - dy_xyy );
1150 const double dz_y13 = factor * s32 * ( 4.0 * dz_xzz - dz_xxx - dz_xyy );
1151 // y14 = factor * s15 * ( tgzxx - tgzyy );
1152 const double dx_y14 = factor * s15 * ( dx_zxx - dx_zyy );
1153 const double dy_y14 = factor * s15 * ( dy_zxx - dy_zyy );
1154 const double dz_y14 = factor * s15 * ( dz_zxx - dz_zyy );
1155 // y15 = factor * s52 * ( tgxxx - 3.0 * tgxyy );
1156 const double dx_y15 = factor * s52 * ( dx_xxx - 3.0 * dx_xyy );
1157 const double dy_y15 = factor * s52 * ( dy_xxx - 3.0 * dy_xyy );
1158 const double dz_y15 = factor * s52 * ( dz_xxx - 3.0 * dz_xyy );
1159
1160 t09[ig] = y09 * v;
1161 t10[ig] = y10 * v;
1162 t11[ig] = y11 * v;
1163 t12[ig] = y12 * v;
1164 t13[ig] = y13 * v;
1165 t14[ig] = y14 * v;
1166 t15[ig] = y15 * v;
1167
1168 // contribution to stress tensor
1169 // sigma_ij = 0.5 * (xi * dj + xj * di)
1170
1171 dt09_xx[ig] = -( v * dx_y09 * tgx - y09 * dv * tg * tgxx );
1172 dt09_yy[ig] = -( v * dy_y09 * tgy - y09 * dv * tg * tgyy );
1173 dt09_zz[ig] = -( v * dz_y09 * tgz - y09 * dv * tg * tgzz );
1174 dt09_xy[ig] = -( 0.5 * v * ( dx_y09 * tgy + dy_y09 * tgx )
1175 - y09 * dv * tg * tgxy );
1176 dt09_yz[ig] = -( 0.5 * v * ( dy_y09 * tgz + dz_y09 * tgy )
1177 - y09 * dv * tg * tgyz );
1178 dt09_xz[ig] = -( 0.5 * v * ( dx_y09 * tgz + dz_y09 * tgx )
1179 - y09 * dv * tg * tgxz );
1180
1181 dt10_xx[ig] = -( v * dx_y10 * tgx - y10 * dv * tg * tgxx );
1182 dt10_yy[ig] = -( v * dy_y10 * tgy - y10 * dv * tg * tgyy );
1183 dt10_zz[ig] = -( v * dz_y10 * tgz - y10 * dv * tg * tgzz );
1184 dt10_xy[ig] = -( 0.5 * v * ( dx_y10 * tgy + dy_y10 * tgx )
1185 - y10 * dv * tg * tgxy );
1186 dt10_yz[ig] = -( 0.5 * v * ( dy_y10 * tgz + dz_y10 * tgy )
1187 - y10 * dv * tg * tgyz );
1188 dt10_xz[ig] = -( 0.5 * v * ( dx_y10 * tgz + dz_y10 * tgx )
1189 - y10 * dv * tg * tgxz );
1190
1191 dt11_xx[ig] = -( v * dx_y11 * tgx - y11 * dv * tg * tgxx );
1192 dt11_yy[ig] = -( v * dy_y11 * tgy - y11 * dv * tg * tgyy );
1193 dt11_zz[ig] = -( v * dz_y11 * tgz - y11 * dv * tg * tgzz );
1194 dt11_xy[ig] = -( 0.5 * v * ( dx_y11 * tgy + dy_y11 * tgx )
1195 - y11 * dv * tg * tgxy );
1196 dt11_yz[ig] = -( 0.5 * v * ( dy_y11 * tgz + dz_y11 * tgy )
1197 - y11 * dv * tg * tgyz );
1198 dt11_xz[ig] = -( 0.5 * v * ( dx_y11 * tgz + dz_y11 * tgx )
1199 - y11 * dv * tg * tgxz );
1200
1201 dt12_xx[ig] = -( v * dx_y12 * tgx - y12 * dv * tg * tgxx );
1202 dt12_yy[ig] = -( v * dy_y12 * tgy - y12 * dv * tg * tgyy );
1203 dt12_zz[ig] = -( v * dz_y12 * tgz - y12 * dv * tg * tgzz );
1204 dt12_xy[ig] = -( 0.5 * v * ( dx_y12 * tgy + dy_y12 * tgx )
1205 - y12 * dv * tg * tgxy );
1206 dt12_yz[ig] = -( 0.5 * v * ( dy_y12 * tgz + dz_y12 * tgy )
1207 - y12 * dv * tg * tgyz );
1208 dt12_xz[ig] = -( 0.5 * v * ( dx_y12 * tgz + dz_y12 * tgx )
1209 - y12 * dv * tg * tgxz );
1210
1211 dt13_xx[ig] = -( v * dx_y13 * tgx - y13 * dv * tg * tgxx );
1212 dt13_yy[ig] = -( v * dy_y13 * tgy - y13 * dv * tg * tgyy );
1213 dt13_zz[ig] = -( v * dz_y13 * tgz - y13 * dv * tg * tgzz );
1214 dt13_xy[ig] = -( 0.5 * v * ( dx_y13 * tgy + dy_y13 * tgx )
1215 - y13 * dv * tg * tgxy );
1216 dt13_yz[ig] = -( 0.5 * v * ( dy_y13 * tgz + dz_y13 * tgy )
1217 - y13 * dv * tg * tgyz );
1218 dt13_xz[ig] = -( 0.5 * v * ( dx_y13 * tgz + dz_y13 * tgx )
1219 - y13 * dv * tg * tgxz );
1220
1221 dt14_xx[ig] = -( v * dx_y14 * tgx - y14 * dv * tg * tgxx );
1222 dt14_yy[ig] = -( v * dy_y14 * tgy - y14 * dv * tg * tgyy );
1223 dt14_zz[ig] = -( v * dz_y14 * tgz - y14 * dv * tg * tgzz );
1224 dt14_xy[ig] = -( 0.5 * v * ( dx_y14 * tgy + dy_y14 * tgx )
1225 - y14 * dv * tg * tgxy );
1226 dt14_yz[ig] = -( 0.5 * v * ( dy_y14 * tgz + dz_y14 * tgy )
1227 - y14 * dv * tg * tgyz );
1228 dt14_xz[ig] = -( 0.5 * v * ( dx_y14 * tgz + dz_y14 * tgx )
1229 - y14 * dv * tg * tgxz );
1230
1231 dt15_xx[ig] = -( v * dx_y15 * tgx - y15 * dv * tg * tgxx );
1232 dt15_yy[ig] = -( v * dy_y15 * tgy - y15 * dv * tg * tgyy );
1233 dt15_zz[ig] = -( v * dz_y15 * tgz - y15 * dv * tg * tgzz );
1234 dt15_xy[ig] = -( 0.5 * v * ( dx_y15 * tgy + dy_y15 * tgx )
1235 - y15 * dv * tg * tgxy );
1236 dt15_yz[ig] = -( 0.5 * v * ( dy_y15 * tgz + dz_y15 * tgy )
1237 - y15 * dv * tg * tgyz );
1238 dt15_xz[ig] = -( 0.5 * v * ( dx_y15 * tgz + dz_y15 * tgx )
1239 - y15 * dv * tg * tgxz );
1240
1241 }
1242
1243 ilm += 2 * l + 1;
1244
1245 } // l == 3
1246 else
1247 {
1248 assert(false);
1249 }
1250 } // l != lloc[is]
1251 } // for l
1252 assert(ilm == s->nlm());
1253 }
1254 tmap["update_twnl"].stop();
1255 }
1256
1257 ////////////////////////////////////////////////////////////////////////////////
energy(bool compute_hpsi,SlaterDet & dsd,bool compute_forces,vector<vector<double>> & fion_enl,bool compute_stress,valarray<double> & sigma_enl)1258 double NonLocalPotential::energy(bool compute_hpsi, SlaterDet& dsd,
1259 bool compute_forces, vector<vector<double> >& fion_enl,
1260 bool compute_stress, valarray<double>& sigma_enl)
1261 {
1262 for ( int is = 0; is < fion_enl.size(); is++ )
1263 for ( int i = 0; i < fion_enl[is].size(); i++ )
1264 fion_enl[is][i] = 0.0;
1265 sigma_enl = 0.0;
1266
1267 if ( nspnl == 0 ) return 0.0;
1268
1269 double enl = 0.0;
1270 double tsum[6] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
1271 const vector<double>& occ = sd_.occ();
1272 const int ngwl = basis_.localsize();
1273 // define atom block size
1274 const int na_block_size = 32;
1275 valarray<double> gr(na_block_size*ngwl); // gr[ig+ia*ngwl]
1276 valarray<double> cgr(na_block_size*ngwl); // cgr[ig+ia*ngwl]
1277 valarray<double> sgr(na_block_size*ngwl); // sgr[ig+ia*ngwl]
1278 vector<vector<double> > tau;
1279 atoms_.get_positions(tau);
1280
1281 const double omega = basis_.cell().volume();
1282 assert(omega != 0.0);
1283 const double omega_inv = 1.0 / omega;
1284
1285 for ( int is = 0; is < nsp; is++ )
1286 {
1287 Species *s = atoms_.species_list[is];
1288 if ( npr[is] > 0 ) // species is is non-local
1289 {
1290 valarray<double> tmpfion(3*na[is]);
1291 tmpfion = 0.0;
1292 // define number of atom blocks
1293 const int na_blocks = na[is] / na_block_size +
1294 ( na[is] % na_block_size == 0 ? 0 : 1 );
1295
1296 valarray<double> anl_loc(npr[is]*na_block_size*2*ngwl);
1297 const int nstloc = sd_.nstloc();
1298 // fnl_loc[ipra][n]
1299 // fnl is real if basis is real, complex otherwise
1300 const int fnl_loc_size = basis_.real() ? npr[is]*na_block_size*nstloc :
1301 2*npr[is]*na_block_size*nstloc;
1302 valarray<double> fnl_loc(fnl_loc_size);
1303 valarray<double> fnl_buf(fnl_loc_size);
1304 for ( int ia_block = 0; ia_block < na_blocks; ia_block++ )
1305 {
1306 // process projectors of atoms in block ia_block
1307
1308 const int iastart = ia_block * na_block_size;
1309 const int iaend = (ia_block+1) * na_block_size < na[is] ?
1310 (ia_block+1) * na_block_size :
1311 na[is];
1312 const int ia_block_size = iaend - iastart;
1313
1314 // compute cgr[is][ia][ig], sgr[is][ia][ig]
1315 tmap["comp_eigr"].start();
1316 int k = 3;
1317 double mone = -1.0, zero = 0.0;
1318 char cn='n';
1319
1320 // next line: const cast is ok since dgemm_ does not modify argument
1321 double* kpgx = const_cast<double*>(basis_.kpgx_ptr(0));
1322 dgemm(&cn,&cn,(int*)&ngwl,(int*)&ia_block_size,&k,&mone,
1323 kpgx,(int*)&ngwl, &tau[is][3*iastart],&k,
1324 &zero,&gr[0],(int*)&ngwl);
1325
1326 int len = ia_block_size * ngwl;
1327 #if USE_MASSV
1328 vsincos(&sgr[0],&cgr[0],&gr[0],&len);
1329 #else
1330 for ( int i = 0; i < len; i++ )
1331 {
1332 const double arg = gr[i];
1333 sgr[i] = sin(arg);
1334 cgr[i] = cos(arg);
1335 }
1336 #endif
1337 tmap["comp_eigr"].stop();
1338
1339 // compute anl_loc
1340 tmap["comp_anl"].start();
1341 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1342 {
1343 // twnl[is][ig+ngwl*ipr]
1344 const double * t = &twnl[is][ngwl*ipr];
1345 const int l = lproj[is][ipr];
1346
1347 // anl_loc[ig+ipra*ngwl]
1348
1349 for ( int ia = 0; ia < ia_block_size; ia++ )
1350 {
1351 double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1352 const double* c = &cgr[ia*ngwl];
1353 const double* s = &sgr[ia*ngwl];
1354 if ( l == 0 )
1355 {
1356 for ( int ig = 0; ig < ngwl; ig++ )
1357 {
1358 a[2*ig] = t[ig] * c[ig];
1359 a[2*ig+1] = t[ig] * s[ig];
1360 }
1361 }
1362 else if ( l == 1 )
1363 {
1364 for ( int ig = 0; ig < ngwl; ig++ )
1365 {
1366 /* Next line: -i * eigr */
1367 /* -i * (a+i*b) = b - i*a */
1368 a[2*ig] = t[ig] * s[ig];
1369 a[2*ig+1] = -t[ig] * c[ig];
1370 }
1371 }
1372 else if ( l == 2 )
1373 {
1374 for ( int ig = 0; ig < ngwl; ig++ )
1375 {
1376 // Next line: (-) sign for -eigr
1377 a[2*ig] = -t[ig] * c[ig];
1378 a[2*ig+1] = -t[ig] * s[ig];
1379 }
1380 }
1381 else if ( l == 3 )
1382 {
1383 for ( int ig = 0; ig < ngwl; ig++ )
1384 {
1385 // Next line: i * eigr
1386 // i * (a+i*b) = -b + i*a
1387 a[2*ig] = -t[ig] * s[ig];
1388 a[2*ig+1] = t[ig] * c[ig];
1389 }
1390 }
1391 }
1392 } // ipr
1393 tmap["comp_anl"].stop();
1394
1395 // array anl_loc is complete
1396
1397 // compute fnl[npra][nstloc] = anl^T * c
1398 int nprnaloc = ia_block_size * npr[is];
1399 if ( basis_.real() )
1400 {
1401 double one=1.0;
1402 char ct='t';
1403 int twongwl = 2 * ngwl;
1404 int c_lda = 2*sd_.c().mloc();
1405 const complex<double>* c = sd_.c().cvalptr();
1406 tmap["fnl_gemm"].start();
1407 dgemm(&ct,&cn,&nprnaloc,(int*)&nstloc,&twongwl,&one,
1408 &anl_loc[0],&twongwl, (double*)c, &c_lda,
1409 &zero,&fnl_loc[0],&nprnaloc);
1410 tmap["fnl_gemm"].stop();
1411
1412 // correct for double counting if ctxt_.myrow() == 0
1413 if ( ctxt_.myrow() == 0 )
1414 {
1415 // rank-one update
1416 // dger(m,n,alpha,x,incx,y,incy,a,lda);
1417 // a += alpha * x * transpose(y)
1418 // x = first row of anl_loc
1419 // y^T = first row of c
1420 double alpha = -0.5;
1421 dger(&nprnaloc,(int*)&nstloc,&alpha,&anl_loc[0],&twongwl,
1422 (double*)c,&c_lda,&fnl_loc[0],&nprnaloc);
1423 }
1424 }
1425 else
1426 {
1427 // fnl is complex
1428 complex<double> cone=1.0;
1429 complex<double> czero=0.0;
1430 char cc='c';
1431 int c_lda = sd_.c().mloc();
1432 const complex<double>* c = sd_.c().cvalptr();
1433 tmap["fnl_zemm"].start();
1434 zgemm(&cc,&cn,&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1435 (complex<double>*) &anl_loc[0],(int*)&ngwl,
1436 (complex<double>*) c, &c_lda,&czero,
1437 (complex<double>*)&fnl_loc[0],&nprnaloc);
1438 tmap["fnl_zemm"].stop();
1439 }
1440
1441 #if USE_MPI
1442 tmap["fnl_allreduce"].start();
1443 // Allreduce fnl partial sum
1444 MPI_Comm basis_comm = basis_.comm();
1445 int fnl_size = basis_.real() ? nprnaloc*nstloc : 2*nprnaloc*nstloc;
1446 MPI_Allreduce(&fnl_loc[0],&fnl_buf[0],fnl_size,
1447 MPI_DOUBLE,MPI_SUM,basis_comm);
1448 tmap["fnl_allreduce"].stop();
1449
1450 // factor 2.0 in next line is: counting G, -G
1451 if ( basis_.real() )
1452 fnl_loc = 2.0 * fnl_buf;
1453 else
1454 fnl_loc = fnl_buf;
1455 #else
1456 // factor 2.0 in next line is: counting G, -G
1457 if ( basis_.real() )
1458 fnl_loc *= 2.0;
1459 #endif
1460
1461 // if the species has multiple projectors, that are not orthogonal
1462 // multiply with D matrix
1463 if ( s->has_dmatrix() )
1464 {
1465 if ( basis_.real() )
1466 {
1467 // helper variables
1468 double one = 1.0;
1469 double zero = 0.0;
1470 const size_t dsize = nprnaloc * nprnaloc;
1471 //
1472 // construct D matrix
1473 //
1474 // allocate array
1475 valarray<double> dmatrix(zero,dsize);
1476 // loop over all elements
1477 for ( int i = 0; i < nprnaloc; i++ )
1478 {
1479 // extract atom, l, and m corresponding to this index
1480 const int iatom = i % ia_block_size;
1481 const int ipr = i / ia_block_size;
1482 for ( int j = 0; j <= i; j++ )
1483 {
1484 // extract atom, l, and m corresponding to this index
1485 const int jatom = j % ia_block_size;
1486 const int jpr = j / ia_block_size;
1487 // D matrix is diagonal in atom, l, and m
1488 if ( iatom != jatom ) continue;
1489 const double val = s->dmatrix(ipr,jpr);
1490 dmatrix[nprnaloc * i + j] = val;
1491 // matrix is symmetric
1492 if ( i != j ) dmatrix[nprnaloc * j + i] = val;
1493 }
1494 }
1495 // multiply F'_In = D_IJ F_Jn
1496 dgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0],
1497 &nprnaloc,&fnl_loc[0],&nprnaloc,&zero,&fnl_buf[0],&nprnaloc);
1498 }
1499 else
1500 {
1501 // helper variables
1502 complex<double> one = 1.0;
1503 complex<double> zero = 0.0;
1504 const size_t dsize = nprnaloc * nprnaloc;
1505 //
1506 // construct D matrix
1507 //
1508 // allocate array
1509 valarray < complex<double> > dmatrix(zero,dsize);
1510 // loop over all elements
1511 for ( int i = 0; i < nprnaloc; i++ )
1512 {
1513 // extract atom, l, and m corresponding to this index
1514 const int iatom = i % ia_block_size;
1515 const int ipr = i / ia_block_size;
1516 for ( int j = 0; j <= i; j++ )
1517 {
1518 // extract atom, l, and m corresponding to this index
1519 const int jatom = j % ia_block_size;
1520 const int jpr = j / ia_block_size;
1521 // D matrix is diagonal in atom, l, and m
1522 if ( iatom != jatom ) continue;
1523 const double val = s->dmatrix(ipr,jpr);
1524 dmatrix[nprnaloc * i + j] = val;
1525 // matrix is symmetric
1526 if ( i != j ) dmatrix[nprnaloc * j + i] = val;
1527 }
1528 }
1529 // multiply F'_In = D_IJ F_Jn
1530 zgemm(&cn,&cn,&nprnaloc,(int*) &nstloc,&nprnaloc,&one,&dmatrix[0],
1531 &nprnaloc,(complex<double>*) &fnl_loc[0],&nprnaloc,&zero,
1532 (complex<double>*) &fnl_buf[0],&nprnaloc);
1533 }
1534 }
1535 else
1536 {
1537 fnl_buf = fnl_loc;
1538 }
1539
1540 // accumulate Enl contribution
1541 const int nbase = ctxt_.mycol() * sd_.c().nb();
1542 if ( basis_.real() )
1543 {
1544 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1545 {
1546 const double fac = wt[is][ipr] * omega_inv;
1547 for ( int n = 0; n < nstloc; n++ )
1548 {
1549 const double facn = fac * occ[n + nbase];
1550 for ( int ia = 0; ia < ia_block_size; ia++ )
1551 {
1552 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1553 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1554 // << ",n=" << n << "]: " << fnl_loc[i] << endl;
1555 enl += facn * fnl_loc[i] * fnl_buf[i];
1556 fnl_loc[i] = fac * fnl_buf[i];
1557 }
1558 }
1559 }
1560 }
1561 else
1562 {
1563 // fnl is complex
1564 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1565 {
1566 const double fac = wt[is][ipr] * omega_inv;
1567 for ( int n = 0; n < nstloc; n++ )
1568 {
1569 const double facn = fac * occ[n + nbase];
1570 for ( int ia = 0; ia < ia_block_size; ia++ )
1571 {
1572 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1573 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1574 // << ",n=" << n << "]: " << fnl_loc[i] << endl;
1575 const double f_re = fnl_loc[2*i];
1576 const double f_im = fnl_loc[2*i+1];
1577 const double fb_re = fnl_buf[2*i];
1578 const double fb_im = fnl_buf[2*i+1];
1579 enl += facn * (f_re*fb_re + f_im*fb_im);
1580 fnl_loc[2*i] = fac * fb_re;
1581 fnl_loc[2*i+1] = fac * fb_im;
1582 }
1583 }
1584 }
1585 }
1586
1587 if ( compute_hpsi )
1588 {
1589 if ( basis_.real() )
1590 {
1591 tmap["enl_hpsi"].start();
1592 // compute cp += anl * fnl
1593 complex<double>* cp = dsd.c().valptr();
1594 int cp_lda = 2*dsd.c().mloc();
1595 int twongwl = 2 * ngwl;
1596 double one = 1.0;
1597 dgemm(&cn,&cn,&twongwl,(int*)&nstloc,&nprnaloc,&one,
1598 &anl_loc[0],&twongwl, &fnl_loc[0],&nprnaloc,
1599 &one,(double*)cp, &cp_lda);
1600 tmap["enl_hpsi"].stop();
1601 }
1602 else
1603 {
1604 tmap["enl_hpsi"].start();
1605 // compute cp += anl * fnl
1606 complex<double>* cp = dsd.c().valptr();
1607 complex<double> cone = 1.0;
1608 int cp_lda = dsd.c().mloc();
1609 zgemm(&cn,&cn,(int*)&ngwl,(int*)&nstloc,&nprnaloc,&cone,
1610 (complex<double>*) &anl_loc[0],(int*)&ngwl,
1611 (complex<double>*) &fnl_loc[0],&nprnaloc,
1612 &cone,cp, &cp_lda);
1613 tmap["enl_hpsi"].stop();
1614 }
1615 }
1616
1617 // ionic forces
1618 if ( compute_forces )
1619 {
1620 tmap["enl_fion"].start();
1621
1622 valarray<double> dfnl_loc(fnl_loc_size);
1623 for ( int j = 0; j < 3; j++ )
1624 {
1625 const double *const kpgxj = basis_.kpgx_ptr(j);
1626
1627 // compute anl_loc
1628 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1629 {
1630 // twnl[is][ig+ngwl*ipr]
1631 const double * t = &twnl[is][ngwl*ipr];
1632 const int l = lproj[is][ipr];
1633
1634 // anl_loc[ig+ipra*ngwl]
1635
1636 for ( int ia = 0; ia < ia_block_size; ia++ )
1637 {
1638 double* a = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1639 const double* c = &cgr[ia*ngwl];
1640 const double* s = &sgr[ia*ngwl];
1641 if ( l == 0 )
1642 {
1643 for ( int ig = 0; ig < ngwl; ig++ )
1644 {
1645 const double tt = kpgxj[ig] * t[ig];
1646 // Next lines: -i * ( a + ib ) = b - ia
1647 a[2*ig] = tt * s[ig];
1648 a[2*ig+1] = -tt * c[ig];
1649 }
1650 }
1651 else if ( l == 1 )
1652 {
1653 for ( int ig = 0; ig < ngwl; ig++ )
1654 {
1655 // Next lines: (-i)**2 * ( a + ib ) = - a - ib
1656 const double tt = - kpgxj[ig] * t[ig];
1657 a[2*ig] = tt * c[ig];
1658 a[2*ig+1] = tt * s[ig];
1659 }
1660 }
1661 else if ( l == 2 )
1662 {
1663 for ( int ig = 0; ig < ngwl; ig++ )
1664 {
1665 // Next lines: (-i) * - ( a + ib ) = i*(a+ib) = - b + ia
1666 const double tt = kpgxj[ig] * t[ig];
1667 a[2*ig] = -tt * s[ig];
1668 a[2*ig+1] = tt * c[ig];
1669 }
1670 }
1671 else if ( l == 3 )
1672 {
1673 for ( int ig = 0; ig < ngwl; ig++ )
1674 {
1675 // Next lines: (-i)**4 * ( a + ib ) = a + ib
1676 const double tt = kpgxj[ig] * t[ig];
1677 a[2*ig] = tt * c[ig];
1678 a[2*ig+1] = tt * s[ig];
1679 }
1680 }
1681 }
1682 } // ipr
1683
1684 // array anl_loc is complete
1685
1686 // compute dfnl[npra][nstloc] = anl^T * c
1687 if ( basis_.real() )
1688 {
1689 // factor 2.0 in next line is: counting G, -G
1690 // Note: no need to correct for double counting of the
1691 // G=0 component which is always zero
1692 double two=2.0;
1693 char ct='t';
1694 const int twongwl = 2 * ngwl;
1695 const int nprnaloc = ia_block_size * npr[is];
1696 int c_lda = 2*sd_.c().mloc();
1697 const complex<double>* c = sd_.c().cvalptr();
1698 dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two,
1699 &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda,
1700 &zero,&dfnl_loc[0],(int*)&nprnaloc);
1701 }
1702 else
1703 {
1704 complex<double> cone=1.0;
1705 complex<double> czero=0.0;
1706 char cc='c';
1707 const int nprnaloc = ia_block_size * npr[is];
1708 int c_lda = sd_.c().mloc();
1709 const complex<double>* c = sd_.c().cvalptr();
1710 zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1711 (complex<double>*) &anl_loc[0],(int*)&ngwl,
1712 (complex<double>*) c,(int*)&c_lda,
1713 &czero,(complex<double>*)&dfnl_loc[0],(int*)&nprnaloc);
1714 }
1715
1716 // accumulate non-local contributions to forces
1717 if ( basis_.real() )
1718 {
1719 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1720 {
1721 for ( int n = 0; n < nstloc; n++ )
1722 {
1723 // Factor 2.0 in next line from derivative of |Fnl|^2
1724 const double facn = 2.0 * occ[n + nbase];
1725 for ( int ia = 0; ia < ia_block_size; ia++ )
1726 {
1727 const int ia_global = ia + iastart;
1728 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1729 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1730 // << ",n=" << n << "]: " << fnl_loc[i] << endl;
1731 tmpfion[3*ia_global+j] -= facn *
1732 fnl_loc[i] * dfnl_loc[i];
1733 }
1734 }
1735 }
1736 }
1737 else
1738 {
1739 for ( int ipr = 0; ipr < npr[is]; ipr++ )
1740 {
1741 for ( int n = 0; n < nstloc; n++ )
1742 {
1743 // Factor 2.0 in next line from derivative of |Fnl|^2
1744 const double facn = 2.0 * occ[n + nbase];
1745 for ( int ia = 0; ia < ia_block_size; ia++ )
1746 {
1747 const int ia_global = ia + iastart;
1748 const int i = ia + ipr*ia_block_size + n * nprnaloc;
1749 //cout << "fnl_loc[ipr=" << ipr << ",ia=" << ia
1750 // << ",n=" << n << "]: " << fnl_loc[i] << endl;
1751 double f_re = fnl_loc[2*i];
1752 double f_im = fnl_loc[2*i+1];
1753 double df_re = dfnl_loc[2*i];
1754 double df_im = dfnl_loc[2*i+1];
1755 tmpfion[3*ia_global+j] -=
1756 facn * ( f_re * df_re + f_im * df_im );
1757 }
1758 }
1759 }
1760 }
1761 } // j
1762
1763 tmap["enl_fion"].stop();
1764 } // compute_forces
1765
1766 if ( compute_stress )
1767 {
1768 tmap["enl_sigma"].start();
1769 valarray<double> dfnl_loc(fnl_loc_size);
1770
1771 for ( int ij = 0; ij < 6; ij++ )
1772 {
1773 // compute anl_loc
1774 int ipr = 0;
1775 while ( ipr < npr[is] )
1776 {
1777 // twnl[is][ig+ngwl*ipr]
1778 const int l = lproj[is][ipr];
1779 if ( l == 0 )
1780 {
1781 // dtwnl[is][ipr][ij][ngwl]
1782 // index = ig + ngwl * ( ij + 6 * ipr))
1783 // ipr = iquad + nquad[is] * ilm, where ilm = 0
1784 const double *const dt0 = &dtwnl[is][ngwl*(ij+6*ipr)];
1785 for ( int ia = 0; ia < ia_block_size; ia++ )
1786 {
1787 double* a0 = &anl_loc[2*(ia+ipr*ia_block_size)*ngwl];
1788 const double* c = &cgr[ia*ngwl];
1789 const double* s = &sgr[ia*ngwl];
1790 for ( int ig = 0; ig < ngwl; ig++ )
1791 {
1792 const double d0 = dt0[ig];
1793 // Next lines: -i * ( a + ib ) = b - ia
1794 a0[2*ig] = d0 * c[ig];
1795 a0[2*ig+1] = d0 * s[ig];
1796 }
1797 }
1798 }
1799 else if ( l == 1 )
1800 {
1801 const int ipr1 = ipr;
1802 const int ipr2 = ipr + 1;
1803 const int ipr3 = ipr + 2;
1804 // dtwnl[is][ipr][ij][ngwl]
1805 // index = ig + ngwl * ( ij + 6 * iprx ))
1806 const double *dt1 = &dtwnl[is][ngwl*(ij+6*ipr1)];
1807 const double *dt2 = &dtwnl[is][ngwl*(ij+6*ipr2)];
1808 const double *dt3 = &dtwnl[is][ngwl*(ij+6*ipr3)];
1809 for ( int ia = 0; ia < ia_block_size; ia++ )
1810 {
1811 double* a1 = &anl_loc[2*(ia+ipr1*ia_block_size)*ngwl];
1812 double* a2 = &anl_loc[2*(ia+ipr2*ia_block_size)*ngwl];
1813 double* a3 = &anl_loc[2*(ia+ipr3*ia_block_size)*ngwl];
1814 const double* c = &cgr[ia*ngwl];
1815 const double* s = &sgr[ia*ngwl];
1816 for ( int ig = 0; ig < ngwl; ig++ )
1817 {
1818 const double d1 = dt1[ig];
1819 const double d2 = dt2[ig];
1820 const double d3 = dt3[ig];
1821 // Next line: (-i)^l factor is -i
1822 // Next line: -i * eigr
1823 // -i * (a+i*b) = b - i*a
1824 const double tc = -c[ig]; // -cosgr[ia][ig]
1825 const double ts = s[ig]; // singr[ia][ig]
1826 a1[2*ig] = d1 * ts;
1827 a1[2*ig+1] = d1 * tc;
1828 a2[2*ig] = d2 * ts;
1829 a2[2*ig+1] = d2 * tc;
1830 a3[2*ig] = d3 * ts;
1831 a3[2*ig+1] = d3 * tc;
1832 }
1833 }
1834 }
1835 else if ( l == 2 )
1836 {
1837 const int ipr4 = ipr;
1838 const int ipr5 = ipr + 1;
1839 const int ipr6 = ipr + 2;
1840 const int ipr7 = ipr + 3;
1841 const int ipr8 = ipr + 4;
1842 // dtwnl[is][ipr][iquad][ij][ngwl]
1843 // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr ))
1844 const double *dt4 = &dtwnl[is][ngwl*(ij+6*ipr4)];
1845 const double *dt5 = &dtwnl[is][ngwl*(ij+6*ipr5)];
1846 const double *dt6 = &dtwnl[is][ngwl*(ij+6*ipr6)];
1847 const double *dt7 = &dtwnl[is][ngwl*(ij+6*ipr7)];
1848 const double *dt8 = &dtwnl[is][ngwl*(ij+6*ipr8)];
1849 for ( int ia = 0; ia < ia_block_size; ia++ )
1850 {
1851 double* a4 = &anl_loc[2*(ia+ipr4*ia_block_size)*ngwl];
1852 double* a5 = &anl_loc[2*(ia+ipr5*ia_block_size)*ngwl];
1853 double* a6 = &anl_loc[2*(ia+ipr6*ia_block_size)*ngwl];
1854 double* a7 = &anl_loc[2*(ia+ipr7*ia_block_size)*ngwl];
1855 double* a8 = &anl_loc[2*(ia+ipr8*ia_block_size)*ngwl];
1856 const double* c = &cgr[ia*ngwl];
1857 const double* s = &sgr[ia*ngwl];
1858
1859 for ( int ig = 0; ig < ngwl; ig++ )
1860 {
1861 const double d4 = dt4[ig];
1862 const double d5 = dt5[ig];
1863 const double d6 = dt6[ig];
1864 const double d7 = dt7[ig];
1865 const double d8 = dt8[ig];
1866 // Next lines: (-i)^2 * ( a + ib ) = - ( a + ib )
1867 const double tc = -c[ig]; // -cosgr[ia][ig]
1868 const double ts = -s[ig]; // -singr[ia][ig]
1869 a4[2*ig] = d4 * tc;
1870 a4[2*ig+1] = d4 * ts;
1871 a5[2*ig] = d5 * tc;
1872 a5[2*ig+1] = d5 * ts;
1873 a6[2*ig] = d6 * tc;
1874 a6[2*ig+1] = d6 * ts;
1875 a7[2*ig] = d7 * tc;
1876 a7[2*ig+1] = d7 * ts;
1877 a8[2*ig] = d8 * tc;
1878 a8[2*ig+1] = d8 * ts;
1879 }
1880 }
1881 }
1882 else if ( l == 3 )
1883 {
1884 const int ipr09 = ipr;
1885 const int ipr10= ipr + 1;
1886 const int ipr11= ipr + 2;
1887 const int ipr12= ipr + 3;
1888 const int ipr13= ipr + 4;
1889 const int ipr14= ipr + 5;
1890 const int ipr15= ipr + 6;
1891 // dtwnl[is][ipr][iquad][ij][ngwl]
1892 // index = ig + ngwl * ( ij + 6 * ( iquad + nquad[is] * ipr ))
1893 const double *dt09 = &dtwnl[is][ngwl * ( ij + 6 * ipr09 )];
1894 const double *dt10 = &dtwnl[is][ngwl * ( ij + 6 * ipr10 )];
1895 const double *dt11 = &dtwnl[is][ngwl * ( ij + 6 * ipr11 )];
1896 const double *dt12 = &dtwnl[is][ngwl * ( ij + 6 * ipr12 )];
1897 const double *dt13 = &dtwnl[is][ngwl * ( ij + 6 * ipr13 )];
1898 const double *dt14 = &dtwnl[is][ngwl * ( ij + 6 * ipr14 )];
1899 const double *dt15 = &dtwnl[is][ngwl * ( ij + 6 * ipr15 )];
1900 for ( int ia = 0; ia < ia_block_size; ia++ )
1901 {
1902 double* a09 = &anl_loc[2 * ( ia + ipr09 * ia_block_size ) * ngwl];
1903 double* a10 = &anl_loc[2 * ( ia + ipr10 * ia_block_size ) * ngwl];
1904 double* a11 = &anl_loc[2 * ( ia + ipr11 * ia_block_size ) * ngwl];
1905 double* a12 = &anl_loc[2 * ( ia + ipr12 * ia_block_size ) * ngwl];
1906 double* a13 = &anl_loc[2 * ( ia + ipr13 * ia_block_size ) * ngwl];
1907 double* a14 = &anl_loc[2 * ( ia + ipr14 * ia_block_size ) * ngwl];
1908 double* a15 = &anl_loc[2 * ( ia + ipr15 * ia_block_size ) * ngwl];
1909 const double* c = &cgr[ia * ngwl];
1910 const double* s = &sgr[ia * ngwl];
1911
1912 for ( int ig = 0; ig < ngwl; ig++ )
1913 {
1914 const double d09 = dt09[ig];
1915 const double d10 = dt10[ig];
1916 const double d11 = dt11[ig];
1917 const double d12 = dt12[ig];
1918 const double d13 = dt13[ig];
1919 const double d14 = dt14[ig];
1920 const double d15 = dt15[ig];
1921 // Next lines: (-i)^2 * ( a + ib ) = - ( a + ib )
1922 const double tc = c[ig]; // cosgr[ia][ig]
1923 const double ts = -s[ig]; // -singr[ia][ig]
1924 a09[2 * ig] = d09 * ts;
1925 a09[2 * ig + 1] = d09 * tc;
1926 a10[2 * ig] = d10 * ts;
1927 a10[2 * ig + 1] = d10 * tc;
1928 a11[2 * ig] = d11 * ts;
1929 a11[2 * ig + 1] = d11 * tc;
1930 a12[2 * ig] = d12 * ts;
1931 a12[2 * ig + 1] = d12 * tc;
1932 a13[2 * ig] = d13 * ts;
1933 a13[2 * ig + 1] = d13 * tc;
1934 a14[2 * ig] = d14 * ts;
1935 a14[2 * ig + 1] = d14 * tc;
1936 a15[2 * ig] = d15 * ts;
1937 a15[2 * ig + 1] = d15 * tc;
1938 }
1939 }
1940 }
1941 else
1942 {
1943 assert(false);
1944 } // l
1945 ipr += 2*l+1;
1946 } // while ipr
1947
1948 // array anl_loc is complete
1949
1950 // compute dfnl[npra][nstloc] = anl^T * c
1951 if ( basis_.real() )
1952 {
1953 // factor 2.0 in next line is: counting G, -G
1954 // Note: no need to correct for double counting of the
1955 // G=0 component which is always zero
1956 double two=2.0;
1957 char ct='t';
1958 const int twongwl = 2 * ngwl;
1959 const int nprnaloc = ia_block_size * npr[is];
1960 int c_lda = 2*sd_.c().mloc();
1961 const complex<double>* c = sd_.c().cvalptr();
1962 dgemm(&ct,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&twongwl,&two,
1963 &anl_loc[0],(int*)&twongwl, (double*)c,(int*)&c_lda,
1964 &zero,&dfnl_loc[0],(int*)&nprnaloc);
1965 }
1966 else
1967 {
1968 complex<double> cone=1.0;
1969 complex<double> czero=0.0;
1970 char cc='c';
1971 const int nprnaloc = ia_block_size * npr[is];
1972 int c_lda = sd_.c().mloc();
1973 const complex<double>* c = sd_.c().cvalptr();
1974 zgemm(&cc,&cn,(int*)&nprnaloc,(int*)&nstloc,(int*)&ngwl,&cone,
1975 (complex<double>*)&anl_loc[0],(int*)&ngwl,
1976 (complex<double>*)c,(int*)&c_lda,
1977 &czero,(complex<double>*)&dfnl_loc[0],(int*)&nprnaloc);
1978 }
1979
1980 // accumulate non-local contributions to sigma_ij
1981 if ( basis_.real() )
1982 {
1983 for ( int n = 0; n < nstloc; n++ )
1984 {
1985 // Factor 2.0 in next line from derivative of |Fnl|^2
1986 const double facn = 2.0 * occ[n + nbase];
1987 for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ )
1988 {
1989 const int i = ipra + n * nprnaloc;
1990 tsum[ij] += facn * fnl_loc[i] * dfnl_loc[i];
1991 }
1992 }
1993 }
1994 else
1995 {
1996 for ( int n = 0; n < nstloc; n++ )
1997 {
1998 // Factor 2.0 in next line from derivative of |Fnl|^2
1999 const double facn = 2.0 * occ[n + nbase];
2000 for ( int ipra = 0; ipra < npr[is]*ia_block_size; ipra++ )
2001 {
2002 const int i = ipra + n * nprnaloc;
2003 double f_re = fnl_loc[2*i];
2004 double f_im = fnl_loc[2*i+1];
2005 double df_re = dfnl_loc[2*i];
2006 double df_im = dfnl_loc[2*i+1];
2007 tsum[ij] += facn * ( f_re * df_re + f_im * df_im );
2008 }
2009 }
2010 }
2011 } // ij
2012 tmap["enl_sigma"].stop();
2013 } // compute_stress
2014
2015 } // ia_block
2016
2017 if ( compute_forces )
2018 {
2019 ctxt_.dsum(3*na[is],1,&tmpfion[0],3*na[is]);
2020 for ( int ia = 0; ia < na[is]; ia++ )
2021 {
2022 fion_enl[is][3*ia+0] += tmpfion[3*ia];
2023 fion_enl[is][3*ia+1] += tmpfion[3*ia+1];
2024 fion_enl[is][3*ia+2] += tmpfion[3*ia+2];
2025 }
2026 }
2027 } // npr[is]>0
2028 } // is
2029
2030 // reduction of enl across rows
2031 ctxt_.dsum('r',1,1,&enl,1);
2032
2033 if ( compute_stress )
2034 {
2035 ctxt_.dsum(6,1,&tsum[0],6);
2036 sigma_enl[0] = ( enl + tsum[0] ) * omega_inv;
2037 sigma_enl[1] = ( enl + tsum[1] ) * omega_inv;
2038 sigma_enl[2] = ( enl + tsum[2] ) * omega_inv;
2039 sigma_enl[3] = + tsum[3] * omega_inv;
2040 sigma_enl[4] = + tsum[4] * omega_inv;
2041 sigma_enl[5] = + tsum[5] * omega_inv;
2042 }
2043
2044 return enl;
2045 }
2046