1 //
2 // transform_123inds.cc
3 //
4 // Copyright (C) 2001 Edward Valeev
5 //
6 // Author: Edward Valeev <edward.valeev@chemistry.gatech.edu>
7 // Maintainer: EV
8 //
9 // This file is part of the SC Toolkit.
10 //
11 // The SC Toolkit is free software; you can redistribute it and/or modify
12 // it under the terms of the GNU Library General Public License as published by
13 // the Free Software Foundation; either version 2, or (at your option)
14 // any later version.
15 //
16 // The SC Toolkit is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 // GNU Library General Public License for more details.
20 //
21 // You should have received a copy of the GNU Library General Public License
22 // along with the SC Toolkit; see the file COPYING.LIB.  If not, write to
23 // the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24 //
25 // The U.S. Government is granted a limited license as per AL 91-7.
26 //
27 
28 #ifdef __GNUC__
29 #pragma implementation
30 #endif
31 
32 #include <cmath>
33 #include <ostream>
34 #include <string>
35 #include <stdexcept>
36 
37 #include <util/misc/formio.h>
38 #include <util/misc/timer.h>
39 #include <chemistry/qc/basis/gpetite.h>
40 #include <chemistry/qc/mbpt/bzerofast.h>
41 #include <chemistry/qc/mbpt/util.h>
42 #include <chemistry/qc/basis/distshpair.h>
43 #include <chemistry/qc/mbptr12/blas.h>
44 #include <chemistry/qc/mbptr12/transform_123inds.h>
45 
46 using namespace std;
47 using namespace sc;
48 
49 #define PRINT1Q 0
50 #define PRINT2Q 0
51 #define PRINT_NUM_TE_TYPES 1
52 
53 // The FAST_BUT_WRONG flags is useful for exercising the communications
54 // layer.  It causes the first and second quarter transformation to be
55 // omitted, but all communication is still performed.  This permits
56 // problems in communications libraries to be more quickly identified.
57 #define FAST_BUT_WRONG 0
58 
TwoBodyMOIntsTransform_123Inds(const Ref<TwoBodyMOIntsTransform> & tform,int mythread,int nthread,const Ref<ThreadLock> & lock,const Ref<TwoBodyInt> & tbint,double tol,int debug)59 TwoBodyMOIntsTransform_123Inds::TwoBodyMOIntsTransform_123Inds(
60   const Ref<TwoBodyMOIntsTransform>& tform, int mythread, int nthread,
61   const Ref<ThreadLock>& lock, const Ref<TwoBodyInt> &tbint, double tol, int debug) :
62   tform_(tform), mythread_(mythread), nthread_(nthread), lock_(lock), tbint_(tbint),
63   tol_(tol), debug_(debug)
64 {
65   timer_ = new RegionTimer();
66   aoint_computed_ = 0;
67   ni_ = tform_->batchsize();
68   i_offset_ = 0;
69 }
70 
~TwoBodyMOIntsTransform_123Inds()71 TwoBodyMOIntsTransform_123Inds::~TwoBodyMOIntsTransform_123Inds()
72 {
73   timer_ = 0;
74 }
75 
76 /*
77   Distribute work by SR
78 
79    for all PQ
80     compute unique (PQ|RS)
81     transform to (RS|IM) where M are all AOs for basis set 2
82    end PQ
83 
84    use BLAS to transform each rsIM to rsIX
85    transform RSIX to IJXS and accumulate to the tasks that holds respective ij-pairs.
86 
87   end SR
88 */
89 
90 void
run()91 TwoBodyMOIntsTransform_123Inds::run()
92 {
93   Ref<MemoryGrp> mem = tform_->mem();
94   Ref<MessageGrp> msg = tform_->msg();
95   Ref<R12IntsAcc> ints_acc = tform_->ints_acc();
96   const int me = msg->me();
97   const int nproc = msg->n();
98   Ref<MOIndexSpace> space1 = tform_->space1();
99   Ref<MOIndexSpace> space2 = tform_->space2();
100   Ref<MOIndexSpace> space3 = tform_->space3();
101   Ref<MOIndexSpace> space4 = tform_->space4();
102 
103   Ref<GaussianBasisSet> bs1 = space1->basis();
104   Ref<GaussianBasisSet> bs2 = space2->basis();
105   Ref<GaussianBasisSet> bs3 = space3->basis();
106   Ref<GaussianBasisSet> bs4 = space4->basis();
107   const bool bs1_eq_bs2 = (bs1 == bs2);
108   const bool bs3_eq_bs4 = (bs3 == bs4);
109 
110   const bool dynamic = tform_->dynamic();
111   const double print_percent = tform_->print_percent();
112 
113   const int ni = ni_;
114   const int rank1 = space1->rank();
115   const int rank2 = space2->rank();
116   const int rank3 = space3->rank();
117   const int nfuncmax1 = bs1->max_nfunction_in_shell();
118   const int nfuncmax2 = bs2->max_nfunction_in_shell();
119   const int nfuncmax3 = bs3->max_nfunction_in_shell();
120   const int nfuncmax4 = bs4->max_nfunction_in_shell();
121   const int nsh1 = bs1->nshell();
122   const int nsh2 = bs2->nshell();
123   const int nsh3 = bs3->nshell();
124   const int nsh4 = bs4->nshell();
125   const int nbasis1 = bs1->nbasis();
126   const int nbasis2 = bs2->nbasis();
127   const int nbasis3 = bs3->nbasis();
128   const int nbasis4 = bs4->nbasis();
129   double dtol = pow(2.0,tol_);
130   const size_t memgrp_blksize = tform_->memgrp_blksize()/sizeof(double);
131 
132   double** vector1 = new double*[nbasis1];
133   double** vector2 = new double*[nbasis2];
134   double** vector3 = new double*[nbasis3];
135   vector1[0] = new double[rank1*nbasis1];
136   vector2[0] = new double[rank2*nbasis2];
137   vector3[0] = new double[rank3*nbasis3];
138   for(int i=1; i<nbasis1; i++) vector1[i] = vector1[i-1] + rank1;
139   for(int i=1; i<nbasis2; i++) vector2[i] = vector2[i-1] + rank2;
140   for(int i=1; i<nbasis3; i++) vector3[i] = vector3[i-1] + rank3;
141   space1->coefs().convert(vector1);
142   space2->coefs().convert(vector2);
143   space3->coefs().convert(vector3);
144 
145   /*-------------------------------------------------------------
146     Find integrals buffers to 1/r12, r12, and [r12,T1] integrals
147    -------------------------------------------------------------*/
148   const int num_te_types = tform_->num_te_types();
149   const double *intbuf[TwoBodyInt::num_tbint_types];
150   intbuf[TwoBodyInt::eri] = tbint_->buffer(TwoBodyInt::eri);
151   intbuf[TwoBodyInt::r12] = tbint_->buffer(TwoBodyInt::r12);
152   intbuf[TwoBodyInt::r12t1] = tbint_->buffer(TwoBodyInt::r12t1);
153   intbuf[TwoBodyInt::r12t2] = tbint_->buffer(TwoBodyInt::r12t2);
154 
155   /*-----------------------------------------------------
156     Allocate buffers for partially transformed integrals
157    -----------------------------------------------------*/
158   double *ijsx_contrib;  // local contributions to integral_ijsx
159   double *ijrx_contrib;  // local contributions to integral_ijrx
160   double **rsiq_ints = new double*[num_te_types];     // quarter-transformed integrals for each RS pair
161   double **rsix_ints = new double*[num_te_types];     // 2 quarter-transformed integrals for each RS pair
162   for(int te_type=0;te_type<num_te_types;te_type++) {
163     rsiq_ints[te_type] = new double[ni*nbasis2*nfuncmax3*nfuncmax4];
164     rsix_ints[te_type] = new double[ni*rank2*nfuncmax3*nfuncmax4];
165   }
166   ijsx_contrib  = mem->malloc_local_double(rank2*nfuncmax4);
167   if (bs3_eq_bs4)
168     ijrx_contrib  = mem->malloc_local_double(rank2*nfuncmax4);
169   else
170     ijrx_contrib  = NULL;
171 
172   /*-----------------------------
173     Initialize work distribution
174    -----------------------------*/
175   DistShellPair shellpairs(msg,nthread_,mythread_,lock_,bs4,bs3,dynamic,
176                            tform_->shell_pair_data());
177   shellpairs.set_debug(debug_);
178   if (debug_) shellpairs.set_print_percent(print_percent/10.0);
179   else shellpairs.set_print_percent(print_percent);
180   int work_per_thread = bs3_eq_bs4 ?
181     ((nsh3*(nsh3+1))/2)/(nproc*nthread_) :
182     (nsh3*nsh4)/(nproc*nthread_) ;
183   int print_interval = work_per_thread/100;
184   int time_interval = work_per_thread/10;
185   int print_index = 0;
186   if (print_interval == 0) print_interval = 1;
187   if (time_interval == 0) time_interval = 1;
188   if (work_per_thread == 0) work_per_thread = 1;
189 
190   if (debug_) {
191     lock_->lock();
192     ExEnv::outn() << scprintf("%d:%d: starting get_task loop",me,mythread_) << endl;
193     lock_->unlock();
194   }
195 
196   Ref<GenPetite4> p4list
197     = construct_gpetite(bs1,bs2,bs3,bs4);
198 
199 #if FAST_BUT_WRONG
200   for(int te_type=0;te_type<num_te_types;te_type++) {
201     bzerofast(rsiq_ints[te_type], ni*nbasis2*nfuncmax3*nfuncmax4);
202     bzerofast(rsij_ints[te_type], ni*rank2*nfuncmax3*nfuncmax4);
203   }
204   bzerofast(ijsx_contrib, rank2*nfuncmax4);
205   if (bs3_eq_bs4)
206     bzerofast(ijrx_contrib, rank2*nfuncmax4);
207 #endif
208 
209   int R = 0;
210   int S = 0;
211   int RS_count = 0;
212   while (shellpairs.get_task(S,R)) {
213     // if bs3_eq_bs4 then S >= R always (see sc::DistShellPair)
214     int nr = bs3->shell(R).nfunction();
215     int r_offset = bs3->shell_to_function(R);
216 
217     int ns = bs4->shell(S).nfunction();
218     int s_offset = bs4->shell_to_function(S);
219 
220     int nrs = nr*ns;
221 
222     if (debug_ > 1 && (print_index++)%print_interval == 0) {
223       lock_->lock();
224       ExEnv::outn() << scprintf("%d:%d: (PQ|%d %d) %d%%",
225 			       me,mythread_,R,S,(100*print_index)/work_per_thread)
226 		   << endl;
227       lock_->unlock();
228     }
229     if (debug_ > 1 && (print_index)%time_interval == 0) {
230       lock_->lock();
231       ExEnv::outn() << scprintf("timer for %d:%d:",me,mythread_) << endl;
232       timer_->print();
233       lock_->unlock();
234     }
235 
236 #if !FAST_BUT_WRONG
237     // Zero out 1 q.t. storage
238     for(int te_type=0;te_type<num_te_types;te_type++)
239       bzerofast(rsiq_ints[te_type], nrs*ni*nbasis2);
240 
241     for (int P=0; P<nsh1; P++) {
242       int np = bs1->shell(P).nfunction();
243       int p_offset = bs1->shell_to_function(P);
244 
245       int Qmax = (bs1_eq_bs2 ? P : nsh2-1);
246       for (int Q=0; Q<=Qmax; Q++) {
247 	int nq = bs2->shell(Q).nfunction();
248 	int q_offset = bs2->shell_to_function(Q);
249 
250 	// check if symmetry unique and compute degeneracy
251 	int deg = p4list->in_p4(P,Q,R,S);
252 	if (deg == 0)
253 	  continue;
254 	double symfac = (double) deg;
255 
256         if (tbint_->log2_shell_bound(P,Q,R,S) < tol_) {
257           continue;  // skip shell quartets less than tol
258 	}
259 
260         aoint_computed_++;
261 
262         timer_->enter("AO integrals");
263         tbint_->compute_shell(P,Q,R,S);
264         timer_->exit("AO integrals");
265 
266         timer_->enter("1. q.t.");
267 
268         // Begin first quarter transformation;
269         // generate (iq|rs) for i active
270         // if bs1_eq_bs2 then (ip|rs) are also generated
271         // store the integrals as rsiq
272 	for(int te_type=0; te_type<num_te_types; te_type++) {
273 	  const double *pqrs_ptr = intbuf[te_type];
274 
275 	  for (int bf1 = 0; bf1 < np; bf1++) {
276 	    int p = p_offset + bf1;
277             int qmax = (bs1_eq_bs2 && P == Q) ? bf1 : nq-1;
278 
279 	    for (int bf2 = 0; bf2 <= qmax; bf2++) {
280 	      int q = q_offset + bf2;
281 
282 	      for (int bf3 = 0; bf3 < nr; bf3++) {
283                 int smin = (bs3_eq_bs4 && R == S) ? bf3 : 0;
284                 pqrs_ptr += smin;
285 
286 		for (int bf4 = smin; bf4 <ns; bf4++) {
287 
288                   // Only transform integrals larger than the threshold
289 		  if (fabs(*pqrs_ptr) > dtol) {
290 
291 		    double* rsiq_ptr = &rsiq_ints[te_type][q + nbasis2*(0 + ni*(bf4 + ns*bf3))];
292 		    const double* c_pi = vector1[p] + i_offset_;
293 
294                     double* rsip_ptr;
295 		    const double* c_qi;
296                     if (bs1_eq_bs2) {
297 		      rsip_ptr = &rsiq_ints[te_type][p + nbasis2*(0 + ni*(bf4 + ns*bf3))];
298 		      c_qi = vector1[q] + i_offset_;
299                     }
300 
301 		    double rsiq_int_contrib = *pqrs_ptr;
302 		    // multiply each integral by its symmetry degeneracy factor
303 		    rsiq_int_contrib *= symfac;
304 
305                     if (bs1_eq_bs2) {
306 
307                       double rsip_int_contrib = rsiq_int_contrib;
308                       if (te_type == TwoBodyInt::r12t1)
309                       rsip_int_contrib = -1.0*rsiq_int_contrib;
310 
311                       if (p == q) {
312                         for (int i=0; i<ni; i++) {
313                           *rsiq_ptr += *c_pi++ * rsiq_int_contrib;
314                           rsiq_ptr += nbasis2;
315                         }
316                       }
317                       else {
318                         // p != q
319                         for (int i=0; i<ni; i++) {
320                           *rsip_ptr += *c_qi++ * rsip_int_contrib;
321                           rsip_ptr += nbasis2;
322                           *rsiq_ptr += *c_pi++ * rsiq_int_contrib;
323                           rsiq_ptr += nbasis2;
324                         }
325                       }
326 
327                     }
328                     else {
329 
330                       for (int i=0; i<ni; i++) {
331                         *rsiq_ptr += *c_pi++ * rsiq_int_contrib;
332                         rsiq_ptr += nbasis2;
333                       }
334 
335                     } // endif bs1_eq_bs2
336                   }   // endif dtol
337 
338 		  pqrs_ptr++;
339                 } // exit bf4 loop
340               }   // exit bf3 loop
341             }     // exit bf2 loop
342             pqrs_ptr += (nq - qmax - 1) * nrs;
343           }       // exit bf1 loop
344 	  // end of first quarter transformation
345 	}
346 	timer_->exit("1. q.t.");
347 
348         }           // exit P loop
349       }             // exit Q loop
350 #endif // !FAST_BUT_WRONG
351 
352 #if PRINT1Q
353     {
354       if ( me == 0 ) {
355         lock_->lock();
356         string filename = tform_->type() + "." + tform_->name() + ".1q.dat";
357         ios_base::openmode mode = ios_base::app;
358         if (RS_count == 0)
359           mode = ios_base::trunc;
360         ofstream ints_file(filename.c_str(),mode);
361 
362         for(int te_type=0; te_type<PRINT_NUM_TE_TYPES; te_type++) {
363           for (int i = 0; i<ni; i++) {
364             for (int q = 0; q<nbasis2; q++) {
365               for (int r = 0; r<nr; r++) {
366                 int rr = r+r_offset;
367                 for (int s = 0; s<ns; s++) {
368                   int ss = s+s_offset;
369                   double value = rsiq_ints[te_type][q+nbasis2*(i+ni*(s+ns*r))];
370                   ints_file << scprintf("1Q: type = %d (%d %d|%d %d) = %12.8f\n",
371                                         te_type,i+i_offset_,q,rr,ss,value);
372                 }
373               }
374             }
375           }
376         }
377         ints_file.close();
378         lock_->unlock();
379       }
380     }
381 #endif
382 
383     const int nix = ni*rank2;
384     const int niq = ni*nbasis2;
385 
386     timer_->enter("2. q.t.");
387     // Begin second quarter transformation;
388     // generate (ix|rs) stored as rsix
389 
390     for(int te_type=0; te_type<num_te_types; te_type++) {
391       const double *rsiq_ptr = rsiq_ints[te_type];
392       double *rsix_ptr = rsix_ints[te_type];
393 
394       for (int bf3 = 0; bf3 < nr; bf3++) {
395         int smin = (bs3_eq_bs4 && R == S) ? bf3 : 0;
396         rsiq_ptr += smin*niq;
397         rsix_ptr += smin*nix;
398 
399         for (int bf4 = smin; bf4 <ns; bf4++) {
400 
401           // second quarter transform
402           // ix = iq * qx
403           const char notransp = 'n';
404           const double one = 1.0;
405           const double zero = 0.0;
406           F77_DGEMM(&notransp,&notransp,&rank2,&ni,&nbasis2,&one,vector2[0],&rank2,
407                     rsiq_ptr,&nbasis2,&zero,rsix_ptr,&rank2);
408 
409           rsiq_ptr += niq;
410           rsix_ptr += nix;
411 
412         }
413       }
414     }
415     timer_->exit("2. q.t.");
416 
417 #if PRINT2Q
418     {
419       if ( me == 0 ) {
420         lock_->lock();
421         string filename = tform_->type() + "." + tform_->name() + ".2q.dat";
422         ios_base::openmode mode = ios_base::app;
423         if (RS_count == 0)
424           mode = ios_base::trunc;
425         ofstream ints_file(filename.c_str(),mode);
426 
427         for(int te_type=0; te_type<PRINT_NUM_TE_TYPES; te_type++) {
428           for (int i = 0; i<ni; i++) {
429             for (int x = 0; x<rank2; x++) {
430               for (int r = 0; r<nr; r++) {
431                 int rr = r+r_offset;
432                 for (int s = 0; s<ns; s++) {
433                   int ss = s+s_offset;
434                   double value = rsix_ints[te_type][x+rank2*(i+ni*(s+ns*r))];
435                   ints_file << scprintf("2Q: type = %d (%d %d|%d %d) = %12.8f\n",
436                                         te_type,i+i_offset_,x,rr,ss,value);
437                 }
438               }
439             }
440           }
441         }
442         ints_file.close();
443         lock_->unlock();
444       }
445     }
446 #endif
447 
448     timer_->enter("3. q.t.");
449     // Begin third quarter transformation;
450     // generate (ix|js) stored as ijsx (also generate (ix|jr), if needed)
451 
452     for(int te_type=0; te_type<num_te_types; te_type++) {
453       for (int i=0; i<ni; i++) {
454         for (int j=0; j<rank3; j++) {
455 
456 #if !FAST_BUT_WRONG
457           bzerofast(ijsx_contrib, rank2*ns);
458           if (bs3_eq_bs4)
459             bzerofast(ijrx_contrib, rank2*nr);
460 
461           int ij_proc =  (i*rank3 + j)%nproc;
462           int ij_index = (i*rank3 + j)/nproc;
463           const size_t ijsx_start = (size_t)(num_te_types*ij_index + te_type) * memgrp_blksize;
464           const double *rsix_ptr = rsix_ints[te_type] + i*rank2;
465 
466           if (bs3_eq_bs4) {
467 
468             for (int bf3 = 0; bf3 < nr; bf3++) {
469               int r = r_offset + bf3;
470               int smin = (bs3_eq_bs4 && R == S) ? bf3 : 0;
471               rsix_ptr += smin*nix;
472 
473               for (int bf4 = smin; bf4 <ns; bf4++, rsix_ptr+=nix) {
474                 int s = s_offset + bf4;
475 
476                 // third quarter transform
477                 // rs = js
478                 // rs = jr
479 
480                 double* ijsx_ptr = ijsx_contrib + bf4*rank2;
481                 double* ijrx_ptr = ijrx_contrib + bf3*rank2;
482                 const double* i_ptr = rsix_ptr;
483 
484                 const double c_rj = vector3[r][j];
485                 const double c_sj = vector3[s][j];
486 
487                 if (r != s) {
488                   for (int x=0; x<rank2; x++) {
489 
490                     double value = *i_ptr++;
491                     *ijsx_ptr++ += c_rj * value;
492                     *ijrx_ptr++ += c_sj * value;
493 
494                   }
495                 }
496                 else {
497                   for (int x=0; x<rank2; x++) {
498 
499                     double value = *i_ptr++;
500                     *ijsx_ptr++ += c_rj * value;
501 
502                   }
503                 }
504               }
505             }
506           }
507           else {
508 
509             for (int bf3 = 0; bf3 < nr; bf3++) {
510               int r= r_offset + bf3;
511               for (int bf4 = 0; bf4 <ns; bf4++, rsix_ptr+=nix) {
512 
513                 // third quarter transform
514                 // rs = js
515                 double* ijsx_ptr = ijsx_contrib + bf4*rank2;
516                 const double* i_ptr = rsix_ptr;
517 
518                 const double c_rj = vector3[r][j];
519 
520                 for (int x=0; x<rank2; x++) {
521 
522                   double value = *i_ptr++;
523                   *ijsx_ptr++ += c_rj * value;
524 
525                 }
526               }
527             }
528           }
529 
530           // We now have contributions to ijxs (and ijxr) for one pair i,j,
531           // all x, and s in S (r in R); send ijxs (and ijxr) to the node
532           // (ij_proc) which is going to have this ij pair
533 #endif // !FAST_BUT_WRONG
534 
535           // Sum the ijxs_contrib to the appropriate place
536           size_t ij_offset = (size_t)rank2*s_offset + ijsx_start;
537           mem->sum_reduction_on_node(ijsx_contrib,
538                                      ij_offset, ns*rank2, ij_proc);
539 
540           if (bs3_eq_bs4) {
541             size_t ij_offset = (size_t)rank2*r_offset + ijsx_start;
542             mem->sum_reduction_on_node(ijrx_contrib,
543                                        ij_offset, nr*rank2, ij_proc);
544           }
545 
546         } // endif j
547       } // endif i
548     }  // endif te_type
549     timer_->exit("3. q.t.");
550 
551     ++RS_count;
552   }         // exit while get_task
553 
554   if (debug_) {
555     lock_->lock();
556     ExEnv::outn() << scprintf("%d:%d: done with get_task loop",me,mythread_) << endl;
557     lock_->unlock();
558   }
559 
560   for(int te_type=0; te_type<num_te_types; te_type++) {
561     delete[] rsiq_ints[te_type];
562     delete[] rsix_ints[te_type];
563   }
564   delete[] rsiq_ints;
565   delete[] rsix_ints;
566   mem->free_local_double(ijsx_contrib);
567   if (bs3_eq_bs4)
568     mem->free_local_double(ijrx_contrib);
569   delete[] vector1[0]; delete[] vector1;
570   delete[] vector2[0]; delete[] vector2;
571   delete[] vector3[0]; delete[] vector3;
572 }
573 
574 ////////////////////////////////////////////////////////////////////////////
575 
576 // Local Variables:
577 // mode: c++
578 // c-file-style: "CLJ-CONDENSED"
579 // End:
580