1c
2c  This routine returns the Coulomb and exchange integral
3c  operator matrices for the range of MO-indices as mo_indx_hi, mo_indx_lo
4c  The g_coul, g_exch global arrays are ordered as
5c
6c
7c
8c               ij
9c  (ij|ab) = ( J  )  = g_coul[ ij : (a-1)*N2 + b ] = g_coul [ ij : (b-1)*N2 + a ]
10c                  ab
11c
12c               ij
13c  (ia|jb) = ( K  )  = g_exch[ ij : (a-1)*N2 + b ]
14c                  ab
15c
16c
17c
18c
19c
20c Note: if number MO-indices in the 2nd pair is equal to Nbf
21c then the half-transformed arrays will be aliased to the
22c passed globals, g_coul and g_exch.
23c Otherwise, temporary storage for the half-transformed
24c integrals will be allocated and destroyed.
25c
26c
27c
28       subroutine moints_build_JK( rtdb, basis,
29     $                             mo1_lo, mo1_hi,
30     $                             mo2_lo, mo2_hi,
31     $                             g_movecs,
32     $                             g_coul, ocoul,
33     $                             g_exch, oexch,
34     $                             oprintstats,
35     $                             chunk_factor )
36       implicit none
37#include "global.fh"
38#include "tcgmsg.fh"
39#include "bas.fh"
40#include "rtdb.fh"
41#include "context.fh"
42#include "mafdecls.fh"
43c
44c
45c
46       integer rtdb, basis               ! [input] Handles to environments
47       integer mo1_lo, mo1_hi            ! [input] Range of MO indices to transform (1st pair)
48       integer mo2_lo, mo2_hi            ! [input] Range of MO indices to transform (2nd pair)
49       integer g_movecs                  ! [input] MO vectors
50       integer g_coul                    ! [output] Coulomb operator
51       integer g_exch                    ! [output] exchange operator
52       logical ocoul, oexch              ! [input] Type selection
53       logical oprintstats               ! [input] Print flag (ignored)
54       double precision chunk_factor
55c
56c Local
57c
58       integer nbf,noper,my_id,n2,noper_t,n2n2,jlo,jhi
59#ifdef OLD_ALGORITHM
60       integer g_jhalf, g_khalf
61#endif
62       logical status, oalias
63       double precision ttotal
64c
65       integer ga_create_moblocked
66       logical ga_check_moblocked
67       external ga_create_moblocked,ga_check_moblocked
68c
69c
70c
71       if ((.not.(ocoul)).and.(.not.(oexch))) return
72       if (.not. context_push('moints_build_JK'))
73     $    call errquit('moints_build_JK: context_push failed',0)
74       ttotal = tcgtime()
75       noper = mo1_hi - mo1_lo + 1
76       noper_t = (noper*(noper+1))/2
77       n2 = mo2_hi - mo2_lo + 1
78       n2n2 = n2*n2
79       if (.not.bas_numbf(basis,nbf))
80     $     call errquit('moints_build_JK: cannot get basis info',0)
81       oalias = ((mo2_hi.eq.nbf).and.(mo2_lo.eq.1))
82       my_id = ga_nodeid()
83c
84c Check dimension and distribution of passed global arrays
85c
86       if ((ocoul).and.(.not.
87     $      ga_check_moblocked(g_coul,noper,n2,jlo,jhi)))
88     $      call errquit(
89     $      'moint_build_JK: wrong dimensions/distrib for Coul',my_id)
90       if ((oexch).and.(.not.
91     $      ga_check_moblocked(g_exch,noper,n2,jlo,jhi)))
92     $      call errquit(
93     $     'moint_build_JK: wrong dimensions/distrib. for exchange',0)
94#ifndef OLD_ALGORITHM
95       call moints_build(basis,
96     $                   mo1_lo, mo1_hi,
97     $                   mo2_lo, mo2_hi,
98     $                   g_movecs,
99     $                   g_coul, ocoul,
100     $                   g_exch, oexch,
101     $                   oprintstats,
102     $                   chunk_factor)
103#else
104c
105c Create temporary globals or alias if possible
106c
107       if (oalias) then
108         if (ocoul) g_jhalf = g_coul
109         if (oexch) g_khalf = g_exch
110       else
111         if (ocoul) g_jhalf =
112     $        ga_create_moblocked(noper,nbf,'J half-transformed')
113         if (oexch) g_khalf =
114     $        ga_create_moblocked(noper,nbf,'K half-transformed')
115       endif
116c
117c 1st half transformation
118c
119#ifdef WHITESIDE
120       call moints_1st_half_w(basis,mo1_lo,mo1_hi,g_movecs,
121     $                        g_jhalf,ocoul,g_khalf,oexch)
122#else
123       call moints_1st_half(basis,mo1_lo,mo1_hi,g_movecs,
124     $                      g_jhalf,ocoul,g_khalf,oexch)
125#endif
126c
127c 2nd half transformation
128c
129       call moints_2nd_half(noper,nbf,mo2_lo,mo2_hi,g_movecs,
130     $                      g_jhalf,g_coul,ocoul,
131     $                      g_khalf,g_exch,oexch)
132c
133c Clean up
134c
135       if (.not.(oalias)) then
136         status = .true.
137         if (ocoul) status = status.and.ga_destroy(g_jhalf)
138         if (oexch) status = status.and.ga_destroy(g_khalf)
139         if (.not.(status)) call
140     $      errquit('moints_build_JK: cannot free temp globals',0)
141       endif
142#endif
143       ttotal = tcgtime() - ttotal
144       if (ga_nodeid().eq.0) then
145         write(6,971) ttotal
146 971     format(/,10x,'Total transformation time:',5x,f10.2)
147         call util_flush(6)
148       endif
149
150       if (.not.context_pop('moints_build_JK'))
151     $      call errquit('moints: context_pop failed',0)
152       return
153       end
154
155
156
157
158
159
160
161
162
163c---------------------------------------------------------------------------------
164c---------------------------------------------------------------------------------
165c---------------------------------------------------------------------------------
166c---------------------------------------------------------------------------------
167c---------------------------------------------------------------------------------
168c
169c
170c   atw - 7 Sep 94
171c   Original crappy algorithm
172c
173c
174c---------------------------------------------------------------------------------
175c---------------------------------------------------------------------------------
176c---------------------------------------------------------------------------------
177c---------------------------------------------------------------------------------
178c---------------------------------------------------------------------------------
179
180
181#ifdef OLD_ALGORITHM
182#ifndef WHITESIDE
183       subroutine moints_1st_half(basis,mo_indx_lo,mo_indx_hi,
184     $                            g_movecs,
185     $                            g_jhalf,ocoul,
186     $                            g_khalf,oexch)
187       implicit none
188#include "tcgmsg.fh"
189#include "global.fh"
190#include "mafdecls.fh"
191#include "bas.fh"
192#include "schwarz.fh"
193       integer MOINTS_SCHWARZ_STAT
194       parameter(MOINTS_SCHWARZ_STAT=1921)
195c
196c
197       integer basis                          ! Basis handle
198       integer mo_indx_lo, mo_indx_hi         ! 1st pair of MO indices
199       integer g_movecs                       ! MO coefficients
200       integer g_jhalf                        ! Half-transformed Coulomb operator
201       integer g_khalf                        ! Half-transformed exchange operator
202       logical ocoul,oexch                    ! Type selection
203c
204c Local
205c
206       integer noper,nbf
207       integer g_mocols
208       integer g_j1idx, g_k1idx
209       integer g_j2idx, g_k2idx
210       integer l_jsssi,k_jsssi,l_tsssi,k_tsssi,l_ksssi,k_ksssi
211       integer l_jssni,k_jssni
212       integer l_iscr,k_iscr,l_eri,k_eri,l_erit,k_erit
213       integer l_mov,k_mov
214       integer mem2,max2e,sssi_block,ssni_block,maxbf_shell
215       integer nsh,ish,jsh,ksh,lsh
216       integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi
217       integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi
218       integer ileng,jleng,kleng
219       integer ncol_tij,row_dist,col_dist,nbf_sh_oper
220       integer num_nodes, jk, next
221       double precision tol2e,dschw_done,schw_ratio
222       double precision tt,t1qtr,t2qtr,thalf,txyz,tint
223       logical status
224       data tol2e/1.d-10/
225c
226c
227c Get general info
228c
229       call ga_sync()
230       thalf = tcgtime()
231       status = bas_numbf(basis,nbf)
232       status = status.and.bas_numcont(basis,nsh)
233       status = status.and.bas_nbf_cn_max(basis,maxbf_shell)
234       if (.not.status) call errquit('moints: cannot get basis info',0)
235       noper = mo_indx_hi - mo_indx_lo + 1
236       if (ga_nodeid().eq.0) then
237         write(6,927) nsh,maxbf_shell
238 927     format(10x,'Number of shells:',19x,i5,/,
239     $          10x,'Maximum shell length:',15x,i5)
240         call util_flush(6)
241       endif
242c
243c Ints allocation
244c
245       call int_mem_2e4c(max2e, mem2)
246       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
247       status = status.and.ma_push_get(MT_DBL,max2e,'moints: buf2',
248     $          l_erit,k_erit)
249       status = status.and.ma_push_get(MT_DBL, mem2,
250     $          'moints: scr', l_iscr, k_iscr)
251c
252c Allocate and get local & global MO coefficients !!!! FIX with ga_patch routines !!!!
253c
254       status = status.and.ma_push_get(MT_DBL,(noper*nbf),
255     $          'movecs cols',l_mov,k_mov)
256       status = status.and.ga_create(MT_DBL,nbf,noper,'MO cols',
257     $          nbf,1,g_mocols)
258       if (.not.(status)) call errquit('failed allocate MO vectors',0)
259       call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
260     $             dbl_mb(k_mov),nbf)
261       call ga_copy_patch('n',g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
262     $                        g_mocols,1,nbf,1,noper)
263c
264c Allocate local shell-shell-shell-active blocks
265c
266       sssi_block = noper*maxbf_shell*maxbf_shell*maxbf_shell
267       ssni_block = noper*maxbf_shell*maxbf_shell*nbf
268       status = status.and.ma_push_get(MT_DBL,sssi_block,
269     $          'J sssi block',l_jsssi,k_jsssi)
270       status = status.and.ma_push_get(MT_DBL,sssi_block,
271     $          'J Tsssi block',l_tsssi,k_tsssi)
272       status = status.and.ma_push_get(MT_DBL,sssi_block,
273     $          'K sssi block',l_ksssi,k_ksssi)
274       status = status.and.ma_push_get(MT_DBL,ssni_block,
275     $          'J ssni block',l_jssni,k_jssni)
276       if (.not.status) call
277     $          errquit('moints: cannot allocate local memory',0)
278c
279c Global arrays for accumulating intermediates
280c
281       nbf_sh_oper = nbf*maxbf_shell*noper
282       col_dist = 1
283       row_dist = 1
284       if (ocoul) then
285         status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper,
286     $        'J 1indx',row_dist,col_dist,g_j1idx)
287         status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper,
288     $        'J 2indx',nbf_sh_oper,col_dist,g_j2idx)
289       endif
290       if (oexch) then
291         status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper,
292     $        'K 1indx',row_dist,col_dist,g_k1idx)
293         status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper,
294     $        'K 2indx',nbf_sh_oper,col_dist,g_k2idx)
295       endif
296       if (.not.(status)) call
297     $      errquit('moints_1st_half: cannot allocate global memory',0)
298c
299c Start 4-fold shell loop
300c
301       t1qtr = 0.d0
302       t2qtr = 0.d0
303       tint = 0.d0
304       dschw_done = 0.d0
305       num_nodes = ga_nnodes()
306       do ish=1,nsh
307         if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi))
308     $        call errquit('mo_ints: bas_cn2bfr',ish)
309         ileng = ish_bfhi - ish_bflo + 1
310         if (ocoul) call ga_zero(g_j1idx)
311         if (oexch) call ga_zero(g_k1idx)
312         call ga_sync()
313         tt = tcgtime()
314         jk = 0
315         next = nxtval(num_nodes)
316         do jsh=1,nsh
317           if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi))
318     $          call errquit('mo_ints: bas_cn2bfr',jsh)
319           jleng = jsh_bfhi - jsh_bflo + 1
320           if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
321             do ksh=1,nsh
322c
323c Parallelize over JK index here.
324c
325               if (jk.eq.next) then
326                 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi))
327     $                call errquit('mo_ints: bas_cn2bfr',ksh)
328                 kleng = ksh_bfhi - ksh_bflo + 1
329                 call dfill(sssi_block,0.d0,dbl_mb(k_jsssi),1)
330                 call dfill(ssni_block,0.d0,dbl_mb(k_jssni),1)
331#ifdef PERMUT_SYMM
332                 do lsh=1,ksh
333#else
334                 do lsh=1,nsh
335#endif
336                   if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
337     $               .ge.tol2e) then
338                     call dfill(sssi_block,0.d0,dbl_mb(k_tsssi),1)
339                     dschw_done = dschw_done + 1.d0
340                     if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi))
341     $                    call errquit('mo_ints: bas_cn2bfr',lsh)
342                     txyz = tcgtime()
343                     call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
344     $                    mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri))
345                     tint = tint + tcgtime() - txyz
346#ifndef MOINTS_INTS_DUMMY
347c
348c Transform 1st index for range lsh_bflo<->lsh_bfhi
349c
350                     call moints_1idx_trf( ish_bflo, ish_bfhi,
351     $                                     jsh_bflo, jsh_bfhi,
352     $                                     ksh_bflo, ksh_bfhi,
353     $                                     lsh_bflo, lsh_bfhi,
354     $                                     dbl_mb(k_eri),
355     $                                     dbl_mb(k_mov), nbf,
356     $                                     mo_indx_lo, mo_indx_hi,
357     $                                     dbl_mb(k_jsssi) )
358#ifdef PERMUT_SYMM
359c
360c Transform 1st index for range ksh_bflo<->ksh_bfhi
361c
362                     if (lsh.ne.ksh) then
363                       call eri_tr( ish_bflo, ish_bfhi,
364     $                              jsh_bflo, jsh_bfhi,
365     $                              ksh_bflo, ksh_bfhi,
366     $                              lsh_bflo, lsh_bfhi,
367     $                              dbl_mb(k_eri),
368     $                              dbl_mb(k_erit))
369
370                       call moints_1idx_trf( ish_bflo, ish_bfhi,
371     $                                       jsh_bflo, jsh_bfhi,
372     $                                       lsh_bflo, lsh_bfhi,
373     $                                       ksh_bflo, ksh_bfhi,
374     $                                       dbl_mb(k_erit),
375     $                                       dbl_mb(k_mov), nbf,
376     $                                       mo_indx_lo, mo_indx_hi,
377     $                                       dbl_mb(k_tsssi) )
378
379                       call moints_1idx_move( nbf, ish_bflo, ish_bfhi,
380     $                                        jsh_bflo, jsh_bfhi,
381     $                                        lsh_bflo, lsh_bfhi,
382     $                                        mo_indx_lo, mo_indx_hi,
383     $                                        dbl_mb(k_tsssi),
384     $                                        dbl_mb(k_jssni) )
385                     endif
386#endif
387                   endif
388#endif
389                 enddo
390
391c
392c Push into global memory
393c
394#ifndef MOINTS_INTS_DUMMY
395                 if (oexch) then
396                   call moints_ktransp(kleng,jleng,ileng,noper,
397     $                                 dbl_mb(k_jsssi),dbl_mb(k_ksssi))
398                   call moints_push_1idx( nbf, ish_bflo, ish_bfhi,
399     $                                    ksh_bflo, ksh_bfhi,
400     $                                    jsh_bflo, jsh_bfhi,
401     $                                    mo_indx_lo, mo_indx_hi,
402     $                                    dbl_mb(k_ksssi), g_k1idx )
403#ifdef PERMUT_SYMM
404                   call moints_push_1idx_b( nbf, ish_bflo, ish_bfhi,
405     $                                     ksh_bflo, ksh_bfhi,
406     $                                     mo_indx_lo, mo_indx_hi,
407     $                                     dbl_mb(k_jssni), g_k1idx )
408#endif
409                 endif
410                 if (ocoul) then
411                   call moints_push_1idx( nbf, ish_bflo, ish_bfhi,
412     $                                    jsh_bflo, jsh_bfhi,
413     $                                    ksh_bflo, ksh_bfhi,
414     $                                    mo_indx_lo, mo_indx_hi,
415     $                                    dbl_mb(k_jsssi), g_j1idx )
416#ifdef PERMUT_SYMM
417                   call moints_push_1idx_a( nbf, ish_bflo, ish_bfhi,
418     $                                     jsh_bflo, jsh_bfhi,
419     $                                     mo_indx_lo, mo_indx_hi,
420     $                                     dbl_mb(k_jssni), g_j1idx )
421#endif
422                 endif
423#endif
424                 next = nxtval(num_nodes)
425               endif
426               jk = jk + 1
427
428c
429c End ksh-loop & jsh-loop
430c
431             enddo
432           endif
433         enddo
434         next = nxtval(-num_nodes)
435         t1qtr = t1qtr + tcgtime() - tt
436#ifndef MOINTS_INTS_DUMMY
437c
438c 2nd Index (global) transformation
439c Push half-transformed integrals (for i-shell) into global
440c operator matrices
441c
442         tt = tcgtime()
443         ncol_tij = noper*ileng*nbf
444         if (ocoul) then
445           call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_j1idx,
446     $                   g_mocols,0.d0,g_j2idx)
447           call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper,
448     $                             g_j2idx,g_jhalf)
449         endif
450         if (oexch) then
451           call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_k1idx,
452     $                   g_mocols,0.d0,g_k2idx)
453           call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper,
454     $                             g_k2idx,g_khalf)
455         endif
456         t2qtr = t2qtr + tcgtime() - tt
457#endif
458c
459c End loop over ish
460c
461       enddo
462       call ga_sync()
463       call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+')
464#ifdef PERMUT_SYMM
465       schw_ratio = dschw_done/(nsh*nsh*((nsh*(nsh+1))/2))*100.d0
466#else
467       schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0
468#endif
469       thalf = tcgtime() - thalf
470       if (ga_nodeid().eq.0) then
471         write(6,986) schw_ratio,t1qtr,t2qtr,tint,thalf
472 986     format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
473     $          10x,'1st quarter time:',14x,f10.2,/,
474     $          10x,'2nd quarter time:',14x,f10.2,/,
475     $          10x,'Integrals only time:',11x,f10.2,/,
476     $          10x,'Total half time:',15x,f10.2,/)
477         call util_flush(6)
478       endif
479#ifdef DEBUG_PRINT
480C       if (ocoul) call moints_print_opermatrix(noper,nbf,g_jhalf)
481C       if (oexch) call moints_print_opermatrix(noper,nbf,g_khalf)
482#endif
483c
484c
485c
486c Clean up
487c
488       status = ga_destroy(g_mocols)
489       if (oexch) then
490         status = status.and.ga_destroy(g_k2idx)
491         status = status.and.ga_destroy(g_k1idx)
492       endif
493       if (ocoul) then
494         status = status.and.ga_destroy(g_j2idx)
495         status = status.and.ga_destroy(g_j1idx)
496       endif
497       if (.not.(status))
498     $   call errquit('moints_1st_half: cannot release globals',0)
499c
500      if (.not. ma_pop_stack(l_jssni))
501     $     call errquit('moints: failed to pop temp transf', l_jssni)
502      if (.not. ma_pop_stack(l_ksssi))
503     $     call errquit('moints: failed to pop temp transf', l_ksssi)
504      if (.not. ma_pop_stack(l_tsssi))
505     $     call errquit('moints: failed to pop temp transf', l_tsssi)
506      if (.not. ma_pop_stack(l_jsssi))
507     $     call errquit('moints: failed to pop temp transf', l_jsssi)
508      if (.not. ma_pop_stack(l_mov))
509     $     call errquit('moints: failed to pop movectors', l_mov)
510      if (.not. ma_pop_stack(l_iscr))
511     $     call errquit('moints: failed to pop int scratch', l_iscr)
512      if (.not. ma_pop_stack(l_erit))
513     $     call errquit('moints: failed to pop int buffer', l_erit)
514      if (.not. ma_pop_stack(l_eri))
515     $     call errquit('moints: failed to pop int buffer', l_eri)
516
517       return
518       end
519
520#endif /* !WHITESIDE */
521
522
523
524
525
526
527
528
529c
530c Performs straightforward two-index transformation on
531c half-transformed integrals.
532c
533c This routines permits the aliasing of
534c        g_jhalf -> g_coul
535c        g_khalf -> g_exch
536c Provided the distributions and array sizes are
537c compatible.
538c
539c
540c
541        subroutine moints_2nd_half(noper,nbf,mo_lo,mo_hi,g_movecs,
542     $                             g_jhalf,g_coul,ocoul,
543     $                             g_khalf,g_exch,oexch)
544        implicit none
545#include "global.fh"
546#include "mafdecls.fh"
547#include "tcgmsg.fh"
548        integer MOINTS_J_SUM, MOINTS_K_SUM
549        parameter(MOINTS_J_SUM=1973,MOINTS_K_SUM=1974)
550        integer noper
551        integer nbf
552        integer mo_lo,mo_hi
553        integer g_movecs
554        integer g_jhalf
555        integer g_khalf
556        integer g_coul
557        integer g_exch
558        logical ocoul, oexch
559c
560c Local
561c
562        integer l_tmp, k_tmp, l_mov, k_mov
563        integer ij, jnonzero, knonzero, my_id, jlo, jhi
564        integer k_src, ld_src, k_dest, ld_dest
565        integer nn,n2,n2n2
566        double precision tt,t3qtr,t4qtr,thalf
567        logical status,oalias
568        integer moints_integ_count
569        external moints_integ_count
570        logical ga_check_moblocked
571        external ga_check_moblocked
572c
573c Allocate local scratch space
574c
575        nn = nbf*nbf
576        n2 = mo_hi - mo_lo + 1
577        n2n2 = n2*n2
578        status = ma_push_get(MT_DBL,(n2*nbf),'scratch 1',l_tmp,k_tmp)
579        status = status.and.ma_push_get(MT_DBL,(n2*nbf),'MO vectors',
580     $                                 l_mov,k_mov)
581        if (.not.(status))
582     $      call errquit('moints_2nd_half: not enough local memory',0)
583        call ga_get(g_movecs,1,nbf,mo_lo,mo_hi,dbl_mb(k_mov),nbf)
584        jnonzero = 0
585        knonzero = 0
586        my_id = ga_nodeid()
587        thalf = tcgtime()
588        t3qtr = 0.d0
589        t4qtr = 0.d0
590c
591c Coulomb
592c
593        if (ocoul) then
594          oalias = g_jhalf.eq.g_coul
595          if (.not.ga_check_moblocked(g_jhalf,noper,nbf,jlo,jhi))
596     $      call errquit(
597     $      'moints_2nd_half: wrong dimensions/distrib half Coul',0)
598          if ((.not.(oalias)).and.
599     $        (.not.ga_check_moblocked(g_coul,noper,n2,jlo,jhi)))
600     $      call errquit(
601     $      'moints_2nd_half: wrong dimensions/distrib Coul',0)
602          do ij=jlo,jhi
603            tt = tcgtime()
604            call ga_access(g_jhalf,1,nn,ij,ij,k_src,ld_src)
605            call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf,
606     $                 dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf)
607            t3qtr = t3qtr + tcgtime() - tt
608            tt = tcgtime()
609            if (oalias) then
610              k_dest = k_src
611            else
612              call ga_release(g_jhalf,1,nn,ij,ij)
613              call ga_access(g_coul,1,n2n2,ij,ij,k_dest,ld_dest)
614            endif
615            call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf,
616     $                 dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2)
617            jnonzero = jnonzero +
618     $                 moints_integ_count(n2,dbl_mb(k_dest))
619            call ga_release_update(g_coul,1,n2n2,ij,ij)
620            t4qtr = t4qtr + tcgtime() - tt
621          enddo
622          call ga_sync()
623        endif
624c
625c Exchange
626c
627        if (oexch) then
628          oalias = g_khalf.eq.g_exch
629          if (.not.ga_check_moblocked(g_khalf,noper,nbf,jlo,jhi))
630     $      call errquit(
631     $      'moints_2nd_half: wrong dimensions/distrib half exch',0)
632          if ((.not.(oalias)).and.
633     $        (.not.ga_check_moblocked(g_exch,noper,n2,jlo,jhi)))
634     $      call errquit(
635     $      'moints_2nd_half: wrong dimensions/distrib exch',0)
636          do ij=jlo,jhi
637            tt = tcgtime()
638            call ga_access(g_khalf,1,nn,ij,ij,k_src,ld_src)
639            call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf,
640     $                 dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf)
641            t3qtr = t3qtr + tcgtime() - tt
642            tt = tcgtime()
643            if (oalias) then
644              k_dest = k_src
645            else
646              call ga_release(g_khalf,1,nn,ij,ij)
647              call ga_access(g_exch,1,n2n2,ij,ij,k_dest,ld_dest)
648            endif
649            call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf,
650     $                 dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2)
651            knonzero = knonzero +
652     $                 moints_integ_count(n2,dbl_mb(k_dest))
653            call ga_release_update(g_exch,1,n2n2,ij,ij)
654            t4qtr = t4qtr + tcgtime() - tt
655          enddo
656          call ga_sync()
657        endif
658c
659c Clean up
660c
661        if (.not. ma_pop_stack(l_mov))
662     $     call errquit('moints: failed to pop temp transf', l_mov)
663        if (.not. ma_pop_stack(l_tmp))
664     $     call errquit('moints: failed to pop temp transf', l_tmp)
665        thalf = tcgtime() - thalf
666c
667c Statistics
668c
669        if (ocoul) call ga_igop(MOINTS_J_SUM,jnonzero,1,'+')
670        if (oexch) call ga_igop(MOINTS_K_SUM,knonzero,1,'+')
671        if (ga_nodeid().eq.0) then
672          write(6,973) t3qtr,t4qtr,thalf
673 973      format(10x,'3rd quarter time:',14x,f10.2,/,
674     $           10x,'4th quarter time:',14x,f10.2,/,
675     $           10x,'Total half time:',15x,f10.2)
676          call util_flush(6)
677          ij = n2*n2*(noper*(noper+1))/2
678          if ((ocoul).and.(oexch)) ij = ij*2
679          write(6,901) ij
680          if (ocoul) write(6,902) jnonzero
681          if (oexch) write(6,903) knonzero
682 901      format(/,10x,'Total number of integrals:',5x,i10)
683 902      format(  10x,'Non-zero Coulomb integrals:',4x,i10)
684 903      format(  10x,'Non-zero exchange integrals:',3x,i10)
685        endif
686        return
687        end
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708c
709c Auxiliary routines to manage and manipulate memory
710c
711
712       subroutine moints_1idx_trf( ilo, ihi, jlo, jhi, klo, khi,
713     $                             llo, lhi, eri, c, nbf,
714     $                             mo_lo, mo_hi, x )
715       implicit none
716       integer nbf
717       integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi,mo_lo,mo_hi
718       double precision eri( llo:lhi, klo:khi, jlo:jhi, ilo:ihi )
719       double precision c( 1:nbf, mo_lo: mo_hi )
720       double precision x( klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi )
721       integer nn,mm,ll
722
723       nn = (ihi-ilo+1)*(jhi-jlo+1)*(khi-klo+1)
724       mm = mo_hi-mo_lo+1
725       ll = lhi - llo + 1
726       call dgemm('t','n',nn,mm,ll,1.d0,eri,ll,c(llo,1),nbf,1.d0,x,nn)
727       return
728       end
729
730
731
732
733
734
735       subroutine moints_push_halftr(nbf,mulo,muhi,noper,g_j2idx,
736     $                               g_joper)
737       implicit none
738#include "global.fh"
739#include "mafdecls.fh"
740       integer nbf, mulo, muhi, noper
741       integer g_j2idx, g_joper
742       integer i,j,ij,mu,muleng,nopert
743       integer l_scr, k_scr, k_local, ld_local
744       integer j_off,mu_off,nu_off
745       integer jlo,jhi
746       logical ga_check_moblocked
747       external ga_check_moblocked
748
749       muleng = muhi - mulo + 1
750       nopert = (noper*(noper+1))/2
751       if (.not.ga_check_moblocked(g_joper,noper,nbf,jlo,jhi))
752     $      call errquit('moints_push_halftr: wrong distrib',0)
753       if (.not.ma_push_get(MT_DBL,nbf,'scr',l_scr,k_scr))
754     $      call errquit('moints_push_halftr: cannot allocate',0)
755       do i=1,noper
756         do j=1,i
757           ij = (i*(i-1))/2 + j
758           if ((ij.ge.jlo).and.(ij.le.jhi)) then
759             call ga_access(g_joper,1,(nbf*nbf),ij,ij,k_local,ld_local)
760             j_off = (j-1)*muleng
761             do mu=mulo,muhi
762               nu_off = (j_off+(mu-mulo))*nbf
763               mu_off = (mu-1)*nbf
764               call ga_get(g_j2idx,nu_off+1,nu_off+nbf,i,i,
765     $                     dbl_mb(k_scr),nbf)
766               call dcopy(nbf,dbl_mb(k_scr),1,dbl_mb(k_local+mu_off),1)
767C               call ga_put(g_joper,mu_off+1,mu_off+nbf,ij,ij,
768C     $                     dbl_mb(k_scr),nbf)
769             enddo
770             call ga_release_update(g_joper,1,(nbf*nbf),ij,ij)
771           endif
772         enddo
773       enddo
774       call ga_sync()
775       if (.not.ma_pop_stack(l_scr))
776     $   call errquit('moints_push_halftr: pop stack failed',0)
777       return
778       end
779
780
781
782
783
784       subroutine moints_push_1idx( nbf, ilo, ihi, jlo, jhi, klo, khi,
785     $                              mo_lo, mo_hi, x, g_j1idx )
786       implicit none
787#include "global.fh"
788#include "mafdecls.fh"
789       integer nbf,ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi
790       double precision x(klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi)
791       integer g_j1idx
792       integer t, i, j
793       integer ileng, kleng, ti, tij
794
795       ileng = ihi - ilo + 1
796       kleng = khi - klo + 1
797       do t=mo_lo,mo_hi
798         do i=ilo,ihi
799           ti = (t-mo_lo)*ileng + (i-ilo+1)
800           do j=jlo,jhi
801             tij = (ti-1)*nbf + j
802             call ga_acc(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t),
803     $                   kleng,1.d0)
804C             call ga_put(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t),kleng)
805           enddo
806         enddo
807       enddo
808       end
809
810
811
812
813
814       subroutine moints_ktransp(k,l,m,n,x,y)
815       implicit none
816       integer k,l,m,n
817       double precision x(k,l,m,n),y(l,k,m,n)
818       integer a,b,c,d
819
820       do a=1,n
821         do b=1,m
822           do c=1,k
823             do d=1,l
824               y(d,c,b,a) = x(c,d,b,a)
825             enddo
826           enddo
827         enddo
828       enddo
829       return
830       end
831
832
833
834
835
836
837
838
839
840
841
842
843        integer function moints_integ_count(nbf,xx)
844        implicit none
845        integer nbf
846        double precision xx(nbf,nbf)
847        integer i,j,count
848
849        count = 0
850        do i=1,nbf
851          do j=1,nbf
852            if (abs(xx(j,i)).gt.1.d-12) then
853              count = count + 1
854            endif
855          enddo
856        enddo
857        moints_integ_count = count
858        return
859        end
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877       subroutine eri_tr(ilo,ihi,jlo,jhi,klo,khi,llo,lhi,x,y)
878       implicit none
879       integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi
880       double precision x(llo:lhi,klo:khi,jlo:jhi,ilo:ihi)
881       double precision y(klo:khi,llo:lhi,jlo:jhi,ilo:ihi)
882       integer i,j,k,l
883
884       do i=ilo,ihi
885         do j=jlo,jhi
886           do k=klo,khi
887             do l=llo,lhi
888               y(k,l,j,i) = x(l,k,j,i)
889             enddo
890           enddo
891         enddo
892       enddo
893       return
894       end
895
896
897
898
899
900
901
902       subroutine moints_1idx_move(nbf,ilo,ihi,jlo,jhi,llo,lhi,mo_lo,
903     $                             mo_hi,x,y)
904       implicit none
905       integer nbf,ilo,ihi,jhi,jlo,llo,lhi,mo_lo,mo_hi
906       double precision x( llo:lhi, jlo:jhi, ilo:ihi, mo_lo:mo_hi)
907       double precision y( nbf, jlo:jhi, ilo:ihi, mo_lo:mo_hi )
908       integer t,i,j,l
909
910       do t=mo_lo,mo_hi
911         do i=ilo,ihi
912           do j=jlo,jhi
913             do l=llo,lhi
914               y(l,j,i,t) = x(l,j,i,t)
915             enddo
916           enddo
917         enddo
918       enddo
919       return
920       end
921
922
923
924
925
926
927       subroutine moints_push_1idx_a(nbf,ilo,ihi,jlo,jhi,
928     $                               mo_lo,mo_hi,x,g_j1idx)
929       implicit none
930       integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi
931       double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
932       integer g_j1idx
933       integer ileng
934       integer t,i,j,ti,tij
935
936       ileng = ihi - ilo + 1
937       do t=mo_lo,mo_hi
938         do i=ilo,ihi
939           ti = (t-mo_lo)*ileng + (i-ilo+1)
940           do j=jlo,jhi
941             tij = (ti-1)*nbf + j
942             call ga_acc(g_j1idx,1,nbf,tij,tij,x(1,j,i,t),
943     $                   nbf,1.d0)
944           enddo
945         enddo
946       enddo
947       end
948
949
950
951
952
953       subroutine moints_push_1idx_b( nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi,
954     $                                x, g_k1idx )
955       implicit none
956       integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi
957       double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
958       integer g_k1idx
959       integer ileng
960       integer t,i,j,l,ti,til
961
962       ileng = ihi - ilo + 1
963       do t=mo_lo,mo_hi
964         do i=ilo,ihi
965           ti = (t-mo_lo)*ileng + (i-ilo+1)
966           do l=1,nbf
967             til = (ti-1)*nbf + l
968             do j=jlo,jhi
969               call ga_acc(g_k1idx,j,j,til,til,x(l,j,i,t),
970     $                     1,1.d0)
971             enddo
972           enddo
973         enddo
974       enddo
975       return
976       end
977
978#endif
979
980
981
982
983
984
985
986#ifdef DEBUG_PRINT
987       subroutine moints_print_opermatrix(noper,nbf,g_a)
988       implicit none
989#include "global.fh"
990#include "mafdecls.fh"
991       integer noper,nbf
992       integer g_a
993       integer j,k,jk
994       integer my_id, ilo, ihi, jlo, jhi
995       integer k_local, ld_local
996
997       my_id = ga_nodeid()
998       call ga_distribution(g_a,my_id,ilo,ihi,jlo,jhi)
999       do j=1,noper
1000         do k=1,j
1001           jk = (j*(j-1))/2 + k
1002           if ((jk.ge.jlo).and.(jk.le.jhi)) then
1003             write(6,901) j,k
1004 901         format(//,'Operator: [',i2,',',i2,']',/)
1005             print*,my_id,'  ',ilo,ihi,'  ',jk
1006             call ga_access(g_a,ilo,ihi,jk,jk,k_local,ld_local)
1007             call moints_print_square(nbf,dbl_mb(k_local))
1008           endif
1009           call ga_sync()
1010         enddo
1011       enddo
1012
1013       return
1014       end
1015
1016
1017
1018
1019       subroutine moints_print_square(n,x)
1020       implicit none
1021       integer n,i,j
1022       double precision x(n,n)
1023
1024       do i=1,n
1025         write(6,901) (x(i,j),j=1,n)
1026 901     format(15f9.4)
1027       enddo
1028       call util_flush(6)
1029       end
1030
1031
1032
1033
1034
1035
1036       subroutine moints_print_1(ilo,ihi,jlo,jhi,
1037     $                           klo,khi,mo_lo,mo_hi,x)
1038       implicit none
1039       integer ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi
1040       double precision x(klo:khi,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
1041       integer m,i,j,k
1042
1043       do m=mo_lo,mo_hi
1044         do i=ilo,ihi
1045           write(6,933) ((x(k,j,i,m),k=klo,khi),j=jlo,jhi)
1046 933       format(15f9.4)
1047         enddo
1048         write(6,*)
1049       enddo
1050
1051       return
1052       end
1053
1054#endif
1055
1056
1057
1058
1059
1060
1061#ifdef OLD_ALGORITHM
1062#ifdef WHITESIDE
1063C========================================================================
1064C========================================================================
1065C========================================================================
1066C========================================================================
1067C========================================================================
1068C========================================================================
1069C========================================================================
1070c
1071c
1072c atw - 8/25/94
1073c
1074c  This is an alternate algorithm for the
1075c  half-transformed Coulomb operator, (the
1076c  obvious algorithm from Whiteside et al).
1077c
1078c
1079C========================================================================
1080C========================================================================
1081C========================================================================
1082C========================================================================
1083C========================================================================
1084C========================================================================
1085C========================================================================
1086       subroutine moints_1st_half_w(basis,mo_indx_lo,mo_indx_hi,
1087     $                              g_movecs,
1088     $                              g_jhalf,ocoul,
1089     $                              g_khalf,oexch)
1090       implicit none
1091#include "tcgmsg.fh"
1092#include "global.fh"
1093#include "mafdecls.fh"
1094#include "bas.fh"
1095#include "schwarz.fh"
1096       integer MOINTS_SCHWARZ_STAT
1097       parameter(MOINTS_SCHWARZ_STAT=1921)
1098c
1099c
1100       integer basis                          ! Basis handle
1101       integer mo_indx_lo, mo_indx_hi         ! 1st pair of MO indices
1102       integer g_movecs                       ! MO coefficients
1103       integer g_jhalf                        ! Half-transformed Coulomb operator
1104       integer g_khalf                        ! Half-transformed exchange operator (UNUSED !)
1105       logical ocoul,oexch                    ! Type selection
1106c
1107c Local
1108c
1109       integer noper,nbf
1110       integer ssbb_block, ssba_block, ssaa_block
1111       integer l_jssbb,k_jssbb
1112       integer l_jssba,k_jssba
1113       integer l_jssaa,k_jssaa
1114       integer l_zssaa,k_zssaa, l_yssaa, k_yssaa
1115       integer ssn, ssa
1116       integer l_iscr,k_iscr,l_eri,k_eri
1117       integer l_mov,k_mov
1118       integer mem2,max2e,maxbf_shell
1119       integer nsh,ish,jsh,ksh,lsh
1120       integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi
1121       integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi
1122       integer ileng,jleng
1123       integer num_nodes, next, ij
1124       double precision tol2e,dschw_done,schw_ratio
1125       double precision t1qtr,t2qtr,thalf,tzz,tint,txy
1126       logical status
1127       data tol2e/1.d-10/
1128c
1129c
1130c Get general info
1131c
1132       call ga_sync()
1133       thalf = tcgtime()
1134       status = bas_numbf(basis,nbf)
1135       status = status.and.bas_numcont(basis,nsh)
1136       status = status.and.bas_nbf_cn_max(basis,maxbf_shell)
1137       if (.not.status) call errquit('moints: cannot get basis info',0)
1138       noper = mo_indx_hi - mo_indx_lo + 1
1139       if (ga_nodeid().eq.0) then
1140         write(6,927) nsh,maxbf_shell
1141 927     format(10x,'Number of shells:',19x,i5,/,
1142     $          10x,'Maximum shell length:',15x,i5)
1143         call util_flush(6)
1144       endif
1145c
1146c Ints allocation
1147c
1148       call int_mem_2e4c(max2e, mem2)
1149       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
1150       status = status.and.ma_push_get(MT_DBL, mem2,
1151     $          'moints: scr', l_iscr, k_iscr)
1152c
1153c Allocate and get local
1154c
1155       status = status.and.ma_push_get(MT_DBL,(noper*nbf),
1156     $          'movecs cols',l_mov,k_mov)
1157       if (.not.(status)) call errquit('failed allocate MO vectors',0)
1158       call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
1159     $             dbl_mb(k_mov),nbf)
1160c
1161c Allocate local shell-shell-basis-basis blocks
1162c
1163       ssbb_block = maxbf_shell*maxbf_shell*nbf*nbf
1164       ssba_block = maxbf_shell*maxbf_shell*nbf*noper
1165       ssaa_block = maxbf_shell*maxbf_shell*noper*noper
1166       status = status.and.ma_push_get(MT_DBL,ssbb_block,
1167     $          'J ssbb block',l_jssbb,k_jssbb)
1168       status = status.and.ma_push_get(MT_DBL,ssba_block,
1169     $          'J ssba block',l_jssba,k_jssba)
1170       status = status.and.ma_push_get(MT_DBL,ssaa_block,
1171     $          'J ssaa block',l_jssaa,k_jssaa)
1172       status = status.and.ma_push_get(MT_DBL,ssaa_block,
1173     $          'J transp 1 ssaa block',l_zssaa,k_zssaa)
1174       status = status.and.ma_push_get(MT_DBL,ssaa_block,
1175     $          'J transp 2 ssaa block',l_yssaa,k_yssaa)
1176       if (.not.status) call
1177     $          errquit('moints: cannot allocate local memory',0)
1178c
1179c Start 4-fold shell loop
1180c
1181       if (ocoul) call ga_zero(g_jhalf)
1182       t1qtr = 0.d0
1183       t2qtr = 0.d0
1184       tint = 0.d0
1185       dschw_done = 0.d0
1186       num_nodes = ga_nnodes()
1187       ij = 0
1188       next = nxtval(num_nodes)
1189c
1190c Four-fold shell loop
1191c
1192       do ish=1,nsh
1193         if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi))
1194     $        call errquit('mo_ints: bas_cn2bfr',ish)
1195         ileng = ish_bfhi - ish_bflo + 1
1196         do jsh=1,ish
1197           if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi))
1198     $          call errquit('mo_ints: bas_cn2bfr',jsh)
1199           jleng = jsh_bfhi - jsh_bflo + 1
1200           ssn = ileng*jleng*nbf
1201           ssa = ileng*jleng*noper
1202c
1203c Parallelize on (ij)
1204c
1205           if (ij.eq.next) then
1206             if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
1207               call dfill(ssbb_block,0.d0,dbl_mb(k_jssbb),1)
1208               tzz = tcgtime()
1209               do ksh=1,nsh
1210                 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi))
1211     $                call errquit('mo_ints: bas_cn2bfr',ksh)
1212                 do lsh=1,ksh
1213                   if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
1214     $               .ge.tol2e) then
1215c
1216c Compute shell-quartet integral block
1217c
1218                     dschw_done = dschw_done + 1.d0
1219                     if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi))
1220     $                    call errquit('mo_ints: bas_cn2bfr',lsh)
1221                     txy = tcgtime()
1222                     call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
1223     $                    mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri))
1224                     tint = tint + tcgtime() - txy
1225                     call moints_push_ints1( ish_bflo, ish_bfhi,
1226     $                                       jsh_bflo, jsh_bfhi,
1227     $                                       ksh_bflo, ksh_bfhi,
1228     $                                       lsh_bflo, lsh_bfhi,
1229     $                                       dbl_mb(k_eri),
1230     $                                       nbf, dbl_mb(k_jssbb))
1231                   endif
1232                 enddo
1233               enddo
1234c
1235c First half-transformation
1236c
1237               call sgemm('t','n',noper,ssn,nbf,1.d0,dbl_mb(k_mov),
1238     $                    nbf,dbl_mb(k_jssbb),nbf,0.d0,
1239     $                    dbl_mb(k_jssba),noper)
1240               t1qtr = t1qtr + tcgtime() - tzz
1241               tzz = tcgtime()
1242c
1243c Second half-transformation
1244c
1245               call moints_2ndi_tr( nbf, noper, jleng, ileng,
1246     $                              dbl_mb(k_jssba), dbl_mb(k_mov),
1247     $                              dbl_mb(k_jssaa))
1248               call moints_transp_ss( noper, ileng, jleng,
1249     $                                dbl_mb(k_jssaa),
1250     $                                dbl_mb(k_zssaa),
1251     $                                dbl_mb(k_yssaa) )
1252c
1253c Push half-transformed into global
1254c
1255               call moints_push_ints2( ish_bflo, ish_bfhi,
1256     $                                 jsh_bflo, jsh_bfhi,
1257     $                                 nbf, noper,
1258     $                                 dbl_mb(k_zssaa),
1259     $                                 dbl_mb(k_yssaa),
1260     $                                 g_jhalf )
1261               t2qtr = t2qtr + tcgtime() - tzz
1262             endif
1263             next = nxtval(num_nodes)
1264           endif
1265c
1266c End parallel task
1267c
1268           ij = ij + 1
1269         enddo
1270       enddo
1271       next = nxtval(-num_nodes)
1272
1273       call ga_sync()
1274       call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+')
1275       schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0
1276       thalf = tcgtime() - thalf
1277       if (ga_nodeid().eq.0) then
1278         write(6,986) schw_ratio,tint,t1qtr,t2qtr,thalf
1279 986     format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
1280     $          10x,'Integrals time:',16x,f10.2,/,
1281     $          10x,'1st quarter time:',14x,f10.2,/,
1282     $          10x,'2nd quarter time:',14x,f10.2,/,
1283     $          10x,'Total half time:',15x,f10.2,/)
1284         call util_flush(6)
1285       endif
1286c
1287c
1288c
1289c Clean up
1290c
1291c
1292      if (.not. ma_pop_stack(l_yssaa))
1293     $     call errquit('moints: failed to pop temp transf', l_yssaa)
1294      if (.not. ma_pop_stack(l_zssaa))
1295     $     call errquit('moints: failed to pop temp transf', l_zssaa)
1296      if (.not. ma_pop_stack(l_jssaa))
1297     $     call errquit('moints: failed to pop temp transf', l_jssaa)
1298      if (.not. ma_pop_stack(l_jssba))
1299     $     call errquit('moints: failed to pop temp transf', l_jssba)
1300      if (.not. ma_pop_stack(l_jssbb))
1301     $     call errquit('moints: failed to pop temp transf', l_jssbb)
1302      if (.not. ma_pop_stack(l_mov))
1303     $     call errquit('moints: failed to pop movectors', l_mov)
1304      if (.not. ma_pop_stack(l_iscr))
1305     $     call errquit('moints: failed to pop int scratch', l_iscr)
1306      if (.not. ma_pop_stack(l_eri))
1307     $     call errquit('moints: failed to pop int buffer', l_eri)
1308
1309       return
1310       end
1311
1312
1313
1314
1315
1316
1317c
1318c ====================================================================
1319c      Auxiliary routines to help manage array
1320c      addressing, transposition, etc.
1321c ====================================================================
1322c
1323c
1324c
1325
1326c
1327c Copy integrals from shell-quartet offset addressing
1328c to absolute AO indices
1329c
1330       subroutine moints_push_ints1( ilo, ihi, jlo, jhi, klo, khi,
1331     $                               llo, lhi, eri, nbf, buf )
1332       implicit none
1333       integer ilo, ihi, jlo, jhi
1334       integer klo, khi, llo, lhi
1335       integer nbf
1336       double precision eri(llo:lhi,klo:khi,jlo:jhi,ilo:ihi)
1337       double precision buf(nbf,nbf,jlo:jhi,ilo:ihi)
1338       integer i,j,k,l
1339
1340       do i=ilo,ihi
1341         do j=jlo,jhi
1342           do k=klo,khi
1343             do l=llo,lhi
1344               buf(l,k,j,i) = eri(l,k,j,i)
1345               buf(k,l,j,i) = eri(l,k,j,i)
1346             enddo
1347           enddo
1348         enddo
1349       enddo
1350       return
1351       end
1352
1353
1354
1355c
1356c Push half-transformed integrals into global
1357c array. Copies strips of length: (jhi-jlo+1)
1358c
1359       subroutine moints_push_ints2( ilo, ihi, jlo, jhi,
1360     $                               nbf, noper, x, y, g_half )
1361       implicit none
1362       integer ilo, ihi, jlo, jhi, nbf, noper
1363       double precision x(jlo:jhi,ilo:ihi,noper,noper)
1364       double precision y(ilo:ihi,jlo:jhi,noper,noper)
1365       integer g_half
1366       integer i,j,t,u,ij1,ij2,tu,ileng,jleng
1367
1368       do t=1,noper
1369         do u=1,t
1370           tu = ((t-1)*t)/2 + u
1371           do i=ilo,ihi
1372             ij1 = (i-1)*nbf+jlo
1373             ij2 = (i-1)*nbf+jhi
1374             jleng = jhi - jlo + 1
1375             call ga_put(g_half,ij1,ij2,tu,tu,x(jlo,i,u,t),jleng)
1376           enddo
1377           do j=jlo,jhi
1378             ij1 = (j-1)*nbf+ilo
1379             ij2 = (j-1)*nbf+ihi
1380             ileng = ihi - ilo + 1
1381             call ga_put(g_half,ij1,ij2,tu,tu,y(ilo,j,u,t),ileng)
1382           enddo
1383         enddo
1384       enddo
1385       return
1386       end
1387
1388
1389
1390
1391
1392c
1393c Loops over (ij)-shells and performs 2nd index
1394c transformation. (This cannot be done as in a
1395c single dgemm)
1396c
1397c
1398       subroutine moints_2ndi_tr( nbf, noper, jleng, ileng,
1399     $                            xx, yy, zz )
1400       implicit none
1401       integer nbf,noper,jleng,ileng
1402       double precision xx(noper,nbf,jleng,ileng)
1403       double precision yy(nbf,noper)
1404       double precision zz(noper,noper,jleng,ileng)
1405       integer i,j
1406
1407
1408       do i=1,ileng
1409         do j=1,jleng
1410           call sgemm('n','n',noper,noper,nbf,1.d0,xx(1,1,j,i),
1411     $                 noper,yy,nbf,0.d0,zz(1,1,j,i),noper)
1412         enddo
1413       enddo
1414       return
1415       end
1416
1417
1418
1419
1420
1421
1422c
1423c Transpose half-transformed integrals so AO-indices
1424c varies faster, to accomodate ga_put() using
1425c shell-strips.
1426c
1427       subroutine moints_transp_ss( noper, ileng, jleng, x, y, z)
1428       implicit none
1429       integer noper, ileng, jleng
1430       double precision x(noper,noper,jleng,ileng)
1431       double precision y(jleng,ileng,noper,noper)
1432       double precision z(ileng,jleng,noper,noper)
1433       integer i,j,k,l
1434
1435       do i=1,ileng
1436         do j=1,jleng
1437           do k=1,noper
1438             do l=1,noper
1439               y(j,i,l,k) = x(l,k,j,i)
1440               z(i,j,l,k) = x(l,k,j,i)
1441             enddo
1442           enddo
1443         enddo
1444       enddo
1445       return
1446       end
1447
1448
1449
1450
1451#endif /* WHITESIDE */
1452#endif /* OLD_ALGORITHM */
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464C========================================================================
1465C========================================================================
1466C========================================================================
1467C========================================================================
1468C========================================================================
1469C========================================================================
1470C========================================================================
1471c
1472c
1473c atw - 9/7/94
1474c
1475c  This is a variation of Whiteside and
1476c  is currently the one implemented.
1477c  Uses 4-fold symmetry.
1478c      Split-loop:         Outer ish loop is split on
1479c                          for load-balancing
1480c      Segmented-task:     Tasks do not span across (nsh*(nsh+1))/2
1481c                          to minimise comms
1482c      UseGemm:            Use the BLAS calls for transformation steps
1483c                          (very slow)
1484c
1485c
1486C========================================================================
1487C========================================================================
1488C========================================================================
1489C========================================================================
1490C========================================================================
1491C========================================================================
1492C========================================================================
1493#ifndef OLD_ALGORITHM
1494       subroutine moints_build(basis,
1495     $                         mo1_lo,mo1_hi,
1496     $                         mo2_lo,mo2_hi,
1497     $                         g_movecs,
1498     $                         g_coul,ocoul,
1499     $                         g_exch,oexch,
1500     $                         oprintstats,
1501     $                         chunk_factor)
1502       implicit none
1503#include "tcgmsg.fh"
1504#include "global.fh"
1505#include "mafdecls.fh"
1506#include "bas.fh"
1507#include "schwarz.fh"
1508#include "msgids.fh"
1509c
1510c Arguments
1511c
1512       integer basis                          ! Basis handle
1513       integer mo1_lo, mo1_hi                 ! 1st Pair Index range
1514       integer mo2_lo, mo2_hi                 ! 2nd Pair Index range
1515       integer g_movecs                       ! MO coefficients
1516       integer g_coul                         ! Coulomb operator
1517       integer g_exch                         ! Exchange operator
1518       logical ocoul,oexch                    ! Type selection
1519       logical oprintstats                    ! Print flag
1520       double precision chunk_factor
1521c
1522c Local variables
1523c
1524       integer nmo1,nmo2,nbf,nsh,maxbfsh
1525       integer ish,jsh,ish2,msh
1526       integer ibflo1,ibfhi1,ibflo2,ibfhi2
1527       integer g_k2idx,g_j2idx
1528       integer l_sssi,k_sssi
1529       integer l_ssni,k_ssni
1530       integer l_ssai,k_ssai
1531       integer l_ssji,k_ssji
1532       integer l_ab1, k_ab1, l_ab2, k_ab2
1533       integer l_eri,k_eri,l_erit,k_erit
1534       integer l_iscr,k_iscr
1535       integer l_mo,k_mo
1536       integer n_sssi,n_ssni,n_ssai,n_ssji,n_ab,nrow,ncol
1537       integer mem2,max2e
1538       integer offset
1539       integer num_nodes,ploop,next,chunksiz,tri_shells
1540       integer i
1541       double precision tol2e
1542       double precision tt,t4sh,tyy,twait,tavg,tyyy,t12
1543       double precision schw_ratio
1544       logical status
1545       integer istatv(1000)
1546       double precision dstatv(100)
1547       double precision tpush_avg,tint_avg,twait_avg
1548#ifdef DEBUG_PARALLEL
1549       integer nptasks,npcomm
1550       double precision schw_done
1551       common/pstats/nptasks,npcomm,schw_done
1552       double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11
1553       common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11
1554#endif
1555c
1556c
1557       integer nxtask,nxtseg
1558       external nxtask,nxtseg
1559c
1560c
1561c
1562       data tol2e/1.d-12/
1563c
1564c
1565c  General basis info
1566c
1567       num_nodes = ga_nnodes()
1568       status = bas_numbf(basis,nbf)
1569       status = status.and.bas_numcont(basis,nsh)
1570       status = status.and.bas_nbf_cn_max(basis,maxbfsh)
1571       if (.not.status) call errquit('moints: cannot get basis info',0)
1572       nmo1 = mo1_hi - mo1_lo + 1
1573       nmo2 = mo2_hi - mo2_lo + 1
1574c
1575c  Integrals allocation
1576c
1577       call int_mem_2e4c(max2e, mem2)
1578       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
1579       status = status.and.ma_push_get(MT_DBL, max2e,
1580     $          'moints: buf', l_erit, k_erit)
1581       status = status.and.ma_push_get(MT_DBL, mem2,
1582     $          'moints: scr', l_iscr, k_iscr)
1583c
1584c  Local MO coefficients
1585c
1586       status = status.and.ma_push_get(MT_DBL,(nbf*nbf),
1587     $                                 'movecs cols',l_mo,k_mo)
1588       call ga_get(g_movecs,1,nbf,1,nbf,dbl_mb(k_mo),nbf)
1589c
1590c  Temporary partially-transformed arrays
1591c
1592       n_sssi = maxbfsh*maxbfsh*maxbfsh*nmo1
1593       n_ssni = maxbfsh*maxbfsh*nbf*nmo1
1594       n_ssai = maxbfsh*maxbfsh*nmo2*nmo1
1595       n_ssji = max((maxbfsh*maxbfsh*nmo1*nmo1),(maxbfsh*nmo2))
1596       n_ab = max(nmo1,nmo2)*max(nmo1,nmo2)
1597       status = status.and.ma_push_get(MT_DBL,n_sssi,
1598     $          'K sssi block',l_sssi,k_sssi)
1599       status = status.and.ma_push_get(MT_DBL,n_ssni,
1600     $          'K ssni block',l_ssni,k_ssni)
1601       if (oexch) then
1602         status = status.and.ma_push_get(MT_DBL,n_ssai,
1603     $          'K ssai block',l_ssai,k_ssai)
1604       endif
1605       if (ocoul) then
1606         status = status.and.ma_push_get(MT_DBL,n_ssji,
1607     $          'K ssai block',l_ssji,k_ssji)
1608       endif
1609       status = status.and.ma_push_get(MT_DBL,n_ab,
1610     $          'K ab1 block',l_ab1,k_ab1)
1611       status = status.and.ma_push_get(MT_DBL,n_ab,
1612     $          'K ab2 block',l_ab2,k_ab2)
1613       if (.not.(status)) call errquit('cannot allocate local memory',0)
1614c
1615c  Globals for accumulating partially-transformed (& transposing)
1616c
1617       nrow = maxbfsh*(nbf+1)
1618       ncol = nmo1*nmo2
1619       if (oexch) then
1620         if (.not.ga_create(MT_DBL,nrow,ncol,'K2idx',nrow,1,g_k2idx))
1621     $    call errquit('moints: cannot allocate global',0)
1622       endif
1623       ncol = (nmo1*(nmo1+1))/2
1624       if (ocoul) then
1625         if (.not.ga_create(MT_DBL,nrow,ncol,'J2idx',nrow,1,g_j2idx))
1626     $     call errquit('moints: cannot allocate global',0)
1627       endif
1628c
1629c Initialize
1630c
1631       tri_shells = (nsh*(nsh+1))/2
1632       chunksiz = tri_shells/chunk_factor
1633       t1qtr = 0.d0
1634       t2qtr = 0.d0
1635       t34 = 0.d0
1636       tint = 0.d0
1637       tpush = 0.d0
1638       twait = 0.d0
1639       tloop = 0.d0
1640       t11 = 0.d0
1641       schw_done = 0.d0
1642       nptasks = 0
1643       npcomm = 0
1644       if (oexch) call ga_zero(g_exch)
1645       if (ocoul) call ga_zero(g_coul)
1646c
1647c  4-fold shell loop
1648c
1649       t4sh = tcgtime()
1650       msh = nsh/2 + mod(nsh,2)
1651#ifdef SPLIT_LOOP
1652       do ish=1,msh
1653#else
1654       do ish=1,nsh
1655#endif
1656         tyyy = tcgtime()
1657#ifdef SEGMENT_TASK
1658         next = nxtseg(num_nodes,chunksiz,tri_shells)
1659#else
1660         next = nxtask(num_nodes,chunksiz)
1661#endif
1662         ploop = 0
1663         offset = 0
1664         if (oexch) call ga_zero(g_k2idx)
1665         if (ocoul) call ga_zero(g_j2idx)
1666         if (.not. bas_cn2bfr(basis,ish,ibflo1,ibfhi1))
1667     $        call errquit('mo_ints: bas_cn2bfr',ish)
1668         do jsh=1,ish
1669           if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
1670             call moints_inner( basis, nbf, nsh, ish, jsh,
1671     $                          ibflo1, ibfhi1,
1672     $                          next, ploop, chunksiz, offset,
1673     $                          ocoul, oexch, mo1_lo, mo1_hi,
1674     $                          mo2_lo, mo2_hi, dbl_mb(k_mo),
1675     $                          max2e, dbl_mb(k_eri), dbl_mb(k_erit),
1676     $                          mem2, dbl_mb(k_iscr),
1677     $                          dbl_mb(k_sssi), n_ssni,
1678     $                          dbl_mb(k_ssni), dbl_mb(k_ssai),
1679     $                          dbl_mb(k_ssji), g_k2idx, g_j2idx )
1680           endif
1681         enddo
1682#ifdef SPLIT_LOOP
1683         ish2 = nsh - ish + 1
1684         if (ish2.eq.ish) goto 100
1685         if (.not. bas_cn2bfr(basis,ish2,ibflo2,ibfhi2))
1686     $        call errquit('mo_ints: bas_cn2bfr',ish2)
1687         offset = (ibfhi1 - ibflo1 + 1)*ibfhi1
1688          do jsh=1,ish2
1689           if (schwarz_shell(ish2,jsh)*schwarz_max().ge.tol2e) then
1690             call moints_inner( basis, nbf, nsh, ish2, jsh,
1691     $                          ibflo2, ibfhi2,
1692     $                          next, ploop, chunksiz, offset,
1693     $                          ocoul, oexch, mo1_lo, mo1_hi,
1694     $                          mo2_lo, mo2_hi, dbl_mb(k_mo),
1695     $                          max2e, dbl_mb(k_eri), dbl_mb(k_erit),
1696     $                          mem2, dbl_mb(k_iscr),
1697     $                          dbl_mb(k_sssi), n_ssni,
1698     $                          dbl_mb(k_ssni), dbl_mb(k_ssai),
1699     $                          dbl_mb(k_ssji), g_k2idx, g_j2idx )
1700           endif
1701         enddo
1702 100     continue
1703         t12 = t12 + tcgtime() - tyyy
1704#endif
1705c
1706c Third and fourth qtr transformations can only be done
1707c when all 'jsh' contributions have been completed
1708c for some 'ish'. Must sync() and wait for all parallel
1709c tasks to finish.
1710c
1711
1712         tyy = tcgtime()
1713#ifdef SEGMENT_TASK
1714         next = nxtseg(-num_nodes,chunksiz,tri_shells)
1715#else
1716         next = nxtask(-num_nodes,chunksiz)
1717#endif
1718         tt = tcgtime()
1719         twait = twait + tt - tyy
1720         offset = 0
1721         if (oexch)
1722     $     call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
1723     $                             ibflo1, ibfhi1, offset, dbl_mb(k_mo),
1724     $                             g_k2idx, dbl_mb(k_ab1),
1725     $                             dbl_mb(k_ab2), dbl_mb(k_ssai),
1726     $                             g_exch )
1727         if (ocoul)
1728     $     call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
1729     $                             ibflo1, ibfhi1, offset, dbl_mb(k_mo),
1730     $                             g_j2idx, dbl_mb(k_ab1),
1731     $                             dbl_mb(k_ab2), dbl_mb(k_ssji),
1732     $                             g_coul )
1733#ifdef SPLIT_LOOP
1734         if (ish.eq.ish2) goto 101
1735         offset = (ibfhi1 - ibflo1 + 1)*ibfhi1
1736         if (oexch)
1737     $     call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
1738     $                             ibflo2, ibfhi2, offset, dbl_mb(k_mo),
1739     $                             g_k2idx, dbl_mb(k_ab1),
1740     $                             dbl_mb(k_ab2), dbl_mb(k_ssai),
1741     $                             g_exch )
1742         if (ocoul)
1743     $     call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
1744     $                             ibflo2, ibfhi2, offset, dbl_mb(k_mo),
1745     $                             g_j2idx, dbl_mb(k_ab1),
1746     $                             dbl_mb(k_ab2), dbl_mb(k_ssji),
1747     $                             g_coul )
1748 101     continue
1749#endif
1750         t34 = t34 + tcgtime() - tt
1751       enddo
1752       call ga_sync()
1753       t4sh = tcgtime() - t4sh
1754c
1755C       call moints_print_opermatrix(nmo1,nmo2,g_exch)
1756C       call moints_print_opermatrix(nmo1,nmo2,g_coul)
1757c
1758c Print statistics
1759c
1760       if (oprintstats) then
1761         call ga_dgop(Msg_MOSch,schw_done,1,'+')
1762         schw_ratio = (schw_done/(tri_shells*tri_shells))*100.d0
1763         if (ga_nodeid().eq.0) then
1764           write(6,986) schw_done,schw_ratio,chunksiz,
1765     $                  t1qtr,t2qtr,t11,t12,t34,t4sh,
1766     $                  tint,tpush,twait
1767 986       format(10x,'Shell-quartets done:',11x,f10.0,/,
1768     $            10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
1769     $            10x,'Task chunksize:',19x,i7,/,
1770     $            10x,'1st quarter time:',14x,f10.2,/,
1771     $            10x,'2nd quarter time:',14x,f10.2,/,
1772     $            10x,'1st quarter total time:',8x,f10.2,/,
1773     $            10x,'Combined 1st and 2nd time:',5x,f10.2,/,
1774     $            10x,'3rd and 4th quarter time:',6x,f10.2,/,
1775     $            10x,'Four-fold shell loop time:',5x,f10.2,/,
1776     $            10x,'Integral evaluation time:',6x,f10.2,/,
1777     $            10x,'Transposition time:',12x,f10.2,/
1778     $            10x,'Synchronization time:',10x,f10.2)
1779           call util_flush(6)
1780         endif
1781       endif
1782
1783
1784#ifdef DEBUG_PARALLEL
1785       call ga_sync()
1786c$$$       call brdcst_ivec( nptasks, istatv )
1787c$$$       if (ga_nodeid().eq.0) then
1788c$$$         write(6,966)
1789c$$$ 966     format(/,'Number of tasks:')
1790c$$$         write(6,901) (istatv(i),i=1,num_nodes)
1791c$$$       endif
1792c$$$       call brdcst_ivec( npcomm, istatv )
1793c$$$       if (ga_nodeid().eq.0) then
1794c$$$         write(6,967)
1795c$$$ 967     format(/,'Number of comms:')
1796c$$$         write(6,901) (istatv(i),i=1,num_nodes)
1797c$$$ 901     format(10i10)
1798c$$$       endif
1799c$$$       call brdcst_dvec( twait, dstatv, tavg )
1800c$$$       if (ga_nodeid().eq.0) then
1801c$$$         write(6,968)
1802c$$$ 968     format(/,'Sync times:')
1803c$$$         write(6,921) (dstatv(i),i=1,num_nodes)
1804c$$$ 921     format(10f10.3)
1805c$$$       endif
1806c$$$       call brdcst_dvec( tloop, dstatv, tavg )
1807c$$$       if (ga_nodeid().eq.0) then
1808c$$$         write(6,969)
1809c$$$ 969     format(/,'Loop times:')
1810c$$$         write(6,921) (dstatv(i),i=1,num_nodes)
1811c$$$         write(6,*)
1812c$$$         write(6,*)
1813c$$$       endif
1814       call brdcst_dvec( tpush, dstatv, tpush_avg )
1815       call brdcst_dvec( twait, dstatv, twait_avg )
1816       call brdcst_dvec( tint, dstatv, tint_avg )
1817       if (ga_nodeid().eq.0) write(6,956) num_nodes,chunksiz,
1818     $                                    tpush_avg,
1819     $                                    twait_avg,tint_avg,
1820     $                                    t4sh
1821 956   format(//,'Average times:',
1822     $        /,':: Number of nodes:',i10,
1823     $        /,':: Chunksize:',6x,i10,
1824     $        /,':: Communication:',2x,f10.3,
1825     $        /,':: Synchronization:',f10.3,
1826     $        /,':: Integrals:',6x,f10.3,
1827     $        /,':: Total:',10x,f10.3)
1828
1829#endif
1830c
1831c Clean-up
1832c
1833      if (.not. ma_pop_stack(l_ab2))
1834     $     call errquit('moints: failed to pop', l_ab2)
1835      if (.not. ma_pop_stack(l_ab1))
1836     $     call errquit('moints: failed to pop', l_ab1)
1837      if ((ocoul).and.(.not. ma_pop_stack(l_ssji)))
1838     $     call errquit('moints: failed to pop', l_ssji)
1839      if ((oexch).and.(.not.ma_pop_stack(l_ssai)))
1840     $     call errquit('moints: failed to pop', l_ssai)
1841      if (.not. ma_pop_stack(l_ssni))
1842     $     call errquit('moints: failed to pop', l_ssni)
1843      if (.not. ma_pop_stack(l_sssi))
1844     $     call errquit('moints: failed to pop', l_sssi)
1845      if (.not. ma_pop_stack(l_mo))
1846     $     call errquit('moints: failed to pop', l_mo)
1847      if (.not. ma_pop_stack(l_iscr))
1848     $     call errquit('moints: failed to pop', l_iscr)
1849      if (.not. ma_pop_stack(l_erit))
1850     $     call errquit('moints: failed to pop', l_erit)
1851      if (.not. ma_pop_stack(l_eri))
1852     $     call errquit('moints: failed to pop', l_eri)
1853       if ((oexch).and.
1854     $    (.not.ga_destroy(g_k2idx)))
1855     $      call errquit('moints: cannot destroy global',g_k2idx)
1856      if ((ocoul).and.
1857     $   (.not.ga_destroy(g_j2idx)))
1858     $     call errquit('moints: cannot destroy global',g_j2idx)
1859
1860       return
1861       end
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871c
1872c
1873c
1874c     ---------------------------
1875c         Auxiliary Routines
1876c     ---------------------------
1877c
1878c
1879c
1880       subroutine moints_inttrsp( ni, nj, nk, nl, x, xt )
1881       implicit none
1882       integer ni,nj,nk,nl
1883       double precision x(nl,nk,nj,ni)
1884       double precision xt(nk,nl,nj,ni)
1885       integer i,j,k,l
1886
1887       do i=1,ni
1888         do j=1,nj
1889           do l=1,nl
1890             do k=1,nk
1891               xt(k,l,j,i) = x(l,k,j,i)
1892             enddo
1893           enddo
1894         enddo
1895       enddo
1896       return
1897       end
1898
1899
1900
1901
1902
1903#ifdef USEGEMM
1904       subroutine moints_trf1idx( nbf, a1, a2, ni, nj,
1905     $                            klo, khi, llo, lhi,
1906     $                            scale, eri, c, x, y )
1907       implicit none
1908       integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi
1909       double precision scale
1910       double precision eri(llo:lhi,klo:khi,nj,ni)
1911       double precision c(nbf,nbf)
1912       double precision x(klo:khi,nj,ni,a1:a2)
1913       double precision y(nbf,ni,nj,a1:a2)
1914       integer na,nl,nsss,i,j,nk,a
1915
1916       na = a2 - a1 + 1
1917       nl = lhi - llo + 1
1918       nk = khi - klo + 1
1919       nsss = ni*nj*nk
1920       call sgemm('t','n',nsss,na,nl,scale,eri,nl,c(llo,a1),nbf,
1921     $            0.d0,x,nsss)
1922       do a=a1,a2
1923         do i=1,ni
1924           do j=1,nj
1925             call saxpy(nk,1.d0,x(klo,j,i,a),1,y(klo,i,j,a),1)
1926           enddo
1927         enddo
1928       enddo
1929       return
1930       end
1931
1932#endif    /* USEGEMM */
1933
1934
1935
1936
1937
1938
1939
1940       subroutine moints_trf2idx( nbf, a1, a2, nb, ni, nj,
1941     $                            scale, y, c, z )
1942       implicit none
1943       integer nbf,a1,a2,nb,ni,nj
1944       double precision scale
1945       double precision c(nbf,nbf)
1946       double precision y(nbf,ni,nj,nb)
1947       double precision z(ni,nj,nb,a1:a2)
1948       integer ssb,na
1949
1950       ssb = nj*ni*nb
1951       na = a2 - a1 + 1
1952       call sgemm('t','n',ssb,na,nbf,scale,y,nbf,c(1,a1),nbf,
1953     $            0.d0,z,ssb)
1954       return
1955       end
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965       subroutine moints_push2idxK( nbf, alo, ahi, blo, bhi,
1966     $                             ilo, ihi, jlo, jhi, koffset,
1967     $                             x, g_k )
1968       implicit none
1969#include "global.fh"
1970       integer nbf,alo,ahi,blo,bhi
1971       integer ilo,ihi,jlo,jhi,koffset
1972       double precision x(ilo:ihi,jlo:jhi,alo:ahi,blo:bhi)
1973       integer g_k
1974       integer a,b,ijlo,ijhi,bleng,ab,ij_leng,ileng
1975
1976       ileng = ihi - ilo + 1
1977       ij_leng = ileng*(jhi - jlo + 1)
1978       bleng = bhi - blo + 1
1979       ijlo = koffset + (jlo-1)*ileng + 1
1980       ijhi = koffset + jhi*ileng
1981       do a=alo,ahi
1982         do b=blo,bhi
1983           ab = (a-alo)*bleng + (b-blo+1)
1984           call ga_acc(g_k,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0)
1985         enddo
1986       enddo
1987       return
1988       end
1989
1990
1991
1992
1993       subroutine moints_push2idxJ( nbf, alo, ahi, blo, bhi,
1994     $                              ilo, ihi, jlo, jhi, joffset,
1995     $                              x, g_j )
1996       implicit none
1997#include "global.fh"
1998       integer nbf,alo,ahi,blo,bhi
1999       integer ilo,ihi,jlo,jhi
2000       integer joffset
2001       double precision x(ilo:ihi,jlo:jhi,alo:ahi,alo:ahi)
2002       integer a,b,ijlo,ijhi,ab,ij_leng,ileng
2003       integer g_j
2004
2005       ileng = ihi - ilo + 1
2006       ij_leng = ileng*(jhi - jlo + 1)
2007       ijlo = joffset + (jlo-1)*ileng + 1
2008       ijhi = joffset + jhi*ileng
2009       do a=alo,ahi
2010         do b=alo,ahi
2011           ab = ((a-alo+1)*(a-alo))/2 + (b-alo+1)
2012           call ga_acc(g_j,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0)
2013         enddo
2014       enddo
2015       return
2016       end
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029       subroutine moints_trf34idx_K( nbf, alo, ahi, blo, bhi,
2030     $                               ilo, ihi, offset, c, g_k,
2031     $                               t1, t2, tmp, g_exch )
2032       implicit none
2033#include "global.fh"
2034#include "mafdecls.fh"
2035       integer nbf,alo,ahi,blo,bhi,ilo,ihi
2036       integer g_k,g_exch
2037       integer offset
2038       double precision c(nbf,nbf)
2039       double precision t1(alo:ahi,blo:bhi)
2040       double precision t2(blo:bhi,alo:ahi)
2041       double precision tmp(*)
2042       integer my_id,rlo,rhi,clo,chi,ileng
2043       integer k_local,ld_local
2044       integer a,b,ia,i,j,ij,bleng,jblo,jbhi
2045
2046
2047       ileng = ihi - ilo + 1
2048       bleng = bhi - blo + 1
2049       my_id = ga_nodeid()
2050       call ga_distribution(g_k,my_id,rlo,rhi,clo,chi)
2051       do i=alo,ahi
2052         do j=blo,bhi
2053           ij = (i-alo)*(bhi-blo+1) + (j-blo+1)
2054           if ((ij.ge.clo).and.(ij.le.chi)) then
2055             call ga_access(g_k,rlo,rhi,ij,ij,k_local,ld_local)
2056             call moints_trf34idx_a(nbf,ilo,ihi,alo,ahi,blo,bhi,c,
2057     $                       dbl_mb(k_local+offset),tmp,t1)
2058             call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,alo,ahi,c,
2059     $                       dbl_mb(k_local+offset),tmp,t2)
2060             call ga_release(g_k,rlo,rhi,ij,ij)
2061             jblo = (j-blo)*bleng + 1
2062             jbhi = (j-blo)*bleng + bleng
2063             do a=alo,i
2064               do b=blo,bhi
2065                 t2(b,a) = t2(b,a) + t1(a,b)
2066               enddo
2067               ia = ((i-alo)*(i-alo+1))/2 + (a-alo+1)
2068               call ga_acc(g_exch,jblo,jbhi,ia,ia,t2(blo,a),bleng,1.d0)
2069             enddo
2070           endif
2071         enddo
2072       enddo
2073
2074
2075       return
2076       end
2077
2078
2079
2080
2081
2082
2083
2084
2085       subroutine moints_trf34idx_J( nbf, alo, ahi, blo, bhi,
2086     $                               ilo, ihi, offset, c, g_j,
2087     $                               t1, t2, tmp, g_coul )
2088       implicit none
2089#include "global.fh"
2090#include "mafdecls.fh"
2091       integer nbf,alo,ahi,blo,bhi,ilo,ihi
2092       integer offset
2093       integer g_j,g_coul
2094       double precision c(nbf,nbf)
2095       double precision t1(blo:bhi,blo:bhi)
2096       double precision t2(blo:bhi,blo:bhi)
2097       double precision tmp(*)
2098       integer my_id,rlo,rhi,clo,chi,ileng
2099       integer k_local,ld_local
2100       integer a,b,i,j,ij,bleng,bbleng,bblo,bbhi
2101
2102       ileng = ihi - ilo + 1
2103       bleng = bhi - blo + 1
2104       bbleng = bleng*bleng
2105       bblo = 1
2106       bbhi = bbleng
2107       my_id = ga_nodeid()
2108       call ga_distribution(g_j,my_id,rlo,rhi,clo,chi)
2109       do i=alo,ahi
2110         do j=alo,i
2111           ij = ((i-alo)*(i-alo+1))/2 + (j-alo+1)
2112           if ((ij.ge.clo).and.(ij.le.chi)) then
2113             call ga_access(g_j,rlo,rhi,ij,ij,k_local,ld_local)
2114             call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,blo,bhi,c,
2115     $                              dbl_mb(k_local+offset),tmp,t1)
2116             call ga_release(g_j,rlo,rhi,ij,ij)
2117             do a=blo,bhi
2118               do b=blo,a
2119                 t2(a,b) = t1(a,b) + t1(b,a)
2120                 t2(b,a) = t2(a,b)
2121               enddo
2122             enddo
2123             call ga_acc(g_coul,bblo,bbhi,ij,ij,t2,bbleng,1.d0)
2124           endif
2125         enddo
2126       enddo
2127       return
2128       end
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139       subroutine moints_trf34idx_a( nbf, ilo, ihi, alo, ahi,
2140     $                               blo, bhi, c, x, y, z )
2141       implicit none
2142       integer nbf,ilo,ihi,alo,ahi,blo,bhi
2143       double precision c(nbf,nbf)
2144       double precision x(ilo:ihi,ihi)
2145       double precision y(alo:ahi,ilo:ihi)
2146       double precision z(alo:ahi,blo:bhi)
2147       integer na,nb,ileng
2148
2149       na = ahi - alo + 1
2150       nb = bhi - blo + 1
2151       ileng = ihi - ilo + 1
2152
2153       call sgemm('t','t',na,ileng,ihi,1.d0,c(1,alo),nbf,x,ileng,
2154     $            0.d0,y,na)
2155       call sgemm('n','n',na,nb,ileng,1.d0,y,na,c(ilo,blo),nbf,
2156     $            0.d0,z,na)
2157       end
2158
2159
2160
2161
2162
2163
2164
2165       subroutine moints_inner( basis, nbf, nsh, ish, jsh,
2166     $                          ibflo, ibfhi,
2167     $                          next, ploop, tasksiz, offset,
2168     $                          ocoul, oexch,
2169     $                          mo1_lo, mo1_hi, mo2_lo, mo2_hi,
2170     $                          mo, max2e, eri, erit, mem2, scr,
2171     $                          sssi, n_ssni, ssni, ssai, ssji,
2172     $                          g_k2, g_j2 )
2173       implicit none
2174#include "tcgmsg.fh"
2175#include "global.fh"
2176#include "bas.fh"
2177#include "schwarz.fh"
2178       integer basis
2179       integer nbf, nsh, ish, jsh
2180       integer ibflo, ibfhi
2181       integer next, ploop, tasksiz
2182       integer mo1_lo, mo1_hi, mo2_lo, mo2_hi
2183       integer mem2, max2e, n_ssni
2184       integer g_k2, g_j2
2185       integer offset
2186       logical oexch, ocoul
2187       double precision mo(nbf,nbf)
2188       double precision eri(max2e),erit(max2e),scr(mem2)
2189       double precision ssni(n_ssni),sssi(*),ssai(*),ssji(*)
2190c
2191c
2192       integer nmo1
2193       integer ksh,lsh
2194       integer jbflo,jbfhi,kbflo,kbfhi,lbflo,lbfhi
2195       integer ilen,jlen,klen,llen
2196       integer numnodes,next0,tri_shells
2197       double precision tol2e,permscale
2198       double precision tt,txy,tzz
2199#ifdef DEBUG_PARALLEL
2200       integer nptasks,npcomm
2201       double precision schw_done
2202       common/pstats/nptasks,npcomm,schw_done
2203       double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11
2204       common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11
2205#endif
2206
2207c
2208c
2209       integer nxtask,nxtseg
2210       external nxtask,nxtseg
2211c
2212c
2213       data tol2e/1.e-12/
2214
2215
2216       txy = tcgtime()
2217       ilen = ibfhi - ibflo + 1
2218       if (.not. bas_cn2bfr(basis,jsh,jbflo,jbfhi))
2219     $     call errquit('mo_ints: bas_cn2bfr',jsh)
2220       jlen = jbfhi - jbflo + 1
2221       numnodes = ga_nnodes()
2222
2223       next0 = next
2224       tri_shells = (nsh*(nsh+1))/2
2225       nmo1 = mo1_hi - mo1_lo + 1
2226       call dfill(n_ssni,0.d0,ssni,1)
2227       do ksh=1,nsh
2228         if (.not. bas_cn2bfr(basis,ksh,kbflo,kbfhi))
2229     $        call errquit('mo_ints: bas_cn2bfr',ksh)
2230         klen = kbfhi - kbflo + 1
2231         do lsh=1,ksh
2232           if (next.eq.ploop) then
2233             nptasks = nptasks + 1
2234             if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
2235     $            .ge.tol2e) then
2236               schw_done = schw_done + 1.d0
2237               if (.not. bas_cn2bfr(basis,lsh,lbflo,lbfhi))
2238     $              call errquit('mo_ints: bas_cn2bfr',lsh)
2239               llen = lbfhi - lbflo + 1
2240               tt = tcgtime()
2241               call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
2242     $                       mem2, scr, max2e, eri )
2243               tint = tint + tcgtime() - tt
2244               permscale = 1.d0
2245               if (ksh.eq.lsh) permscale = 0.5d0
2246#ifdef USEGEMM
2247               call moints_trf1idx( nbf, mo1_lo, mo1_hi,
2248     $                              ilen, jlen, kbflo, kbfhi,
2249     $                              lbflo, lbfhi, permscale,
2250     $                              eri, mo, sssi, ssni )
2251               call moints_inttrsp( ilen, jlen, klen, llen, eri, erit )
2252               call moints_trf1idx( nbf, mo1_lo, mo1_hi,
2253     $                              ilen, jlen, lbflo, lbfhi,
2254     $                              kbflo, kbfhi, permscale,
2255     $                              erit, mo, sssi, ssni )
2256#else
2257               call moints_ytrf1idx( nbf, mo1_lo, mo1_hi,
2258     $                              ilen, jlen, kbflo, kbfhi,
2259     $                              lbflo, lbfhi, permscale,
2260     $                              eri, mo, ssni )
2261#endif
2262               t1qtr = t1qtr + tcgtime() - tt
2263             endif
2264#ifdef SEGMENT_TASK
2265             next = nxtseg(numnodes,tasksiz,tri_shells)
2266#else
2267             next = nxtask(numnodes,tasksiz)
2268#endif
2269           endif
2270           ploop = ploop + 1
2271         enddo
2272       enddo
2273       if (next.eq.next0) tloop = tloop + tcgtime() - txy
2274       t11 = t11 + tcgtime() - txy
2275c
2276c 2nd qtr transformation
2277c
2278       if (next.ne.next0) then
2279         tt = tcgtime()
2280         permscale = 1.d0
2281         if (jsh.eq.ish) permscale = 0.5d0
2282         if (oexch)
2283     $     call moints_trf2idx( nbf, mo2_lo, mo2_hi, nmo1,
2284     $                          ilen, jlen, permscale,
2285     $                          ssni, mo, ssai )
2286         if (ocoul)
2287     $     call moints_trf2idx( nbf, mo1_lo, mo1_hi, nmo1,
2288     $                          ilen, jlen, permscale,
2289     $                          ssni, mo, ssji )
2290c
2291c Push intermediates into global
2292c
2293         npcomm = npcomm + 1
2294         tzz = tcgtime()
2295         if (oexch)
2296     $     call moints_push2idxK( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
2297     $                            ibflo, ibfhi, jbflo, jbfhi, offset,
2298     $                            ssai, g_k2 )
2299         if (ocoul)
2300     $     call moints_push2idxJ( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
2301     $                            ibflo, ibfhi, jbflo, jbfhi, offset,
2302     $                            ssji, g_j2 )
2303         tpush = tpush + tcgtime() - tzz
2304         t2qtr = t2qtr + tcgtime() - tt
2305       endif
2306       return
2307       end
2308
2309
2310
2311#endif /* OLD_ALGORITHM */
2312
2313
2314
2315#ifndef USEGEMM
2316       subroutine moints_ytrf1idx( nbf, a1, a2, ni, nj,
2317     $                            klo, khi, llo, lhi,
2318     $                            scale, eri, c, y )
2319       implicit none
2320       integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi
2321       double precision scale
2322       double precision eri(llo:lhi,klo:khi,nj,ni)
2323       double precision c(nbf,nbf)
2324       double precision y(nbf,ni,nj,a1:a2)
2325       integer i,j,a,k,l,nn
2326       double precision xx
2327
2328       if (scale.ne.1.d0) then
2329         nn = ni*nj*(khi-klo+1)*(lhi-llo+1)
2330         call sscal(nn,scale,eri,1)
2331       endif
2332
2333       do a=a1,a2
2334         do j=1,nj
2335           do i=1,ni
2336             do k=klo,khi
2337               xx = 0.d0
2338               do l=llo,lhi
2339                 xx = xx + eri(l,k,j,i)*c(l,a)
2340               enddo
2341               y(k,i,j,a) = y(k,i,j,a) + xx
2342             enddo
2343             do l=llo,lhi
2344               xx = 0.d0
2345               do k=klo,khi
2346                 xx = xx + eri(l,k,j,i)*c(k,a)
2347               enddo
2348               y(l,i,j,a) = y(l,i,j,a) + xx
2349             enddo
2350           enddo
2351         enddo
2352       enddo
2353
2354       return
2355       end
2356
2357#endif /* ! USEGEMM */
2358c
2359c
2360c
2361c Unused routines (may come in handy later)
2362c
2363c
2364c
2365c
2366#if defined(MOINTS_PAR_STATS)
2367
2368c
2369c Broadcast all nodes' value. Result are
2370c returned in ordered vector
2371c
2372
2373       subroutine brdcst_ivec( value, v )
2374C$Id$
2375       implicit none
2376       integer intlen
2377#ifdef KSR
2378       parameter (intlen=8)
2379#else
2380       parameter (intlen=4)
2381#endif
2382#include "tcgmsg.fh"
2383#include "global.fh"
2384#include "msgids.fh"
2385       integer value, v(*)
2386       integer myid, numnodes, itmp, i
2387
2388       numnodes = ga_nnodes()
2389       myid = ga_nodeid()
2390       do i=0,numnodes-1
2391         if (i.eq.myid) itmp = value
2392         call ga_brdcst(Msg_BcstVec,itmp,intlen,i)
2393         v(i+1) = itmp
2394       enddo
2395       return
2396       end
2397
2398
2399
2400       subroutine brdcst_dvec( value, v, avg )
2401       implicit none
2402       integer msgtype,dbllen
2403       parameter (msgtype=1973)
2404       parameter (dbllen=8)
2405#include "tcgmsg.fh"
2406#include "global.fh"
2407       double precision value, v(*), avg
2408       integer myid, numnodes, i
2409       double precision dtmp
2410
2411       avg = 0.d0
2412       numnodes = ga_nnodes()
2413       myid = ga_nodeid()
2414       do i=0,numnodes-1
2415         if (i.eq.myid) dtmp = value
2416         call ga_brdcst(msgtype,dtmp,dbllen,i)
2417         v(i+1) = dtmp
2418         avg = avg + dtmp
2419       enddo
2420       avg = avg/numnodes
2421       return
2422       end
2423#else
2424       subroutine brdcst_ivec( value, v )
2425       implicit none
2426       integer value, v(*)
2427       return
2428       end
2429#endif
2430
2431
2432
2433      subroutine moints_trf2Kynew
2434     $     ( nbf, qlo, qhi, oseglo, oseghi,
2435     $     vlo, vhi, ilo, ihi, jlo, jhi, gloc,
2436     $     ssni, h, h2, ct, g_buf )
2437      implicit none
2438#include "global.fh"
2439      integer nbf, qlo, qhi, oseglo, oseghi, vlo, vhi
2440      integer ilo, ihi, jlo, jhi
2441      integer gloc(nbf,nbf)
2442      double precision ssni(nbf,jlo:jhi,ilo:ihi,oseglo:oseghi)
2443      double precision h(vlo:vhi,jlo:jhi,ilo:ihi),h2(*)
2444      double precision ct(qlo:qhi,nbf),s, sum
2445      integer g_buf
2446      integer ilen, jlen, ijlen, vlen, qlen
2447      integer glo, ghi, i, j, ij,k, jtop, klo, khi
2448      integer o, v, vo, vtop, vbot
2449      integer kchunk, vchunk
2450      parameter (kchunk=32, vchunk=96) ! For cache reuse
2451
2452*      integer ilen, jlen, ijlen
2453*      integer glo, ghi, i, j, ij,k
2454*      integer o, v, vo, jtop
2455
2456c
2457      ilen = ihi - ilo + 1
2458      jlen = jhi - jlo + 1
2459      vlen = vhi - vlo + 1
2460      qlen = qhi - qlo + 1
2461      glo  = gloc(ilo,jlo)
2462      ghi  = gloc(ihi,jhi)
2463      ijlen = ilen*jlen
2464
2465      do o = oseglo, oseghi
2466*     call dgemm('n','n',vlen,ijlen,nbf,1.0d0,ct(vlo,1),qlen,
2467*     $        ssni(1,1,1,o),nbf,0.0d0,h,vlen)
2468c
2469c     Following is the same as the above dgemm but using
2470c     sparsity of the integrals and restricting diagonal where possible
2471c
2472         call dfill((vhi-vlo+1)*ilen*jlen,0.0d0,h,1)
2473         do klo = 1, nbf, kchunk ! Blocking for cache
2474            khi = min(nbf,klo+kchunk-1)
2475            do vbot = vlo, vhi, vchunk ! Blocking for cache
2476               vtop = min(vhi,vbot+vchunk-1)
2477               do i = ilo, ihi
2478                  jtop = jhi
2479                  if (ilo .eq. jlo) jtop = i
2480                  do j = jlo, jtop
2481                     do k = klo, khi
2482                        s = ssni(k,j,i,o)
2483                        if (abs(s) .gt. 1d-12) then
2484                           do v = vbot,vtop
2485                              h(v,j,i) = h(v,j,i) + s*ct(v,k)
2486                           enddo
2487                        endif
2488                     enddo
2489                  enddo
2490               enddo
2491            enddo
2492         enddo
2493         do v=vlo,vhi
2494            vo = (v-vlo)*(oseghi-oseglo+1) + o - oseglo + 1
2495            ij = 0
2496            sum = 0.0d0
2497            do i=ilo,ihi
2498               jtop = jhi
2499               if (ihi.eq.jhi) jtop = i
2500               do j=jlo,jtop
2501                  ij =   ij + 1
2502                  h2(ij) = h(v,j,i)
2503                  sum = sum + h(v,j,i)*h(v,j,i)
2504               enddo
2505            enddo
2506c     Note that sum is the sum of squares hence test against tol**2
2507            if (sum .gt. 1d-20) call ga_put(g_buf,glo,ghi,vo,vo,h2,1)
2508         enddo
2509      enddo
2510c
2511      return
2512      end
2513
2514
2515
2516
2517
2518
2519