1 #include "simint/boys/boys.h"
2 #include "simint/ostei/gen/ostei_generated.h"
3 #include "simint/vectorization/vectorization.h"
4 #include <math.h>
5 #include <string.h>
6
7
ostei_s_p_p_p(struct simint_multi_shellpair const P,struct simint_multi_shellpair const Q,double screen_tol,double * const restrict work,double * const restrict INT__s_p_p_p)8 int ostei_s_p_p_p(struct simint_multi_shellpair const P,
9 struct simint_multi_shellpair const Q,
10 double screen_tol,
11 double * const restrict work,
12 double * const restrict INT__s_p_p_p)
13 {
14
15 SIMINT_ASSUME_ALIGN_DBL(work);
16 SIMINT_ASSUME_ALIGN_DBL(INT__s_p_p_p);
17 int ab, cd, abcd;
18 int istart, jstart;
19 int iprimcd, nprim_icd, icd;
20 const int check_screen = (screen_tol > 0.0);
21 int i, j;
22 int n;
23 int not_screened;
24 int real_abcd;
25 int ibra;
26
27 // partition workspace
28 double * const INT__s_p_p_s = work + (SIMINT_NSHELL_SIMD * 0);
29 double * const INT__s_p_d_s = work + (SIMINT_NSHELL_SIMD * 9);
30 SIMINT_DBLTYPE * const primwork = (SIMINT_DBLTYPE *)(work + SIMINT_NSHELL_SIMD*27);
31 SIMINT_DBLTYPE * const restrict PRIM_INT__s_s_s_s = primwork + 0;
32 SIMINT_DBLTYPE * const restrict PRIM_INT__s_s_p_s = primwork + 4;
33 SIMINT_DBLTYPE * const restrict PRIM_INT__s_s_d_s = primwork + 13;
34 SIMINT_DBLTYPE * const restrict PRIM_INT__s_p_p_s = primwork + 25;
35 SIMINT_DBLTYPE * const restrict PRIM_INT__s_p_d_s = primwork + 34;
36 double * const hrrwork = (double *)(primwork + 52);
37
38
39 // Create constants
40 const SIMINT_DBLTYPE const_1 = SIMINT_DBLSET1(1);
41 const SIMINT_DBLTYPE const_2 = SIMINT_DBLSET1(2);
42 const SIMINT_DBLTYPE one_half = SIMINT_DBLSET1(0.5);
43
44
45 ////////////////////////////////////////
46 // Loop over shells and primitives
47 ////////////////////////////////////////
48
49 real_abcd = 0;
50 istart = 0;
51 for(ab = 0; ab < P.nshell12_clip; ++ab)
52 {
53 const int iend = istart + P.nprim12[ab];
54
55 cd = 0;
56 jstart = 0;
57
58 for(cd = 0; cd < Q.nshell12_clip; cd += SIMINT_NSHELL_SIMD)
59 {
60 const int nshellbatch = ((cd + SIMINT_NSHELL_SIMD) > Q.nshell12_clip) ? Q.nshell12_clip - cd : SIMINT_NSHELL_SIMD;
61 int jend = jstart;
62 for(i = 0; i < nshellbatch; i++)
63 jend += Q.nprim12[cd+i];
64
65 // Clear the beginning of the workspace (where we are accumulating integrals)
66 memset(work, 0, SIMINT_NSHELL_SIMD * 27 * sizeof(double));
67 abcd = 0;
68
69
70 for(i = istart; i < iend; ++i)
71 {
72 SIMINT_DBLTYPE bra_screen_max; // only used if check_screen
73
74 if(check_screen)
75 {
76 // Skip this whole thing if always insignificant
77 if((P.screen[i] * Q.screen_max) < screen_tol)
78 continue;
79 bra_screen_max = SIMINT_DBLSET1(P.screen[i]);
80 }
81
82 icd = 0;
83 iprimcd = 0;
84 nprim_icd = Q.nprim12[cd];
85 double * restrict PRIM_PTR_INT__s_p_p_s = INT__s_p_p_s + abcd * 9;
86 double * restrict PRIM_PTR_INT__s_p_d_s = INT__s_p_d_s + abcd * 18;
87
88
89
90 // Load these one per loop over i
91 const SIMINT_DBLTYPE P_alpha = SIMINT_DBLSET1(P.alpha[i]);
92 const SIMINT_DBLTYPE P_prefac = SIMINT_DBLSET1(P.prefac[i]);
93 const SIMINT_DBLTYPE Pxyz[3] = { SIMINT_DBLSET1(P.x[i]), SIMINT_DBLSET1(P.y[i]), SIMINT_DBLSET1(P.z[i]) };
94
95 const SIMINT_DBLTYPE P_PB[3] = { SIMINT_DBLSET1(P.PB_x[i]), SIMINT_DBLSET1(P.PB_y[i]), SIMINT_DBLSET1(P.PB_z[i]) };
96
97 for(j = jstart; j < jend; j += SIMINT_SIMD_LEN)
98 {
99 // calculate the shell offsets
100 // these are the offset from the shell pointed to by cd
101 // for each element
102 int shelloffsets[SIMINT_SIMD_LEN] = {0};
103 int lastoffset = 0;
104 const int nlane = ( ((j + SIMINT_SIMD_LEN) < jend) ? SIMINT_SIMD_LEN : (jend - j));
105
106 if((iprimcd + SIMINT_SIMD_LEN) >= nprim_icd)
107 {
108 // Handle if the first element of the vector is a new shell
109 if(iprimcd >= nprim_icd && ((icd+1) < nshellbatch))
110 {
111 nprim_icd += Q.nprim12[cd + (++icd)];
112 PRIM_PTR_INT__s_p_p_s += 9;
113 PRIM_PTR_INT__s_p_d_s += 18;
114 }
115 iprimcd++;
116 for(n = 1; n < SIMINT_SIMD_LEN; ++n)
117 {
118 if(iprimcd >= nprim_icd && ((icd+1) < nshellbatch))
119 {
120 shelloffsets[n] = shelloffsets[n-1] + 1;
121 lastoffset++;
122 nprim_icd += Q.nprim12[cd + (++icd)];
123 }
124 else
125 shelloffsets[n] = shelloffsets[n-1];
126 iprimcd++;
127 }
128 }
129 else
130 iprimcd += SIMINT_SIMD_LEN;
131
132 // Do we have to compute this vector (or has it been screened out)?
133 // (not_screened != 0 means we have to do this vector)
134 if(check_screen)
135 {
136 const double vmax = vector_max(SIMINT_MUL(bra_screen_max, SIMINT_DBLLOAD(Q.screen, j)));
137 if(vmax < screen_tol)
138 {
139 PRIM_PTR_INT__s_p_p_s += lastoffset*9;
140 PRIM_PTR_INT__s_p_d_s += lastoffset*18;
141 continue;
142 }
143 }
144
145 const SIMINT_DBLTYPE Q_alpha = SIMINT_DBLLOAD(Q.alpha, j);
146 const SIMINT_DBLTYPE PQalpha_mul = SIMINT_MUL(P_alpha, Q_alpha);
147 const SIMINT_DBLTYPE PQalpha_sum = SIMINT_ADD(P_alpha, Q_alpha);
148 const SIMINT_DBLTYPE one_over_PQalpha_sum = SIMINT_DIV(const_1, PQalpha_sum);
149
150
151 /* construct R2 = (Px - Qx)**2 + (Py - Qy)**2 + (Pz -Qz)**2 */
152 SIMINT_DBLTYPE PQ[3];
153 PQ[0] = SIMINT_SUB(Pxyz[0], SIMINT_DBLLOAD(Q.x, j));
154 PQ[1] = SIMINT_SUB(Pxyz[1], SIMINT_DBLLOAD(Q.y, j));
155 PQ[2] = SIMINT_SUB(Pxyz[2], SIMINT_DBLLOAD(Q.z, j));
156 SIMINT_DBLTYPE R2 = SIMINT_MUL(PQ[0], PQ[0]);
157 R2 = SIMINT_FMADD(PQ[1], PQ[1], R2);
158 R2 = SIMINT_FMADD(PQ[2], PQ[2], R2);
159
160 const SIMINT_DBLTYPE alpha = SIMINT_MUL(PQalpha_mul, one_over_PQalpha_sum); // alpha from MEST
161 const SIMINT_DBLTYPE one_over_p = SIMINT_DIV(const_1, P_alpha);
162 const SIMINT_DBLTYPE one_over_q = SIMINT_DIV(const_1, Q_alpha);
163 const SIMINT_DBLTYPE one_over_2p = SIMINT_MUL(one_half, one_over_p);
164 const SIMINT_DBLTYPE one_over_2q = SIMINT_MUL(one_half, one_over_q);
165 const SIMINT_DBLTYPE one_over_2pq = SIMINT_MUL(one_half, one_over_PQalpha_sum);
166 const SIMINT_DBLTYPE Q_PA[3] = { SIMINT_DBLLOAD(Q.PA_x, j), SIMINT_DBLLOAD(Q.PA_y, j), SIMINT_DBLLOAD(Q.PA_z, j) };
167
168 // NOTE: Minus sign!
169 const SIMINT_DBLTYPE a_over_p = SIMINT_MUL(SIMINT_NEG(alpha), one_over_p);
170 SIMINT_DBLTYPE aop_PQ[3];
171 aop_PQ[0] = SIMINT_MUL(a_over_p, PQ[0]);
172 aop_PQ[1] = SIMINT_MUL(a_over_p, PQ[1]);
173 aop_PQ[2] = SIMINT_MUL(a_over_p, PQ[2]);
174
175 SIMINT_DBLTYPE a_over_q = SIMINT_MUL(alpha, one_over_q);
176 SIMINT_DBLTYPE aoq_PQ[3];
177 aoq_PQ[0] = SIMINT_MUL(a_over_q, PQ[0]);
178 aoq_PQ[1] = SIMINT_MUL(a_over_q, PQ[1]);
179 aoq_PQ[2] = SIMINT_MUL(a_over_q, PQ[2]);
180 // Put a minus sign here so we don't have to in RR routines
181 a_over_q = SIMINT_NEG(a_over_q);
182
183
184 //////////////////////////////////////////////
185 // Fjt function section
186 // Maximum v value: 3
187 //////////////////////////////////////////////
188 // The parameter to the Fjt function
189 const SIMINT_DBLTYPE F_x = SIMINT_MUL(R2, alpha);
190
191
192 const SIMINT_DBLTYPE Q_prefac = mask_load(nlane, Q.prefac + j);
193
194
195 boys_F_split(PRIM_INT__s_s_s_s, F_x, 3);
196 SIMINT_DBLTYPE prefac = SIMINT_SQRT(one_over_PQalpha_sum);
197 prefac = SIMINT_MUL(SIMINT_MUL(P_prefac, Q_prefac), prefac);
198 for(n = 0; n <= 3; n++)
199 PRIM_INT__s_s_s_s[n] = SIMINT_MUL(PRIM_INT__s_s_s_s[n], prefac);
200
201 //////////////////////////////////////////////
202 // Primitive integrals: Vertical recurrance
203 //////////////////////////////////////////////
204
205 const SIMINT_DBLTYPE vrr_const_1_over_2q = one_over_2q;
206 const SIMINT_DBLTYPE vrr_const_1_over_2pq = one_over_2pq;
207 const SIMINT_DBLTYPE vrr_const_2_over_2pq = SIMINT_MUL(const_2, one_over_2pq);
208
209
210
211 // Forming PRIM_INT__s_s_p_s[3 * 3];
212 for(n = 0; n < 3; ++n) // loop over orders of auxiliary function
213 {
214
215 PRIM_INT__s_s_p_s[n * 3 + 0] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_s_s_s[n * 1 + 0]);
216 PRIM_INT__s_s_p_s[n * 3 + 0] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_p_s[n * 3 + 0]);
217
218 PRIM_INT__s_s_p_s[n * 3 + 1] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_s_s_s[n * 1 + 0]);
219 PRIM_INT__s_s_p_s[n * 3 + 1] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_p_s[n * 3 + 1]);
220
221 PRIM_INT__s_s_p_s[n * 3 + 2] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_s_s_s[n * 1 + 0]);
222 PRIM_INT__s_s_p_s[n * 3 + 2] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_p_s[n * 3 + 2]);
223
224 }
225
226
227
228 // Forming PRIM_INT__s_p_p_s[1 * 9];
229 for(n = 0; n < 1; ++n) // loop over orders of auxiliary function
230 {
231
232 PRIM_INT__s_p_p_s[n * 9 + 0] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_p_s[n * 3 + 0]);
233 PRIM_INT__s_p_p_s[n * 9 + 0] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_p_s[n * 9 + 0]);
234 PRIM_INT__s_p_p_s[n * 9 + 0] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_p_s[n * 9 + 0]);
235
236 PRIM_INT__s_p_p_s[n * 9 + 1] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_p_s[n * 3 + 1]);
237 PRIM_INT__s_p_p_s[n * 9 + 1] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_p_s[n * 9 + 1]);
238
239 PRIM_INT__s_p_p_s[n * 9 + 2] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_p_s[n * 3 + 2]);
240 PRIM_INT__s_p_p_s[n * 9 + 2] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_p_s[n * 9 + 2]);
241
242 PRIM_INT__s_p_p_s[n * 9 + 3] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_p_s[n * 3 + 0]);
243 PRIM_INT__s_p_p_s[n * 9 + 3] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_p_s[n * 9 + 3]);
244
245 PRIM_INT__s_p_p_s[n * 9 + 4] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_p_s[n * 3 + 1]);
246 PRIM_INT__s_p_p_s[n * 9 + 4] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_p_s[n * 9 + 4]);
247 PRIM_INT__s_p_p_s[n * 9 + 4] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_p_s[n * 9 + 4]);
248
249 PRIM_INT__s_p_p_s[n * 9 + 5] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_p_s[n * 3 + 2]);
250 PRIM_INT__s_p_p_s[n * 9 + 5] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_p_s[n * 9 + 5]);
251
252 PRIM_INT__s_p_p_s[n * 9 + 6] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_p_s[n * 3 + 0]);
253 PRIM_INT__s_p_p_s[n * 9 + 6] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_p_s[n * 9 + 6]);
254
255 PRIM_INT__s_p_p_s[n * 9 + 7] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_p_s[n * 3 + 1]);
256 PRIM_INT__s_p_p_s[n * 9 + 7] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_p_s[n * 9 + 7]);
257
258 PRIM_INT__s_p_p_s[n * 9 + 8] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_p_s[n * 3 + 2]);
259 PRIM_INT__s_p_p_s[n * 9 + 8] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_p_s[n * 9 + 8]);
260 PRIM_INT__s_p_p_s[n * 9 + 8] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_p_s[n * 9 + 8]);
261
262 }
263
264
265
266 // Forming PRIM_INT__s_s_d_s[2 * 6];
267 for(n = 0; n < 2; ++n) // loop over orders of auxiliary function
268 {
269
270 PRIM_INT__s_s_d_s[n * 6 + 0] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_s_p_s[n * 3 + 0]);
271 PRIM_INT__s_s_d_s[n * 6 + 0] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_s_d_s[n * 6 + 0]);
272 PRIM_INT__s_s_d_s[n * 6 + 0] = SIMINT_FMADD( vrr_const_1_over_2q, SIMINT_FMADD(a_over_q, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_s_d_s[n * 6 + 0]);
273
274 PRIM_INT__s_s_d_s[n * 6 + 1] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_s_p_s[n * 3 + 0]);
275 PRIM_INT__s_s_d_s[n * 6 + 1] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_s_d_s[n * 6 + 1]);
276
277 PRIM_INT__s_s_d_s[n * 6 + 2] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_s_p_s[n * 3 + 0]);
278 PRIM_INT__s_s_d_s[n * 6 + 2] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_s_d_s[n * 6 + 2]);
279
280 PRIM_INT__s_s_d_s[n * 6 + 3] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_s_p_s[n * 3 + 1]);
281 PRIM_INT__s_s_d_s[n * 6 + 3] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_s_d_s[n * 6 + 3]);
282 PRIM_INT__s_s_d_s[n * 6 + 3] = SIMINT_FMADD( vrr_const_1_over_2q, SIMINT_FMADD(a_over_q, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_s_d_s[n * 6 + 3]);
283
284 PRIM_INT__s_s_d_s[n * 6 + 4] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_s_p_s[n * 3 + 1]);
285 PRIM_INT__s_s_d_s[n * 6 + 4] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_s_d_s[n * 6 + 4]);
286
287 PRIM_INT__s_s_d_s[n * 6 + 5] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_s_p_s[n * 3 + 2]);
288 PRIM_INT__s_s_d_s[n * 6 + 5] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_s_d_s[n * 6 + 5]);
289 PRIM_INT__s_s_d_s[n * 6 + 5] = SIMINT_FMADD( vrr_const_1_over_2q, SIMINT_FMADD(a_over_q, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_s_d_s[n * 6 + 5]);
290
291 }
292
293
294
295 // Forming PRIM_INT__s_p_d_s[1 * 18];
296 for(n = 0; n < 1; ++n) // loop over orders of auxiliary function
297 {
298
299 PRIM_INT__s_p_d_s[n * 18 + 0] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 0]);
300 PRIM_INT__s_p_d_s[n * 18 + 0] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 0], PRIM_INT__s_p_d_s[n * 18 + 0]);
301 PRIM_INT__s_p_d_s[n * 18 + 0] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_d_s[n * 18 + 0]);
302
303 PRIM_INT__s_p_d_s[n * 18 + 1] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 1]);
304 PRIM_INT__s_p_d_s[n * 18 + 1] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 1], PRIM_INT__s_p_d_s[n * 18 + 1]);
305 PRIM_INT__s_p_d_s[n * 18 + 1] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_d_s[n * 18 + 1]);
306
307 PRIM_INT__s_p_d_s[n * 18 + 2] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 2]);
308 PRIM_INT__s_p_d_s[n * 18 + 2] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 2], PRIM_INT__s_p_d_s[n * 18 + 2]);
309 PRIM_INT__s_p_d_s[n * 18 + 2] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_d_s[n * 18 + 2]);
310
311 PRIM_INT__s_p_d_s[n * 18 + 3] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 3]);
312 PRIM_INT__s_p_d_s[n * 18 + 3] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 3], PRIM_INT__s_p_d_s[n * 18 + 3]);
313
314 PRIM_INT__s_p_d_s[n * 18 + 4] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 4]);
315 PRIM_INT__s_p_d_s[n * 18 + 4] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 4], PRIM_INT__s_p_d_s[n * 18 + 4]);
316
317 PRIM_INT__s_p_d_s[n * 18 + 5] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_d_s[n * 6 + 5]);
318 PRIM_INT__s_p_d_s[n * 18 + 5] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_d_s[(n+1) * 6 + 5], PRIM_INT__s_p_d_s[n * 18 + 5]);
319
320 PRIM_INT__s_p_d_s[n * 18 + 6] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 0]);
321 PRIM_INT__s_p_d_s[n * 18 + 6] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 0], PRIM_INT__s_p_d_s[n * 18 + 6]);
322
323 PRIM_INT__s_p_d_s[n * 18 + 7] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 1]);
324 PRIM_INT__s_p_d_s[n * 18 + 7] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 1], PRIM_INT__s_p_d_s[n * 18 + 7]);
325 PRIM_INT__s_p_d_s[n * 18 + 7] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_d_s[n * 18 + 7]);
326
327 PRIM_INT__s_p_d_s[n * 18 + 8] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 2]);
328 PRIM_INT__s_p_d_s[n * 18 + 8] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 2], PRIM_INT__s_p_d_s[n * 18 + 8]);
329
330 PRIM_INT__s_p_d_s[n * 18 + 9] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 3]);
331 PRIM_INT__s_p_d_s[n * 18 + 9] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 3], PRIM_INT__s_p_d_s[n * 18 + 9]);
332 PRIM_INT__s_p_d_s[n * 18 + 9] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_d_s[n * 18 + 9]);
333
334 PRIM_INT__s_p_d_s[n * 18 + 10] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 4]);
335 PRIM_INT__s_p_d_s[n * 18 + 10] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 4], PRIM_INT__s_p_d_s[n * 18 + 10]);
336 PRIM_INT__s_p_d_s[n * 18 + 10] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_d_s[n * 18 + 10]);
337
338 PRIM_INT__s_p_d_s[n * 18 + 11] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_d_s[n * 6 + 5]);
339 PRIM_INT__s_p_d_s[n * 18 + 11] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_d_s[(n+1) * 6 + 5], PRIM_INT__s_p_d_s[n * 18 + 11]);
340
341 PRIM_INT__s_p_d_s[n * 18 + 12] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 0]);
342 PRIM_INT__s_p_d_s[n * 18 + 12] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 0], PRIM_INT__s_p_d_s[n * 18 + 12]);
343
344 PRIM_INT__s_p_d_s[n * 18 + 13] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 1]);
345 PRIM_INT__s_p_d_s[n * 18 + 13] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 1], PRIM_INT__s_p_d_s[n * 18 + 13]);
346
347 PRIM_INT__s_p_d_s[n * 18 + 14] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 2]);
348 PRIM_INT__s_p_d_s[n * 18 + 14] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 2], PRIM_INT__s_p_d_s[n * 18 + 14]);
349 PRIM_INT__s_p_d_s[n * 18 + 14] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 0], PRIM_INT__s_p_d_s[n * 18 + 14]);
350
351 PRIM_INT__s_p_d_s[n * 18 + 15] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 3]);
352 PRIM_INT__s_p_d_s[n * 18 + 15] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 3], PRIM_INT__s_p_d_s[n * 18 + 15]);
353
354 PRIM_INT__s_p_d_s[n * 18 + 16] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 4]);
355 PRIM_INT__s_p_d_s[n * 18 + 16] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 4], PRIM_INT__s_p_d_s[n * 18 + 16]);
356 PRIM_INT__s_p_d_s[n * 18 + 16] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 1], PRIM_INT__s_p_d_s[n * 18 + 16]);
357
358 PRIM_INT__s_p_d_s[n * 18 + 17] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_d_s[n * 6 + 5]);
359 PRIM_INT__s_p_d_s[n * 18 + 17] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_d_s[(n+1) * 6 + 5], PRIM_INT__s_p_d_s[n * 18 + 17]);
360 PRIM_INT__s_p_d_s[n * 18 + 17] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_s_p_s[(n+1) * 3 + 2], PRIM_INT__s_p_d_s[n * 18 + 17]);
361
362 }
363
364
365
366
367 ////////////////////////////////////
368 // Accumulate contracted integrals
369 ////////////////////////////////////
370 if(lastoffset == 0)
371 {
372 contract_all(9, PRIM_INT__s_p_p_s, PRIM_PTR_INT__s_p_p_s);
373 contract_all(18, PRIM_INT__s_p_d_s, PRIM_PTR_INT__s_p_d_s);
374 }
375 else
376 {
377 contract(9, shelloffsets, PRIM_INT__s_p_p_s, PRIM_PTR_INT__s_p_p_s);
378 contract(18, shelloffsets, PRIM_INT__s_p_d_s, PRIM_PTR_INT__s_p_d_s);
379 PRIM_PTR_INT__s_p_p_s += lastoffset*9;
380 PRIM_PTR_INT__s_p_d_s += lastoffset*18;
381 }
382
383 } // close loop over j
384 } // close loop over i
385
386 //Advance to the next batch
387 jstart = SIMINT_SIMD_ROUND(jend);
388
389 //////////////////////////////////////////////
390 // Contracted integrals: Horizontal recurrance
391 //////////////////////////////////////////////
392
393
394
395
396 for(abcd = 0; abcd < nshellbatch; ++abcd, ++real_abcd)
397 {
398 const double hCD[3] = { Q.AB_x[cd+abcd], Q.AB_y[cd+abcd], Q.AB_z[cd+abcd] };
399
400 // set up HRR pointers
401 double const * restrict HRR_INT__s_p_p_s = INT__s_p_p_s + abcd * 9;
402 double const * restrict HRR_INT__s_p_d_s = INT__s_p_d_s + abcd * 18;
403 double * restrict HRR_INT__s_p_p_p = INT__s_p_p_p + real_abcd * 27;
404
405 // form INT__s_p_p_p
406 for(ibra = 0; ibra < 3; ++ibra)
407 {
408 HRR_INT__s_p_p_p[ibra * 9 + 0] = HRR_INT__s_p_d_s[ibra * 6 + 0] + ( hCD[0] * HRR_INT__s_p_p_s[ibra * 3 + 0] );
409
410 HRR_INT__s_p_p_p[ibra * 9 + 1] = HRR_INT__s_p_d_s[ibra * 6 + 1] + ( hCD[1] * HRR_INT__s_p_p_s[ibra * 3 + 0] );
411
412 HRR_INT__s_p_p_p[ibra * 9 + 2] = HRR_INT__s_p_d_s[ibra * 6 + 2] + ( hCD[2] * HRR_INT__s_p_p_s[ibra * 3 + 0] );
413
414 HRR_INT__s_p_p_p[ibra * 9 + 3] = HRR_INT__s_p_d_s[ibra * 6 + 1] + ( hCD[0] * HRR_INT__s_p_p_s[ibra * 3 + 1] );
415
416 HRR_INT__s_p_p_p[ibra * 9 + 4] = HRR_INT__s_p_d_s[ibra * 6 + 3] + ( hCD[1] * HRR_INT__s_p_p_s[ibra * 3 + 1] );
417
418 HRR_INT__s_p_p_p[ibra * 9 + 5] = HRR_INT__s_p_d_s[ibra * 6 + 4] + ( hCD[2] * HRR_INT__s_p_p_s[ibra * 3 + 1] );
419
420 HRR_INT__s_p_p_p[ibra * 9 + 6] = HRR_INT__s_p_d_s[ibra * 6 + 2] + ( hCD[0] * HRR_INT__s_p_p_s[ibra * 3 + 2] );
421
422 HRR_INT__s_p_p_p[ibra * 9 + 7] = HRR_INT__s_p_d_s[ibra * 6 + 4] + ( hCD[1] * HRR_INT__s_p_p_s[ibra * 3 + 2] );
423
424 HRR_INT__s_p_p_p[ibra * 9 + 8] = HRR_INT__s_p_d_s[ibra * 6 + 5] + ( hCD[2] * HRR_INT__s_p_p_s[ibra * 3 + 2] );
425
426 }
427
428
429 } // close HRR loop
430
431
432 } // close loop cdbatch
433
434 istart = iend;
435 } // close loop over ab
436
437 return P.nshell12_clip * Q.nshell12_clip;
438 }
439
440