1 #include "simint/boys/boys.h"
2 #include "simint/ostei/gen/ostei_generated.h"
3 #include "simint/vectorization/vectorization.h"
4 #include <math.h>
5 #include <string.h>
6
7
ostei_s_f_p_p(struct simint_multi_shellpair const P,struct simint_multi_shellpair const Q,double screen_tol,double * const restrict work,double * const restrict INT__s_f_p_p)8 int ostei_s_f_p_p(struct simint_multi_shellpair const P,
9 struct simint_multi_shellpair const Q,
10 double screen_tol,
11 double * const restrict work,
12 double * const restrict INT__s_f_p_p)
13 {
14
15 SIMINT_ASSUME_ALIGN_DBL(work);
16 SIMINT_ASSUME_ALIGN_DBL(INT__s_f_p_p);
17 int ab, cd, abcd;
18 int istart, jstart;
19 int iprimcd, nprim_icd, icd;
20 const int check_screen = (screen_tol > 0.0);
21 int i, j;
22 int n;
23 int not_screened;
24 int real_abcd;
25 int ibra;
26
27 // partition workspace
28 double * const INT__s_f_p_s = work + (SIMINT_NSHELL_SIMD * 0);
29 double * const INT__s_f_d_s = work + (SIMINT_NSHELL_SIMD * 30);
30 SIMINT_DBLTYPE * const primwork = (SIMINT_DBLTYPE *)(work + SIMINT_NSHELL_SIMD*90);
31 SIMINT_DBLTYPE * const restrict PRIM_INT__s_s_s_s = primwork + 0;
32 SIMINT_DBLTYPE * const restrict PRIM_INT__s_p_s_s = primwork + 6;
33 SIMINT_DBLTYPE * const restrict PRIM_INT__s_d_s_s = primwork + 21;
34 SIMINT_DBLTYPE * const restrict PRIM_INT__s_d_p_s = primwork + 45;
35 SIMINT_DBLTYPE * const restrict PRIM_INT__s_f_s_s = primwork + 81;
36 SIMINT_DBLTYPE * const restrict PRIM_INT__s_f_p_s = primwork + 111;
37 SIMINT_DBLTYPE * const restrict PRIM_INT__s_f_d_s = primwork + 171;
38 double * const hrrwork = (double *)(primwork + 231);
39
40
41 // Create constants
42 const SIMINT_DBLTYPE const_1 = SIMINT_DBLSET1(1);
43 const SIMINT_DBLTYPE const_2 = SIMINT_DBLSET1(2);
44 const SIMINT_DBLTYPE const_3 = SIMINT_DBLSET1(3);
45 const SIMINT_DBLTYPE one_half = SIMINT_DBLSET1(0.5);
46
47
48 ////////////////////////////////////////
49 // Loop over shells and primitives
50 ////////////////////////////////////////
51
52 real_abcd = 0;
53 istart = 0;
54 for(ab = 0; ab < P.nshell12_clip; ++ab)
55 {
56 const int iend = istart + P.nprim12[ab];
57
58 cd = 0;
59 jstart = 0;
60
61 for(cd = 0; cd < Q.nshell12_clip; cd += SIMINT_NSHELL_SIMD)
62 {
63 const int nshellbatch = ((cd + SIMINT_NSHELL_SIMD) > Q.nshell12_clip) ? Q.nshell12_clip - cd : SIMINT_NSHELL_SIMD;
64 int jend = jstart;
65 for(i = 0; i < nshellbatch; i++)
66 jend += Q.nprim12[cd+i];
67
68 // Clear the beginning of the workspace (where we are accumulating integrals)
69 memset(work, 0, SIMINT_NSHELL_SIMD * 90 * sizeof(double));
70 abcd = 0;
71
72
73 for(i = istart; i < iend; ++i)
74 {
75 SIMINT_DBLTYPE bra_screen_max; // only used if check_screen
76
77 if(check_screen)
78 {
79 // Skip this whole thing if always insignificant
80 if((P.screen[i] * Q.screen_max) < screen_tol)
81 continue;
82 bra_screen_max = SIMINT_DBLSET1(P.screen[i]);
83 }
84
85 icd = 0;
86 iprimcd = 0;
87 nprim_icd = Q.nprim12[cd];
88 double * restrict PRIM_PTR_INT__s_f_p_s = INT__s_f_p_s + abcd * 30;
89 double * restrict PRIM_PTR_INT__s_f_d_s = INT__s_f_d_s + abcd * 60;
90
91
92
93 // Load these one per loop over i
94 const SIMINT_DBLTYPE P_alpha = SIMINT_DBLSET1(P.alpha[i]);
95 const SIMINT_DBLTYPE P_prefac = SIMINT_DBLSET1(P.prefac[i]);
96 const SIMINT_DBLTYPE Pxyz[3] = { SIMINT_DBLSET1(P.x[i]), SIMINT_DBLSET1(P.y[i]), SIMINT_DBLSET1(P.z[i]) };
97
98 const SIMINT_DBLTYPE P_PB[3] = { SIMINT_DBLSET1(P.PB_x[i]), SIMINT_DBLSET1(P.PB_y[i]), SIMINT_DBLSET1(P.PB_z[i]) };
99
100 for(j = jstart; j < jend; j += SIMINT_SIMD_LEN)
101 {
102 // calculate the shell offsets
103 // these are the offset from the shell pointed to by cd
104 // for each element
105 int shelloffsets[SIMINT_SIMD_LEN] = {0};
106 int lastoffset = 0;
107 const int nlane = ( ((j + SIMINT_SIMD_LEN) < jend) ? SIMINT_SIMD_LEN : (jend - j));
108
109 if((iprimcd + SIMINT_SIMD_LEN) >= nprim_icd)
110 {
111 // Handle if the first element of the vector is a new shell
112 if(iprimcd >= nprim_icd && ((icd+1) < nshellbatch))
113 {
114 nprim_icd += Q.nprim12[cd + (++icd)];
115 PRIM_PTR_INT__s_f_p_s += 30;
116 PRIM_PTR_INT__s_f_d_s += 60;
117 }
118 iprimcd++;
119 for(n = 1; n < SIMINT_SIMD_LEN; ++n)
120 {
121 if(iprimcd >= nprim_icd && ((icd+1) < nshellbatch))
122 {
123 shelloffsets[n] = shelloffsets[n-1] + 1;
124 lastoffset++;
125 nprim_icd += Q.nprim12[cd + (++icd)];
126 }
127 else
128 shelloffsets[n] = shelloffsets[n-1];
129 iprimcd++;
130 }
131 }
132 else
133 iprimcd += SIMINT_SIMD_LEN;
134
135 // Do we have to compute this vector (or has it been screened out)?
136 // (not_screened != 0 means we have to do this vector)
137 if(check_screen)
138 {
139 const double vmax = vector_max(SIMINT_MUL(bra_screen_max, SIMINT_DBLLOAD(Q.screen, j)));
140 if(vmax < screen_tol)
141 {
142 PRIM_PTR_INT__s_f_p_s += lastoffset*30;
143 PRIM_PTR_INT__s_f_d_s += lastoffset*60;
144 continue;
145 }
146 }
147
148 const SIMINT_DBLTYPE Q_alpha = SIMINT_DBLLOAD(Q.alpha, j);
149 const SIMINT_DBLTYPE PQalpha_mul = SIMINT_MUL(P_alpha, Q_alpha);
150 const SIMINT_DBLTYPE PQalpha_sum = SIMINT_ADD(P_alpha, Q_alpha);
151 const SIMINT_DBLTYPE one_over_PQalpha_sum = SIMINT_DIV(const_1, PQalpha_sum);
152
153
154 /* construct R2 = (Px - Qx)**2 + (Py - Qy)**2 + (Pz -Qz)**2 */
155 SIMINT_DBLTYPE PQ[3];
156 PQ[0] = SIMINT_SUB(Pxyz[0], SIMINT_DBLLOAD(Q.x, j));
157 PQ[1] = SIMINT_SUB(Pxyz[1], SIMINT_DBLLOAD(Q.y, j));
158 PQ[2] = SIMINT_SUB(Pxyz[2], SIMINT_DBLLOAD(Q.z, j));
159 SIMINT_DBLTYPE R2 = SIMINT_MUL(PQ[0], PQ[0]);
160 R2 = SIMINT_FMADD(PQ[1], PQ[1], R2);
161 R2 = SIMINT_FMADD(PQ[2], PQ[2], R2);
162
163 const SIMINT_DBLTYPE alpha = SIMINT_MUL(PQalpha_mul, one_over_PQalpha_sum); // alpha from MEST
164 const SIMINT_DBLTYPE one_over_p = SIMINT_DIV(const_1, P_alpha);
165 const SIMINT_DBLTYPE one_over_q = SIMINT_DIV(const_1, Q_alpha);
166 const SIMINT_DBLTYPE one_over_2p = SIMINT_MUL(one_half, one_over_p);
167 const SIMINT_DBLTYPE one_over_2q = SIMINT_MUL(one_half, one_over_q);
168 const SIMINT_DBLTYPE one_over_2pq = SIMINT_MUL(one_half, one_over_PQalpha_sum);
169 const SIMINT_DBLTYPE Q_PA[3] = { SIMINT_DBLLOAD(Q.PA_x, j), SIMINT_DBLLOAD(Q.PA_y, j), SIMINT_DBLLOAD(Q.PA_z, j) };
170
171 // NOTE: Minus sign!
172 const SIMINT_DBLTYPE a_over_p = SIMINT_MUL(SIMINT_NEG(alpha), one_over_p);
173 SIMINT_DBLTYPE aop_PQ[3];
174 aop_PQ[0] = SIMINT_MUL(a_over_p, PQ[0]);
175 aop_PQ[1] = SIMINT_MUL(a_over_p, PQ[1]);
176 aop_PQ[2] = SIMINT_MUL(a_over_p, PQ[2]);
177
178 SIMINT_DBLTYPE a_over_q = SIMINT_MUL(alpha, one_over_q);
179 SIMINT_DBLTYPE aoq_PQ[3];
180 aoq_PQ[0] = SIMINT_MUL(a_over_q, PQ[0]);
181 aoq_PQ[1] = SIMINT_MUL(a_over_q, PQ[1]);
182 aoq_PQ[2] = SIMINT_MUL(a_over_q, PQ[2]);
183 // Put a minus sign here so we don't have to in RR routines
184 a_over_q = SIMINT_NEG(a_over_q);
185
186
187 //////////////////////////////////////////////
188 // Fjt function section
189 // Maximum v value: 5
190 //////////////////////////////////////////////
191 // The parameter to the Fjt function
192 const SIMINT_DBLTYPE F_x = SIMINT_MUL(R2, alpha);
193
194
195 const SIMINT_DBLTYPE Q_prefac = mask_load(nlane, Q.prefac + j);
196
197
198 boys_F_split(PRIM_INT__s_s_s_s, F_x, 5);
199 SIMINT_DBLTYPE prefac = SIMINT_SQRT(one_over_PQalpha_sum);
200 prefac = SIMINT_MUL(SIMINT_MUL(P_prefac, Q_prefac), prefac);
201 for(n = 0; n <= 5; n++)
202 PRIM_INT__s_s_s_s[n] = SIMINT_MUL(PRIM_INT__s_s_s_s[n], prefac);
203
204 //////////////////////////////////////////////
205 // Primitive integrals: Vertical recurrance
206 //////////////////////////////////////////////
207
208 const SIMINT_DBLTYPE vrr_const_1_over_2p = one_over_2p;
209 const SIMINT_DBLTYPE vrr_const_2_over_2p = SIMINT_MUL(const_2, one_over_2p);
210 const SIMINT_DBLTYPE vrr_const_1_over_2q = one_over_2q;
211 const SIMINT_DBLTYPE vrr_const_1_over_2pq = one_over_2pq;
212 const SIMINT_DBLTYPE vrr_const_2_over_2pq = SIMINT_MUL(const_2, one_over_2pq);
213 const SIMINT_DBLTYPE vrr_const_3_over_2pq = SIMINT_MUL(const_3, one_over_2pq);
214
215
216
217 // Forming PRIM_INT__s_p_s_s[5 * 3];
218 for(n = 0; n < 5; ++n) // loop over orders of auxiliary function
219 {
220
221 PRIM_INT__s_p_s_s[n * 3 + 0] = SIMINT_MUL(P_PB[0], PRIM_INT__s_s_s_s[n * 1 + 0]);
222 PRIM_INT__s_p_s_s[n * 3 + 0] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_s_s[n * 3 + 0]);
223
224 PRIM_INT__s_p_s_s[n * 3 + 1] = SIMINT_MUL(P_PB[1], PRIM_INT__s_s_s_s[n * 1 + 0]);
225 PRIM_INT__s_p_s_s[n * 3 + 1] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_s_s[n * 3 + 1]);
226
227 PRIM_INT__s_p_s_s[n * 3 + 2] = SIMINT_MUL(P_PB[2], PRIM_INT__s_s_s_s[n * 1 + 0]);
228 PRIM_INT__s_p_s_s[n * 3 + 2] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_p_s_s[n * 3 + 2]);
229
230 }
231
232
233
234 // Forming PRIM_INT__s_d_s_s[4 * 6];
235 for(n = 0; n < 4; ++n) // loop over orders of auxiliary function
236 {
237
238 PRIM_INT__s_d_s_s[n * 6 + 0] = SIMINT_MUL(P_PB[0], PRIM_INT__s_p_s_s[n * 3 + 0]);
239 PRIM_INT__s_d_s_s[n * 6 + 0] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_s_s[n * 6 + 0]);
240 PRIM_INT__s_d_s_s[n * 6 + 0] = SIMINT_FMADD( vrr_const_1_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_d_s_s[n * 6 + 0]);
241
242 PRIM_INT__s_d_s_s[n * 6 + 1] = SIMINT_MUL(P_PB[1], PRIM_INT__s_p_s_s[n * 3 + 0]);
243 PRIM_INT__s_d_s_s[n * 6 + 1] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_s_s[n * 6 + 1]);
244
245 PRIM_INT__s_d_s_s[n * 6 + 2] = SIMINT_MUL(P_PB[2], PRIM_INT__s_p_s_s[n * 3 + 0]);
246 PRIM_INT__s_d_s_s[n * 6 + 2] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_s_s[n * 6 + 2]);
247
248 PRIM_INT__s_d_s_s[n * 6 + 3] = SIMINT_MUL(P_PB[1], PRIM_INT__s_p_s_s[n * 3 + 1]);
249 PRIM_INT__s_d_s_s[n * 6 + 3] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_d_s_s[n * 6 + 3]);
250 PRIM_INT__s_d_s_s[n * 6 + 3] = SIMINT_FMADD( vrr_const_1_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_d_s_s[n * 6 + 3]);
251
252 PRIM_INT__s_d_s_s[n * 6 + 4] = SIMINT_MUL(P_PB[2], PRIM_INT__s_p_s_s[n * 3 + 1]);
253 PRIM_INT__s_d_s_s[n * 6 + 4] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_d_s_s[n * 6 + 4]);
254
255 PRIM_INT__s_d_s_s[n * 6 + 5] = SIMINT_MUL(P_PB[2], PRIM_INT__s_p_s_s[n * 3 + 2]);
256 PRIM_INT__s_d_s_s[n * 6 + 5] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_p_s_s[(n+1) * 3 + 2], PRIM_INT__s_d_s_s[n * 6 + 5]);
257 PRIM_INT__s_d_s_s[n * 6 + 5] = SIMINT_FMADD( vrr_const_1_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_s_s_s[(n+1) * 1 + 0], PRIM_INT__s_s_s_s[n * 1 + 0]), PRIM_INT__s_d_s_s[n * 6 + 5]);
258
259 }
260
261
262
263 // Forming PRIM_INT__s_f_s_s[3 * 10];
264 for(n = 0; n < 3; ++n) // loop over orders of auxiliary function
265 {
266
267 PRIM_INT__s_f_s_s[n * 10 + 0] = SIMINT_MUL(P_PB[0], PRIM_INT__s_d_s_s[n * 6 + 0]);
268 PRIM_INT__s_f_s_s[n * 10 + 0] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_f_s_s[n * 10 + 0]);
269 PRIM_INT__s_f_s_s[n * 10 + 0] = SIMINT_FMADD( vrr_const_2_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_p_s_s[n * 3 + 0]), PRIM_INT__s_f_s_s[n * 10 + 0]);
270
271 PRIM_INT__s_f_s_s[n * 10 + 1] = SIMINT_MUL(P_PB[1], PRIM_INT__s_d_s_s[n * 6 + 0]);
272 PRIM_INT__s_f_s_s[n * 10 + 1] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_f_s_s[n * 10 + 1]);
273
274 PRIM_INT__s_f_s_s[n * 10 + 2] = SIMINT_MUL(P_PB[2], PRIM_INT__s_d_s_s[n * 6 + 0]);
275 PRIM_INT__s_f_s_s[n * 10 + 2] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_f_s_s[n * 10 + 2]);
276
277 PRIM_INT__s_f_s_s[n * 10 + 3] = SIMINT_MUL(P_PB[0], PRIM_INT__s_d_s_s[n * 6 + 3]);
278 PRIM_INT__s_f_s_s[n * 10 + 3] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_f_s_s[n * 10 + 3]);
279
280 PRIM_INT__s_f_s_s[n * 10 + 4] = SIMINT_MUL(P_PB[2], PRIM_INT__s_d_s_s[n * 6 + 1]);
281 PRIM_INT__s_f_s_s[n * 10 + 4] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 1], PRIM_INT__s_f_s_s[n * 10 + 4]);
282
283 PRIM_INT__s_f_s_s[n * 10 + 5] = SIMINT_MUL(P_PB[0], PRIM_INT__s_d_s_s[n * 6 + 5]);
284 PRIM_INT__s_f_s_s[n * 10 + 5] = SIMINT_FMADD( aop_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_f_s_s[n * 10 + 5]);
285
286 PRIM_INT__s_f_s_s[n * 10 + 6] = SIMINT_MUL(P_PB[1], PRIM_INT__s_d_s_s[n * 6 + 3]);
287 PRIM_INT__s_f_s_s[n * 10 + 6] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_f_s_s[n * 10 + 6]);
288 PRIM_INT__s_f_s_s[n * 10 + 6] = SIMINT_FMADD( vrr_const_2_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_p_s_s[n * 3 + 1]), PRIM_INT__s_f_s_s[n * 10 + 6]);
289
290 PRIM_INT__s_f_s_s[n * 10 + 7] = SIMINT_MUL(P_PB[2], PRIM_INT__s_d_s_s[n * 6 + 3]);
291 PRIM_INT__s_f_s_s[n * 10 + 7] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_f_s_s[n * 10 + 7]);
292
293 PRIM_INT__s_f_s_s[n * 10 + 8] = SIMINT_MUL(P_PB[1], PRIM_INT__s_d_s_s[n * 6 + 5]);
294 PRIM_INT__s_f_s_s[n * 10 + 8] = SIMINT_FMADD( aop_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_f_s_s[n * 10 + 8]);
295
296 PRIM_INT__s_f_s_s[n * 10 + 9] = SIMINT_MUL(P_PB[2], PRIM_INT__s_d_s_s[n * 6 + 5]);
297 PRIM_INT__s_f_s_s[n * 10 + 9] = SIMINT_FMADD( aop_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_f_s_s[n * 10 + 9]);
298 PRIM_INT__s_f_s_s[n * 10 + 9] = SIMINT_FMADD( vrr_const_2_over_2p, SIMINT_FMADD(a_over_p, PRIM_INT__s_p_s_s[(n+1) * 3 + 2], PRIM_INT__s_p_s_s[n * 3 + 2]), PRIM_INT__s_f_s_s[n * 10 + 9]);
299
300 }
301
302
303 VRR_K_s_f_p_s(
304 PRIM_INT__s_f_p_s,
305 PRIM_INT__s_f_s_s,
306 PRIM_INT__s_d_s_s,
307 Q_PA,
308 aoq_PQ,
309 one_over_2pq,
310 2);
311
312
313
314 // Forming PRIM_INT__s_d_p_s[2 * 18];
315 for(n = 0; n < 2; ++n) // loop over orders of auxiliary function
316 {
317
318 PRIM_INT__s_d_p_s[n * 18 + 0] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 0]);
319 PRIM_INT__s_d_p_s[n * 18 + 0] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_d_p_s[n * 18 + 0]);
320 PRIM_INT__s_d_p_s[n * 18 + 0] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_p_s[n * 18 + 0]);
321
322 PRIM_INT__s_d_p_s[n * 18 + 1] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 0]);
323 PRIM_INT__s_d_p_s[n * 18 + 1] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_d_p_s[n * 18 + 1]);
324
325 PRIM_INT__s_d_p_s[n * 18 + 2] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 0]);
326 PRIM_INT__s_d_p_s[n * 18 + 2] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 0], PRIM_INT__s_d_p_s[n * 18 + 2]);
327
328 PRIM_INT__s_d_p_s[n * 18 + 3] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 1]);
329 PRIM_INT__s_d_p_s[n * 18 + 3] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 1], PRIM_INT__s_d_p_s[n * 18 + 3]);
330 PRIM_INT__s_d_p_s[n * 18 + 3] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_d_p_s[n * 18 + 3]);
331
332 PRIM_INT__s_d_p_s[n * 18 + 4] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 1]);
333 PRIM_INT__s_d_p_s[n * 18 + 4] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 1], PRIM_INT__s_d_p_s[n * 18 + 4]);
334 PRIM_INT__s_d_p_s[n * 18 + 4] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_p_s[n * 18 + 4]);
335
336 PRIM_INT__s_d_p_s[n * 18 + 5] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 1]);
337 PRIM_INT__s_d_p_s[n * 18 + 5] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 1], PRIM_INT__s_d_p_s[n * 18 + 5]);
338
339 PRIM_INT__s_d_p_s[n * 18 + 6] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 2]);
340 PRIM_INT__s_d_p_s[n * 18 + 6] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 2], PRIM_INT__s_d_p_s[n * 18 + 6]);
341 PRIM_INT__s_d_p_s[n * 18 + 6] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 2], PRIM_INT__s_d_p_s[n * 18 + 6]);
342
343 PRIM_INT__s_d_p_s[n * 18 + 7] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 2]);
344 PRIM_INT__s_d_p_s[n * 18 + 7] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 2], PRIM_INT__s_d_p_s[n * 18 + 7]);
345
346 PRIM_INT__s_d_p_s[n * 18 + 8] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 2]);
347 PRIM_INT__s_d_p_s[n * 18 + 8] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 2], PRIM_INT__s_d_p_s[n * 18 + 8]);
348 PRIM_INT__s_d_p_s[n * 18 + 8] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 0], PRIM_INT__s_d_p_s[n * 18 + 8]);
349
350 PRIM_INT__s_d_p_s[n * 18 + 9] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 3]);
351 PRIM_INT__s_d_p_s[n * 18 + 9] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_d_p_s[n * 18 + 9]);
352
353 PRIM_INT__s_d_p_s[n * 18 + 10] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 3]);
354 PRIM_INT__s_d_p_s[n * 18 + 10] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_d_p_s[n * 18 + 10]);
355 PRIM_INT__s_d_p_s[n * 18 + 10] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_d_p_s[n * 18 + 10]);
356
357 PRIM_INT__s_d_p_s[n * 18 + 11] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 3]);
358 PRIM_INT__s_d_p_s[n * 18 + 11] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 3], PRIM_INT__s_d_p_s[n * 18 + 11]);
359
360 PRIM_INT__s_d_p_s[n * 18 + 12] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 4]);
361 PRIM_INT__s_d_p_s[n * 18 + 12] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 4], PRIM_INT__s_d_p_s[n * 18 + 12]);
362
363 PRIM_INT__s_d_p_s[n * 18 + 13] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 4]);
364 PRIM_INT__s_d_p_s[n * 18 + 13] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 4], PRIM_INT__s_d_p_s[n * 18 + 13]);
365 PRIM_INT__s_d_p_s[n * 18 + 13] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 2], PRIM_INT__s_d_p_s[n * 18 + 13]);
366
367 PRIM_INT__s_d_p_s[n * 18 + 14] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 4]);
368 PRIM_INT__s_d_p_s[n * 18 + 14] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 4], PRIM_INT__s_d_p_s[n * 18 + 14]);
369 PRIM_INT__s_d_p_s[n * 18 + 14] = SIMINT_FMADD( vrr_const_1_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 1], PRIM_INT__s_d_p_s[n * 18 + 14]);
370
371 PRIM_INT__s_d_p_s[n * 18 + 15] = SIMINT_MUL(Q_PA[0], PRIM_INT__s_d_s_s[n * 6 + 5]);
372 PRIM_INT__s_d_p_s[n * 18 + 15] = SIMINT_FMADD( aoq_PQ[0], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_d_p_s[n * 18 + 15]);
373
374 PRIM_INT__s_d_p_s[n * 18 + 16] = SIMINT_MUL(Q_PA[1], PRIM_INT__s_d_s_s[n * 6 + 5]);
375 PRIM_INT__s_d_p_s[n * 18 + 16] = SIMINT_FMADD( aoq_PQ[1], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_d_p_s[n * 18 + 16]);
376
377 PRIM_INT__s_d_p_s[n * 18 + 17] = SIMINT_MUL(Q_PA[2], PRIM_INT__s_d_s_s[n * 6 + 5]);
378 PRIM_INT__s_d_p_s[n * 18 + 17] = SIMINT_FMADD( aoq_PQ[2], PRIM_INT__s_d_s_s[(n+1) * 6 + 5], PRIM_INT__s_d_p_s[n * 18 + 17]);
379 PRIM_INT__s_d_p_s[n * 18 + 17] = SIMINT_FMADD( vrr_const_2_over_2pq, PRIM_INT__s_p_s_s[(n+1) * 3 + 2], PRIM_INT__s_d_p_s[n * 18 + 17]);
380
381 }
382
383
384 VRR_K_s_f_d_s(
385 PRIM_INT__s_f_d_s,
386 PRIM_INT__s_f_p_s,
387 PRIM_INT__s_f_s_s,
388 PRIM_INT__s_d_p_s,
389 Q_PA,
390 a_over_q,
391 aoq_PQ,
392 one_over_2pq,
393 one_over_2q,
394 1);
395
396
397
398
399 ////////////////////////////////////
400 // Accumulate contracted integrals
401 ////////////////////////////////////
402 if(lastoffset == 0)
403 {
404 contract_all(30, PRIM_INT__s_f_p_s, PRIM_PTR_INT__s_f_p_s);
405 contract_all(60, PRIM_INT__s_f_d_s, PRIM_PTR_INT__s_f_d_s);
406 }
407 else
408 {
409 contract(30, shelloffsets, PRIM_INT__s_f_p_s, PRIM_PTR_INT__s_f_p_s);
410 contract(60, shelloffsets, PRIM_INT__s_f_d_s, PRIM_PTR_INT__s_f_d_s);
411 PRIM_PTR_INT__s_f_p_s += lastoffset*30;
412 PRIM_PTR_INT__s_f_d_s += lastoffset*60;
413 }
414
415 } // close loop over j
416 } // close loop over i
417
418 //Advance to the next batch
419 jstart = SIMINT_SIMD_ROUND(jend);
420
421 //////////////////////////////////////////////
422 // Contracted integrals: Horizontal recurrance
423 //////////////////////////////////////////////
424
425
426
427
428 for(abcd = 0; abcd < nshellbatch; ++abcd, ++real_abcd)
429 {
430 const double hCD[3] = { Q.AB_x[cd+abcd], Q.AB_y[cd+abcd], Q.AB_z[cd+abcd] };
431
432 // set up HRR pointers
433 double const * restrict HRR_INT__s_f_p_s = INT__s_f_p_s + abcd * 30;
434 double const * restrict HRR_INT__s_f_d_s = INT__s_f_d_s + abcd * 60;
435 double * restrict HRR_INT__s_f_p_p = INT__s_f_p_p + real_abcd * 90;
436
437 // form INT__s_f_p_p
438 for(ibra = 0; ibra < 10; ++ibra)
439 {
440 HRR_INT__s_f_p_p[ibra * 9 + 0] = HRR_INT__s_f_d_s[ibra * 6 + 0] + ( hCD[0] * HRR_INT__s_f_p_s[ibra * 3 + 0] );
441
442 HRR_INT__s_f_p_p[ibra * 9 + 1] = HRR_INT__s_f_d_s[ibra * 6 + 1] + ( hCD[1] * HRR_INT__s_f_p_s[ibra * 3 + 0] );
443
444 HRR_INT__s_f_p_p[ibra * 9 + 2] = HRR_INT__s_f_d_s[ibra * 6 + 2] + ( hCD[2] * HRR_INT__s_f_p_s[ibra * 3 + 0] );
445
446 HRR_INT__s_f_p_p[ibra * 9 + 3] = HRR_INT__s_f_d_s[ibra * 6 + 1] + ( hCD[0] * HRR_INT__s_f_p_s[ibra * 3 + 1] );
447
448 HRR_INT__s_f_p_p[ibra * 9 + 4] = HRR_INT__s_f_d_s[ibra * 6 + 3] + ( hCD[1] * HRR_INT__s_f_p_s[ibra * 3 + 1] );
449
450 HRR_INT__s_f_p_p[ibra * 9 + 5] = HRR_INT__s_f_d_s[ibra * 6 + 4] + ( hCD[2] * HRR_INT__s_f_p_s[ibra * 3 + 1] );
451
452 HRR_INT__s_f_p_p[ibra * 9 + 6] = HRR_INT__s_f_d_s[ibra * 6 + 2] + ( hCD[0] * HRR_INT__s_f_p_s[ibra * 3 + 2] );
453
454 HRR_INT__s_f_p_p[ibra * 9 + 7] = HRR_INT__s_f_d_s[ibra * 6 + 4] + ( hCD[1] * HRR_INT__s_f_p_s[ibra * 3 + 2] );
455
456 HRR_INT__s_f_p_p[ibra * 9 + 8] = HRR_INT__s_f_d_s[ibra * 6 + 5] + ( hCD[2] * HRR_INT__s_f_p_s[ibra * 3 + 2] );
457
458 }
459
460
461 } // close HRR loop
462
463
464 } // close loop cdbatch
465
466 istart = iend;
467 } // close loop over ab
468
469 return P.nshell12_clip * Q.nshell12_clip;
470 }
471
472