1 /* $Id: CoinAbcHelperFunctions.hpp 2385 2019-01-06 19:43:06Z unxusr $ */
2 // Copyright (C) 2003, International Business Machines
3 // Corporation and others, Copyright (C) 2012, FasterCoin.  All Rights Reserved.
4 // This code is licensed under the terms of the Eclipse Public License (EPL).
5 
6 #ifndef CoinAbcHelperFunctions_H
7 #define CoinAbcHelperFunctions_H
8 
9 #include "ClpConfig.h"
10 #ifdef HAVE_CMATH
11 #include <cmath>
12 #else
13 #ifdef HAVE_MATH_H
14 #include <math.h>
15 #else
16 #include <cmath>
17 #endif
18 #endif
19 #include "CoinAbcCommon.hpp"
20 #ifndef abc_assert
21 #define abc_assert(condition)                               \
22   {                                                         \
23     if (!condition) {                                       \
24       printf("abc_assert in %s at line %d - %s is false\n", \
25         __FILE__, __LINE__, __STRING(condition));           \
26       abort();                                              \
27     }                                                       \
28   }
29 #endif
30 // cilk_for granularity.
31 #define CILK_FOR_GRAINSIZE 128
32 //#define AVX2 2
33 #if AVX2 == 1
34 #include "emmintrin.h"
35 #elif AVX2 == 2
36 #include <immintrin.h>
37 #elif AVX2 == 3
38 #include "avx2intrin.h"
39 #endif
40 //#define __AVX__ 1
41 //#define __AVX2__ 1
42 /**
43     Note (JJF) I have added some operations on arrays even though they may
44     duplicate CoinDenseVector.
45 
46 */
47 
48 #define UNROLL_SCATTER 2
49 #define INLINE_SCATTER 1
50 #if INLINE_SCATTER == 0
51 void CoinAbcScatterUpdate(int number, CoinFactorizationDouble pivotValue,
52   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
53   const int *COIN_RESTRICT thisIndex,
54   CoinFactorizationDouble *COIN_RESTRICT region);
55 #else
CoinAbcScatterUpdate(int number,CoinFactorizationDouble pivotValue,const CoinFactorizationDouble * COIN_RESTRICT thisElement,const int * COIN_RESTRICT thisIndex,CoinFactorizationDouble * COIN_RESTRICT region)56 void ABC_INLINE inline CoinAbcScatterUpdate(int number, CoinFactorizationDouble pivotValue,
57   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
58   const int *COIN_RESTRICT thisIndex,
59   CoinFactorizationDouble *COIN_RESTRICT region)
60 {
61 #if UNROLL_SCATTER == 0
62   for (CoinBigIndex j = number - 1; j >= 0; j--) {
63     CoinSimplexInt iRow = thisIndex[j];
64     CoinFactorizationDouble regionValue = region[iRow];
65     CoinFactorizationDouble value = thisElement[j];
66     assert(value);
67     region[iRow] = regionValue - value * pivotValue;
68   }
69 #elif UNROLL_SCATTER == 1
70   if ((number & 1) != 0) {
71     number--;
72     CoinSimplexInt iRow = thisIndex[number];
73     CoinFactorizationDouble regionValue = region[iRow];
74     CoinFactorizationDouble value = thisElement[number];
75     region[iRow] = regionValue - value * pivotValue;
76   }
77   for (CoinBigIndex j = number - 1; j >= 0; j -= 2) {
78     CoinSimplexInt iRow0 = thisIndex[j];
79     CoinSimplexInt iRow1 = thisIndex[j - 1];
80     CoinFactorizationDouble regionValue0 = region[iRow0];
81     CoinFactorizationDouble regionValue1 = region[iRow1];
82     region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
83     region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
84   }
85 #elif UNROLL_SCATTER == 2
86   if ((number & 1) != 0) {
87     number--;
88     CoinSimplexInt iRow = thisIndex[number];
89     CoinFactorizationDouble regionValue = region[iRow];
90     CoinFactorizationDouble value = thisElement[number];
91     region[iRow] = regionValue - value * pivotValue;
92   }
93   if ((number & 2) != 0) {
94     CoinSimplexInt iRow0 = thisIndex[number - 1];
95     CoinFactorizationDouble regionValue0 = region[iRow0];
96     CoinFactorizationDouble value0 = thisElement[number - 1];
97     CoinSimplexInt iRow1 = thisIndex[number - 2];
98     CoinFactorizationDouble regionValue1 = region[iRow1];
99     CoinFactorizationDouble value1 = thisElement[number - 2];
100     region[iRow0] = regionValue0 - value0 * pivotValue;
101     region[iRow1] = regionValue1 - value1 * pivotValue;
102     number -= 2;
103   }
104 #pragma cilk grainsize = CILK_FOR_GRAINSIZE
105   cilk_for(CoinBigIndex j = number - 1; j >= 0; j -= 4)
106   {
107     CoinSimplexInt iRow0 = thisIndex[j];
108     CoinSimplexInt iRow1 = thisIndex[j - 1];
109     CoinFactorizationDouble regionValue0 = region[iRow0];
110     CoinFactorizationDouble regionValue1 = region[iRow1];
111     region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
112     region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
113     CoinSimplexInt iRow2 = thisIndex[j - 2];
114     CoinSimplexInt iRow3 = thisIndex[j - 3];
115     CoinFactorizationDouble regionValue2 = region[iRow2];
116     CoinFactorizationDouble regionValue3 = region[iRow3];
117     region[iRow2] = regionValue2 - thisElement[j - 2] * pivotValue;
118     region[iRow3] = regionValue3 - thisElement[j - 3] * pivotValue;
119   }
120 #elif UNROLL_SCATTER == 3
121   CoinSimplexInt iRow0;
122   CoinSimplexInt iRow1;
123   CoinFactorizationDouble regionValue0;
124   CoinFactorizationDouble regionValue1;
125   switch (static_cast< unsigned int >(number)) {
126   case 0:
127     break;
128   case 1:
129     iRow0 = thisIndex[0];
130     regionValue0 = region[iRow0];
131     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
132     break;
133   case 2:
134     iRow0 = thisIndex[0];
135     iRow1 = thisIndex[1];
136     regionValue0 = region[iRow0];
137     regionValue1 = region[iRow1];
138     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
139     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
140     break;
141   case 3:
142     iRow0 = thisIndex[0];
143     iRow1 = thisIndex[1];
144     regionValue0 = region[iRow0];
145     regionValue1 = region[iRow1];
146     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
147     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
148     iRow0 = thisIndex[2];
149     regionValue0 = region[iRow0];
150     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
151     break;
152   case 4:
153     iRow0 = thisIndex[0];
154     iRow1 = thisIndex[1];
155     regionValue0 = region[iRow0];
156     regionValue1 = region[iRow1];
157     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
158     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
159     iRow0 = thisIndex[2];
160     iRow1 = thisIndex[3];
161     regionValue0 = region[iRow0];
162     regionValue1 = region[iRow1];
163     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
164     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
165     break;
166   case 5:
167     iRow0 = thisIndex[0];
168     iRow1 = thisIndex[1];
169     regionValue0 = region[iRow0];
170     regionValue1 = region[iRow1];
171     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
172     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
173     iRow0 = thisIndex[2];
174     iRow1 = thisIndex[3];
175     regionValue0 = region[iRow0];
176     regionValue1 = region[iRow1];
177     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
178     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
179     iRow0 = thisIndex[4];
180     regionValue0 = region[iRow0];
181     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
182     break;
183   case 6:
184     iRow0 = thisIndex[0];
185     iRow1 = thisIndex[1];
186     regionValue0 = region[iRow0];
187     regionValue1 = region[iRow1];
188     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
189     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
190     iRow0 = thisIndex[2];
191     iRow1 = thisIndex[3];
192     regionValue0 = region[iRow0];
193     regionValue1 = region[iRow1];
194     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
195     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
196     iRow0 = thisIndex[4];
197     iRow1 = thisIndex[5];
198     regionValue0 = region[iRow0];
199     regionValue1 = region[iRow1];
200     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
201     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
202     break;
203   case 7:
204     iRow0 = thisIndex[0];
205     iRow1 = thisIndex[1];
206     regionValue0 = region[iRow0];
207     regionValue1 = region[iRow1];
208     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
209     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
210     iRow0 = thisIndex[2];
211     iRow1 = thisIndex[3];
212     regionValue0 = region[iRow0];
213     regionValue1 = region[iRow1];
214     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
215     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
216     iRow0 = thisIndex[4];
217     iRow1 = thisIndex[5];
218     regionValue0 = region[iRow0];
219     regionValue1 = region[iRow1];
220     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
221     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
222     iRow0 = thisIndex[6];
223     regionValue0 = region[iRow0];
224     region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
225     break;
226   case 8:
227     iRow0 = thisIndex[0];
228     iRow1 = thisIndex[1];
229     regionValue0 = region[iRow0];
230     regionValue1 = region[iRow1];
231     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
232     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
233     iRow0 = thisIndex[2];
234     iRow1 = thisIndex[3];
235     regionValue0 = region[iRow0];
236     regionValue1 = region[iRow1];
237     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
238     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
239     iRow0 = thisIndex[4];
240     iRow1 = thisIndex[5];
241     regionValue0 = region[iRow0];
242     regionValue1 = region[iRow1];
243     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
244     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
245     iRow0 = thisIndex[6];
246     iRow1 = thisIndex[7];
247     regionValue0 = region[iRow0];
248     regionValue1 = region[iRow1];
249     region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
250     region[iRow1] = regionValue1 - thisElement[7] * pivotValue;
251     break;
252   default:
253     if ((number & 1) != 0) {
254       number--;
255       CoinSimplexInt iRow = thisIndex[number];
256       CoinFactorizationDouble regionValue = region[iRow];
257       CoinFactorizationDouble value = thisElement[number];
258       region[iRow] = regionValue - value * pivotValue;
259     }
260     for (CoinBigIndex j = number - 1; j >= 0; j -= 2) {
261       CoinSimplexInt iRow0 = thisIndex[j];
262       CoinSimplexInt iRow1 = thisIndex[j - 1];
263       CoinFactorizationDouble regionValue0 = region[iRow0];
264       CoinFactorizationDouble regionValue1 = region[iRow1];
265       region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
266       region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
267     }
268     break;
269   }
270 #endif
271 }
CoinAbcScatterUpdate(int number,CoinFactorizationDouble pivotValue,const CoinFactorizationDouble * COIN_RESTRICT thisElement,CoinFactorizationDouble * COIN_RESTRICT region)272 void ABC_INLINE inline CoinAbcScatterUpdate(int number, CoinFactorizationDouble pivotValue,
273   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
274   CoinFactorizationDouble *COIN_RESTRICT region)
275 {
276 #if UNROLL_SCATTER == 0
277   const int *COIN_RESTRICT thisIndex = reinterpret_cast< const int * >(thisElement + number);
278   for (CoinBigIndex j = number - 1; j >= 0; j--) {
279     CoinSimplexInt iRow = thisIndex[j];
280     CoinFactorizationDouble regionValue = region[iRow];
281     CoinFactorizationDouble value = thisElement[j];
282     assert(value);
283     region[iRow] = regionValue - value * pivotValue;
284   }
285 #elif UNROLL_SCATTER == 1
286   const int *COIN_RESTRICT thisIndex = reinterpret_cast< const int * >(thisElement + number);
287   if ((number & 1) != 0) {
288     number--;
289     CoinSimplexInt iRow = thisIndex[number];
290     CoinFactorizationDouble regionValue = region[iRow];
291     CoinFactorizationDouble value = thisElement[number];
292     region[iRow] = regionValue - value * pivotValue;
293   }
294   for (CoinBigIndex j = number - 1; j >= 0; j -= 2) {
295     CoinSimplexInt iRow0 = thisIndex[j];
296     CoinSimplexInt iRow1 = thisIndex[j - 1];
297     CoinFactorizationDouble regionValue0 = region[iRow0];
298     CoinFactorizationDouble regionValue1 = region[iRow1];
299     region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
300     region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
301   }
302 #elif UNROLL_SCATTER == 2
303   const int *COIN_RESTRICT thisIndex = reinterpret_cast< const int * >(thisElement + number);
304   if ((number & 1) != 0) {
305     number--;
306     CoinSimplexInt iRow = thisIndex[number];
307     CoinFactorizationDouble regionValue = region[iRow];
308     CoinFactorizationDouble value = thisElement[number];
309     region[iRow] = regionValue - value * pivotValue;
310   }
311   if ((number & 2) != 0) {
312     CoinSimplexInt iRow0 = thisIndex[number - 1];
313     CoinFactorizationDouble regionValue0 = region[iRow0];
314     CoinFactorizationDouble value0 = thisElement[number - 1];
315     CoinSimplexInt iRow1 = thisIndex[number - 2];
316     CoinFactorizationDouble regionValue1 = region[iRow1];
317     CoinFactorizationDouble value1 = thisElement[number - 2];
318     region[iRow0] = regionValue0 - value0 * pivotValue;
319     region[iRow1] = regionValue1 - value1 * pivotValue;
320     number -= 2;
321   }
322 #if AVX2 == 22
323   CoinFactorizationDouble temp[4] __attribute__((aligned(32)));
324   __m256d pv = _mm256_broadcast_sd(&pivotValue);
325   for (CoinBigIndex j = number - 1; j >= 0; j -= 4) {
326     __m256d elements = _mm256_loadu_pd(thisElement + j - 3);
327     CoinSimplexInt iRow0 = thisIndex[j - 3];
328     CoinSimplexInt iRow1 = thisIndex[j - 2];
329     CoinSimplexInt iRow2 = thisIndex[j - 1];
330     CoinSimplexInt iRow3 = thisIndex[j - 0];
331     temp[0] = region[iRow0];
332     temp[1] = region[iRow1];
333     temp[2] = region[iRow2];
334     temp[3] = region[iRow3];
335     __m256d t0 = _mm256_load_pd(temp);
336     t0 -= pv * elements;
337     _mm256_store_pd(temp, t0);
338     region[iRow0] = temp[0];
339     region[iRow1] = temp[1];
340     region[iRow2] = temp[2];
341     region[iRow3] = temp[3];
342   }
343 #else
344 #pragma cilk grainsize = CILK_FOR_GRAINSIZE
345   cilk_for(CoinBigIndex j = number - 1; j >= 0; j -= 4)
346   {
347     CoinSimplexInt iRow0 = thisIndex[j];
348     CoinSimplexInt iRow1 = thisIndex[j - 1];
349     CoinFactorizationDouble regionValue0 = region[iRow0];
350     CoinFactorizationDouble regionValue1 = region[iRow1];
351     region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
352     region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
353     CoinSimplexInt iRow2 = thisIndex[j - 2];
354     CoinSimplexInt iRow3 = thisIndex[j - 3];
355     CoinFactorizationDouble regionValue2 = region[iRow2];
356     CoinFactorizationDouble regionValue3 = region[iRow3];
357     region[iRow2] = regionValue2 - thisElement[j - 2] * pivotValue;
358     region[iRow3] = regionValue3 - thisElement[j - 3] * pivotValue;
359   }
360 #endif
361 #elif UNROLL_SCATTER == 3
362   const int *COIN_RESTRICT thisIndex = reinterpret_cast< const int * >(thisElement + number);
363   CoinSimplexInt iRow0;
364   CoinSimplexInt iRow1;
365   CoinFactorizationDouble regionValue0;
366   CoinFactorizationDouble regionValue1;
367   switch (static_cast< unsigned int >(number)) {
368   case 0:
369     break;
370   case 1:
371     iRow0 = thisIndex[0];
372     regionValue0 = region[iRow0];
373     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
374     break;
375   case 2:
376     iRow0 = thisIndex[0];
377     iRow1 = thisIndex[1];
378     regionValue0 = region[iRow0];
379     regionValue1 = region[iRow1];
380     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
381     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
382     break;
383   case 3:
384     iRow0 = thisIndex[0];
385     iRow1 = thisIndex[1];
386     regionValue0 = region[iRow0];
387     regionValue1 = region[iRow1];
388     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
389     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
390     iRow0 = thisIndex[2];
391     regionValue0 = region[iRow0];
392     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
393     break;
394   case 4:
395     iRow0 = thisIndex[0];
396     iRow1 = thisIndex[1];
397     regionValue0 = region[iRow0];
398     regionValue1 = region[iRow1];
399     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
400     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
401     iRow0 = thisIndex[2];
402     iRow1 = thisIndex[3];
403     regionValue0 = region[iRow0];
404     regionValue1 = region[iRow1];
405     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
406     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
407     break;
408   case 5:
409     iRow0 = thisIndex[0];
410     iRow1 = thisIndex[1];
411     regionValue0 = region[iRow0];
412     regionValue1 = region[iRow1];
413     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
414     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
415     iRow0 = thisIndex[2];
416     iRow1 = thisIndex[3];
417     regionValue0 = region[iRow0];
418     regionValue1 = region[iRow1];
419     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
420     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
421     iRow0 = thisIndex[4];
422     regionValue0 = region[iRow0];
423     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
424     break;
425   case 6:
426     iRow0 = thisIndex[0];
427     iRow1 = thisIndex[1];
428     regionValue0 = region[iRow0];
429     regionValue1 = region[iRow1];
430     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
431     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
432     iRow0 = thisIndex[2];
433     iRow1 = thisIndex[3];
434     regionValue0 = region[iRow0];
435     regionValue1 = region[iRow1];
436     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
437     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
438     iRow0 = thisIndex[4];
439     iRow1 = thisIndex[5];
440     regionValue0 = region[iRow0];
441     regionValue1 = region[iRow1];
442     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
443     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
444     break;
445   case 7:
446     iRow0 = thisIndex[0];
447     iRow1 = thisIndex[1];
448     regionValue0 = region[iRow0];
449     regionValue1 = region[iRow1];
450     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
451     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
452     iRow0 = thisIndex[2];
453     iRow1 = thisIndex[3];
454     regionValue0 = region[iRow0];
455     regionValue1 = region[iRow1];
456     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
457     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
458     iRow0 = thisIndex[4];
459     iRow1 = thisIndex[5];
460     regionValue0 = region[iRow0];
461     regionValue1 = region[iRow1];
462     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
463     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
464     iRow0 = thisIndex[6];
465     regionValue0 = region[iRow0];
466     region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
467     break;
468   case 8:
469     iRow0 = thisIndex[0];
470     iRow1 = thisIndex[1];
471     regionValue0 = region[iRow0];
472     regionValue1 = region[iRow1];
473     region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
474     region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
475     iRow0 = thisIndex[2];
476     iRow1 = thisIndex[3];
477     regionValue0 = region[iRow0];
478     regionValue1 = region[iRow1];
479     region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
480     region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
481     iRow0 = thisIndex[4];
482     iRow1 = thisIndex[5];
483     regionValue0 = region[iRow0];
484     regionValue1 = region[iRow1];
485     region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
486     region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
487     iRow0 = thisIndex[6];
488     iRow1 = thisIndex[7];
489     regionValue0 = region[iRow0];
490     regionValue1 = region[iRow1];
491     region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
492     region[iRow1] = regionValue1 - thisElement[7] * pivotValue;
493     break;
494   default:
495     if ((number & 1) != 0) {
496       number--;
497       CoinSimplexInt iRow = thisIndex[number];
498       CoinFactorizationDouble regionValue = region[iRow];
499       CoinFactorizationDouble value = thisElement[number];
500       region[iRow] = regionValue - value * pivotValue;
501     }
502     for (CoinBigIndex j = number - 1; j >= 0; j -= 2) {
503       CoinSimplexInt iRow0 = thisIndex[j];
504       CoinSimplexInt iRow1 = thisIndex[j - 1];
505       CoinFactorizationDouble regionValue0 = region[iRow0];
506       CoinFactorizationDouble regionValue1 = region[iRow1];
507       region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
508       region[iRow1] = regionValue1 - thisElement[j - 1] * pivotValue;
509     }
510     break;
511   }
512 #endif
513 }
514 #endif
515 //#define COIN_PREFETCH
516 #ifdef COIN_PREFETCH
517 #if 1
518 #define coin_prefetch(mem)              \
519   __asm__ __volatile__("prefetchnta %0" \
520                        :                \
521                        : "m"(*(reinterpret_cast< char * >(mem))))
522 #define coin_prefetch_const(mem)        \
523   __asm__ __volatile__("prefetchnta %0" \
524                        :                \
525                        : "m"(*(reinterpret_cast< const char * >(mem))))
526 #else
527 #define coin_prefetch(mem)           \
528   __asm__ __volatile__("prefetch %0" \
529                        :             \
530                        : "m"(*(reinterpret_cast< char * >(mem))))
531 #define coin_prefetch_const(mem)     \
532   __asm__ __volatile__("prefetch %0" \
533                        :             \
534                        : "m"(*(reinterpret_cast< const char * >(mem))))
535 #endif
536 #else
537 // dummy
538 #define coin_prefetch(mem)
539 #define coin_prefetch_const(mem)
540 #endif
541 #define NEW_CHUNK_SIZE 4
542 #define NEW_CHUNK_SIZE_INCREMENT (NEW_CHUNK_SIZE + NEW_CHUNK_SIZE / 2);
543 #define NEW_CHUNK_SIZE_OFFSET (NEW_CHUNK_SIZE / 2)
544 // leaf, pure, nothrow and hot give warnings
545 // fastcall and sseregparm give wrong results
546 //#define SCATTER_ATTRIBUTE __attribute__ ((leaf,fastcall,pure,sseregparm,nothrow,hot))
547 #define SCATTER_ATTRIBUTE
548 typedef void (*scatterUpdate)(int, CoinFactorizationDouble, const CoinFactorizationDouble *, double *) SCATTER_ATTRIBUTE;
549 typedef struct {
550   scatterUpdate functionPointer;
551   CoinBigIndex offset;
552   int number;
553 } scatterStruct;
554 void CoinAbcScatterUpdate0(int numberIn, CoinFactorizationDouble multiplier,
555   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
556   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
557 void CoinAbcScatterUpdate1(int numberIn, CoinFactorizationDouble multiplier,
558   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
559   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
560 void CoinAbcScatterUpdate2(int numberIn, CoinFactorizationDouble multiplier,
561   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
562   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
563 void CoinAbcScatterUpdate3(int numberIn, CoinFactorizationDouble multiplier,
564   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
565   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
566 void CoinAbcScatterUpdate4(int numberIn, CoinFactorizationDouble multiplier,
567   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
568   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
569 void CoinAbcScatterUpdate5(int numberIn, CoinFactorizationDouble multiplier,
570   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
571   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
572 void CoinAbcScatterUpdate6(int numberIn, CoinFactorizationDouble multiplier,
573   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
574   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
575 void CoinAbcScatterUpdate7(int numberIn, CoinFactorizationDouble multiplier,
576   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
577   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
578 void CoinAbcScatterUpdate8(int numberIn, CoinFactorizationDouble multiplier,
579   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
580   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
581 void CoinAbcScatterUpdate4N(int numberIn, CoinFactorizationDouble multiplier,
582   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
583   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
584 void CoinAbcScatterUpdate4NPlus1(int numberIn, CoinFactorizationDouble multiplier,
585   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
586   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
587 void CoinAbcScatterUpdate4NPlus2(int numberIn, CoinFactorizationDouble multiplier,
588   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
589   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
590 void CoinAbcScatterUpdate4NPlus3(int numberIn, CoinFactorizationDouble multiplier,
591   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
592   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
593 void CoinAbcScatterUpdate1Subtract(int numberIn, CoinFactorizationDouble multiplier,
594   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
595   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
596 void CoinAbcScatterUpdate2Subtract(int numberIn, CoinFactorizationDouble multiplier,
597   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
598   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
599 void CoinAbcScatterUpdate3Subtract(int numberIn, CoinFactorizationDouble multiplier,
600   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
601   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
602 void CoinAbcScatterUpdate4Subtract(int numberIn, CoinFactorizationDouble multiplier,
603   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
604   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
605 void CoinAbcScatterUpdate5Subtract(int numberIn, CoinFactorizationDouble multiplier,
606   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
607   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
608 void CoinAbcScatterUpdate6Subtract(int numberIn, CoinFactorizationDouble multiplier,
609   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
610   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
611 void CoinAbcScatterUpdate7Subtract(int numberIn, CoinFactorizationDouble multiplier,
612   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
613   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
614 void CoinAbcScatterUpdate8Subtract(int numberIn, CoinFactorizationDouble multiplier,
615   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
616   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
617 void CoinAbcScatterUpdate4NSubtract(int numberIn, CoinFactorizationDouble multiplier,
618   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
619   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
620 void CoinAbcScatterUpdate4NPlus1Subtract(int numberIn, CoinFactorizationDouble multiplier,
621   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
622   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
623 void CoinAbcScatterUpdate4NPlus2Subtract(int numberIn, CoinFactorizationDouble multiplier,
624   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
625   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
626 void CoinAbcScatterUpdate4NPlus3Subtract(int numberIn, CoinFactorizationDouble multiplier,
627   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
628   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
629 void CoinAbcScatterUpdate1Add(int numberIn, CoinFactorizationDouble multiplier,
630   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
631   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
632 void CoinAbcScatterUpdate2Add(int numberIn, CoinFactorizationDouble multiplier,
633   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
634   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
635 void CoinAbcScatterUpdate3Add(int numberIn, CoinFactorizationDouble multiplier,
636   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
637   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
638 void CoinAbcScatterUpdate4Add(int numberIn, CoinFactorizationDouble multiplier,
639   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
640   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
641 void CoinAbcScatterUpdate5Add(int numberIn, CoinFactorizationDouble multiplier,
642   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
643   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
644 void CoinAbcScatterUpdate6Add(int numberIn, CoinFactorizationDouble multiplier,
645   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
646   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
647 void CoinAbcScatterUpdate7Add(int numberIn, CoinFactorizationDouble multiplier,
648   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
649   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
650 void CoinAbcScatterUpdate8Add(int numberIn, CoinFactorizationDouble multiplier,
651   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
652   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
653 void CoinAbcScatterUpdate4NAdd(int numberIn, CoinFactorizationDouble multiplier,
654   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
655   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
656 void CoinAbcScatterUpdate4NPlus1Add(int numberIn, CoinFactorizationDouble multiplier,
657   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
658   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
659 void CoinAbcScatterUpdate4NPlus2Add(int numberIn, CoinFactorizationDouble multiplier,
660   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
661   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
662 void CoinAbcScatterUpdate4NPlus3Add(int numberIn, CoinFactorizationDouble multiplier,
663   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
664   CoinFactorizationDouble *COIN_RESTRICT region) SCATTER_ATTRIBUTE;
665 #if INLINE_SCATTER == 0
666 void CoinAbcScatterUpdate(int number, CoinFactorizationDouble pivotValue,
667   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
668   const int *COIN_RESTRICT thisIndex,
669   CoinFactorizationDouble *COIN_RESTRICT region,
670   double *COIN_RESTRICT work);
671 #else
672 #if 0
673 void ABC_INLINE inline CoinAbcScatterUpdate(int number,CoinFactorizationDouble pivotValue,
674 					    const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
675 					    const int *  COIN_RESTRICT thisIndex,
676 					    CoinFactorizationDouble * COIN_RESTRICT region,
677 					    double * COIN_RESTRICT /*work*/)
678 {
679 #if UNROLL_SCATTER == 0
680   for (CoinBigIndex j=number-1 ; j >=0; j-- ) {
681     CoinSimplexInt iRow = thisIndex[j];
682     CoinFactorizationDouble regionValue = region[iRow];
683     CoinFactorizationDouble value = thisElement[j];
684     assert (value);
685     region[iRow] = regionValue - value * pivotValue;
686   }
687 #elif UNROLL_SCATTER == 1
688   if ((number&1)!=0) {
689     CoinSimplexInt iRow = thisIndex[0];
690     thisIndex++;
691     CoinFactorizationDouble regionValue = region[iRow];
692     CoinFactorizationDouble value = thisElement[0];
693     thisElement++;
694     region[iRow] = regionValue - value * pivotValue;
695   }
696   number = number>>1;
697   CoinFactorizationDouble work2[4];
698   for ( ; number !=0; number-- ) {
699     CoinSimplexInt iRow0 = thisIndex[0];
700     CoinSimplexInt iRow1 = thisIndex[1];
701     work2[0] = region[iRow0];
702     work2[1] = region[iRow1];
703 #if 0
704     work2[2] = region[iRow0];
705     work2[3] = region[iRow1];
706     //__v4df b = __builtin_ia32_maskloadpd256(work2);
707     __v4df b = __builtin_ia32_loadupd256(work2);
708     //__v4df b = _mm256_load_pd(work2);
709 #endif
710     work2[0] -= thisElement[0] * pivotValue;
711     work2[1] -= thisElement[1] * pivotValue;
712     region[iRow0] = work2[0];
713     region[iRow1] = work2[1];
714     thisIndex+=2;
715     thisElement+=2;
716   }
717 #endif
718 }
719 #endif
720 #endif
721 #define UNROLL_GATHER 0
722 #define INLINE_GATHER 1
723 #if INLINE_GATHER == 0
724 CoinFactorizationDouble CoinAbcGatherUpdate(CoinSimplexInt number,
725   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
726   const int *COIN_RESTRICT thisIndex,
727   CoinFactorizationDouble *COIN_RESTRICT region);
728 #else
CoinAbcGatherUpdate(CoinSimplexInt number,const CoinFactorizationDouble * COIN_RESTRICT thisElement,const int * COIN_RESTRICT thisIndex,CoinFactorizationDouble * COIN_RESTRICT region)729 CoinFactorizationDouble ABC_INLINE inline CoinAbcGatherUpdate(CoinSimplexInt number,
730   const CoinFactorizationDouble *COIN_RESTRICT thisElement,
731   const int *COIN_RESTRICT thisIndex,
732   CoinFactorizationDouble *COIN_RESTRICT region)
733 {
734 #if UNROLL_GATHER == 0
735   CoinFactorizationDouble pivotValue = 0.0;
736   for (CoinBigIndex j = 0; j < number; j++) {
737     CoinFactorizationDouble value = thisElement[j];
738     CoinSimplexInt jRow = thisIndex[j];
739     value *= region[jRow];
740     pivotValue -= value;
741   }
742   return pivotValue;
743 #else
744 #error code
745 #endif
746 }
747 #endif
748 #define UNROLL_MULTIPLY_INDEXED 0
749 #define INLINE_MULTIPLY_INDEXED 0
750 #if INLINE_MULTIPLY_INDEXED == 0
751 void CoinAbcMultiplyIndexed(int number,
752   const double *COIN_RESTRICT multiplier,
753   const int *COIN_RESTRICT thisIndex,
754   CoinFactorizationDouble *COIN_RESTRICT region);
755 void CoinAbcMultiplyIndexed(int number,
756   const long double *COIN_RESTRICT multiplier,
757   const int *COIN_RESTRICT thisIndex,
758   long double *COIN_RESTRICT region);
759 #else
CoinAbcMultiplyIndexed(int number,const double * COIN_RESTRICT multiplier,const int * COIN_RESTRICT thisIndex,CoinFactorizationDouble * COIN_RESTRICT region)760 void ABC_INLINE inline CoinAbcMultiplyIndexed(int number,
761   const double *COIN_RESTRICT multiplier,
762   const int *COIN_RESTRICT thisIndex,
763   CoinFactorizationDouble *COIN_RESTRICT region)
764 {
765 }
766 #endif
767 double CoinAbcMaximumAbsElement(const double *region, int size);
768 void CoinAbcMinMaxAbsElement(const double *region, int size, double &minimum, double &maximum);
769 void CoinAbcMinMaxAbsNormalValues(const double *region, int size, double &minimum, double &maximum);
770 void CoinAbcScale(double *region, double multiplier, int size);
771 void CoinAbcScaleNormalValues(double *region, double multiplier, double killIfLessThanThis, int size);
772 /// maximum fabs(region[i]) and then region[i]*=multiplier
773 double CoinAbcMaximumAbsElementAndScale(double *region, double multiplier, int size);
774 void CoinAbcSetElements(double *region, int size, double value);
775 void CoinAbcMultiplyAdd(const double *region1, int size, double multiplier1,
776   double *regionChanged, double multiplier2);
777 double CoinAbcInnerProduct(const double *region1, int size, const double *region2);
778 void CoinAbcGetNorms(const double *region, int size, double &norm1, double &norm2);
779 /// regionTo[index[i]]=regionFrom[i]
780 void CoinAbcScatterTo(const double *regionFrom, double *regionTo, const int *index, int number);
781 /// regionTo[i]=regionFrom[index[i]]
782 void CoinAbcGatherFrom(const double *regionFrom, double *regionTo, const int *index, int number);
783 /// regionTo[index[i]]=0.0
784 void CoinAbcScatterZeroTo(double *regionTo, const int *index, int number);
785 /// regionTo[indexScatter[indexList[i]]]=regionFrom[indexList[i]]
786 void CoinAbcScatterToList(const double *regionFrom, double *regionTo,
787   const int *indexList, const int *indexScatter, int number);
788 /// array[i]=1.0/sqrt(array[i])
789 void CoinAbcInverseSqrts(double *array, int n);
790 void CoinAbcReciprocal(double *array, int n, const double *input);
791 void CoinAbcMemcpyLong(double *array, const double *arrayFrom, int size);
792 void CoinAbcMemcpyLong(int *array, const int *arrayFrom, int size);
793 void CoinAbcMemcpyLong(unsigned char *array, const unsigned char *arrayFrom, int size);
794 void CoinAbcMemset0Long(double *array, int size);
795 void CoinAbcMemset0Long(int *array, int size);
796 void CoinAbcMemset0Long(unsigned char *array, int size);
797 void CoinAbcMemmove(double *array, const double *arrayFrom, int size);
798 void CoinAbcMemmove(int *array, const int *arrayFrom, int size);
799 void CoinAbcMemmove(unsigned char *array, const unsigned char *arrayFrom, int size);
800 /// This moves down and zeroes out end
801 void CoinAbcMemmoveAndZero(double *array, double *arrayFrom, int size);
802 /// This compacts several sections and zeroes out end (returns number)
803 int CoinAbcCompact(int numberSections, int alreadyDone, double *array, const int *starts, const int *lengths);
804 /// This compacts several sections (returns number)
805 int CoinAbcCompact(int numberSections, int alreadyDone, int *array, const int *starts, const int *lengths);
806 #endif
807 #if ABC_CREATE_SCATTER_FUNCTION
functionName(ScatterUpdate1)808 SCATTER_ATTRIBUTE void functionName(ScatterUpdate1)(int numberIn, CoinFactorizationDouble multiplier,
809   const CoinFactorizationDouble *COIN_RESTRICT element,
810   CoinFactorizationDouble *COIN_RESTRICT region)
811 {
812 #ifndef NDEBUG
813   assert(numberIn == 1);
814 #endif
815   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 1);
816   int iColumn0 = thisColumn[0];
817   double value0 = region[iColumn0];
818   value0 OPERATION multiplier *element[0];
819   region[iColumn0] = value0;
820 }
functionName(ScatterUpdate2)821 SCATTER_ATTRIBUTE void functionName(ScatterUpdate2)(int numberIn, CoinFactorizationDouble multiplier,
822   const CoinFactorizationDouble *COIN_RESTRICT element,
823   CoinFactorizationDouble *COIN_RESTRICT region)
824 {
825 #ifndef NDEBUG
826   assert(numberIn == 2);
827 #endif
828   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 2);
829 #if NEW_CHUNK_SIZE == 2
830   int nFull = 2 & (~(NEW_CHUNK_SIZE - 1));
831   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
832     coin_prefetch(element + NEW_CHUNK_SIZE_INCREMENT);
833     int iColumn0 = thisColumn[0];
834     int iColumn1 = thisColumn[1];
835     CoinFactorizationDouble value0 = region[iColumn0];
836     CoinFactorizationDouble value1 = region[iColumn1];
837     value0 OPERATION multiplier *element[0 + NEW_CHUNK_SIZE_OFFSET];
838     value1 OPERATION multiplier *element[1 + NEW_CHUNK_SIZE_OFFSET];
839     region[iColumn0] = value0;
840     region[iColumn1] = value1;
841     element += NEW_CHUNK_SIZE_INCREMENT;
842     thisColumn = reinterpret_cast< const int * >(element);
843   }
844 #endif
845 #if NEW_CHUNK_SIZE == 4
846   int iColumn0 = thisColumn[0];
847   int iColumn1 = thisColumn[1];
848   CoinFactorizationDouble value0 = region[iColumn0];
849   CoinFactorizationDouble value1 = region[iColumn1];
850   value0 OPERATION multiplier *element[0];
851   value1 OPERATION multiplier *element[1];
852   region[iColumn0] = value0;
853   region[iColumn1] = value1;
854 #endif
855 }
functionName(ScatterUpdate3)856 SCATTER_ATTRIBUTE void functionName(ScatterUpdate3)(int numberIn, CoinFactorizationDouble multiplier,
857   const CoinFactorizationDouble *COIN_RESTRICT element,
858   CoinFactorizationDouble *COIN_RESTRICT region)
859 {
860 #ifndef NDEBUG
861   assert(numberIn == 3);
862 #endif
863   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 3);
864 #if AVX2 == 1
865   double temp[2];
866 #endif
867 #if NEW_CHUNK_SIZE == 2
868   int nFull = 3 & (~(NEW_CHUNK_SIZE - 1));
869   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
870     //coin_prefetch_const(element+NEW_CHUNK_SIZE_INCREMENT);
871     int iColumn0 = thisColumn[0];
872     int iColumn1 = thisColumn[1];
873     CoinFactorizationDouble value0 = region[iColumn0];
874     CoinFactorizationDouble value1 = region[iColumn1];
875     value0 OPERATION multiplier *element[0];
876     value1 OPERATION multiplier *element[1];
877     region[iColumn0] = value0;
878     region[iColumn1] = value1;
879     element += NEW_CHUNK_SIZE;
880     thisColumn + = NEW_CHUNK_SIZE;
881   }
882 #endif
883 #if NEW_CHUNK_SIZE == 2
884   int iColumn0 = thisColumn[0];
885   double value0 = region[iColumn0];
886   value0 OPERATION multiplier *element[0];
887   region[iColumn0] = value0;
888 #else
889   int iColumn0 = thisColumn[0];
890   int iColumn1 = thisColumn[1];
891   int iColumn2 = thisColumn[2];
892 #if AVX2 == 1
893   __v2df bb;
894   double value2 = region[iColumn2];
895   value2 OPERATION multiplier *element[2];
896   set_const_v2df(bb, multiplier);
897   temp[0] = region[iColumn0];
898   temp[1] = region[iColumn1];
899   region[iColumn2] = value2;
900   __v2df v0 = __builtin_ia32_loadupd(temp);
901   __v2df a = __builtin_ia32_loadupd(element);
902   a *= bb;
903   v0 OPERATION a;
904   __builtin_ia32_storeupd(temp, v0);
905   region[iColumn0] = temp[0];
906   region[iColumn1] = temp[1];
907 #else
908   double value0 = region[iColumn0];
909   double value1 = region[iColumn1];
910   double value2 = region[iColumn2];
911   value0 OPERATION multiplier *element[0];
912   value1 OPERATION multiplier *element[1];
913   value2 OPERATION multiplier *element[2];
914   region[iColumn0] = value0;
915   region[iColumn1] = value1;
916   region[iColumn2] = value2;
917 #endif
918 #endif
919 }
functionName(ScatterUpdate4)920 SCATTER_ATTRIBUTE void functionName(ScatterUpdate4)(int numberIn, CoinFactorizationDouble multiplier,
921   const CoinFactorizationDouble *COIN_RESTRICT element,
922   CoinFactorizationDouble *COIN_RESTRICT region)
923 {
924 #ifndef NDEBUG
925   assert(numberIn == 4);
926 #endif
927   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 4);
928   int nFull = 4 & (~(NEW_CHUNK_SIZE - 1));
929 #if AVX2 == 1
930   double temp[4];
931 #endif
932   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
933     //coin_prefetch_const(element+NEW_CHUNK_SIZE_INCREMENT);
934 #if NEW_CHUNK_SIZE == 2
935     int iColumn0 = thisColumn[0];
936     int iColumn1 = thisColumn[1];
937     double value0 = region[iColumn0];
938     double value1 = region[iColumn1];
939     value0 OPERATION multiplier *element[0];
940     value1 OPERATION multiplier *element[1];
941     region[iColumn0] = value0;
942     region[iColumn1] = value1;
943 #elif NEW_CHUNK_SIZE == 4
944     int iColumn0 = thisColumn[0];
945     int iColumn1 = thisColumn[1];
946     int iColumn2 = thisColumn[2];
947     int iColumn3 = thisColumn[3];
948 #if AVX2 == 1
949     __v2df bb;
950     set_const_v2df(bb, multiplier);
951     temp[0] = region[iColumn0];
952     temp[1] = region[iColumn1];
953     temp[2] = region[iColumn2];
954     temp[3] = region[iColumn3];
955     __v2df v0 = __builtin_ia32_loadupd(temp);
956     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
957     __v2df a = __builtin_ia32_loadupd(element);
958     a *= bb;
959     v0 OPERATION a;
960     a = __builtin_ia32_loadupd(element + 2);
961     a *= bb;
962     v1 OPERATION a;
963     __builtin_ia32_storeupd(temp, v0);
964     __builtin_ia32_storeupd(temp + 2, v1);
965     region[iColumn0] = temp[0];
966     region[iColumn1] = temp[1];
967     region[iColumn2] = temp[2];
968     region[iColumn3] = temp[3];
969 #else
970     double value0 = region[iColumn0];
971     double value1 = region[iColumn1];
972     double value2 = region[iColumn2];
973     double value3 = region[iColumn3];
974     value0 OPERATION multiplier *element[0];
975     value1 OPERATION multiplier *element[1];
976     value2 OPERATION multiplier *element[2];
977     value3 OPERATION multiplier *element[3];
978     region[iColumn0] = value0;
979     region[iColumn1] = value1;
980     region[iColumn2] = value2;
981     region[iColumn3] = value3;
982 #endif
983 #else
984     abort();
985 #endif
986     element += NEW_CHUNK_SIZE;
987     thisColumn += NEW_CHUNK_SIZE;
988   }
989 }
functionName(ScatterUpdate5)990 SCATTER_ATTRIBUTE void functionName(ScatterUpdate5)(int numberIn, CoinFactorizationDouble multiplier,
991   const CoinFactorizationDouble *COIN_RESTRICT element,
992   CoinFactorizationDouble *COIN_RESTRICT region)
993 {
994 #ifndef NDEBUG
995   assert(numberIn == 5);
996 #endif
997   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 5);
998   int nFull = 5 & (~(NEW_CHUNK_SIZE - 1));
999 #if AVX2 == 1
1000   double temp[4];
1001 #endif
1002   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1003     //coin_prefetch_const(element+NEW_CHUNK_SIZE_INCREMENT);
1004 #if NEW_CHUNK_SIZE == 2
1005     int iColumn0 = thisColumn[0];
1006     int iColumn1 = thisColumn[1];
1007     double value0 = region[iColumn0];
1008     double value1 = region[iColumn1];
1009     value0 OPERATION multiplier *element[0];
1010     value1 OPERATION multiplier *element[1];
1011     region[iColumn0] = value0;
1012     region[iColumn1] = value1;
1013 #elif NEW_CHUNK_SIZE == 4
1014     int iColumn0 = thisColumn[0];
1015     int iColumn1 = thisColumn[1];
1016     int iColumn2 = thisColumn[2];
1017     int iColumn3 = thisColumn[3];
1018 #if AVX2 == 1
1019     __v2df bb;
1020     set_const_v2df(bb, multiplier);
1021     temp[0] = region[iColumn0];
1022     temp[1] = region[iColumn1];
1023     temp[2] = region[iColumn2];
1024     temp[3] = region[iColumn3];
1025     __v2df v0 = __builtin_ia32_loadupd(temp);
1026     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1027     __v2df a = __builtin_ia32_loadupd(element);
1028     a *= bb;
1029     v0 OPERATION a;
1030     a = __builtin_ia32_loadupd(element + 2);
1031     a *= bb;
1032     v1 OPERATION a;
1033     __builtin_ia32_storeupd(temp, v0);
1034     __builtin_ia32_storeupd(temp + 2, v1);
1035     region[iColumn0] = temp[0];
1036     region[iColumn1] = temp[1];
1037     region[iColumn2] = temp[2];
1038     region[iColumn3] = temp[3];
1039 #else
1040     double value0 = region[iColumn0];
1041     double value1 = region[iColumn1];
1042     double value2 = region[iColumn2];
1043     double value3 = region[iColumn3];
1044     value0 OPERATION multiplier *element[0];
1045     value1 OPERATION multiplier *element[1];
1046     value2 OPERATION multiplier *element[2];
1047     value3 OPERATION multiplier *element[3];
1048     region[iColumn0] = value0;
1049     region[iColumn1] = value1;
1050     region[iColumn2] = value2;
1051     region[iColumn3] = value3;
1052 #endif
1053 #else
1054     abort();
1055 #endif
1056     element += NEW_CHUNK_SIZE;
1057     thisColumn += NEW_CHUNK_SIZE;
1058   }
1059   int iColumn0 = thisColumn[0];
1060   double value0 = region[iColumn0];
1061   value0 OPERATION multiplier *element[0];
1062   region[iColumn0] = value0;
1063 }
functionName(ScatterUpdate6)1064 SCATTER_ATTRIBUTE void functionName(ScatterUpdate6)(int numberIn, CoinFactorizationDouble multiplier,
1065   const CoinFactorizationDouble *COIN_RESTRICT element,
1066   CoinFactorizationDouble *COIN_RESTRICT region)
1067 {
1068 #ifndef NDEBUG
1069   assert(numberIn == 6);
1070 #endif
1071   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 6);
1072   int nFull = 6 & (~(NEW_CHUNK_SIZE - 1));
1073 #if AVX2 == 1
1074   double temp[4];
1075 #endif
1076   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1077     coin_prefetch_const(element + 6);
1078 #if NEW_CHUNK_SIZE == 2
1079     int iColumn0 = thisColumn[0];
1080     int iColumn1 = thisColumn[1];
1081     double value0 = region[iColumn0];
1082     double value1 = region[iColumn1];
1083     value0 OPERATION multiplier *element[0];
1084     value1 OPERATION multiplier *element[1];
1085     region[iColumn0] = value0;
1086     region[iColumn1] = value1;
1087 #elif NEW_CHUNK_SIZE == 4
1088     int iColumn0 = thisColumn[0];
1089     int iColumn1 = thisColumn[1];
1090     int iColumn2 = thisColumn[2];
1091     int iColumn3 = thisColumn[3];
1092 #if AVX2 == 1
1093     __v2df bb;
1094     set_const_v2df(bb, multiplier);
1095     temp[0] = region[iColumn0];
1096     temp[1] = region[iColumn1];
1097     temp[2] = region[iColumn2];
1098     temp[3] = region[iColumn3];
1099     __v2df v0 = __builtin_ia32_loadupd(temp);
1100     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1101     __v2df a = __builtin_ia32_loadupd(element);
1102     a *= bb;
1103     v0 OPERATION a;
1104     a = __builtin_ia32_loadupd(element + 2);
1105     a *= bb;
1106     v1 OPERATION a;
1107     __builtin_ia32_storeupd(temp, v0);
1108     __builtin_ia32_storeupd(temp + 2, v1);
1109     region[iColumn0] = temp[0];
1110     region[iColumn1] = temp[1];
1111     region[iColumn2] = temp[2];
1112     region[iColumn3] = temp[3];
1113 #else
1114     double value0 = region[iColumn0];
1115     double value1 = region[iColumn1];
1116     double value2 = region[iColumn2];
1117     double value3 = region[iColumn3];
1118     value0 OPERATION multiplier *element[0];
1119     value1 OPERATION multiplier *element[1];
1120     value2 OPERATION multiplier *element[2];
1121     value3 OPERATION multiplier *element[3];
1122     region[iColumn0] = value0;
1123     region[iColumn1] = value1;
1124     region[iColumn2] = value2;
1125     region[iColumn3] = value3;
1126 #endif
1127 #else
1128     abort();
1129 #endif
1130     element += NEW_CHUNK_SIZE;
1131     thisColumn += NEW_CHUNK_SIZE;
1132   }
1133 #if NEW_CHUNK_SIZE == 4
1134   int iColumn0 = thisColumn[0];
1135   int iColumn1 = thisColumn[1];
1136   double value0 = region[iColumn0];
1137   double value1 = region[iColumn1];
1138   value0 OPERATION multiplier *element[0];
1139   value1 OPERATION multiplier *element[1];
1140   region[iColumn0] = value0;
1141   region[iColumn1] = value1;
1142 #endif
1143 }
functionName(ScatterUpdate7)1144 SCATTER_ATTRIBUTE void functionName(ScatterUpdate7)(int numberIn, CoinFactorizationDouble multiplier,
1145   const CoinFactorizationDouble *COIN_RESTRICT element,
1146   CoinFactorizationDouble *COIN_RESTRICT region)
1147 {
1148 #ifndef NDEBUG
1149   assert(numberIn == 7);
1150 #endif
1151   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 7);
1152   int nFull = 7 & (~(NEW_CHUNK_SIZE - 1));
1153 #if AVX2 == 1
1154   double temp[4];
1155 #endif
1156   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1157     coin_prefetch_const(element + 6);
1158 #if NEW_CHUNK_SIZE == 2
1159     int iColumn0 = thisColumn[0];
1160     int iColumn1 = thisColumn[1];
1161     double value0 = region[iColumn0];
1162     double value1 = region[iColumn1];
1163     value0 OPERATION multiplier *element[0];
1164     value1 OPERATION multiplier *element[1];
1165     region[iColumn0] = value0;
1166     region[iColumn1] = value1;
1167 #elif NEW_CHUNK_SIZE == 4
1168     int iColumn0 = thisColumn[0];
1169     int iColumn1 = thisColumn[1];
1170     int iColumn2 = thisColumn[2];
1171     int iColumn3 = thisColumn[3];
1172 #if AVX2 == 1
1173     __v2df bb;
1174     set_const_v2df(bb, multiplier);
1175     temp[0] = region[iColumn0];
1176     temp[1] = region[iColumn1];
1177     temp[2] = region[iColumn2];
1178     temp[3] = region[iColumn3];
1179     __v2df v0 = __builtin_ia32_loadupd(temp);
1180     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1181     __v2df a = __builtin_ia32_loadupd(element);
1182     a *= bb;
1183     v0 OPERATION a;
1184     a = __builtin_ia32_loadupd(element + 2);
1185     a *= bb;
1186     v1 OPERATION a;
1187     __builtin_ia32_storeupd(temp, v0);
1188     __builtin_ia32_storeupd(temp + 2, v1);
1189     region[iColumn0] = temp[0];
1190     region[iColumn1] = temp[1];
1191     region[iColumn2] = temp[2];
1192     region[iColumn3] = temp[3];
1193 #else
1194     double value0 = region[iColumn0];
1195     double value1 = region[iColumn1];
1196     double value2 = region[iColumn2];
1197     double value3 = region[iColumn3];
1198     value0 OPERATION multiplier *element[0];
1199     value1 OPERATION multiplier *element[1];
1200     value2 OPERATION multiplier *element[2];
1201     value3 OPERATION multiplier *element[3];
1202     region[iColumn0] = value0;
1203     region[iColumn1] = value1;
1204     region[iColumn2] = value2;
1205     region[iColumn3] = value3;
1206 #endif
1207 #else
1208     abort();
1209 #endif
1210     element += NEW_CHUNK_SIZE;
1211     thisColumn += NEW_CHUNK_SIZE;
1212   }
1213 #if NEW_CHUNK_SIZE == 2
1214   int iColumn0 = thisColumn[0];
1215   double value0 = region[iColumn0];
1216   value0 OPERATION multiplier *element[0];
1217   region[iColumn0] = value0;
1218 #else
1219   int iColumn0 = thisColumn[0];
1220   int iColumn1 = thisColumn[1];
1221   int iColumn2 = thisColumn[2];
1222   double value0 = region[iColumn0];
1223   double value1 = region[iColumn1];
1224   double value2 = region[iColumn2];
1225   value0 OPERATION multiplier *element[0];
1226   value1 OPERATION multiplier *element[1];
1227   value2 OPERATION multiplier *element[2];
1228   region[iColumn0] = value0;
1229   region[iColumn1] = value1;
1230   region[iColumn2] = value2;
1231 #endif
1232 }
functionName(ScatterUpdate8)1233 SCATTER_ATTRIBUTE void functionName(ScatterUpdate8)(int numberIn, CoinFactorizationDouble multiplier,
1234   const CoinFactorizationDouble *COIN_RESTRICT element,
1235   CoinFactorizationDouble *COIN_RESTRICT region)
1236 {
1237 #ifndef NDEBUG
1238   assert(numberIn == 8);
1239 #endif
1240   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + 8);
1241   int nFull = 8 & (~(NEW_CHUNK_SIZE - 1));
1242 #if AVX2 == 1
1243   double temp[4];
1244 #endif
1245   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1246     coin_prefetch_const(element + 6);
1247 #if NEW_CHUNK_SIZE == 2
1248     int iColumn0 = thisColumn[0];
1249     int iColumn1 = thisColumn[1];
1250     double value0 = region[iColumn0];
1251     double value1 = region[iColumn1];
1252     value0 OPERATION multiplier *element[0];
1253     value1 OPERATION multiplier *element[1];
1254     region[iColumn0] = value0;
1255     region[iColumn1] = value1;
1256 #elif NEW_CHUNK_SIZE == 4
1257     int iColumn0 = thisColumn[0];
1258     int iColumn1 = thisColumn[1];
1259     int iColumn2 = thisColumn[2];
1260     int iColumn3 = thisColumn[3];
1261 #if AVX2 == 1
1262     __v2df bb;
1263     set_const_v2df(bb, multiplier);
1264     temp[0] = region[iColumn0];
1265     temp[1] = region[iColumn1];
1266     temp[2] = region[iColumn2];
1267     temp[3] = region[iColumn3];
1268     __v2df v0 = __builtin_ia32_loadupd(temp);
1269     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1270     __v2df a = __builtin_ia32_loadupd(element);
1271     a *= bb;
1272     v0 OPERATION a;
1273     a = __builtin_ia32_loadupd(element + 2);
1274     a *= bb;
1275     v1 OPERATION a;
1276     __builtin_ia32_storeupd(temp, v0);
1277     __builtin_ia32_storeupd(temp + 2, v1);
1278     region[iColumn0] = temp[0];
1279     region[iColumn1] = temp[1];
1280     region[iColumn2] = temp[2];
1281     region[iColumn3] = temp[3];
1282 #else
1283     double value0 = region[iColumn0];
1284     double value1 = region[iColumn1];
1285     double value2 = region[iColumn2];
1286     double value3 = region[iColumn3];
1287     value0 OPERATION multiplier *element[0];
1288     value1 OPERATION multiplier *element[1];
1289     value2 OPERATION multiplier *element[2];
1290     value3 OPERATION multiplier *element[3];
1291     region[iColumn0] = value0;
1292     region[iColumn1] = value1;
1293     region[iColumn2] = value2;
1294     region[iColumn3] = value3;
1295 #endif
1296 #else
1297     abort();
1298 #endif
1299     element += NEW_CHUNK_SIZE;
1300     thisColumn += NEW_CHUNK_SIZE;
1301   }
1302 }
functionName(ScatterUpdate4N)1303 SCATTER_ATTRIBUTE void functionName(ScatterUpdate4N)(int numberIn, CoinFactorizationDouble multiplier,
1304   const CoinFactorizationDouble *COIN_RESTRICT element,
1305   CoinFactorizationDouble *COIN_RESTRICT region)
1306 {
1307   assert((numberIn & 3) == 0);
1308   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + numberIn);
1309   int nFull = numberIn & (~(NEW_CHUNK_SIZE - 1));
1310 #if AVX2 == 1
1311   double temp[4];
1312 #endif
1313   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1314     coin_prefetch_const(element + 16);
1315     coin_prefetch_const(thisColumn + 32);
1316 #if NEW_CHUNK_SIZE == 2
1317     int iColumn0 = thisColumn[0];
1318     int iColumn1 = thisColumn[1];
1319     double value0 = region[iColumn0];
1320     double value1 = region[iColumn1];
1321     value0 OPERATION multiplier *element[0];
1322     value1 OPERATION multiplier *element[1];
1323     region[iColumn0] = value0;
1324     region[iColumn1] = value1;
1325 #elif NEW_CHUNK_SIZE == 4
1326     int iColumn0 = thisColumn[0];
1327     int iColumn1 = thisColumn[1];
1328     int iColumn2 = thisColumn[2];
1329     int iColumn3 = thisColumn[3];
1330 #if AVX2 == 1
1331     __v2df bb;
1332     set_const_v2df(bb, multiplier);
1333     temp[0] = region[iColumn0];
1334     temp[1] = region[iColumn1];
1335     temp[2] = region[iColumn2];
1336     temp[3] = region[iColumn3];
1337     __v2df v0 = __builtin_ia32_loadupd(temp);
1338     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1339     __v2df a = __builtin_ia32_loadupd(element);
1340     a *= bb;
1341     v0 OPERATION a;
1342     a = __builtin_ia32_loadupd(element + 2);
1343     a *= bb;
1344     v1 OPERATION a;
1345     __builtin_ia32_storeupd(temp, v0);
1346     __builtin_ia32_storeupd(temp + 2, v1);
1347     region[iColumn0] = temp[0];
1348     region[iColumn1] = temp[1];
1349     region[iColumn2] = temp[2];
1350     region[iColumn3] = temp[3];
1351 #else
1352     double value0 = region[iColumn0];
1353     double value1 = region[iColumn1];
1354     double value2 = region[iColumn2];
1355     double value3 = region[iColumn3];
1356     value0 OPERATION multiplier *element[0];
1357     value1 OPERATION multiplier *element[1];
1358     value2 OPERATION multiplier *element[2];
1359     value3 OPERATION multiplier *element[3];
1360     region[iColumn0] = value0;
1361     region[iColumn1] = value1;
1362     region[iColumn2] = value2;
1363     region[iColumn3] = value3;
1364 #endif
1365 #else
1366     abort();
1367 #endif
1368     element += NEW_CHUNK_SIZE;
1369     thisColumn += NEW_CHUNK_SIZE;
1370   }
1371 }
functionName(ScatterUpdate4NPlus1)1372 SCATTER_ATTRIBUTE void functionName(ScatterUpdate4NPlus1)(int numberIn, CoinFactorizationDouble multiplier,
1373   const CoinFactorizationDouble *COIN_RESTRICT element,
1374   CoinFactorizationDouble *COIN_RESTRICT region)
1375 {
1376   assert((numberIn & 3) == 1);
1377   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + numberIn);
1378   int nFull = numberIn & (~(NEW_CHUNK_SIZE - 1));
1379 #if AVX2 == 1
1380   double temp[4];
1381 #endif
1382   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1383     coin_prefetch_const(element + 16);
1384     coin_prefetch_const(thisColumn + 32);
1385 #if NEW_CHUNK_SIZE == 2
1386     int iColumn0 = thisColumn[0];
1387     int iColumn1 = thisColumn[1];
1388     double value0 = region[iColumn0];
1389     double value1 = region[iColumn1];
1390     value0 OPERATION multiplier *element[0];
1391     value1 OPERATION multiplier *element[1];
1392     region[iColumn0] = value0;
1393     region[iColumn1] = value1;
1394 #elif NEW_CHUNK_SIZE == 4
1395     int iColumn0 = thisColumn[0];
1396     int iColumn1 = thisColumn[1];
1397     int iColumn2 = thisColumn[2];
1398     int iColumn3 = thisColumn[3];
1399 #if AVX2 == 1
1400     __v2df bb;
1401     set_const_v2df(bb, multiplier);
1402     temp[0] = region[iColumn0];
1403     temp[1] = region[iColumn1];
1404     temp[2] = region[iColumn2];
1405     temp[3] = region[iColumn3];
1406     __v2df v0 = __builtin_ia32_loadupd(temp);
1407     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1408     __v2df a = __builtin_ia32_loadupd(element);
1409     a *= bb;
1410     v0 OPERATION a;
1411     a = __builtin_ia32_loadupd(element + 2);
1412     a *= bb;
1413     v1 OPERATION a;
1414     __builtin_ia32_storeupd(temp, v0);
1415     __builtin_ia32_storeupd(temp + 2, v1);
1416     region[iColumn0] = temp[0];
1417     region[iColumn1] = temp[1];
1418     region[iColumn2] = temp[2];
1419     region[iColumn3] = temp[3];
1420 #else
1421     double value0 = region[iColumn0];
1422     double value1 = region[iColumn1];
1423     double value2 = region[iColumn2];
1424     double value3 = region[iColumn3];
1425     value0 OPERATION multiplier *element[0];
1426     value1 OPERATION multiplier *element[1];
1427     value2 OPERATION multiplier *element[2];
1428     value3 OPERATION multiplier *element[3];
1429     region[iColumn0] = value0;
1430     region[iColumn1] = value1;
1431     region[iColumn2] = value2;
1432     region[iColumn3] = value3;
1433 #endif
1434 #else
1435     abort();
1436 #endif
1437     element += NEW_CHUNK_SIZE;
1438     thisColumn += NEW_CHUNK_SIZE;
1439   }
1440   int iColumn0 = thisColumn[0];
1441   double value0 = region[iColumn0];
1442   value0 OPERATION multiplier *element[0];
1443   region[iColumn0] = value0;
1444 }
functionName(ScatterUpdate4NPlus2)1445 SCATTER_ATTRIBUTE void functionName(ScatterUpdate4NPlus2)(int numberIn, CoinFactorizationDouble multiplier,
1446   const CoinFactorizationDouble *COIN_RESTRICT element,
1447   CoinFactorizationDouble *COIN_RESTRICT region)
1448 {
1449   assert((numberIn & 3) == 2);
1450   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + numberIn);
1451   int nFull = numberIn & (~(NEW_CHUNK_SIZE - 1));
1452 #if AVX2 == 1
1453   double temp[4];
1454 #endif
1455   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1456     coin_prefetch_const(element + 16);
1457     coin_prefetch_const(thisColumn + 32);
1458 #if NEW_CHUNK_SIZE == 2
1459     int iColumn0 = thisColumn[0];
1460     int iColumn1 = thisColumn[1];
1461     double value0 = region[iColumn0];
1462     double value1 = region[iColumn1];
1463     value0 OPERATION multiplier *element[0];
1464     value1 OPERATION multiplier *element[1];
1465     region[iColumn0] = value0;
1466     region[iColumn1] = value1;
1467 #elif NEW_CHUNK_SIZE == 4
1468     int iColumn0 = thisColumn[0];
1469     int iColumn1 = thisColumn[1];
1470     int iColumn2 = thisColumn[2];
1471     int iColumn3 = thisColumn[3];
1472 #if AVX2 == 1
1473     __v2df bb;
1474     set_const_v2df(bb, multiplier);
1475     temp[0] = region[iColumn0];
1476     temp[1] = region[iColumn1];
1477     temp[2] = region[iColumn2];
1478     temp[3] = region[iColumn3];
1479     __v2df v0 = __builtin_ia32_loadupd(temp);
1480     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1481     __v2df a = __builtin_ia32_loadupd(element);
1482     a *= bb;
1483     v0 OPERATION a;
1484     a = __builtin_ia32_loadupd(element + 2);
1485     a *= bb;
1486     v1 OPERATION a;
1487     __builtin_ia32_storeupd(temp, v0);
1488     __builtin_ia32_storeupd(temp + 2, v1);
1489     region[iColumn0] = temp[0];
1490     region[iColumn1] = temp[1];
1491     region[iColumn2] = temp[2];
1492     region[iColumn3] = temp[3];
1493 #else
1494     double value0 = region[iColumn0];
1495     double value1 = region[iColumn1];
1496     double value2 = region[iColumn2];
1497     double value3 = region[iColumn3];
1498     value0 OPERATION multiplier *element[0];
1499     value1 OPERATION multiplier *element[1];
1500     value2 OPERATION multiplier *element[2];
1501     value3 OPERATION multiplier *element[3];
1502     region[iColumn0] = value0;
1503     region[iColumn1] = value1;
1504     region[iColumn2] = value2;
1505     region[iColumn3] = value3;
1506 #endif
1507 #else
1508     abort();
1509 #endif
1510     element += NEW_CHUNK_SIZE;
1511     thisColumn += NEW_CHUNK_SIZE;
1512   }
1513 #if NEW_CHUNK_SIZE == 4
1514   int iColumn0 = thisColumn[0];
1515   int iColumn1 = thisColumn[1];
1516   double value0 = region[iColumn0];
1517   double value1 = region[iColumn1];
1518   value0 OPERATION multiplier *element[0];
1519   value1 OPERATION multiplier *element[1];
1520   region[iColumn0] = value0;
1521   region[iColumn1] = value1;
1522 #endif
1523 }
functionName(ScatterUpdate4NPlus3)1524 SCATTER_ATTRIBUTE void functionName(ScatterUpdate4NPlus3)(int numberIn, CoinFactorizationDouble multiplier,
1525   const CoinFactorizationDouble *COIN_RESTRICT element,
1526   CoinFactorizationDouble *COIN_RESTRICT region)
1527 {
1528   assert((numberIn & 3) == 3);
1529   const int *COIN_RESTRICT thisColumn = reinterpret_cast< const int * >(element + numberIn);
1530   int nFull = numberIn & (~(NEW_CHUNK_SIZE - 1));
1531 #if AVX2 == 1
1532   double temp[4];
1533 #endif
1534   for (int j = 0; j < nFull; j += NEW_CHUNK_SIZE) {
1535     coin_prefetch_const(element + 16);
1536     coin_prefetch_const(thisColumn + 32);
1537 #if NEW_CHUNK_SIZE == 2
1538     int iColumn0 = thisColumn[0];
1539     int iColumn1 = thisColumn[1];
1540     double value0 = region[iColumn0];
1541     double value1 = region[iColumn1];
1542     value0 OPERATION multiplier *element[0];
1543     value1 OPERATION multiplier *element[1];
1544     region[iColumn0] = value0;
1545     region[iColumn1] = value1;
1546 #elif NEW_CHUNK_SIZE == 4
1547     int iColumn0 = thisColumn[0];
1548     int iColumn1 = thisColumn[1];
1549     int iColumn2 = thisColumn[2];
1550     int iColumn3 = thisColumn[3];
1551 #if AVX2 == 1
1552     __v2df bb;
1553     set_const_v2df(bb, multiplier);
1554     temp[0] = region[iColumn0];
1555     temp[1] = region[iColumn1];
1556     temp[2] = region[iColumn2];
1557     temp[3] = region[iColumn3];
1558     __v2df v0 = __builtin_ia32_loadupd(temp);
1559     __v2df v1 = __builtin_ia32_loadupd(temp + 2);
1560     __v2df a = __builtin_ia32_loadupd(element);
1561     a *= bb;
1562     v0 OPERATION a;
1563     a = __builtin_ia32_loadupd(element + 2);
1564     a *= bb;
1565     v1 OPERATION a;
1566     __builtin_ia32_storeupd(temp, v0);
1567     __builtin_ia32_storeupd(temp + 2, v1);
1568     region[iColumn0] = temp[0];
1569     region[iColumn1] = temp[1];
1570     region[iColumn2] = temp[2];
1571     region[iColumn3] = temp[3];
1572 #else
1573     double value0 = region[iColumn0];
1574     double value1 = region[iColumn1];
1575     double value2 = region[iColumn2];
1576     double value3 = region[iColumn3];
1577     value0 OPERATION multiplier *element[0];
1578     value1 OPERATION multiplier *element[1];
1579     value2 OPERATION multiplier *element[2];
1580     value3 OPERATION multiplier *element[3];
1581     region[iColumn0] = value0;
1582     region[iColumn1] = value1;
1583     region[iColumn2] = value2;
1584     region[iColumn3] = value3;
1585 #endif
1586 #else
1587     abort();
1588 #endif
1589     element += NEW_CHUNK_SIZE;
1590     thisColumn += NEW_CHUNK_SIZE;
1591   }
1592 #if NEW_CHUNK_SIZE == 2
1593   int iColumn0 = thisColumn[0];
1594   double value0 = region[iColumn0];
1595   value0 OPERATION multiplier *element[0];
1596   region[iColumn0] = value0;
1597 #else
1598   int iColumn0 = thisColumn[0];
1599   int iColumn1 = thisColumn[1];
1600   int iColumn2 = thisColumn[2];
1601   double value0 = region[iColumn0];
1602   double value1 = region[iColumn1];
1603   double value2 = region[iColumn2];
1604   value0 OPERATION multiplier *element[0];
1605   value1 OPERATION multiplier *element[1];
1606   value2 OPERATION multiplier *element[2];
1607   region[iColumn0] = value0;
1608   region[iColumn1] = value1;
1609   region[iColumn2] = value2;
1610 #endif
1611 }
1612 #endif
1613 
1614 /* vi: softtabstop=2 shiftwidth=2 expandtab tabstop=2
1615 */
1616