1 /* $Id: AbcSimplexParallel.cpp 2385 2019-01-06 19:43:06Z unxusr $ */
2 // Copyright (C) 2002, International Business Machines
3 // Corporation and others, Copyright (C) 2012, FasterCoin.  All Rights Reserved.
4 // This code is licensed under the terms of the Eclipse Public License (EPL).
5 #include "CoinPragma.hpp"
6 
7 #include <math.h>
8 #include "CoinHelperFunctions.hpp"
9 #include "CoinAbcHelperFunctions.hpp"
10 #include "AbcSimplexDual.hpp"
11 #include "ClpEventHandler.hpp"
12 #include "AbcSimplexFactorization.hpp"
13 #include "CoinPackedMatrix.hpp"
14 #include "CoinIndexedVector.hpp"
15 #include "CoinFloatEqual.hpp"
16 #include "AbcDualRowDantzig.hpp"
17 #include "ClpMessage.hpp"
18 #include "ClpLinearObjective.hpp"
19 #include <cfloat>
20 #include <cassert>
21 #include <string>
22 #include <stdio.h>
23 #include <iostream>
24 //#undef AVX2
25 #define _mm256_broadcast_sd(x) static_cast< __m256d >(__builtin_ia32_vbroadcastsd256(x))
26 #define _mm256_load_pd(x) *(__m256d *)(x)
27 #define _mm256_store_pd (s, x) * ((__m256d *)s) = x
28 //#define ABC_DEBUG 2
29 /* Reasons to come out:
30    -1 iterations etc
31    -2 inaccuracy
32    -3 slight inaccuracy (and done iterations)
33    +0 looks optimal (might be unbounded - but we will investigate)
34    +1 looks infeasible
35    +3 max iterations
36 */
whileIteratingSerial()37 int AbcSimplexDual::whileIteratingSerial()
38 {
39   /* arrays
40      0 - to get tableau row and then for weights update
41      1 - tableau column
42      2 - for flip
43      3 - actual tableau row
44   */
45 #ifdef CLP_DEBUG
46   int debugIteration = -1;
47 #endif
48   // if can't trust much and long way from optimal then relax
49   //if (largestPrimalError_ > 10.0)
50   //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
51   //else
52   //abcFactorization_->relaxAccuracyCheck(1.0);
53   // status stays at -1 while iterating, >=0 finished, -2 to invert
54   // status -3 to go to top without an invert
55   int returnCode = -1;
56 #define DELAYED_UPDATE
57   arrayForBtran_ = 0;
58   setUsedArray(arrayForBtran_);
59   arrayForFtran_ = 1;
60   setUsedArray(arrayForFtran_);
61   arrayForFlipBounds_ = 2;
62   setUsedArray(arrayForFlipBounds_);
63   arrayForTableauRow_ = 3;
64   setUsedArray(arrayForTableauRow_);
65   arrayForDualColumn_ = 4;
66   setUsedArray(arrayForDualColumn_);
67   arrayForReplaceColumn_ = 4; //4;
68   arrayForFlipRhs_ = 5;
69   setUsedArray(arrayForFlipRhs_);
70   dualPivotRow();
71   lastPivotRow_ = pivotRow_;
72   if (pivotRow_ >= 0) {
73     // we found a pivot row
74     createDualPricingVectorSerial();
75     do {
76 #if ABC_DEBUG
77       checkArrays();
78 #endif
79       /*
80 	-1 - just move dual in values pass
81 	0 - take iteration
82 	1 - don't take but continue
83 	2 - don't take and break
84       */
85       stateOfIteration_ = 0;
86       returnCode = -1;
87       // put row of tableau in usefulArray[arrayForTableauRow_]
88       /*
89 	Could
90 	a) copy [arrayForBtran_] and start off updateWeights earlier
91 	b) break transposeTimes into two and do after slack part
92 	c) also think if cleaner to go all way with update but just don't do final part
93       */
94       //upperTheta=COIN_DBL_MAX;
95       double saveAcceptable = acceptablePivot_;
96       if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
97         if (!abcFactorization_->pivots())
98           acceptablePivot_ *= 1.0e2;
99         else if (abcFactorization_->pivots() < 5)
100           acceptablePivot_ *= 1.0e1;
101       }
102       dualColumn1();
103       acceptablePivot_ = saveAcceptable;
104       if ((stateOfProblem_ & VALUES_PASS) != 0) {
105         // see if can just move dual
106         if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
107           stateOfIteration_ = -1;
108         }
109       }
110       //usefulArray_[arrayForTableauRow_].sortPacked();
111       //usefulArray_[arrayForTableauRow_].print();
112       //usefulArray_[arrayForDualColumn_].setPackedMode(true);
113       //usefulArray_[arrayForDualColumn].sortPacked();
114       //usefulArray_[arrayForDualColumn].print();
115       if (!stateOfIteration_) {
116         // get sequenceIn_
117         dualPivotColumn();
118         if (sequenceIn_ >= 0) {
119           // normal iteration
120           // update the incoming column (and do weights)
121           getTableauColumnFlipAndStartReplaceSerial();
122         } else if (stateOfIteration_ != -1) {
123           stateOfIteration_ = 2;
124         }
125       }
126       if (!stateOfIteration_) {
127         //	assert (stateOfIteration_!=1);
128         int whatNext = 0;
129         whatNext = housekeeping();
130         if (whatNext == 1) {
131           problemStatus_ = -2; // refactorize
132         } else if (whatNext == 2) {
133           // maximum iterations or equivalent
134           problemStatus_ = 3;
135           returnCode = 3;
136           stateOfIteration_ = 2;
137         }
138         if (problemStatus_ == -1) {
139           replaceColumnPart3();
140           //clearArrays(arrayForReplaceColumn_);
141 #if ABC_DEBUG
142           checkArrays();
143 #endif
144           updateDualsInDual();
145           abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
146             movement_);
147 #if ABC_DEBUG
148           checkArrays();
149 #endif
150         } else {
151           abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
152           //clearArrays(arrayForBtran_);
153           //clearArrays(arrayForFtran_);
154         }
155       } else {
156         if (stateOfIteration_ < 0) {
157           // move dual in dual values pass
158           theta_ = abcDj_[sequenceOut_];
159           updateDualsInDual();
160           abcDj_[sequenceOut_] = 0.0;
161           sequenceOut_ = -1;
162         }
163         // clear all arrays
164         clearArrays(-1);
165         //sequenceIn_=-1;
166         //sequenceOut_=-1;
167       }
168       // Check event
169       {
170         int status = eventHandler_->event(ClpEventHandler::endOfIteration);
171         if (status >= 0) {
172           problemStatus_ = 5;
173           secondaryStatus_ = ClpEventHandler::endOfIteration;
174           returnCode = 4;
175           stateOfIteration_ = 2;
176         }
177       }
178       // at this stage sequenceIn_ may be <0
179       if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
180         // no incoming column is valid
181         returnCode = noPivotColumn();
182       }
183       if (stateOfIteration_ == 2) {
184         sequenceIn_ = -1;
185         break;
186       }
187       swapPrimalStuff();
188       if (problemStatus_ != -1) {
189         break;
190       }
191       // dualRow will go to virtual row pivot choice algorithm
192       // make sure values pass off if it should be
193       // use Btran array and clear inside dualPivotRow (if used)
194       int lastSequenceOut = sequenceOut_;
195       int lastDirectionOut = directionOut_;
196       dualPivotRow();
197       lastPivotRow_ = pivotRow_;
198       if (pivotRow_ >= 0) {
199         // we found a pivot row
200         // create as packed
201         createDualPricingVectorSerial();
202         swapDualStuff(lastSequenceOut, lastDirectionOut);
203         // next pivot
204       } else {
205         // no pivot row
206         clearArrays(-1);
207         returnCode = noPivotRow();
208         break;
209       }
210     } while (problemStatus_ == -1);
211     usefulArray_[arrayForTableauRow_].compact();
212 #if ABC_DEBUG
213     checkArrays();
214 #endif
215   } else {
216     // no pivot row on first try
217     clearArrays(-1);
218     returnCode = noPivotRow();
219   }
220   //printStuff();
221   clearArrays(-1);
222   //if (pivotRow_==-100)
223   //returnCode=-100; // end of values pass
224   return returnCode;
225 }
226 // Create dual pricing vector
createDualPricingVectorSerial()227 void AbcSimplexDual::createDualPricingVectorSerial()
228 {
229 #ifndef NDEBUG
230 #if ABC_NORMAL_DEBUG > 3
231   checkArrays();
232 #endif
233 #endif
234   // we found a pivot row
235   if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
236     handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
237       << pivotRow_
238       << CoinMessageEol;
239   }
240   // check accuracy of weights
241   abcDualRowPivot_->checkAccuracy();
242   // get sign for finding row of tableau
243   // create as packed
244   usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
245   abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
246   sequenceIn_ = -1;
247 }
getTableauColumnPart1Serial()248 void AbcSimplexDual::getTableauColumnPart1Serial()
249 {
250 #if ABC_DEBUG
251   {
252     const double *work = usefulArray_[arrayForTableauRow_].denseVector();
253     int number = usefulArray_[arrayForTableauRow_].getNumElements();
254     const int *which = usefulArray_[arrayForTableauRow_].getIndices();
255     for (int i = 0; i < number; i++) {
256       if (which[i] == sequenceIn_) {
257         assert(alpha_ == work[i]);
258         break;
259       }
260     }
261   }
262 #endif
263   //int returnCode=0;
264   // update the incoming column
265   unpack(usefulArray_[arrayForFtran_]);
266   // Array[0] may be needed until updateWeights2
267   // and update dual weights (can do in parallel - with extra array)
268   alpha_ = abcDualRowPivot_->updateWeights1(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
269 }
getTableauColumnFlipAndStartReplaceSerial()270 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceSerial()
271 {
272   // move checking stuff down into called functions
273   int numberFlipped;
274   numberFlipped = flipBounds();
275   // update the incoming column
276   getTableauColumnPart1Serial();
277   checkReplacePart1();
278   //checkReplacePart1a();
279   //checkReplacePart1b();
280   getTableauColumnPart2();
281   //usefulArray_[arrayForTableauRow_].compact();
282   // returns 3 if skip this iteration and re-factorize
283   /*
284   return code
285   0 - OK
286   2 - flag something and skip
287   3 - break and re-factorize
288   4 - break and say infeasible
289  */
290   int returnCode = 0;
291   // amount primal will move
292   movement_ = -dualOut_ * directionOut_ / alpha_;
293   // see if update stable
294 #if ABC_NORMAL_DEBUG > 3
295   if ((handler_->logLevel() & 32)) {
296     double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
297     double diff1 = fabs(alpha_ - btranAlpha_);
298     double diff2 = fabs(fabs(alpha_) - fabs(ft));
299     double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
300     double largest = CoinMax(CoinMax(diff1, diff2), diff3);
301     printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
302       btranAlpha_, alpha_, ft, largest);
303     if (largest > 0.001 * fabs(alpha_)) {
304       printf("bad\n");
305     }
306   }
307 #endif
308   int numberPivots = abcFactorization_->pivots();
309   //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
310   double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
311   // if can't trust much and long way from optimal then relax
312   if (largestPrimalError_ > 10.0)
313     checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
314   if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
315     handler_->message(CLP_DUAL_CHECK, messages_)
316       << btranAlpha_
317       << alpha_
318       << CoinMessageEol;
319     // see with more relaxed criterion
320     double test;
321     if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
322       test = 1.0e-1 * fabs(alpha_);
323     else
324       test = 10.0 * checkValue; //1.0e-4 * (1.0 + fabs(alpha_));
325     bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
326     double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
327     int error = 0;
328     while (derror > 1.0) {
329       error++;
330       derror *= 0.1;
331     }
332     int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
333     int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
334 #if ABC_NORMAL_DEBUG > 0
335     if (newFactorFrequency < forceFactorization_)
336       printf("Error of %g after %d pivots old forceFact %d now %d\n",
337         fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
338         forceFactorization_, newFactorFrequency);
339 #endif
340     if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
341       && abcFactorization_->pivotTolerance() < 0.5)
342       abcFactorization_->saferTolerances(1.0, -1.03);
343     forceFactorization_ = newFactorFrequency;
344 
345 #if ABC_NORMAL_DEBUG > 0
346     if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
347       printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
348     }
349 #endif
350     if (numberPivots) {
351       if (needFlag && numberPivots < 10) {
352         // need to reject something
353         assert(sequenceOut_ >= 0);
354         char x = isColumn(sequenceOut_) ? 'C' : 'R';
355         handler_->message(CLP_SIMPLEX_FLAG, messages_)
356           << x << sequenceWithin(sequenceOut_)
357           << CoinMessageEol;
358 #if ABC_NORMAL_DEBUG > 0
359         printf("flag %d as alpha  %g %g - after %d pivots\n", sequenceOut_,
360           btranAlpha_, alpha_, numberPivots);
361 #endif
362         // Make safer?
363         abcFactorization_->saferTolerances(1.0, -1.03);
364         setFlagged(sequenceOut_);
365         // so can't happen immediately again
366         sequenceOut_ = -1;
367         lastBadIteration_ = numberIterations_; // say be more cautious
368       }
369       // Make safer?
370       //if (numberPivots<5)
371       //abcFactorization_->saferTolerances (-0.99, -1.01);
372       problemStatus_ = -2; // factorize now
373       returnCode = -2;
374       stateOfIteration_ = 2;
375     } else {
376       if (needFlag) {
377         assert(sequenceOut_ >= 0);
378         // need to reject something
379         char x = isColumn(sequenceOut_) ? 'C' : 'R';
380         handler_->message(CLP_SIMPLEX_FLAG, messages_)
381           << x << sequenceWithin(sequenceOut_)
382           << CoinMessageEol;
383 #if ABC_NORMAL_DEBUG > 3
384         printf("flag a %g %g\n", btranAlpha_, alpha_);
385 #endif
386         setFlagged(sequenceOut_);
387         // so can't happen immediately again
388         sequenceOut_ = -1;
389         //abcProgress_.clearBadTimes();
390         lastBadIteration_ = numberIterations_; // say be more cautious
391         if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
392           //printf("I think should declare infeasible\n");
393           problemStatus_ = 1;
394           returnCode = 1;
395           stateOfIteration_ = 2;
396         } else {
397           stateOfIteration_ = 1;
398         }
399         // Make safer?
400         if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
401           // change tolerance and re-invert
402           abcFactorization_->saferTolerances(1.0, -1.03);
403           problemStatus_ = -6; // factorize now
404           returnCode = -2;
405           stateOfIteration_ = 2;
406         }
407       }
408     }
409   }
410   if (!stateOfIteration_) {
411     // check update
412     //int updateStatus =
413     /*
414   returns
415   0 - OK
416   1 - take iteration then re-factorize
417   2 - flag something and skip
418   3 - break and re-factorize
419   5 - take iteration then re-factorize because of memory
420  */
421     int status = checkReplace();
422     if (status && !returnCode)
423       returnCode = status;
424   }
425   //clearArrays(arrayForFlipRhs_);
426   if (stateOfIteration_ && numberFlipped) {
427     //usefulArray_[arrayForTableauRow_].compact();
428     // move variables back
429     flipBack(numberFlipped);
430   }
431   // could choose average of all three
432   return returnCode;
433 }
434 #if ABC_PARALLEL == 1
435 /* Reasons to come out:
436    -1 iterations etc
437    -2 inaccuracy
438    -3 slight inaccuracy (and done iterations)
439    +0 looks optimal (might be unbounded - but we will investigate)
440    +1 looks infeasible
441    +3 max iterations
442 */
whileIteratingThread()443 int AbcSimplexDual::whileIteratingThread()
444 {
445   /* arrays
446      0 - to get tableau row and then for weights update
447      1 - tableau column
448      2 - for flip
449      3 - actual tableau row
450   */
451 #ifdef CLP_DEBUG
452   int debugIteration = -1;
453 #endif
454   // if can't trust much and long way from optimal then relax
455   //if (largestPrimalError_ > 10.0)
456   //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
457   //else
458   //abcFactorization_->relaxAccuracyCheck(1.0);
459   // status stays at -1 while iterating, >=0 finished, -2 to invert
460   // status -3 to go to top without an invert
461   int returnCode = -1;
462 #define DELAYED_UPDATE
463   arrayForBtran_ = 0;
464   setUsedArray(arrayForBtran_);
465   arrayForFtran_ = 1;
466   setUsedArray(arrayForFtran_);
467   arrayForFlipBounds_ = 2;
468   setUsedArray(arrayForFlipBounds_);
469   arrayForTableauRow_ = 3;
470   setUsedArray(arrayForTableauRow_);
471   arrayForDualColumn_ = 4;
472   setUsedArray(arrayForDualColumn_);
473   arrayForReplaceColumn_ = 4; //4;
474   arrayForFlipRhs_ = 5;
475   setUsedArray(arrayForFlipRhs_);
476   dualPivotRow();
477   lastPivotRow_ = pivotRow_;
478   if (pivotRow_ >= 0) {
479     // we found a pivot row
480     createDualPricingVectorThread();
481     do {
482 #if ABC_DEBUG
483       checkArrays();
484 #endif
485       /*
486 	-1 - just move dual in values pass
487 	0 - take iteration
488 	1 - don't take but continue
489 	2 - don't take and break
490       */
491       stateOfIteration_ = 0;
492       returnCode = -1;
493       // put row of tableau in usefulArray[arrayForTableauRow_]
494       /*
495 	Could
496 	a) copy [arrayForBtran_] and start off updateWeights earlier
497 	b) break transposeTimes into two and do after slack part
498 	c) also think if cleaner to go all way with update but just don't do final part
499       */
500       //upperTheta=COIN_DBL_MAX;
501       double saveAcceptable = acceptablePivot_;
502       if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
503         if (!abcFactorization_->pivots())
504           acceptablePivot_ *= 1.0e2;
505         else if (abcFactorization_->pivots() < 5)
506           acceptablePivot_ *= 1.0e1;
507       }
508       dualColumn1();
509       acceptablePivot_ = saveAcceptable;
510       if ((stateOfProblem_ & VALUES_PASS) != 0) {
511         // see if can just move dual
512         if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
513           stateOfIteration_ = -1;
514         }
515       }
516       //usefulArray_[arrayForTableauRow_].sortPacked();
517       //usefulArray_[arrayForTableauRow_].print();
518       //usefulArray_[arrayForDualColumn_].setPackedMode(true);
519       //usefulArray_[arrayForDualColumn].sortPacked();
520       //usefulArray_[arrayForDualColumn].print();
521       if (parallelMode_ != 0) {
522         stopStart_ = 1 + 32 * 1; // Just first thread for updateweights
523         startParallelStuff(2);
524       }
525       if (!stateOfIteration_) {
526         // get sequenceIn_
527         dualPivotColumn();
528         if (sequenceIn_ >= 0) {
529           // normal iteration
530           // update the incoming column (and do weights if serial)
531           getTableauColumnFlipAndStartReplaceThread();
532           //usleep(1000);
533         } else if (stateOfIteration_ != -1) {
534           stateOfIteration_ = 2;
535         }
536       }
537       if (parallelMode_ != 0) {
538         // do sync here
539         stopParallelStuff(2);
540       }
541       if (!stateOfIteration_) {
542         //	assert (stateOfIteration_!=1);
543         int whatNext = 0;
544         whatNext = housekeeping();
545         if (whatNext == 1) {
546           problemStatus_ = -2; // refactorize
547         } else if (whatNext == 2) {
548           // maximum iterations or equivalent
549           problemStatus_ = 3;
550           returnCode = 3;
551           stateOfIteration_ = 2;
552         }
553       } else {
554         if (stateOfIteration_ < 0) {
555           // move dual in dual values pass
556           theta_ = abcDj_[sequenceOut_];
557           updateDualsInDual();
558           abcDj_[sequenceOut_] = 0.0;
559           sequenceOut_ = -1;
560         }
561         // clear all arrays
562         clearArrays(-1);
563       }
564       // at this stage sequenceIn_ may be <0
565       if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
566         // no incoming column is valid
567         returnCode = noPivotColumn();
568       }
569       if (stateOfIteration_ == 2) {
570         sequenceIn_ = -1;
571         break;
572       }
573       if (problemStatus_ != -1) {
574         abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
575         swapPrimalStuff();
576         break;
577       }
578 #if ABC_DEBUG
579       checkArrays();
580 #endif
581       if (stateOfIteration_ != -1) {
582         assert(stateOfIteration_ == 0); // if 1 why are we here
583         // can do these in parallel
584         if (parallelMode_ == 0) {
585           replaceColumnPart3();
586           updateDualsInDual();
587           abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
588             movement_);
589         } else {
590           stopStart_ = 1 + 32 * 1;
591           startParallelStuff(3);
592           if (parallelMode_ > 1) {
593             stopStart_ = 2 + 32 * 2;
594             startParallelStuff(9);
595           } else {
596             replaceColumnPart3();
597           }
598           abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
599             movement_);
600         }
601       }
602 #if ABC_DEBUG
603       checkArrays();
604 #endif
605       swapPrimalStuff();
606       // dualRow will go to virtual row pivot choice algorithm
607       // make sure values pass off if it should be
608       // use Btran array and clear inside dualPivotRow (if used)
609       int lastSequenceOut = sequenceOut_;
610       int lastDirectionOut = directionOut_;
611       // redo to test on parallelMode_
612       if (parallelMode_ > 1) {
613         stopStart_ = 2 + 32 * 2; // stop factorization update
614         //stopStart_=3+32*3; // stop all
615         stopParallelStuff(9);
616       }
617       dualPivotRow();
618       lastPivotRow_ = pivotRow_;
619       if (pivotRow_ >= 0) {
620         // we found a pivot row
621         // create as packed
622         createDualPricingVectorThread();
623         swapDualStuff(lastSequenceOut, lastDirectionOut);
624         // next pivot
625         // redo to test on parallelMode_
626         if (parallelMode_ != 0) {
627           stopStart_ = 1 + 32 * 1; // stop dual update
628           stopParallelStuff(3);
629         }
630       } else {
631         // no pivot row
632         // redo to test on parallelMode_
633         if (parallelMode_ != 0) {
634           stopStart_ = 1 + 32 * 1; // stop dual update
635           stopParallelStuff(3);
636         }
637         clearArrays(-1);
638         returnCode = noPivotRow();
639         break;
640       }
641     } while (problemStatus_ == -1);
642     usefulArray_[arrayForTableauRow_].compact();
643 #if ABC_DEBUG
644     checkArrays();
645 #endif
646   } else {
647     // no pivot row on first try
648     clearArrays(-1);
649     returnCode = noPivotRow();
650   }
651   //printStuff();
652   clearArrays(-1);
653   //if (pivotRow_==-100)
654   //returnCode=-100; // end of values pass
655   return returnCode;
656 }
657 // Create dual pricing vector
createDualPricingVectorThread()658 void AbcSimplexDual::createDualPricingVectorThread()
659 {
660 #ifndef NDEBUG
661 #if ABC_NORMAL_DEBUG > 3
662   checkArrays();
663 #endif
664 #endif
665   // we found a pivot row
666   if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
667     handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
668       << pivotRow_
669       << CoinMessageEol;
670   }
671   // check accuracy of weights
672   abcDualRowPivot_->checkAccuracy();
673   // get sign for finding row of tableau
674   // create as packed
675   usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
676   abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
677   sequenceIn_ = -1;
678 }
getTableauColumnPart1Thread()679 void AbcSimplexDual::getTableauColumnPart1Thread()
680 {
681 #if ABC_DEBUG
682   {
683     const double *work = usefulArray_[arrayForTableauRow_].denseVector();
684     int number = usefulArray_[arrayForTableauRow_].getNumElements();
685     const int *which = usefulArray_[arrayForTableauRow_].getIndices();
686     for (int i = 0; i < number; i++) {
687       if (which[i] == sequenceIn_) {
688         assert(alpha_ == work[i]);
689         break;
690       }
691     }
692   }
693 #endif
694   //int returnCode=0;
695   // update the incoming column
696   unpack(usefulArray_[arrayForFtran_]);
697   // Array[0] may be needed until updateWeights2
698   // and update dual weights (can do in parallel - with extra array)
699   if (parallelMode_ != 0) {
700     abcFactorization_->updateColumnFTPart1(usefulArray_[arrayForFtran_]);
701     // pivot element
702     //alpha_ = usefulArray_[arrayForFtran_].denseVector()[pivotRow_];
703   } else {
704     alpha_ = abcDualRowPivot_->updateWeights1(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
705   }
706 }
getTableauColumnFlipAndStartReplaceThread()707 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceThread()
708 {
709   // move checking stuff down into called functions
710   // threads 2 and 3 are available
711   int numberFlipped;
712 
713   if (parallelMode_ == 0) {
714     numberFlipped = flipBounds();
715     // update the incoming column
716     getTableauColumnPart1Thread();
717     checkReplacePart1();
718     //checkReplacePart1a();
719     //checkReplacePart1b();
720     getTableauColumnPart2();
721   } else {
722     // save stopStart
723     int saveStopStart = stopStart_;
724     if (parallelMode_ > 1) {
725       // we can flip immediately
726       stopStart_ = 2 + 32 * 2; // just thread 1
727       startParallelStuff(8);
728       // update the incoming column
729       getTableauColumnPart1Thread();
730       stopStart_ = 4 + 32 * 4; // just thread 2
731       startParallelStuff(7);
732       getTableauColumnPart2();
733       stopStart_ = 6 + 32 * 6; // just threads 1 and 2
734       stopParallelStuff(8);
735       //ftAlpha_=threadInfo_[2].result;
736     } else {
737       // just one extra thread - do replace
738       // update the incoming column
739       getTableauColumnPart1Thread();
740       stopStart_ = 2 + 32 * 2; // just thread 1
741       startParallelStuff(7);
742       getTableauColumnPart2();
743       numberFlipped = flipBounds();
744       stopParallelStuff(8);
745       //ftAlpha_=threadInfo_[1].result;
746     }
747     stopStart_ = saveStopStart;
748   }
749   //usefulArray_[arrayForTableauRow_].compact();
750   // returns 3 if skip this iteration and re-factorize
751   /*
752   return code
753   0 - OK
754   2 - flag something and skip
755   3 - break and re-factorize
756   4 - break and say infeasible
757  */
758   int returnCode = 0;
759   // amount primal will move
760   movement_ = -dualOut_ * directionOut_ / alpha_;
761   // see if update stable
762 #if ABC_NORMAL_DEBUG > 3
763   if ((handler_->logLevel() & 32)) {
764     double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
765     double diff1 = fabs(alpha_ - btranAlpha_);
766     double diff2 = fabs(fabs(alpha_) - fabs(ft));
767     double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
768     double largest = CoinMax(CoinMax(diff1, diff2), diff3);
769     printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
770       btranAlpha_, alpha_, ft, largest);
771     if (largest > 0.001 * fabs(alpha_)) {
772       printf("bad\n");
773     }
774   }
775 #endif
776   int numberPivots = abcFactorization_->pivots();
777   //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
778   double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
779   // if can't trust much and long way from optimal then relax
780   if (largestPrimalError_ > 10.0)
781     checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
782   if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
783     handler_->message(CLP_DUAL_CHECK, messages_)
784       << btranAlpha_
785       << alpha_
786       << CoinMessageEol;
787     // see with more relaxed criterion
788     double test;
789     if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
790       test = 1.0e-1 * fabs(alpha_);
791     else
792       test = 10.0 * checkValue; //1.0e-4 * (1.0 + fabs(alpha_));
793     bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
794     double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
795     int error = 0;
796     while (derror > 1.0) {
797       error++;
798       derror *= 0.1;
799     }
800     int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
801     int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
802 #if ABC_NORMAL_DEBUG > 0
803     if (newFactorFrequency < forceFactorization_)
804       printf("Error of %g after %d pivots old forceFact %d now %d\n",
805         fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
806         forceFactorization_, newFactorFrequency);
807 #endif
808     if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
809       && abcFactorization_->pivotTolerance() < 0.5)
810       abcFactorization_->saferTolerances(1.0, -1.03);
811     forceFactorization_ = newFactorFrequency;
812 
813 #if ABC_NORMAL_DEBUG > 0
814     if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
815       printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
816     }
817 #endif
818     if (numberPivots) {
819       if (needFlag && numberPivots < 10) {
820         // need to reject something
821         assert(sequenceOut_ >= 0);
822         char x = isColumn(sequenceOut_) ? 'C' : 'R';
823         handler_->message(CLP_SIMPLEX_FLAG, messages_)
824           << x << sequenceWithin(sequenceOut_)
825           << CoinMessageEol;
826 #if ABC_NORMAL_DEBUG > 0
827         printf("flag %d as alpha  %g %g - after %d pivots\n", sequenceOut_,
828           btranAlpha_, alpha_, numberPivots);
829 #endif
830         // Make safer?
831         abcFactorization_->saferTolerances(1.0, -1.03);
832         setFlagged(sequenceOut_);
833         // so can't happen immediately again
834         sequenceOut_ = -1;
835         lastBadIteration_ = numberIterations_; // say be more cautious
836       }
837       // Make safer?
838       //if (numberPivots<5)
839       //abcFactorization_->saferTolerances (-0.99, -1.01);
840       problemStatus_ = -2; // factorize now
841       returnCode = -2;
842       stateOfIteration_ = 2;
843     } else {
844       if (needFlag) {
845         assert(sequenceOut_ >= 0);
846         // need to reject something
847         char x = isColumn(sequenceOut_) ? 'C' : 'R';
848         handler_->message(CLP_SIMPLEX_FLAG, messages_)
849           << x << sequenceWithin(sequenceOut_)
850           << CoinMessageEol;
851 #if ABC_NORMAL_DEBUG > 3
852         printf("flag a %g %g\n", btranAlpha_, alpha_);
853 #endif
854         setFlagged(sequenceOut_);
855         // so can't happen immediately again
856         sequenceOut_ = -1;
857         //abcProgress_.clearBadTimes();
858         lastBadIteration_ = numberIterations_; // say be more cautious
859         if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
860           //printf("I think should declare infeasible\n");
861           problemStatus_ = 1;
862           returnCode = 1;
863           stateOfIteration_ = 2;
864         } else {
865           stateOfIteration_ = 1;
866         }
867         // Make safer?
868         if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
869           // change tolerance and re-invert
870           abcFactorization_->saferTolerances(1.0, -1.03);
871           problemStatus_ = -6; // factorize now
872           returnCode = -2;
873           stateOfIteration_ = 2;
874         }
875       }
876     }
877   }
878   if (!stateOfIteration_) {
879     // check update
880     //int updateStatus =
881     /*
882   returns
883   0 - OK
884   1 - take iteration then re-factorize
885   2 - flag something and skip
886   3 - break and re-factorize
887   5 - take iteration then re-factorize because of memory
888  */
889     int status = checkReplace();
890     if (status && !returnCode)
891       returnCode = status;
892   }
893   //clearArrays(arrayForFlipRhs_);
894   if (stateOfIteration_ && numberFlipped) {
895     //usefulArray_[arrayForTableauRow_].compact();
896     // move variables back
897     flipBack(numberFlipped);
898   }
899   // could choose average of all three
900   return returnCode;
901 }
902 #endif
903 #if ABC_PARALLEL == 2
904 #if ABC_NORMAL_DEBUG > 0
905 // for conflicts
906 int cilk_conflict = 0;
907 #endif
908 #ifdef EARLY_FACTORIZE
doEarlyFactorization(AbcSimplexDual * dual)909 static int doEarlyFactorization(AbcSimplexDual *dual)
910 {
911   int returnCode = cilk_spawn dual->whileIteratingParallel(123456789);
912   CoinIndexedVector &vector = *dual->usefulArray(ABC_NUMBER_USEFUL - 1);
913   int status = dual->earlyFactorization()->factorize(dual, vector);
914 #if 0
915   // Is this safe
916   if (!status&&true) {
917     // do pivots if there are any
918     int capacity = vector.capacity();
919     capacity--;
920     int numberPivotsStored = vector.getIndices()[capacity];
921     status =
922       dual->earlyFactorization()->replaceColumns(dual,vector,
923 						    0,numberPivotsStored,false);
924   }
925 #endif
926   if (status) {
927     printf("bad early factorization in doEarly - switch off\n");
928     vector.setNumElements(-1);
929   }
930   cilk_sync;
931   return returnCode;
932 }
933 #endif
934 #define MOVE_UPDATE_WEIGHTS
935 /* Reasons to come out:
936    -1 iterations etc
937    -2 inaccuracy
938    -3 slight inaccuracy (and done iterations)
939    +0 looks optimal (might be unbounded - but we will investigate)
940    +1 looks infeasible
941    +3 max iterations
942 */
whileIteratingCilk()943 int AbcSimplexDual::whileIteratingCilk()
944 {
945   /* arrays
946      0 - to get tableau row and then for weights update
947      1 - tableau column
948      2 - for flip
949      3 - actual tableau row
950   */
951 #ifdef CLP_DEBUG
952   int debugIteration = -1;
953 #endif
954   // if can't trust much and long way from optimal then relax
955   //if (largestPrimalError_ > 10.0)
956   //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
957   //else
958   //abcFactorization_->relaxAccuracyCheck(1.0);
959   // status stays at -1 while iterating, >=0 finished, -2 to invert
960   // status -3 to go to top without an invert
961   int returnCode = -1;
962 #define DELAYED_UPDATE
963   arrayForBtran_ = 0;
964   setUsedArray(arrayForBtran_);
965   arrayForFtran_ = 1;
966   setUsedArray(arrayForFtran_);
967   arrayForFlipBounds_ = 2;
968   setUsedArray(arrayForFlipBounds_);
969   arrayForTableauRow_ = 3;
970   setUsedArray(arrayForTableauRow_);
971   arrayForDualColumn_ = 4;
972   setUsedArray(arrayForDualColumn_);
973   arrayForReplaceColumn_ = 6; //4;
974   setUsedArray(arrayForReplaceColumn_);
975 #ifndef MOVE_UPDATE_WEIGHTS
976   arrayForFlipRhs_ = 5;
977   setUsedArray(arrayForFlipRhs_);
978 #else
979   arrayForFlipRhs_ = 0;
980   // use for weights
981   setUsedArray(5);
982 #endif
983   dualPivotRow();
984   lastPivotRow_ = pivotRow_;
985   if (pivotRow_ >= 0) {
986     // we found a pivot row
987     createDualPricingVectorCilk();
988     // ABC_PARALLEL 2
989 #if 0
990     {
991       for (int i=0;i<numberRows_;i++) {
992 	int iSequence=abcPivotVariable_[i];
993 	if (lowerBasic_[i]!=lowerSaved_[iSequence]||
994 	    upperBasic_[i]!=upperSaved_[iSequence])
995 	  printf("basic l %g,%g,%g u %g,%g\n",
996 		 abcLower_[iSequence],lowerSaved_[iSequence],lowerBasic_[i],
997 		 abcUpper_[iSequence],upperSaved_[iSequence],upperBasic_[i]);
998       }
999     }
1000 #endif
1001 #if 0
1002     for (int iSequence = 0; iSequence < numberTotal_; iSequence++) {
1003       int bad=-1;
1004       if (getFakeBound(iSequence)==noFake) {
1005 	if (abcLower_[iSequence]!=lowerSaved_[iSequence]||
1006 	    abcUpper_[iSequence]!=upperSaved_[iSequence])
1007 	  bad=0;
1008       } else if (getFakeBound(iSequence)==lowerFake) {
1009 	if (abcLower_[iSequence]==lowerSaved_[iSequence]||
1010 	    abcUpper_[iSequence]!=upperSaved_[iSequence])
1011 	  bad=1;
1012       } else if (getFakeBound(iSequence)==upperFake) {
1013 	if (abcLower_[iSequence]!=lowerSaved_[iSequence]||
1014 	    abcUpper_[iSequence]==upperSaved_[iSequence])
1015 	  bad=2;
1016       } else {
1017 	if (abcLower_[iSequence]==lowerSaved_[iSequence]||
1018 	    abcUpper_[iSequence]==upperSaved_[iSequence])
1019 	  bad=3;
1020       }
1021       if (bad>=0)
1022 	printf("%d status %d l %g,%g u %g,%g\n",iSequence,internalStatus_[iSequence],
1023 	       abcLower_[iSequence],lowerSaved_[iSequence],
1024 	       abcUpper_[iSequence],upperSaved_[iSequence]);
1025     }
1026 #endif
1027     int numberPivots = abcFactorization_->maximumPivots();
1028 #ifdef EARLY_FACTORIZE
1029     int numberEarly = 0;
1030     if (numberPivots > 20 && (numberEarly_ & 0xffff) > 5) {
1031       numberEarly = numberEarly_ & 0xffff;
1032       numberPivots = CoinMax(numberPivots - numberEarly - abcFactorization_->pivots(), numberPivots / 2);
1033     }
1034     returnCode = whileIteratingParallel(numberPivots);
1035     if (returnCode == -99) {
1036       if (numberEarly) {
1037         if (!abcEarlyFactorization_)
1038           abcEarlyFactorization_ = new AbcSimplexFactorization(*abcFactorization_);
1039         // create pivot list
1040         CoinIndexedVector &vector = usefulArray_[ABC_NUMBER_USEFUL - 1];
1041         vector.checkClear();
1042         int *indices = vector.getIndices();
1043         int capacity = vector.capacity();
1044         int numberNeeded = numberRows_ + 2 * numberEarly + 1;
1045         if (capacity < numberNeeded) {
1046           vector.reserve(numberNeeded);
1047           capacity = vector.capacity();
1048         }
1049         int numberBasic = 0;
1050         for (int i = 0; i < numberRows_; i++) {
1051           int iSequence = abcPivotVariable_[i];
1052           if (iSequence < numberRows_)
1053             indices[numberBasic++] = iSequence;
1054         }
1055         vector.setNumElements(numberRows_ - numberBasic);
1056         for (int i = 0; i < numberRows_; i++) {
1057           int iSequence = abcPivotVariable_[i];
1058           if (iSequence >= numberRows_)
1059             indices[numberBasic++] = iSequence;
1060         }
1061         assert(numberBasic == numberRows_);
1062         indices[capacity - 1] = 0;
1063         // could set iterations to -1 for safety
1064 #if 0
1065 	cilk_spawn doEarlyFactorization(this);
1066 	returnCode=whileIteratingParallel(123456789);
1067 	cilk_sync;
1068 #else
1069         returnCode = doEarlyFactorization(this);
1070 #endif
1071       } else {
1072         returnCode = whileIteratingParallel(123456789);
1073       }
1074     }
1075 #else
1076     returnCode = whileIteratingParallel(numberPivots);
1077 #endif
1078     usefulArray_[arrayForTableauRow_].compact();
1079 #if ABC_DEBUG
1080     checkArrays();
1081 #endif
1082   } else {
1083     // no pivot row on first try
1084     clearArrays(-1);
1085     returnCode = noPivotRow();
1086   }
1087   //printStuff();
1088   clearArrays(-1);
1089   //if (pivotRow_==-100)
1090   //returnCode=-100; // end of values pass
1091   return returnCode;
1092 }
1093 // Create dual pricing vector
createDualPricingVectorCilk()1094 void AbcSimplexDual::createDualPricingVectorCilk()
1095 {
1096 #if CILK_CONFLICT > 0
1097   if (cilk_conflict & 15 != 0) {
1098     printf("cilk_conflict %d!\n", cilk_conflict);
1099     cilk_conflict = 0;
1100   }
1101 #endif
1102 #ifndef NDEBUG
1103 #if ABC_NORMAL_DEBUG > 3
1104   checkArrays();
1105 #endif
1106 #endif
1107   // we found a pivot row
1108   if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
1109     handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
1110       << pivotRow_
1111       << CoinMessageEol;
1112   }
1113   // check accuracy of weights
1114   abcDualRowPivot_->checkAccuracy();
1115   // get sign for finding row of tableau
1116   // create as packed
1117 #if MOVE_REPLACE_PART1A > 0
1118   if (!abcFactorization_->usingFT()) {
1119 #endif
1120     usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
1121     abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
1122 #if MOVE_REPLACE_PART1A > 0
1123   } else {
1124     cilk_spawn abcFactorization_->checkReplacePart1a(&usefulArray_[arrayForReplaceColumn_], pivotRow_);
1125     usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
1126     abcFactorization_->updateColumnTransposeCpu(usefulArray_[arrayForBtran_], 1);
1127     cilk_sync;
1128   }
1129 #endif
1130   sequenceIn_ = -1;
1131 }
getTableauColumnPart1Cilk()1132 void AbcSimplexDual::getTableauColumnPart1Cilk()
1133 {
1134 #if ABC_DEBUG
1135   {
1136     const double *work = usefulArray_[arrayForTableauRow_].denseVector();
1137     int number = usefulArray_[arrayForTableauRow_].getNumElements();
1138     const int *which = usefulArray_[arrayForTableauRow_].getIndices();
1139     for (int i = 0; i < number; i++) {
1140       if (which[i] == sequenceIn_) {
1141         assert(alpha_ == work[i]);
1142         break;
1143       }
1144     }
1145   }
1146 #endif
1147   //int returnCode=0;
1148   // update the incoming column
1149   unpack(usefulArray_[arrayForFtran_]);
1150   // Array[0] may be needed until updateWeights2
1151   // and update dual weights (can do in parallel - with extra array)
1152   abcFactorization_->updateColumnFTPart1(usefulArray_[arrayForFtran_]);
1153 }
getTableauColumnFlipAndStartReplaceCilk()1154 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceCilk()
1155 {
1156   // move checking stuff down into called functions
1157   // threads 2 and 3 are available
1158   int numberFlipped;
1159   //cilk
1160   getTableauColumnPart1Cilk();
1161 #if MOVE_REPLACE_PART1A <= 0
1162   cilk_spawn getTableauColumnPart2();
1163 #if MOVE_REPLACE_PART1A == 0
1164   cilk_spawn checkReplacePart1();
1165 #endif
1166   numberFlipped = flipBounds();
1167   cilk_sync;
1168 #else
1169   if (abcFactorization_->usingFT()) {
1170     cilk_spawn getTableauColumnPart2();
1171     ftAlpha_ = cilk_spawn abcFactorization_->checkReplacePart1b(&usefulArray_[arrayForReplaceColumn_], pivotRow_);
1172     numberFlipped = flipBounds();
1173     cilk_sync;
1174   } else {
1175     cilk_spawn getTableauColumnPart2();
1176     numberFlipped = flipBounds();
1177     cilk_sync;
1178   }
1179 #endif
1180   //usefulArray_[arrayForTableauRow_].compact();
1181   // returns 3 if skip this iteration and re-factorize
1182   /*
1183     return code
1184     0 - OK
1185     2 - flag something and skip
1186     3 - break and re-factorize
1187     4 - break and say infeasible
1188  */
1189   int returnCode = 0;
1190   // amount primal will move
1191   movement_ = -dualOut_ * directionOut_ / alpha_;
1192   // see if update stable
1193 #if ABC_NORMAL_DEBUG > 3
1194   if ((handler_->logLevel() & 32)) {
1195     double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
1196     double diff1 = fabs(alpha_ - btranAlpha_);
1197     double diff2 = fabs(fabs(alpha_) - fabs(ft));
1198     double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
1199     double largest = CoinMax(CoinMax(diff1, diff2), diff3);
1200     printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
1201       btranAlpha_, alpha_, ft, largest);
1202     if (largest > 0.001 * fabs(alpha_)) {
1203       printf("bad\n");
1204     }
1205   }
1206 #endif
1207   int numberPivots = abcFactorization_->pivots();
1208   //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
1209   double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
1210   // if can't trust much and long way from optimal then relax
1211   if (largestPrimalError_ > 10.0)
1212     checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
1213   if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
1214     handler_->message(CLP_DUAL_CHECK, messages_)
1215       << btranAlpha_
1216       << alpha_
1217       << CoinMessageEol;
1218     // see with more relaxed criterion
1219     double test;
1220     if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
1221       test = 1.0e-1 * fabs(alpha_);
1222     else
1223       test = CoinMin(10.0 * checkValue, 1.0e-4 * (1.0 + fabs(alpha_)));
1224     bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
1225     double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
1226     int error = 0;
1227     while (derror > 1.0) {
1228       error++;
1229       derror *= 0.1;
1230     }
1231     int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
1232     int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
1233 #if ABC_NORMAL_DEBUG > 0
1234     if (newFactorFrequency < forceFactorization_)
1235       printf("Error of %g after %d pivots old forceFact %d now %d\n",
1236         fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
1237         forceFactorization_, newFactorFrequency);
1238 #endif
1239     if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
1240       && abcFactorization_->pivotTolerance() < 0.5)
1241       abcFactorization_->saferTolerances(1.0, -1.03);
1242     forceFactorization_ = newFactorFrequency;
1243 
1244 #if ABC_NORMAL_DEBUG > 0
1245     if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
1246       printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
1247     }
1248 #endif
1249     if (numberPivots) {
1250       if (needFlag && numberPivots < 10) {
1251         // need to reject something
1252         assert(sequenceOut_ >= 0);
1253         char x = isColumn(sequenceOut_) ? 'C' : 'R';
1254         handler_->message(CLP_SIMPLEX_FLAG, messages_)
1255           << x << sequenceWithin(sequenceOut_)
1256           << CoinMessageEol;
1257 #if ABC_NORMAL_DEBUG > 0
1258         printf("flag %d as alpha  %g %g - after %d pivots\n", sequenceOut_,
1259           btranAlpha_, alpha_, numberPivots);
1260 #endif
1261         // Make safer?
1262         abcFactorization_->saferTolerances(1.0, -1.03);
1263         setFlagged(sequenceOut_);
1264         // so can't happen immediately again
1265         sequenceOut_ = -1;
1266         lastBadIteration_ = numberIterations_; // say be more cautious
1267       }
1268       // Make safer?
1269       //if (numberPivots<5)
1270       //abcFactorization_->saferTolerances (-0.99, -1.01);
1271       problemStatus_ = -2; // factorize now
1272       returnCode = -2;
1273       stateOfIteration_ = 2;
1274     } else {
1275       if (needFlag) {
1276         assert(sequenceOut_ >= 0);
1277         // need to reject something
1278         char x = isColumn(sequenceOut_) ? 'C' : 'R';
1279         handler_->message(CLP_SIMPLEX_FLAG, messages_)
1280           << x << sequenceWithin(sequenceOut_)
1281           << CoinMessageEol;
1282 #if ABC_NORMAL_DEBUG > 3
1283         printf("flag a %g %g\n", btranAlpha_, alpha_);
1284 #endif
1285         setFlagged(sequenceOut_);
1286         // so can't happen immediately again
1287         sequenceOut_ = -1;
1288         //abcProgress_.clearBadTimes();
1289         lastBadIteration_ = numberIterations_; // say be more cautious
1290         if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
1291           //printf("I think should declare infeasible\n");
1292           problemStatus_ = 1;
1293           returnCode = 1;
1294           stateOfIteration_ = 2;
1295         } else {
1296           stateOfIteration_ = 1;
1297         }
1298         // Make safer?
1299         if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
1300           // change tolerance and re-invert
1301           abcFactorization_->saferTolerances(1.0, -1.03);
1302           problemStatus_ = -6; // factorize now
1303           returnCode = -2;
1304           stateOfIteration_ = 2;
1305         }
1306       }
1307     }
1308   }
1309   if (!stateOfIteration_) {
1310     // check update
1311     //int updateStatus =
1312     /*
1313   returns
1314   0 - OK
1315   1 - take iteration then re-factorize
1316   2 - flag something and skip
1317   3 - break and re-factorize
1318   5 - take iteration then re-factorize because of memory
1319  */
1320     int status = checkReplace();
1321     if (status && !returnCode)
1322       returnCode = status;
1323   }
1324   //clearArrays(arrayForFlipRhs_);
1325   if (stateOfIteration_ && numberFlipped) {
1326     //usefulArray_[arrayForTableauRow_].compact();
1327     // move variables back
1328     flipBack(numberFlipped);
1329   }
1330   // could choose average of all three
1331   return returnCode;
1332 }
whileIteratingParallel(int numberIterations)1333 int AbcSimplexDual::whileIteratingParallel(int numberIterations)
1334 {
1335   int returnCode = -1;
1336 #ifdef EARLY_FACTORIZE
1337   int savePivot = -1;
1338   CoinIndexedVector &early = usefulArray_[ABC_NUMBER_USEFUL - 1];
1339   int *pivotIndices = early.getIndices();
1340   double *pivotValues = early.denseVector();
1341   int pivotCountPosition = early.capacity() - 1;
1342   int numberSaved = 0;
1343   if (numberIterations == 123456789)
1344     savePivot = pivotCountPosition;
1345   ;
1346 #endif
1347   numberIterations += numberIterations_;
1348   do {
1349 #if ABC_DEBUG
1350     checkArrays();
1351 #endif
1352     /*
1353       -1 - just move dual in values pass
1354       0 - take iteration
1355       1 - don't take but continue
1356       2 - don't take and break
1357     */
1358     stateOfIteration_ = 0;
1359     returnCode = -1;
1360     // put row of tableau in usefulArray[arrayForTableauRow_]
1361     /*
1362       Could
1363       a) copy [arrayForBtran_] and start off updateWeights earlier
1364       b) break transposeTimes into two and do after slack part
1365       c) also think if cleaner to go all way with update but just don't do final part
1366     */
1367     //upperTheta=COIN_DBL_MAX;
1368     double saveAcceptable = acceptablePivot_;
1369     if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
1370       //if ((largestPrimalError_>1.0e-5||largestDualError_>1.0e-5)&&false) {
1371       if (!abcFactorization_->pivots())
1372         acceptablePivot_ *= 1.0e2;
1373       else if (abcFactorization_->pivots() < 5)
1374         acceptablePivot_ *= 1.0e1;
1375     }
1376 #ifdef MOVE_UPDATE_WEIGHTS
1377     // copy btran across
1378     usefulArray_[5].copy(usefulArray_[arrayForBtran_]);
1379     cilk_spawn abcDualRowPivot_->updateWeightsOnly(usefulArray_[5]);
1380     ;
1381 #endif
1382     dualColumn1();
1383     acceptablePivot_ = saveAcceptable;
1384     if ((stateOfProblem_ & VALUES_PASS) != 0) {
1385       // see if can just move dual
1386       if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
1387         stateOfIteration_ = -1;
1388       }
1389     }
1390     if (!stateOfIteration_) {
1391 #ifndef MOVE_UPDATE_WEIGHTS
1392       cilk_spawn abcDualRowPivot_->updateWeightsOnly(usefulArray_[arrayForBtran_]);
1393       ;
1394 #endif
1395       // get sequenceIn_
1396       dualPivotColumn();
1397       //stateOfIteration_=0;
1398       if (sequenceIn_ >= 0) {
1399         // normal iteration
1400         // update the incoming column
1401         //arrayForReplaceColumn_=getAvailableArray();
1402         getTableauColumnFlipAndStartReplaceCilk();
1403         //usleep(1000);
1404       } else if (stateOfIteration_ != -1) {
1405         stateOfIteration_ = 2;
1406       }
1407     }
1408     cilk_sync;
1409     // Check event
1410     {
1411       int status = eventHandler_->event(ClpEventHandler::endOfIteration);
1412       if (status >= 0) {
1413         problemStatus_ = 5;
1414         secondaryStatus_ = ClpEventHandler::endOfIteration;
1415         returnCode = 4;
1416         stateOfIteration_ = 2;
1417       }
1418     }
1419     if (!stateOfIteration_) {
1420       //	assert (stateOfIteration_!=1);
1421       int whatNext = 0;
1422       whatNext = housekeeping();
1423 #ifdef EARLY_FACTORIZE
1424       if (savePivot >= 0) {
1425         // save pivot
1426         pivotIndices[--savePivot] = sequenceOut_;
1427         pivotValues[savePivot] = alpha_;
1428         pivotIndices[--savePivot] = sequenceIn_;
1429         pivotValues[savePivot] = btranAlpha_;
1430         numberSaved++;
1431       }
1432 #endif
1433       if (whatNext == 1) {
1434         problemStatus_ = -2; // refactorize
1435         usefulArray_[arrayForTableauRow_].compact();
1436       } else if (whatNext == 2) {
1437         // maximum iterations or equivalent
1438         problemStatus_ = 3;
1439         returnCode = 3;
1440         stateOfIteration_ = 2;
1441       }
1442 #ifdef EARLY_FACTORIZE
1443       if (savePivot >= 0) {
1444         // tell worker can update this
1445         pivotIndices[pivotCountPosition] = numberSaved;
1446       }
1447 #endif
1448     } else {
1449       usefulArray_[arrayForTableauRow_].compact();
1450       if (stateOfIteration_ < 0) {
1451         // move dual in dual values pass
1452         theta_ = abcDj_[sequenceOut_];
1453         updateDualsInDual();
1454         abcDj_[sequenceOut_] = 0.0;
1455         sequenceOut_ = -1;
1456       }
1457       // clear all arrays
1458       clearArrays(-1);
1459     }
1460     // at this stage sequenceIn_ may be <0
1461     if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
1462       usefulArray_[arrayForTableauRow_].compact();
1463       // no incoming column is valid
1464       returnCode = noPivotColumn();
1465     }
1466     if (stateOfIteration_ == 2) {
1467       usefulArray_[arrayForTableauRow_].compact();
1468       sequenceIn_ = -1;
1469 #ifdef ABC_LONG_FACTORIZATION
1470       abcFactorization_->clearHiddenArrays();
1471 #endif
1472       break;
1473     }
1474     if (problemStatus_ != -1) {
1475 #ifndef MOVE_UPDATE_WEIGHTS
1476       abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
1477 #else
1478       abcDualRowPivot_->updateWeights2(usefulArray_[5], usefulArray_[arrayForFtran_]);
1479 #endif
1480 #ifdef ABC_LONG_FACTORIZATION
1481       abcFactorization_->clearHiddenArrays();
1482 #endif
1483       swapPrimalStuff();
1484       break;
1485     }
1486     if (stateOfIteration_ == 1) {
1487       // clear all arrays
1488       clearArrays(-2);
1489 #ifdef ABC_LONG_FACTORIZATION
1490       abcFactorization_->clearHiddenArrays();
1491 #endif
1492     }
1493     if (stateOfIteration_ == 0) {
1494       // can do these in parallel
1495       // No idea why I need this - but otherwise runs not repeatable (try again??)
1496       //usefulArray_[3].compact();
1497       cilk_spawn updateDualsInDual();
1498       int lastSequenceOut;
1499       int lastDirectionOut;
1500       if (firstFree_ < 0) {
1501         // can do in parallel
1502         cilk_spawn replaceColumnPart3();
1503         updatePrimalSolution();
1504         swapPrimalStuff();
1505         // dualRow will go to virtual row pivot choice algorithm
1506         // make sure values pass off if it should be
1507         // use Btran array and clear inside dualPivotRow (if used)
1508         lastSequenceOut = sequenceOut_;
1509         lastDirectionOut = directionOut_;
1510         dualPivotRow();
1511         cilk_sync;
1512       } else {
1513         // be more careful as dualPivotRow may do update
1514         cilk_spawn replaceColumnPart3();
1515         updatePrimalSolution();
1516         swapPrimalStuff();
1517         // dualRow will go to virtual row pivot choice algorithm
1518         // make sure values pass off if it should be
1519         // use Btran array and clear inside dualPivotRow (if used)
1520         lastSequenceOut = sequenceOut_;
1521         lastDirectionOut = directionOut_;
1522         cilk_sync;
1523         dualPivotRow();
1524       }
1525       lastPivotRow_ = pivotRow_;
1526       if (pivotRow_ >= 0) {
1527         // we found a pivot row
1528         // create as packed
1529         // dual->checkReplacePart1a();
1530         createDualPricingVectorCilk();
1531         swapDualStuff(lastSequenceOut, lastDirectionOut);
1532       }
1533       cilk_sync;
1534     } else {
1535       // after moving dual in values pass
1536       dualPivotRow();
1537       lastPivotRow_ = pivotRow_;
1538       if (pivotRow_ >= 0) {
1539         // we found a pivot row
1540         // create as packed
1541         createDualPricingVectorCilk();
1542       }
1543     }
1544     if (pivotRow_ < 0) {
1545       // no pivot row
1546       clearArrays(-1);
1547       returnCode = noPivotRow();
1548       break;
1549     }
1550     if (numberIterations == numberIterations_ && problemStatus_ == -1) {
1551       returnCode = -99;
1552       break;
1553     }
1554   } while (problemStatus_ == -1);
1555   return returnCode;
1556 }
1557 #if 0
1558 void
1559 AbcSimplexDual::whileIterating2()
1560 {
1561   // get sequenceIn_
1562   dualPivotColumn();
1563   //stateOfIteration_=0;
1564   if (sequenceIn_ >= 0) {
1565     // normal iteration
1566     // update the incoming column
1567     //arrayForReplaceColumn_=getAvailableArray();
1568     getTableauColumnFlipAndStartReplaceCilk();
1569     //usleep(1000);
1570   } else if (stateOfIteration_!=-1) {
1571     stateOfIteration_=2;
1572   }
1573 }
1574 #endif
1575 #endif
updatePrimalSolution()1576 void AbcSimplexDual::updatePrimalSolution()
1577 {
1578   // finish doing weights
1579 #ifndef MOVE_UPDATE_WEIGHTS
1580   abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
1581     movement_);
1582 #else
1583   abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[5], usefulArray_[arrayForFtran_],
1584     movement_);
1585 #endif
1586 }
1587 int xxInfo[6][8];
parallelDual4(AbcSimplexDual * dual)1588 double parallelDual4(AbcSimplexDual *dual)
1589 {
1590   int maximumRows = dual->maximumAbcNumberRows();
1591   //int numberTotal=dual->numberTotal();
1592   CoinIndexedVector &update = *dual->usefulArray(dual->arrayForBtran());
1593   CoinPartitionedVector &tableauRow = *dual->usefulArray(dual->arrayForTableauRow());
1594   CoinPartitionedVector &candidateList = *dual->usefulArray(dual->arrayForDualColumn());
1595   AbcMatrix *matrix = dual->abcMatrix();
1596   // do slacks first (move before sync)
1597   double upperTheta = dual->upperTheta();
1598   //assert (upperTheta>-100*dual->dualTolerance()||dual->sequenceIn()>=0||dual->lastFirstFree()>=0);
1599 #if ABC_PARALLEL
1600 #define NUMBER_BLOCKS NUMBER_ROW_BLOCKS
1601   int numberBlocks = CoinMin(NUMBER_BLOCKS, dual->numberCpus());
1602 #else
1603 #define NUMBER_BLOCKS 1
1604   int numberBlocks = NUMBER_BLOCKS;
1605 #endif
1606 #if ABC_PARALLEL
1607   if (dual->parallelMode() == 0)
1608     numberBlocks = 1;
1609 #endif
1610   // see if worth using row copy
1611   bool gotRowCopy = matrix->gotRowCopy();
1612   int number = update.getNumElements();
1613   assert(number);
1614   bool useRowCopy = (gotRowCopy && (number << 2) < maximumRows);
1615   assert(numberBlocks == matrix->numberRowBlocks());
1616 #if ABC_PARALLEL == 1
1617   // redo so can pass info in with void *
1618   CoinThreadInfo *infoP = dual->threadInfoPointer();
1619   int cpuMask = ((1 << dual->parallelMode()) - 1);
1620   cpuMask += cpuMask << 5;
1621   dual->setStopStart(cpuMask);
1622 #endif
1623   CoinThreadInfo info[NUMBER_BLOCKS];
1624   const int *starts;
1625   if (useRowCopy)
1626     starts = matrix->blockStart();
1627   else
1628     starts = matrix->startColumnBlock();
1629   tableauRow.setPartitions(numberBlocks, starts);
1630   candidateList.setPartitions(numberBlocks, starts);
1631   //printf("free sequence in %d\n",dual->freeSequenceIn());
1632   if (useRowCopy) {
1633     // using row copy
1634 #if ABC_PARALLEL
1635     if (numberBlocks > 1) {
1636 #if ABC_PARALLEL == 2
1637       for (int i = 0; i < numberBlocks; i++) {
1638         info[i].stuff[1] = i;
1639         info[i].stuff[2] = -1;
1640         info[i].result = upperTheta;
1641         info[i].result = cilk_spawn
1642                            matrix->dualColumn1Row(info[i].stuff[1], COIN_DBL_MAX, info[i].stuff[2],
1643                              update, tableauRow, candidateList);
1644       }
1645       cilk_sync;
1646 #else
1647       // parallel 1
1648       for (int i = 0; i < numberBlocks; i++) {
1649         info[i].status = 5;
1650         info[i].stuff[1] = i;
1651         info[i].result = upperTheta;
1652         if (i < numberBlocks - 1) {
1653           infoP[i] = info[i];
1654           if (i == numberBlocks - 2)
1655             dual->startParallelStuff(5);
1656         }
1657       }
1658       info[numberBlocks - 1].stuff[1] = numberBlocks - 1;
1659       info[numberBlocks - 1].stuff[2] = -1;
1660       //double freeAlpha;
1661       info[numberBlocks - 1].result = matrix->dualColumn1Row(info[numberBlocks - 1].stuff[1],
1662         upperTheta, info[numberBlocks - 1].stuff[2],
1663         update, tableauRow, candidateList);
1664       dual->stopParallelStuff(5);
1665       for (int i = 0; i < numberBlocks - 1; i++)
1666         info[i] = infoP[i];
1667 #endif
1668     } else {
1669 #endif
1670       info[0].status = 5;
1671       info[0].stuff[1] = 0;
1672       info[0].result = upperTheta;
1673       info[0].stuff[1] = 0;
1674       info[0].stuff[2] = -1;
1675       // worth using row copy
1676       //assert (number>2);
1677       info[0].result = matrix->dualColumn1Row(info[0].stuff[1],
1678         upperTheta, info[0].stuff[2],
1679         update, tableauRow, candidateList);
1680 #if ABC_PARALLEL
1681     }
1682 #endif
1683   } else { // end of useRowCopy
1684 #if ABC_PARALLEL
1685     if (numberBlocks > 1) {
1686 #if ABC_PARALLEL == 2
1687       // do by column
1688       for (int i = 0; i < numberBlocks; i++) {
1689         info[i].stuff[1] = i;
1690         info[i].result = upperTheta;
1691         cilk_spawn
1692           matrix->dualColumn1Part(info[i].stuff[1], info[i].stuff[2],
1693             info[i].result,
1694             update, tableauRow, candidateList);
1695       }
1696       cilk_sync;
1697 #else
1698       // parallel 1
1699       // do by column
1700       for (int i = 0; i < numberBlocks; i++) {
1701         info[i].status = 4;
1702         info[i].stuff[1] = i;
1703         info[i].result = upperTheta;
1704         if (i < numberBlocks - 1) {
1705           infoP[i] = info[i];
1706           if (i == numberBlocks - 2)
1707             dual->startParallelStuff(4);
1708         }
1709       }
1710       matrix->dualColumn1Part(info[numberBlocks - 1].stuff[1], info[numberBlocks - 1].stuff[2],
1711         info[numberBlocks - 1].result,
1712         update, tableauRow, candidateList);
1713       dual->stopParallelStuff(4);
1714       for (int i = 0; i < numberBlocks - 1; i++)
1715         info[i] = infoP[i];
1716 #endif
1717     } else {
1718 #endif
1719       info[0].status = 4;
1720       info[0].stuff[1] = 0;
1721       info[0].result = upperTheta;
1722       info[0].stuff[1] = 0;
1723       matrix->dualColumn1Part(info[0].stuff[1], info[0].stuff[2],
1724         info[0].result,
1725         update, tableauRow, candidateList);
1726 #if ABC_PARALLEL
1727     }
1728 #endif
1729   }
1730   int sequenceIn[NUMBER_BLOCKS];
1731   bool anyFree = false;
1732 #if 0
1733   if (useRowCopy) {
1734     printf("Result at iteration %d slack seqIn %d upperTheta %g\n",
1735 	   dual->numberIterations(),dual->freeSequenceIn(),upperTheta);
1736     double * arrayT = tableauRow.denseVector();
1737     int * indexT = tableauRow.getIndices();
1738     double * arrayC = candidateList.denseVector();
1739     int * indexC = candidateList.getIndices();
1740     for (int i=0;i<numberBlocks;i++) {
1741       printf("Block %d seqIn %d upperTheta %g first %d last %d firstIn %d nnz %d numrem %d\n",
1742 	     i,info[i].stuff[2],info[i].result,
1743 	     xxInfo[0][i],xxInfo[1][i],xxInfo[2][i],xxInfo[3][i],xxInfo[4][i]);
1744       if (xxInfo[3][i]<-35) {
1745 	assert (xxInfo[3][i]==tableauRow.getNumElements(i));
1746 	assert (xxInfo[4][i]==candidateList.getNumElements(i));
1747 	for (int k=0;k<xxInfo[3][i];k++)
1748 	  printf("T %d %d %g\n",k,indexT[k+xxInfo[2][i]],arrayT[k+xxInfo[2][i]]);
1749 	for (int k=0;k<xxInfo[4][i];k++)
1750 	  printf("C %d %d %g\n",k,indexC[k+xxInfo[2][i]],arrayC[k+xxInfo[2][i]]);
1751       }
1752     }
1753   }
1754 #endif
1755   for (int i = 0; i < numberBlocks; i++) {
1756     sequenceIn[i] = info[i].stuff[2];
1757     if (sequenceIn[i] >= 0)
1758       anyFree = true;
1759     upperTheta = CoinMin(info[i].result, upperTheta);
1760     //assert (info[i].result>-100*dual->dualTolerance()||sequenceIn[i]>=0||dual->lastFirstFree()>=0);
1761   }
1762   if (anyFree) {
1763     int *COIN_RESTRICT index = tableauRow.getIndices();
1764     double *COIN_RESTRICT array = tableauRow.denseVector();
1765     // search for free coming in
1766     double bestFree = 0.0;
1767     int bestSequence = dual->sequenceIn();
1768     if (bestSequence >= 0)
1769       bestFree = dual->alpha();
1770     for (int i = 0; i < numberBlocks; i++) {
1771       int iLook = sequenceIn[i];
1772       if (iLook >= 0) {
1773         // free variable - search
1774         int start = starts[i];
1775         int end = start + tableauRow.getNumElements(i);
1776         for (int k = start; k < end; k++) {
1777           if (iLook == index[k]) {
1778             if (fabs(bestFree) < fabs(array[k])) {
1779               bestFree = array[k];
1780               bestSequence = iLook;
1781             }
1782             break;
1783           }
1784         }
1785       }
1786     }
1787     if (bestSequence >= 0) {
1788       double oldValue = dual->djRegion()[bestSequence];
1789       dual->setSequenceIn(bestSequence);
1790       dual->setAlpha(bestFree);
1791       dual->setTheta(oldValue / bestFree);
1792     }
1793   }
1794   //tableauRow.compact();
1795   //candidateList.compact();
1796 #if 0 //ndef NDEBUG
1797   tableauRow.setPackedMode(true);
1798   tableauRow.sortPacked();
1799   candidateList.setPackedMode(true);
1800   candidateList.sortPacked();
1801 #endif
1802   return upperTheta;
1803 }
1804 #if ABC_PARALLEL == 2
1805 static void
parallelDual5a(AbcSimplexFactorization * factorization,CoinIndexedVector * whichVector,int numberCpu,int whichCpu,double * weights)1806 parallelDual5a(AbcSimplexFactorization *factorization,
1807   CoinIndexedVector *whichVector,
1808   int numberCpu,
1809   int whichCpu,
1810   double *weights)
1811 {
1812   double *COIN_RESTRICT array = whichVector->denseVector();
1813   int *COIN_RESTRICT which = whichVector->getIndices();
1814   int numberRows = factorization->numberRows();
1815   for (int i = whichCpu; i < numberRows; i += numberCpu) {
1816     double value = 0.0;
1817     array[i] = 1.0;
1818     which[0] = i;
1819     whichVector->setNumElements(1);
1820     factorization->updateColumnTransposeCpu(*whichVector, whichCpu);
1821     int number = whichVector->getNumElements();
1822     for (int j = 0; j < number; j++) {
1823       int k = which[j];
1824       value += array[k] * array[k];
1825       array[k] = 0.0;
1826     }
1827     weights[i] = value;
1828   }
1829   whichVector->setNumElements(0);
1830 }
1831 #endif
1832 #if ABC_PARALLEL == 2
parallelDual5(AbcSimplexFactorization * factorization,CoinIndexedVector ** whichVector,int numberCpu,int whichCpu,double * weights)1833 void parallelDual5(AbcSimplexFactorization *factorization,
1834   CoinIndexedVector **whichVector,
1835   int numberCpu,
1836   int whichCpu,
1837   double *weights)
1838 {
1839   if (whichCpu) {
1840     cilk_spawn parallelDual5(factorization, whichVector, numberCpu, whichCpu - 1, weights);
1841     parallelDual5a(factorization, whichVector[whichCpu], numberCpu, whichCpu, weights);
1842     cilk_sync;
1843   } else {
1844     parallelDual5a(factorization, whichVector[whichCpu], numberCpu, whichCpu, weights);
1845   }
1846 }
1847 #endif
1848 // cilk seems a bit fragile
1849 #define CILK_FRAGILE 1
1850 #if CILK_FRAGILE > 1
1851 #undef cilk_spawn
1852 #undef cilk_sync
1853 #define cilk_spawn
1854 #define cilk_sync
1855 #define ONWARD 0
1856 #elif CILK_FRAGILE == 1
1857 #define ONWARD 0
1858 #else
1859 #define ONWARD 1
1860 #endif
1861 // Code below is just a translation of LAPACK
1862 #define BLOCKING1 8 // factorization strip
1863 #define BLOCKING2 8 // dgemm recursive
1864 #define BLOCKING3 16 // dgemm parallel
1865 /* type
1866    0 Left Lower NoTranspose Unit
1867    1 Left Upper NoTranspose NonUnit
1868    2 Left Lower Transpose Unit
1869    3 Left Upper Transpose NonUnit
1870 */
CoinAbcDtrsmFactor(int m,int n,double * COIN_RESTRICT a,int lda)1871 static void CoinAbcDtrsmFactor(int m, int n, double *COIN_RESTRICT a, int lda)
1872 {
1873   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
1874   assert(m == BLOCKING8);
1875   // 0 Left Lower NoTranspose Unit
1876   /* entry for column j and row i (when multiple of BLOCKING8)
1877      is at aBlocked+j*m+i*BLOCKING8
1878   */
1879   double *COIN_RESTRICT aBase2 = a;
1880   double *COIN_RESTRICT bBase2 = aBase2 + lda * BLOCKING8;
1881   for (int jj = 0; jj < n; jj += BLOCKING8) {
1882     double *COIN_RESTRICT bBase = bBase2;
1883     for (int j = jj; j < jj + BLOCKING8; j++) {
1884 #if 0
1885       double * COIN_RESTRICT aBase = aBase2;
1886       for (int k=0;k<BLOCKING8;k++) {
1887 	double bValue = bBase[k];
1888 	if (bValue) {
1889 	  for (int i=k+1;i<BLOCKING8;i++) {
1890 	    bBase[i]-=bValue*aBase[i];
1891 	  }
1892 	}
1893 	aBase+=BLOCKING8;
1894       }
1895 #else
1896       // unroll - stay in registers - don't test for zeros
1897       assert(BLOCKING8 == 8);
1898       double bValue0 = bBase[0];
1899       double bValue1 = bBase[1];
1900       double bValue2 = bBase[2];
1901       double bValue3 = bBase[3];
1902       double bValue4 = bBase[4];
1903       double bValue5 = bBase[5];
1904       double bValue6 = bBase[6];
1905       double bValue7 = bBase[7];
1906       bValue1 -= bValue0 * a[1 + 0 * BLOCKING8];
1907       bBase[1] = bValue1;
1908       bValue2 -= bValue0 * a[2 + 0 * BLOCKING8];
1909       bValue3 -= bValue0 * a[3 + 0 * BLOCKING8];
1910       bValue4 -= bValue0 * a[4 + 0 * BLOCKING8];
1911       bValue5 -= bValue0 * a[5 + 0 * BLOCKING8];
1912       bValue6 -= bValue0 * a[6 + 0 * BLOCKING8];
1913       bValue7 -= bValue0 * a[7 + 0 * BLOCKING8];
1914       bValue2 -= bValue1 * a[2 + 1 * BLOCKING8];
1915       bBase[2] = bValue2;
1916       bValue3 -= bValue1 * a[3 + 1 * BLOCKING8];
1917       bValue4 -= bValue1 * a[4 + 1 * BLOCKING8];
1918       bValue5 -= bValue1 * a[5 + 1 * BLOCKING8];
1919       bValue6 -= bValue1 * a[6 + 1 * BLOCKING8];
1920       bValue7 -= bValue1 * a[7 + 1 * BLOCKING8];
1921       bValue3 -= bValue2 * a[3 + 2 * BLOCKING8];
1922       bBase[3] = bValue3;
1923       bValue4 -= bValue2 * a[4 + 2 * BLOCKING8];
1924       bValue5 -= bValue2 * a[5 + 2 * BLOCKING8];
1925       bValue6 -= bValue2 * a[6 + 2 * BLOCKING8];
1926       bValue7 -= bValue2 * a[7 + 2 * BLOCKING8];
1927       bValue4 -= bValue3 * a[4 + 3 * BLOCKING8];
1928       bBase[4] = bValue4;
1929       bValue5 -= bValue3 * a[5 + 3 * BLOCKING8];
1930       bValue6 -= bValue3 * a[6 + 3 * BLOCKING8];
1931       bValue7 -= bValue3 * a[7 + 3 * BLOCKING8];
1932       bValue5 -= bValue4 * a[5 + 4 * BLOCKING8];
1933       bBase[5] = bValue5;
1934       bValue6 -= bValue4 * a[6 + 4 * BLOCKING8];
1935       bValue7 -= bValue4 * a[7 + 4 * BLOCKING8];
1936       bValue6 -= bValue5 * a[6 + 5 * BLOCKING8];
1937       bBase[6] = bValue6;
1938       bValue7 -= bValue5 * a[7 + 5 * BLOCKING8];
1939       bValue7 -= bValue6 * a[7 + 6 * BLOCKING8];
1940       bBase[7] = bValue7;
1941 #endif
1942       bBase += BLOCKING8;
1943     }
1944     bBase2 += lda * BLOCKING8;
1945   }
1946 }
1947 #define UNROLL_DTRSM 16
1948 #define CILK_DTRSM 32
dtrsm0(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)1949 static void dtrsm0(int kkk, int first, int last,
1950   int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
1951 {
1952   int mm = CoinMin(kkk + UNROLL_DTRSM * BLOCKING8, m);
1953   assert((last - first) % BLOCKING8 == 0);
1954   if (last - first > CILK_DTRSM) {
1955     int mid = ((first + last) >> 4) << 3;
1956     cilk_spawn dtrsm0(kkk, first, mid, m, a, b);
1957     dtrsm0(kkk, mid, last, m, a, b);
1958     cilk_sync;
1959   } else {
1960     const double *COIN_RESTRICT aBaseA = a + UNROLL_DTRSM * BLOCKING8X8 + kkk * BLOCKING8;
1961     aBaseA += (first - mm) * BLOCKING8 - BLOCKING8X8;
1962     aBaseA += m * kkk;
1963     for (int ii = first; ii < last; ii += BLOCKING8) {
1964       aBaseA += BLOCKING8X8;
1965       const double *COIN_RESTRICT aBaseB = aBaseA;
1966       double *COIN_RESTRICT bBaseI = b + ii;
1967       for (int kk = kkk; kk < mm; kk += BLOCKING8) {
1968         double *COIN_RESTRICT bBase = b + kk;
1969         const double *COIN_RESTRICT aBase2 = aBaseB; //a+UNROLL_DTRSM*BLOCKING8X8+m*kk+kkk*BLOCKING8;
1970         //aBase2 += (ii-mm)*BLOCKING8;
1971         //assert (aBase2==aBaseB);
1972         aBaseB += m * BLOCKING8;
1973 #if AVX2 != 2
1974 #define ALTERNATE_INNER
1975 #ifndef ALTERNATE_INNER
1976         for (int k = 0; k < BLOCKING8; k++) {
1977           //coin_prefetch_const(aBase2+BLOCKING8);
1978           for (int i = 0; i < BLOCKING8; i++) {
1979             bBaseI[i] -= bBase[k] * aBase2[i];
1980           }
1981           aBase2 += BLOCKING8;
1982         }
1983 #else
1984         double b0 = bBaseI[0];
1985         double b1 = bBaseI[1];
1986         double b2 = bBaseI[2];
1987         double b3 = bBaseI[3];
1988         double b4 = bBaseI[4];
1989         double b5 = bBaseI[5];
1990         double b6 = bBaseI[6];
1991         double b7 = bBaseI[7];
1992         for (int k = 0; k < BLOCKING8; k++) {
1993           //coin_prefetch_const(aBase2+BLOCKING8);
1994           double bValue = bBase[k];
1995           b0 -= bValue * aBase2[0];
1996           b1 -= bValue * aBase2[1];
1997           b2 -= bValue * aBase2[2];
1998           b3 -= bValue * aBase2[3];
1999           b4 -= bValue * aBase2[4];
2000           b5 -= bValue * aBase2[5];
2001           b6 -= bValue * aBase2[6];
2002           b7 -= bValue * aBase2[7];
2003           aBase2 += BLOCKING8;
2004         }
2005         bBaseI[0] = b0;
2006         bBaseI[1] = b1;
2007         bBaseI[2] = b2;
2008         bBaseI[3] = b3;
2009         bBaseI[4] = b4;
2010         bBaseI[5] = b5;
2011         bBaseI[6] = b6;
2012         bBaseI[7] = b7;
2013 #endif
2014 #else
2015         __m256d b0 = _mm256_load_pd(bBaseI);
2016         __m256d b1 = _mm256_load_pd(bBaseI + 4);
2017         for (int k = 0; k < BLOCKING8; k++) {
2018           //coin_prefetch_const(aBase2+BLOCKING8);
2019           __m256d bb = _mm256_broadcast_sd(bBase + k);
2020           __m256d a0 = _mm256_load_pd(aBase2);
2021           b0 -= a0 * bb;
2022           __m256d a1 = _mm256_load_pd(aBase2 + 4);
2023           b1 -= a1 * bb;
2024           aBase2 += BLOCKING8;
2025         }
2026         //_mm256_store_pd ((bBaseI), (b0));
2027         *reinterpret_cast< __m256d * >(bBaseI) = b0;
2028         //_mm256_store_pd (bBaseI+4, b1);
2029         *reinterpret_cast< __m256d * >(bBaseI + 4) = b1;
2030 #endif
2031       }
2032     }
2033   }
2034 }
CoinAbcDtrsm0(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2035 void CoinAbcDtrsm0(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2036 {
2037   assert((m & (BLOCKING8 - 1)) == 0);
2038   // 0 Left Lower NoTranspose Unit
2039   for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM * BLOCKING8) {
2040     int mm = CoinMin(m, kkk + UNROLL_DTRSM * BLOCKING8);
2041     for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2042       const double *COIN_RESTRICT aBase2 = a + kk * (m + BLOCKING8);
2043       double *COIN_RESTRICT bBase = b + kk;
2044       for (int k = 0; k < BLOCKING8; k++) {
2045         for (int i = k + 1; i < BLOCKING8; i++) {
2046           bBase[i] -= bBase[k] * aBase2[i];
2047         }
2048         aBase2 += BLOCKING8;
2049       }
2050       for (int ii = kk + BLOCKING8; ii < mm; ii += BLOCKING8) {
2051         double *COIN_RESTRICT bBaseI = b + ii;
2052         for (int k = 0; k < BLOCKING8; k++) {
2053           //coin_prefetch_const(aBase2+BLOCKING8);
2054           for (int i = 0; i < BLOCKING8; i++) {
2055             bBaseI[i] -= bBase[k] * aBase2[i];
2056           }
2057           aBase2 += BLOCKING8;
2058         }
2059       }
2060     }
2061     dtrsm0(kkk, mm, m, m, a, b);
2062   }
2063 }
dtrsm1(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2064 static void dtrsm1(int kkk, int first, int last,
2065   int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2066 {
2067   int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2068   assert((last - first) % BLOCKING8 == 0);
2069   if (last - first > CILK_DTRSM) {
2070     int mid = ((first + last) >> 4) << 3;
2071     cilk_spawn dtrsm1(kkk, first, mid, m, a, b);
2072     dtrsm1(kkk, mid, last, m, a, b);
2073     cilk_sync;
2074   } else {
2075     for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2076       double *COIN_RESTRICT bBase2 = b + iii;
2077       const double *COIN_RESTRICT aBaseA = a + BLOCKING8X8 + BLOCKING8 * iii;
2078       for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2079         double *COIN_RESTRICT bBase = b + ii;
2080         const double *COIN_RESTRICT aBase = aBaseA + m * ii;
2081 #if AVX2 != 2
2082 #ifndef ALTERNATE_INNER
2083         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2084           aBase -= BLOCKING8;
2085           //coin_prefetch_const(aBase-BLOCKING8);
2086           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2087             bBase2[k] -= bBase[i] * aBase[k];
2088           }
2089         }
2090 #else
2091         double b0 = bBase2[0];
2092         double b1 = bBase2[1];
2093         double b2 = bBase2[2];
2094         double b3 = bBase2[3];
2095         double b4 = bBase2[4];
2096         double b5 = bBase2[5];
2097         double b6 = bBase2[6];
2098         double b7 = bBase2[7];
2099         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2100           aBase -= BLOCKING8;
2101           //coin_prefetch_const(aBase-BLOCKING8);
2102           double bValue = bBase[i];
2103           b0 -= bValue * aBase[0];
2104           b1 -= bValue * aBase[1];
2105           b2 -= bValue * aBase[2];
2106           b3 -= bValue * aBase[3];
2107           b4 -= bValue * aBase[4];
2108           b5 -= bValue * aBase[5];
2109           b6 -= bValue * aBase[6];
2110           b7 -= bValue * aBase[7];
2111         }
2112         bBase2[0] = b0;
2113         bBase2[1] = b1;
2114         bBase2[2] = b2;
2115         bBase2[3] = b3;
2116         bBase2[4] = b4;
2117         bBase2[5] = b5;
2118         bBase2[6] = b6;
2119         bBase2[7] = b7;
2120 #endif
2121 #else
2122         __m256d b0 = _mm256_load_pd(bBase2);
2123         __m256d b1 = _mm256_load_pd(bBase2 + 4);
2124         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2125           aBase -= BLOCKING8;
2126           //coin_prefetch_const(aBase5-BLOCKING8);
2127           __m256d bb = _mm256_broadcast_sd(bBase + i);
2128           __m256d a0 = _mm256_load_pd(aBase);
2129           b0 -= a0 * bb;
2130           __m256d a1 = _mm256_load_pd(aBase + 4);
2131           b1 -= a1 * bb;
2132         }
2133         //_mm256_store_pd (bBase2, b0);
2134         *reinterpret_cast< __m256d * >(bBase2) = b0;
2135         //_mm256_store_pd (bBase2+4, b1);
2136         *reinterpret_cast< __m256d * >(bBase2 + 4) = b1;
2137 #endif
2138       }
2139     }
2140   }
2141 }
CoinAbcDtrsm1(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2142 void CoinAbcDtrsm1(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2143 {
2144   assert((m & (BLOCKING8 - 1)) == 0);
2145   // 1 Left Upper NoTranspose NonUnit
2146   for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2147     int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2148     for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2149       const double *COIN_RESTRICT aBase = a + m * ii + ii * BLOCKING8;
2150       double *COIN_RESTRICT bBase = b + ii;
2151       for (int i = BLOCKING8 - 1; i >= 0; i--) {
2152         bBase[i] *= aBase[i * (BLOCKING8 + 1)];
2153         for (int k = i - 1; k >= 0; k--) {
2154           bBase[k] -= bBase[i] * aBase[k + i * BLOCKING8];
2155         }
2156       }
2157       double *COIN_RESTRICT bBase2 = bBase;
2158       for (int iii = ii; iii > mm; iii -= BLOCKING8) {
2159         bBase2 -= BLOCKING8;
2160         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2161           aBase -= BLOCKING8;
2162           coin_prefetch_const(aBase - BLOCKING8);
2163           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2164             bBase2[k] -= bBase[i] * aBase[k];
2165           }
2166         }
2167       }
2168     }
2169     dtrsm1(kkk, 0, mm, m, a, b);
2170   }
2171 }
dtrsm2(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2172 static void dtrsm2(int kkk, int first, int last,
2173   int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2174 {
2175   int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2176   assert((last - first) % BLOCKING8 == 0);
2177   if (last - first > CILK_DTRSM) {
2178     int mid = ((first + last) >> 4) << 3;
2179     cilk_spawn dtrsm2(kkk, first, mid, m, a, b);
2180     dtrsm2(kkk, mid, last, m, a, b);
2181     cilk_sync;
2182   } else {
2183     for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2184       for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2185         const double *COIN_RESTRICT bBase = b + ii;
2186         double *COIN_RESTRICT bBase2 = b + iii;
2187         const double *COIN_RESTRICT aBase = a + ii * BLOCKING8 + m * iii + BLOCKING8X8;
2188         for (int j = BLOCKING8 - 1; j >= 0; j -= 4) {
2189           double bValue0 = bBase2[j];
2190           double bValue1 = bBase2[j - 1];
2191           double bValue2 = bBase2[j - 2];
2192           double bValue3 = bBase2[j - 3];
2193           bValue0 -= aBase[-1 * BLOCKING8 + 7] * bBase[7];
2194           bValue1 -= aBase[-2 * BLOCKING8 + 7] * bBase[7];
2195           bValue2 -= aBase[-3 * BLOCKING8 + 7] * bBase[7];
2196           bValue3 -= aBase[-4 * BLOCKING8 + 7] * bBase[7];
2197           bValue0 -= aBase[-1 * BLOCKING8 + 6] * bBase[6];
2198           bValue1 -= aBase[-2 * BLOCKING8 + 6] * bBase[6];
2199           bValue2 -= aBase[-3 * BLOCKING8 + 6] * bBase[6];
2200           bValue3 -= aBase[-4 * BLOCKING8 + 6] * bBase[6];
2201           bValue0 -= aBase[-1 * BLOCKING8 + 5] * bBase[5];
2202           bValue1 -= aBase[-2 * BLOCKING8 + 5] * bBase[5];
2203           bValue2 -= aBase[-3 * BLOCKING8 + 5] * bBase[5];
2204           bValue3 -= aBase[-4 * BLOCKING8 + 5] * bBase[5];
2205           bValue0 -= aBase[-1 * BLOCKING8 + 4] * bBase[4];
2206           bValue1 -= aBase[-2 * BLOCKING8 + 4] * bBase[4];
2207           bValue2 -= aBase[-3 * BLOCKING8 + 4] * bBase[4];
2208           bValue3 -= aBase[-4 * BLOCKING8 + 4] * bBase[4];
2209           bValue0 -= aBase[-1 * BLOCKING8 + 3] * bBase[3];
2210           bValue1 -= aBase[-2 * BLOCKING8 + 3] * bBase[3];
2211           bValue2 -= aBase[-3 * BLOCKING8 + 3] * bBase[3];
2212           bValue3 -= aBase[-4 * BLOCKING8 + 3] * bBase[3];
2213           bValue0 -= aBase[-1 * BLOCKING8 + 2] * bBase[2];
2214           bValue1 -= aBase[-2 * BLOCKING8 + 2] * bBase[2];
2215           bValue2 -= aBase[-3 * BLOCKING8 + 2] * bBase[2];
2216           bValue3 -= aBase[-4 * BLOCKING8 + 2] * bBase[2];
2217           bValue0 -= aBase[-1 * BLOCKING8 + 1] * bBase[1];
2218           bValue1 -= aBase[-2 * BLOCKING8 + 1] * bBase[1];
2219           bValue2 -= aBase[-3 * BLOCKING8 + 1] * bBase[1];
2220           bValue3 -= aBase[-4 * BLOCKING8 + 1] * bBase[1];
2221           bValue0 -= aBase[-1 * BLOCKING8 + 0] * bBase[0];
2222           bValue1 -= aBase[-2 * BLOCKING8 + 0] * bBase[0];
2223           bValue2 -= aBase[-3 * BLOCKING8 + 0] * bBase[0];
2224           bValue3 -= aBase[-4 * BLOCKING8 + 0] * bBase[0];
2225           bBase2[j] = bValue0;
2226           bBase2[j - 1] = bValue1;
2227           bBase2[j - 2] = bValue2;
2228           bBase2[j - 3] = bValue3;
2229           aBase -= 4 * BLOCKING8;
2230         }
2231       }
2232     }
2233   }
2234 }
CoinAbcDtrsm2(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2235 void CoinAbcDtrsm2(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2236 {
2237   assert((m & (BLOCKING8 - 1)) == 0);
2238   // 2 Left Lower Transpose Unit
2239   for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2240     int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2241     for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2242       double *COIN_RESTRICT bBase = b + ii;
2243       double *COIN_RESTRICT bBase2 = bBase;
2244       const double *COIN_RESTRICT aBase = a + m * ii + (ii + BLOCKING8) * BLOCKING8;
2245       for (int i = BLOCKING8 - 1; i >= 0; i--) {
2246         aBase -= BLOCKING8;
2247         for (int k = i + 1; k < BLOCKING8; k++) {
2248           bBase2[i] -= aBase[k] * bBase2[k];
2249         }
2250       }
2251       for (int iii = ii - BLOCKING8; iii >= mm; iii -= BLOCKING8) {
2252         bBase2 -= BLOCKING8;
2253         assert(bBase2 == b + iii);
2254         aBase -= m * BLOCKING8;
2255         const double *COIN_RESTRICT aBase2 = aBase + BLOCKING8X8;
2256         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2257           aBase2 -= BLOCKING8;
2258           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2259             bBase2[i] -= aBase2[k] * bBase[k];
2260           }
2261         }
2262       }
2263     }
2264     dtrsm2(kkk, 0, mm, m, a, b);
2265   }
2266 }
2267 #define UNROLL_DTRSM3 16
2268 #define CILK_DTRSM3 32
dtrsm3(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2269 static void dtrsm3(int kkk, int first, int last,
2270   int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2271 {
2272   //int mm=CoinMin(kkk+UNROLL_DTRSM3*BLOCKING8,m);
2273   assert((last - first) % BLOCKING8 == 0);
2274   if (last - first > CILK_DTRSM3) {
2275     int mid = ((first + last) >> 4) << 3;
2276     cilk_spawn dtrsm3(kkk, first, mid, m, a, b);
2277     dtrsm3(kkk, mid, last, m, a, b);
2278     cilk_sync;
2279   } else {
2280     for (int kk = 0; kk < kkk; kk += BLOCKING8) {
2281       for (int ii = first; ii < last; ii += BLOCKING8) {
2282         double *COIN_RESTRICT bBase = b + ii;
2283         double *COIN_RESTRICT bBase2 = b + kk;
2284         const double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2285         for (int j = 0; j < BLOCKING8; j += 4) {
2286           double bValue0 = bBase[j];
2287           double bValue1 = bBase[j + 1];
2288           double bValue2 = bBase[j + 2];
2289           double bValue3 = bBase[j + 3];
2290           bValue0 -= aBase[0] * bBase2[0];
2291           bValue1 -= aBase[1 * BLOCKING8 + 0] * bBase2[0];
2292           bValue2 -= aBase[2 * BLOCKING8 + 0] * bBase2[0];
2293           bValue3 -= aBase[3 * BLOCKING8 + 0] * bBase2[0];
2294           bValue0 -= aBase[1] * bBase2[1];
2295           bValue1 -= aBase[1 * BLOCKING8 + 1] * bBase2[1];
2296           bValue2 -= aBase[2 * BLOCKING8 + 1] * bBase2[1];
2297           bValue3 -= aBase[3 * BLOCKING8 + 1] * bBase2[1];
2298           bValue0 -= aBase[2] * bBase2[2];
2299           bValue1 -= aBase[1 * BLOCKING8 + 2] * bBase2[2];
2300           bValue2 -= aBase[2 * BLOCKING8 + 2] * bBase2[2];
2301           bValue3 -= aBase[3 * BLOCKING8 + 2] * bBase2[2];
2302           bValue0 -= aBase[3] * bBase2[3];
2303           bValue1 -= aBase[1 * BLOCKING8 + 3] * bBase2[3];
2304           bValue2 -= aBase[2 * BLOCKING8 + 3] * bBase2[3];
2305           bValue3 -= aBase[3 * BLOCKING8 + 3] * bBase2[3];
2306           bValue0 -= aBase[4] * bBase2[4];
2307           bValue1 -= aBase[1 * BLOCKING8 + 4] * bBase2[4];
2308           bValue2 -= aBase[2 * BLOCKING8 + 4] * bBase2[4];
2309           bValue3 -= aBase[3 * BLOCKING8 + 4] * bBase2[4];
2310           bValue0 -= aBase[5] * bBase2[5];
2311           bValue1 -= aBase[1 * BLOCKING8 + 5] * bBase2[5];
2312           bValue2 -= aBase[2 * BLOCKING8 + 5] * bBase2[5];
2313           bValue3 -= aBase[3 * BLOCKING8 + 5] * bBase2[5];
2314           bValue0 -= aBase[6] * bBase2[6];
2315           bValue1 -= aBase[1 * BLOCKING8 + 6] * bBase2[6];
2316           bValue2 -= aBase[2 * BLOCKING8 + 7] * bBase2[7];
2317           bValue3 -= aBase[3 * BLOCKING8 + 6] * bBase2[6];
2318           bValue0 -= aBase[7] * bBase2[7];
2319           bValue1 -= aBase[1 * BLOCKING8 + 7] * bBase2[7];
2320           bValue2 -= aBase[2 * BLOCKING8 + 6] * bBase2[6];
2321           bValue3 -= aBase[3 * BLOCKING8 + 7] * bBase2[7];
2322           bBase[j] = bValue0;
2323           bBase[j + 1] = bValue1;
2324           bBase[j + 2] = bValue2;
2325           bBase[j + 3] = bValue3;
2326           aBase += 4 * BLOCKING8;
2327         }
2328       }
2329     }
2330   }
2331 }
CoinAbcDtrsm3(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2332 void CoinAbcDtrsm3(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2333 {
2334   assert((m & (BLOCKING8 - 1)) == 0);
2335   // 3 Left Upper Transpose NonUnit
2336   for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM3 * BLOCKING8) {
2337     int mm = CoinMin(m, kkk + UNROLL_DTRSM3 * BLOCKING8);
2338     dtrsm3(kkk, kkk, mm, m, a, b);
2339     for (int ii = kkk; ii < mm; ii += BLOCKING8) {
2340       double *COIN_RESTRICT bBase = b + ii;
2341       for (int kk = kkk; kk < ii; kk += BLOCKING8) {
2342         double *COIN_RESTRICT bBase2 = b + kk;
2343         const double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2344         for (int i = 0; i < BLOCKING8; i++) {
2345           for (int k = 0; k < BLOCKING8; k++) {
2346             bBase[i] -= aBase[k] * bBase2[k];
2347           }
2348           aBase += BLOCKING8;
2349         }
2350       }
2351       const double *COIN_RESTRICT aBase = a + ii * m + ii * BLOCKING8;
2352       for (int i = 0; i < BLOCKING8; i++) {
2353         for (int k = 0; k < i; k++) {
2354           bBase[i] -= aBase[k] * bBase[k];
2355         }
2356         bBase[i] = bBase[i] * aBase[i];
2357         aBase += BLOCKING8;
2358       }
2359     }
2360   }
2361 }
2362 static void
CoinAbcDlaswp(int n,double * COIN_RESTRICT a,int lda,int start,int end,int * ipiv)2363 CoinAbcDlaswp(int n, double *COIN_RESTRICT a, int lda, int start, int end, int *ipiv)
2364 {
2365   assert((n & (BLOCKING8 - 1)) == 0);
2366   double *COIN_RESTRICT aThis3 = a;
2367   for (int j = 0; j < n; j += BLOCKING8) {
2368     for (int i = start; i < end; i++) {
2369       int ip = ipiv[i];
2370       if (ip != i) {
2371         double *COIN_RESTRICT aTop = aThis3 + j * lda;
2372         int iBias = ip / BLOCKING8;
2373         ip -= iBias * BLOCKING8;
2374         double *COIN_RESTRICT aNotTop = aTop + iBias * BLOCKING8X8;
2375         iBias = i / BLOCKING8;
2376         int i2 = i - iBias * BLOCKING8;
2377         aTop += iBias * BLOCKING8X8;
2378         for (int k = 0; k < BLOCKING8; k++) {
2379           double temp = aTop[i2];
2380           aTop[i2] = aNotTop[ip];
2381           aNotTop[ip] = temp;
2382           aTop += BLOCKING8;
2383           aNotTop += BLOCKING8;
2384         }
2385       }
2386     }
2387   }
2388 }
2389 extern void CoinAbcDgemm(int m, int n, int k, double *COIN_RESTRICT a, int lda,
2390   double *COIN_RESTRICT b, double *COIN_RESTRICT c
2391 #if ABC_PARALLEL == 2
2392   ,
2393   int parallelMode
2394 #endif
2395 );
CoinAbcDgetrf2(int m,int n,double * COIN_RESTRICT a,int * ipiv)2396 static int CoinAbcDgetrf2(int m, int n, double *COIN_RESTRICT a, int *ipiv)
2397 {
2398   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2399   assert(n == BLOCKING8);
2400   int dimension = CoinMin(m, n);
2401   /* entry for column j and row i (when multiple of BLOCKING8)
2402      is at aBlocked+j*m+i*BLOCKING8
2403   */
2404   assert(dimension == BLOCKING8);
2405   //printf("Dgetrf2 m=%d n=%d\n",m,n);
2406   double *COIN_RESTRICT aThis3 = a;
2407   for (int j = 0; j < dimension; j++) {
2408     int pivotRow = -1;
2409     double largest = 0.0;
2410     double *COIN_RESTRICT aThis2 = aThis3 + j * BLOCKING8;
2411     // this block
2412     int pivotRow2 = -1;
2413     for (int i = j; i < BLOCKING8; i++) {
2414       double value = fabs(aThis2[i]);
2415       if (value > largest) {
2416         largest = value;
2417         pivotRow2 = i;
2418       }
2419     }
2420     // other blocks
2421     int iBias = 0;
2422     for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2423       aThis2 += BLOCKING8 * BLOCKING8;
2424       for (int i = 0; i < BLOCKING8; i++) {
2425         double value = fabs(aThis2[i]);
2426         if (value > largest) {
2427           largest = value;
2428           pivotRow2 = i;
2429           iBias = ii;
2430         }
2431       }
2432     }
2433     pivotRow = pivotRow2 + iBias;
2434     ipiv[j] = pivotRow;
2435     if (largest) {
2436       if (j != pivotRow) {
2437         double *COIN_RESTRICT aTop = aThis3;
2438         double *COIN_RESTRICT aNotTop = aThis3 + iBias * BLOCKING8;
2439         for (int i = 0; i < n; i++) {
2440           double value = aTop[j];
2441           aTop[j] = aNotTop[pivotRow2];
2442           aNotTop[pivotRow2] = value;
2443           aTop += BLOCKING8;
2444           aNotTop += BLOCKING8;
2445         }
2446       }
2447       aThis2 = aThis3 + j * BLOCKING8;
2448       double pivotMultiplier = 1.0 / aThis2[j];
2449       aThis2[j] = pivotMultiplier;
2450       // Do L
2451       // this block
2452       for (int i = j + 1; i < BLOCKING8; i++)
2453         aThis2[i] *= pivotMultiplier;
2454       // other blocks
2455       for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2456         aThis2 += BLOCKING8 * BLOCKING8;
2457         for (int i = 0; i < BLOCKING8; i++)
2458           aThis2[i] *= pivotMultiplier;
2459       }
2460       // update rest
2461       double *COIN_RESTRICT aPut;
2462       aThis2 = aThis3 + j * BLOCKING8;
2463       aPut = aThis2 + BLOCKING8;
2464       double *COIN_RESTRICT aPut2 = aPut;
2465       // this block
2466       for (int i = j + 1; i < BLOCKING8; i++) {
2467         double value = aPut[j];
2468         for (int k = j + 1; k < BLOCKING8; k++)
2469           aPut[k] -= value * aThis2[k];
2470         aPut += BLOCKING8;
2471       }
2472       // other blocks
2473       for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2474         aThis2 += BLOCKING8 * BLOCKING8;
2475         aPut = aThis2 + BLOCKING8;
2476         double *COIN_RESTRICT aPut2a = aPut2;
2477         for (int i = j + 1; i < BLOCKING8; i++) {
2478           double value = aPut2a[j];
2479           for (int k = 0; k < BLOCKING8; k++)
2480             aPut[k] -= value * aThis2[k];
2481           aPut += BLOCKING8;
2482           aPut2a += BLOCKING8;
2483         }
2484       }
2485     } else {
2486       return 1;
2487     }
2488   }
2489   return 0;
2490 }
2491 
CoinAbcDgetrf(int m,int n,double * COIN_RESTRICT a,int lda,int * ipiv,int parallelMode)2492 int CoinAbcDgetrf(int m, int n, double *COIN_RESTRICT a, int lda, int *ipiv
2493 #if ABC_PARALLEL == 2
2494   ,
2495   int parallelMode
2496 #endif
2497 )
2498 {
2499   assert(m == n);
2500   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2501   if (m < BLOCKING8) {
2502     return CoinAbcDgetrf2(m, n, a, ipiv);
2503   } else {
2504     for (int j = 0; j < n; j += BLOCKING8) {
2505       int start = j;
2506       int newSize = CoinMin(BLOCKING8, n - j);
2507       int end = j + newSize;
2508       int returnCode = CoinAbcDgetrf2(m - start, newSize, a + (start * lda + start * BLOCKING8),
2509         ipiv + start);
2510       if (!returnCode) {
2511         // adjust
2512         for (int k = start; k < end; k++)
2513           ipiv[k] += start;
2514         // swap 0<start
2515         CoinAbcDlaswp(start, a, lda, start, end, ipiv);
2516         if (end < n) {
2517           // swap >=end
2518           CoinAbcDlaswp(n - end, a + end * lda, lda, start, end, ipiv);
2519           CoinAbcDtrsmFactor(newSize, n - end, a + (start * lda + start * BLOCKING8), lda);
2520           CoinAbcDgemm(n - end, n - end, newSize,
2521             a + start * lda + end * BLOCKING8, lda,
2522             a + end * lda + start * BLOCKING8, a + end * lda + end * BLOCKING8
2523 #if ABC_PARALLEL == 2
2524             ,
2525             parallelMode
2526 #endif
2527           );
2528         }
2529       } else {
2530         return returnCode;
2531       }
2532     }
2533   }
2534   return 0;
2535 }
CoinAbcDgetrs(char trans,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT work)2536 void CoinAbcDgetrs(char trans, int m, double *COIN_RESTRICT a, double *COIN_RESTRICT work)
2537 {
2538   assert((m & (BLOCKING8 - 1)) == 0);
2539   if (trans == 'N') {
2540     //CoinAbcDlaswp1(work,m,ipiv);
2541     CoinAbcDtrsm0(m, a, work);
2542     CoinAbcDtrsm1(m, a, work);
2543   } else {
2544     assert(trans == 'T');
2545     CoinAbcDtrsm3(m, a, work);
2546     CoinAbcDtrsm2(m, a, work);
2547     //CoinAbcDlaswp1Backwards(work,m,ipiv);
2548   }
2549 }
2550 #ifdef ABC_LONG_FACTORIZATION
2551 /// ****** Start long double version
2552 /* type
2553    0 Left Lower NoTranspose Unit
2554    1 Left Upper NoTranspose NonUnit
2555    2 Left Lower Transpose Unit
2556    3 Left Upper Transpose NonUnit
2557 */
CoinAbcDtrsmFactor(int m,int n,long double * COIN_RESTRICT a,int lda)2558 static void CoinAbcDtrsmFactor(int m, int n, long double *COIN_RESTRICT a, int lda)
2559 {
2560   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2561   assert(m == BLOCKING8);
2562   // 0 Left Lower NoTranspose Unit
2563   /* entry for column j and row i (when multiple of BLOCKING8)
2564      is at aBlocked+j*m+i*BLOCKING8
2565   */
2566   long double *COIN_RESTRICT aBase2 = a;
2567   long double *COIN_RESTRICT bBase2 = aBase2 + lda * BLOCKING8;
2568   for (int jj = 0; jj < n; jj += BLOCKING8) {
2569     long double *COIN_RESTRICT bBase = bBase2;
2570     for (int j = jj; j < jj + BLOCKING8; j++) {
2571 #if 0
2572       long double * COIN_RESTRICT aBase = aBase2;
2573       for (int k=0;k<BLOCKING8;k++) {
2574 	long double bValue = bBase[k];
2575 	if (bValue) {
2576 	  for (int i=k+1;i<BLOCKING8;i++) {
2577 	    bBase[i]-=bValue*aBase[i];
2578 	  }
2579 	}
2580 	aBase+=BLOCKING8;
2581       }
2582 #else
2583       // unroll - stay in registers - don't test for zeros
2584       assert(BLOCKING8 == 8);
2585       long double bValue0 = bBase[0];
2586       long double bValue1 = bBase[1];
2587       long double bValue2 = bBase[2];
2588       long double bValue3 = bBase[3];
2589       long double bValue4 = bBase[4];
2590       long double bValue5 = bBase[5];
2591       long double bValue6 = bBase[6];
2592       long double bValue7 = bBase[7];
2593       bValue1 -= bValue0 * a[1 + 0 * BLOCKING8];
2594       bBase[1] = bValue1;
2595       bValue2 -= bValue0 * a[2 + 0 * BLOCKING8];
2596       bValue3 -= bValue0 * a[3 + 0 * BLOCKING8];
2597       bValue4 -= bValue0 * a[4 + 0 * BLOCKING8];
2598       bValue5 -= bValue0 * a[5 + 0 * BLOCKING8];
2599       bValue6 -= bValue0 * a[6 + 0 * BLOCKING8];
2600       bValue7 -= bValue0 * a[7 + 0 * BLOCKING8];
2601       bValue2 -= bValue1 * a[2 + 1 * BLOCKING8];
2602       bBase[2] = bValue2;
2603       bValue3 -= bValue1 * a[3 + 1 * BLOCKING8];
2604       bValue4 -= bValue1 * a[4 + 1 * BLOCKING8];
2605       bValue5 -= bValue1 * a[5 + 1 * BLOCKING8];
2606       bValue6 -= bValue1 * a[6 + 1 * BLOCKING8];
2607       bValue7 -= bValue1 * a[7 + 1 * BLOCKING8];
2608       bValue3 -= bValue2 * a[3 + 2 * BLOCKING8];
2609       bBase[3] = bValue3;
2610       bValue4 -= bValue2 * a[4 + 2 * BLOCKING8];
2611       bValue5 -= bValue2 * a[5 + 2 * BLOCKING8];
2612       bValue6 -= bValue2 * a[6 + 2 * BLOCKING8];
2613       bValue7 -= bValue2 * a[7 + 2 * BLOCKING8];
2614       bValue4 -= bValue3 * a[4 + 3 * BLOCKING8];
2615       bBase[4] = bValue4;
2616       bValue5 -= bValue3 * a[5 + 3 * BLOCKING8];
2617       bValue6 -= bValue3 * a[6 + 3 * BLOCKING8];
2618       bValue7 -= bValue3 * a[7 + 3 * BLOCKING8];
2619       bValue5 -= bValue4 * a[5 + 4 * BLOCKING8];
2620       bBase[5] = bValue5;
2621       bValue6 -= bValue4 * a[6 + 4 * BLOCKING8];
2622       bValue7 -= bValue4 * a[7 + 4 * BLOCKING8];
2623       bValue6 -= bValue5 * a[6 + 5 * BLOCKING8];
2624       bBase[6] = bValue6;
2625       bValue7 -= bValue5 * a[7 + 5 * BLOCKING8];
2626       bValue7 -= bValue6 * a[7 + 6 * BLOCKING8];
2627       bBase[7] = bValue7;
2628 #endif
2629       bBase += BLOCKING8;
2630     }
2631     bBase2 += lda * BLOCKING8;
2632   }
2633 }
2634 #define UNROLL_DTRSM 16
2635 #define CILK_DTRSM 32
dtrsm0(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2636 static void dtrsm0(int kkk, int first, int last,
2637   int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2638 {
2639   int mm = CoinMin(kkk + UNROLL_DTRSM * BLOCKING8, m);
2640   assert((last - first) % BLOCKING8 == 0);
2641   if (last - first > CILK_DTRSM) {
2642     int mid = ((first + last) >> 4) << 3;
2643     cilk_spawn dtrsm0(kkk, first, mid, m, a, b);
2644     dtrsm0(kkk, mid, last, m, a, b);
2645     cilk_sync;
2646   } else {
2647     const long double *COIN_RESTRICT aBaseA = a + UNROLL_DTRSM * BLOCKING8X8 + kkk * BLOCKING8;
2648     aBaseA += (first - mm) * BLOCKING8 - BLOCKING8X8;
2649     aBaseA += m * kkk;
2650     for (int ii = first; ii < last; ii += BLOCKING8) {
2651       aBaseA += BLOCKING8X8;
2652       const long double *COIN_RESTRICT aBaseB = aBaseA;
2653       long double *COIN_RESTRICT bBaseI = b + ii;
2654       for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2655         long double *COIN_RESTRICT bBase = b + kk;
2656         const long double *COIN_RESTRICT aBase2 = aBaseB; //a+UNROLL_DTRSM*BLOCKING8X8+m*kk+kkk*BLOCKING8;
2657         //aBase2 += (ii-mm)*BLOCKING8;
2658         //assert (aBase2==aBaseB);
2659         aBaseB += m * BLOCKING8;
2660 #if AVX2 != 2
2661 #define ALTERNATE_INNER
2662 #ifndef ALTERNATE_INNER
2663         for (int k = 0; k < BLOCKING8; k++) {
2664           //coin_prefetch_const(aBase2+BLOCKING8);
2665           for (int i = 0; i < BLOCKING8; i++) {
2666             bBaseI[i] -= bBase[k] * aBase2[i];
2667           }
2668           aBase2 += BLOCKING8;
2669         }
2670 #else
2671         long double b0 = bBaseI[0];
2672         long double b1 = bBaseI[1];
2673         long double b2 = bBaseI[2];
2674         long double b3 = bBaseI[3];
2675         long double b4 = bBaseI[4];
2676         long double b5 = bBaseI[5];
2677         long double b6 = bBaseI[6];
2678         long double b7 = bBaseI[7];
2679         for (int k = 0; k < BLOCKING8; k++) {
2680           //coin_prefetch_const(aBase2+BLOCKING8);
2681           long double bValue = bBase[k];
2682           b0 -= bValue * aBase2[0];
2683           b1 -= bValue * aBase2[1];
2684           b2 -= bValue * aBase2[2];
2685           b3 -= bValue * aBase2[3];
2686           b4 -= bValue * aBase2[4];
2687           b5 -= bValue * aBase2[5];
2688           b6 -= bValue * aBase2[6];
2689           b7 -= bValue * aBase2[7];
2690           aBase2 += BLOCKING8;
2691         }
2692         bBaseI[0] = b0;
2693         bBaseI[1] = b1;
2694         bBaseI[2] = b2;
2695         bBaseI[3] = b3;
2696         bBaseI[4] = b4;
2697         bBaseI[5] = b5;
2698         bBaseI[6] = b6;
2699         bBaseI[7] = b7;
2700 #endif
2701 #else
2702         __m256d b0 = _mm256_load_pd(bBaseI);
2703         __m256d b1 = _mm256_load_pd(bBaseI + 4);
2704         for (int k = 0; k < BLOCKING8; k++) {
2705           //coin_prefetch_const(aBase2+BLOCKING8);
2706           __m256d bb = _mm256_broadcast_sd(bBase + k);
2707           __m256d a0 = _mm256_load_pd(aBase2);
2708           b0 -= a0 * bb;
2709           __m256d a1 = _mm256_load_pd(aBase2 + 4);
2710           b1 -= a1 * bb;
2711           aBase2 += BLOCKING8;
2712         }
2713         //_mm256_store_pd ((bBaseI), (b0));
2714         *reinterpret_cast< __m256d * >(bBaseI) = b0;
2715         //_mm256_store_pd (bBaseI+4, b1);
2716         *reinterpret_cast< __m256d * >(bBaseI + 4) = b1;
2717 #endif
2718       }
2719     }
2720   }
2721 }
CoinAbcDtrsm0(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2722 void CoinAbcDtrsm0(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2723 {
2724   assert((m & (BLOCKING8 - 1)) == 0);
2725   // 0 Left Lower NoTranspose Unit
2726   for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM * BLOCKING8) {
2727     int mm = CoinMin(m, kkk + UNROLL_DTRSM * BLOCKING8);
2728     for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2729       const long double *COIN_RESTRICT aBase2 = a + kk * (m + BLOCKING8);
2730       long double *COIN_RESTRICT bBase = b + kk;
2731       for (int k = 0; k < BLOCKING8; k++) {
2732         for (int i = k + 1; i < BLOCKING8; i++) {
2733           bBase[i] -= bBase[k] * aBase2[i];
2734         }
2735         aBase2 += BLOCKING8;
2736       }
2737       for (int ii = kk + BLOCKING8; ii < mm; ii += BLOCKING8) {
2738         long double *COIN_RESTRICT bBaseI = b + ii;
2739         for (int k = 0; k < BLOCKING8; k++) {
2740           //coin_prefetch_const(aBase2+BLOCKING8);
2741           for (int i = 0; i < BLOCKING8; i++) {
2742             bBaseI[i] -= bBase[k] * aBase2[i];
2743           }
2744           aBase2 += BLOCKING8;
2745         }
2746       }
2747     }
2748     dtrsm0(kkk, mm, m, m, a, b);
2749   }
2750 }
dtrsm1(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2751 static void dtrsm1(int kkk, int first, int last,
2752   int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2753 {
2754   int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2755   assert((last - first) % BLOCKING8 == 0);
2756   if (last - first > CILK_DTRSM) {
2757     int mid = ((first + last) >> 4) << 3;
2758     cilk_spawn dtrsm1(kkk, first, mid, m, a, b);
2759     dtrsm1(kkk, mid, last, m, a, b);
2760     cilk_sync;
2761   } else {
2762     for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2763       long double *COIN_RESTRICT bBase2 = b + iii;
2764       const long double *COIN_RESTRICT aBaseA = a + BLOCKING8X8 + BLOCKING8 * iii;
2765       for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2766         long double *COIN_RESTRICT bBase = b + ii;
2767         const long double *COIN_RESTRICT aBase = aBaseA + m * ii;
2768 #if AVX2 != 2
2769 #ifndef ALTERNATE_INNER
2770         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2771           aBase -= BLOCKING8;
2772           //coin_prefetch_const(aBase-BLOCKING8);
2773           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2774             bBase2[k] -= bBase[i] * aBase[k];
2775           }
2776         }
2777 #else
2778         long double b0 = bBase2[0];
2779         long double b1 = bBase2[1];
2780         long double b2 = bBase2[2];
2781         long double b3 = bBase2[3];
2782         long double b4 = bBase2[4];
2783         long double b5 = bBase2[5];
2784         long double b6 = bBase2[6];
2785         long double b7 = bBase2[7];
2786         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2787           aBase -= BLOCKING8;
2788           //coin_prefetch_const(aBase-BLOCKING8);
2789           long double bValue = bBase[i];
2790           b0 -= bValue * aBase[0];
2791           b1 -= bValue * aBase[1];
2792           b2 -= bValue * aBase[2];
2793           b3 -= bValue * aBase[3];
2794           b4 -= bValue * aBase[4];
2795           b5 -= bValue * aBase[5];
2796           b6 -= bValue * aBase[6];
2797           b7 -= bValue * aBase[7];
2798         }
2799         bBase2[0] = b0;
2800         bBase2[1] = b1;
2801         bBase2[2] = b2;
2802         bBase2[3] = b3;
2803         bBase2[4] = b4;
2804         bBase2[5] = b5;
2805         bBase2[6] = b6;
2806         bBase2[7] = b7;
2807 #endif
2808 #else
2809         __m256d b0 = _mm256_load_pd(bBase2);
2810         __m256d b1 = _mm256_load_pd(bBase2 + 4);
2811         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2812           aBase -= BLOCKING8;
2813           //coin_prefetch_const(aBase5-BLOCKING8);
2814           __m256d bb = _mm256_broadcast_sd(bBase + i);
2815           __m256d a0 = _mm256_load_pd(aBase);
2816           b0 -= a0 * bb;
2817           __m256d a1 = _mm256_load_pd(aBase + 4);
2818           b1 -= a1 * bb;
2819         }
2820         //_mm256_store_pd (bBase2, b0);
2821         *reinterpret_cast< __m256d * >(bBase2) = b0;
2822         //_mm256_store_pd (bBase2+4, b1);
2823         *reinterpret_cast< __m256d * >(bBase2 + 4) = b1;
2824 #endif
2825       }
2826     }
2827   }
2828 }
CoinAbcDtrsm1(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2829 void CoinAbcDtrsm1(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2830 {
2831   assert((m & (BLOCKING8 - 1)) == 0);
2832   // 1 Left Upper NoTranspose NonUnit
2833   for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2834     int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2835     for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2836       const long double *COIN_RESTRICT aBase = a + m * ii + ii * BLOCKING8;
2837       long double *COIN_RESTRICT bBase = b + ii;
2838       for (int i = BLOCKING8 - 1; i >= 0; i--) {
2839         bBase[i] *= aBase[i * (BLOCKING8 + 1)];
2840         for (int k = i - 1; k >= 0; k--) {
2841           bBase[k] -= bBase[i] * aBase[k + i * BLOCKING8];
2842         }
2843       }
2844       long double *COIN_RESTRICT bBase2 = bBase;
2845       for (int iii = ii; iii > mm; iii -= BLOCKING8) {
2846         bBase2 -= BLOCKING8;
2847         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2848           aBase -= BLOCKING8;
2849           coin_prefetch_const(aBase - BLOCKING8);
2850           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2851             bBase2[k] -= bBase[i] * aBase[k];
2852           }
2853         }
2854       }
2855     }
2856     dtrsm1(kkk, 0, mm, m, a, b);
2857   }
2858 }
dtrsm2(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2859 static void dtrsm2(int kkk, int first, int last,
2860   int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2861 {
2862   int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2863   assert((last - first) % BLOCKING8 == 0);
2864   if (last - first > CILK_DTRSM) {
2865     int mid = ((first + last) >> 4) << 3;
2866     cilk_spawn dtrsm2(kkk, first, mid, m, a, b);
2867     dtrsm2(kkk, mid, last, m, a, b);
2868     cilk_sync;
2869   } else {
2870     for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2871       for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2872         const long double *COIN_RESTRICT bBase = b + ii;
2873         long double *COIN_RESTRICT bBase2 = b + iii;
2874         const long double *COIN_RESTRICT aBase = a + ii * BLOCKING8 + m * iii + BLOCKING8X8;
2875         for (int j = BLOCKING8 - 1; j >= 0; j -= 4) {
2876           long double bValue0 = bBase2[j];
2877           long double bValue1 = bBase2[j - 1];
2878           long double bValue2 = bBase2[j - 2];
2879           long double bValue3 = bBase2[j - 3];
2880           bValue0 -= aBase[-1 * BLOCKING8 + 7] * bBase[7];
2881           bValue1 -= aBase[-2 * BLOCKING8 + 7] * bBase[7];
2882           bValue2 -= aBase[-3 * BLOCKING8 + 7] * bBase[7];
2883           bValue3 -= aBase[-4 * BLOCKING8 + 7] * bBase[7];
2884           bValue0 -= aBase[-1 * BLOCKING8 + 6] * bBase[6];
2885           bValue1 -= aBase[-2 * BLOCKING8 + 6] * bBase[6];
2886           bValue2 -= aBase[-3 * BLOCKING8 + 6] * bBase[6];
2887           bValue3 -= aBase[-4 * BLOCKING8 + 6] * bBase[6];
2888           bValue0 -= aBase[-1 * BLOCKING8 + 5] * bBase[5];
2889           bValue1 -= aBase[-2 * BLOCKING8 + 5] * bBase[5];
2890           bValue2 -= aBase[-3 * BLOCKING8 + 5] * bBase[5];
2891           bValue3 -= aBase[-4 * BLOCKING8 + 5] * bBase[5];
2892           bValue0 -= aBase[-1 * BLOCKING8 + 4] * bBase[4];
2893           bValue1 -= aBase[-2 * BLOCKING8 + 4] * bBase[4];
2894           bValue2 -= aBase[-3 * BLOCKING8 + 4] * bBase[4];
2895           bValue3 -= aBase[-4 * BLOCKING8 + 4] * bBase[4];
2896           bValue0 -= aBase[-1 * BLOCKING8 + 3] * bBase[3];
2897           bValue1 -= aBase[-2 * BLOCKING8 + 3] * bBase[3];
2898           bValue2 -= aBase[-3 * BLOCKING8 + 3] * bBase[3];
2899           bValue3 -= aBase[-4 * BLOCKING8 + 3] * bBase[3];
2900           bValue0 -= aBase[-1 * BLOCKING8 + 2] * bBase[2];
2901           bValue1 -= aBase[-2 * BLOCKING8 + 2] * bBase[2];
2902           bValue2 -= aBase[-3 * BLOCKING8 + 2] * bBase[2];
2903           bValue3 -= aBase[-4 * BLOCKING8 + 2] * bBase[2];
2904           bValue0 -= aBase[-1 * BLOCKING8 + 1] * bBase[1];
2905           bValue1 -= aBase[-2 * BLOCKING8 + 1] * bBase[1];
2906           bValue2 -= aBase[-3 * BLOCKING8 + 1] * bBase[1];
2907           bValue3 -= aBase[-4 * BLOCKING8 + 1] * bBase[1];
2908           bValue0 -= aBase[-1 * BLOCKING8 + 0] * bBase[0];
2909           bValue1 -= aBase[-2 * BLOCKING8 + 0] * bBase[0];
2910           bValue2 -= aBase[-3 * BLOCKING8 + 0] * bBase[0];
2911           bValue3 -= aBase[-4 * BLOCKING8 + 0] * bBase[0];
2912           bBase2[j] = bValue0;
2913           bBase2[j - 1] = bValue1;
2914           bBase2[j - 2] = bValue2;
2915           bBase2[j - 3] = bValue3;
2916           aBase -= 4 * BLOCKING8;
2917         }
2918       }
2919     }
2920   }
2921 }
CoinAbcDtrsm2(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2922 void CoinAbcDtrsm2(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2923 {
2924   assert((m & (BLOCKING8 - 1)) == 0);
2925   // 2 Left Lower Transpose Unit
2926   for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2927     int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2928     for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2929       long double *COIN_RESTRICT bBase = b + ii;
2930       long double *COIN_RESTRICT bBase2 = bBase;
2931       const long double *COIN_RESTRICT aBase = a + m * ii + (ii + BLOCKING8) * BLOCKING8;
2932       for (int i = BLOCKING8 - 1; i >= 0; i--) {
2933         aBase -= BLOCKING8;
2934         for (int k = i + 1; k < BLOCKING8; k++) {
2935           bBase2[i] -= aBase[k] * bBase2[k];
2936         }
2937       }
2938       for (int iii = ii - BLOCKING8; iii >= mm; iii -= BLOCKING8) {
2939         bBase2 -= BLOCKING8;
2940         assert(bBase2 == b + iii);
2941         aBase -= m * BLOCKING8;
2942         const long double *COIN_RESTRICT aBase2 = aBase + BLOCKING8X8;
2943         for (int i = BLOCKING8 - 1; i >= 0; i--) {
2944           aBase2 -= BLOCKING8;
2945           for (int k = BLOCKING8 - 1; k >= 0; k--) {
2946             bBase2[i] -= aBase2[k] * bBase[k];
2947           }
2948         }
2949       }
2950     }
2951     dtrsm2(kkk, 0, mm, m, a, b);
2952   }
2953 }
2954 #define UNROLL_DTRSM3 16
2955 #define CILK_DTRSM3 32
dtrsm3(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2956 static void dtrsm3(int kkk, int first, int last,
2957   int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2958 {
2959   //int mm=CoinMin(kkk+UNROLL_DTRSM3*BLOCKING8,m);
2960   assert((last - first) % BLOCKING8 == 0);
2961   if (last - first > CILK_DTRSM3) {
2962     int mid = ((first + last) >> 4) << 3;
2963     cilk_spawn dtrsm3(kkk, first, mid, m, a, b);
2964     dtrsm3(kkk, mid, last, m, a, b);
2965     cilk_sync;
2966   } else {
2967     for (int kk = 0; kk < kkk; kk += BLOCKING8) {
2968       for (int ii = first; ii < last; ii += BLOCKING8) {
2969         long double *COIN_RESTRICT bBase = b + ii;
2970         long double *COIN_RESTRICT bBase2 = b + kk;
2971         const long double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2972         for (int j = 0; j < BLOCKING8; j += 4) {
2973           long double bValue0 = bBase[j];
2974           long double bValue1 = bBase[j + 1];
2975           long double bValue2 = bBase[j + 2];
2976           long double bValue3 = bBase[j + 3];
2977           bValue0 -= aBase[0] * bBase2[0];
2978           bValue1 -= aBase[1 * BLOCKING8 + 0] * bBase2[0];
2979           bValue2 -= aBase[2 * BLOCKING8 + 0] * bBase2[0];
2980           bValue3 -= aBase[3 * BLOCKING8 + 0] * bBase2[0];
2981           bValue0 -= aBase[1] * bBase2[1];
2982           bValue1 -= aBase[1 * BLOCKING8 + 1] * bBase2[1];
2983           bValue2 -= aBase[2 * BLOCKING8 + 1] * bBase2[1];
2984           bValue3 -= aBase[3 * BLOCKING8 + 1] * bBase2[1];
2985           bValue0 -= aBase[2] * bBase2[2];
2986           bValue1 -= aBase[1 * BLOCKING8 + 2] * bBase2[2];
2987           bValue2 -= aBase[2 * BLOCKING8 + 2] * bBase2[2];
2988           bValue3 -= aBase[3 * BLOCKING8 + 2] * bBase2[2];
2989           bValue0 -= aBase[3] * bBase2[3];
2990           bValue1 -= aBase[1 * BLOCKING8 + 3] * bBase2[3];
2991           bValue2 -= aBase[2 * BLOCKING8 + 3] * bBase2[3];
2992           bValue3 -= aBase[3 * BLOCKING8 + 3] * bBase2[3];
2993           bValue0 -= aBase[4] * bBase2[4];
2994           bValue1 -= aBase[1 * BLOCKING8 + 4] * bBase2[4];
2995           bValue2 -= aBase[2 * BLOCKING8 + 4] * bBase2[4];
2996           bValue3 -= aBase[3 * BLOCKING8 + 4] * bBase2[4];
2997           bValue0 -= aBase[5] * bBase2[5];
2998           bValue1 -= aBase[1 * BLOCKING8 + 5] * bBase2[5];
2999           bValue2 -= aBase[2 * BLOCKING8 + 5] * bBase2[5];
3000           bValue3 -= aBase[3 * BLOCKING8 + 5] * bBase2[5];
3001           bValue0 -= aBase[6] * bBase2[6];
3002           bValue1 -= aBase[1 * BLOCKING8 + 6] * bBase2[6];
3003           bValue2 -= aBase[2 * BLOCKING8 + 7] * bBase2[7];
3004           bValue3 -= aBase[3 * BLOCKING8 + 6] * bBase2[6];
3005           bValue0 -= aBase[7] * bBase2[7];
3006           bValue1 -= aBase[1 * BLOCKING8 + 7] * bBase2[7];
3007           bValue2 -= aBase[2 * BLOCKING8 + 6] * bBase2[6];
3008           bValue3 -= aBase[3 * BLOCKING8 + 7] * bBase2[7];
3009           bBase[j] = bValue0;
3010           bBase[j + 1] = bValue1;
3011           bBase[j + 2] = bValue2;
3012           bBase[j + 3] = bValue3;
3013           aBase += 4 * BLOCKING8;
3014         }
3015       }
3016     }
3017   }
3018 }
CoinAbcDtrsm3(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)3019 void CoinAbcDtrsm3(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
3020 {
3021   assert((m & (BLOCKING8 - 1)) == 0);
3022   // 3 Left Upper Transpose NonUnit
3023   for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM3 * BLOCKING8) {
3024     int mm = CoinMin(m, kkk + UNROLL_DTRSM3 * BLOCKING8);
3025     dtrsm3(kkk, kkk, mm, m, a, b);
3026     for (int ii = kkk; ii < mm; ii += BLOCKING8) {
3027       long double *COIN_RESTRICT bBase = b + ii;
3028       for (int kk = kkk; kk < ii; kk += BLOCKING8) {
3029         long double *COIN_RESTRICT bBase2 = b + kk;
3030         const long double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
3031         for (int i = 0; i < BLOCKING8; i++) {
3032           for (int k = 0; k < BLOCKING8; k++) {
3033             bBase[i] -= aBase[k] * bBase2[k];
3034           }
3035           aBase += BLOCKING8;
3036         }
3037       }
3038       const long double *COIN_RESTRICT aBase = a + ii * m + ii * BLOCKING8;
3039       for (int i = 0; i < BLOCKING8; i++) {
3040         for (int k = 0; k < i; k++) {
3041           bBase[i] -= aBase[k] * bBase[k];
3042         }
3043         bBase[i] = bBase[i] * aBase[i];
3044         aBase += BLOCKING8;
3045       }
3046     }
3047   }
3048 }
3049 static void
CoinAbcDlaswp(int n,long double * COIN_RESTRICT a,int lda,int start,int end,int * ipiv)3050 CoinAbcDlaswp(int n, long double *COIN_RESTRICT a, int lda, int start, int end, int *ipiv)
3051 {
3052   assert((n & (BLOCKING8 - 1)) == 0);
3053   long double *COIN_RESTRICT aThis3 = a;
3054   for (int j = 0; j < n; j += BLOCKING8) {
3055     for (int i = start; i < end; i++) {
3056       int ip = ipiv[i];
3057       if (ip != i) {
3058         long double *COIN_RESTRICT aTop = aThis3 + j * lda;
3059         int iBias = ip / BLOCKING8;
3060         ip -= iBias * BLOCKING8;
3061         long double *COIN_RESTRICT aNotTop = aTop + iBias * BLOCKING8X8;
3062         iBias = i / BLOCKING8;
3063         int i2 = i - iBias * BLOCKING8;
3064         aTop += iBias * BLOCKING8X8;
3065         for (int k = 0; k < BLOCKING8; k++) {
3066           long double temp = aTop[i2];
3067           aTop[i2] = aNotTop[ip];
3068           aNotTop[ip] = temp;
3069           aTop += BLOCKING8;
3070           aNotTop += BLOCKING8;
3071         }
3072       }
3073     }
3074   }
3075 }
3076 extern void CoinAbcDgemm(int m, int n, int k, long double *COIN_RESTRICT a, int lda,
3077   long double *COIN_RESTRICT b, long double *COIN_RESTRICT c
3078 #if ABC_PARALLEL == 2
3079   ,
3080   int parallelMode
3081 #endif
3082 );
CoinAbcDgetrf2(int m,int n,long double * COIN_RESTRICT a,int * ipiv)3083 static int CoinAbcDgetrf2(int m, int n, long double *COIN_RESTRICT a, int *ipiv)
3084 {
3085   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
3086   assert(n == BLOCKING8);
3087   int dimension = CoinMin(m, n);
3088   /* entry for column j and row i (when multiple of BLOCKING8)
3089      is at aBlocked+j*m+i*BLOCKING8
3090   */
3091   assert(dimension == BLOCKING8);
3092   //printf("Dgetrf2 m=%d n=%d\n",m,n);
3093   long double *COIN_RESTRICT aThis3 = a;
3094   for (int j = 0; j < dimension; j++) {
3095     int pivotRow = -1;
3096     long double largest = 0.0;
3097     long double *COIN_RESTRICT aThis2 = aThis3 + j * BLOCKING8;
3098     // this block
3099     int pivotRow2 = -1;
3100     for (int i = j; i < BLOCKING8; i++) {
3101       long double value = fabsl(aThis2[i]);
3102       if (value > largest) {
3103         largest = value;
3104         pivotRow2 = i;
3105       }
3106     }
3107     // other blocks
3108     int iBias = 0;
3109     for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3110       aThis2 += BLOCKING8 * BLOCKING8;
3111       for (int i = 0; i < BLOCKING8; i++) {
3112         long double value = fabsl(aThis2[i]);
3113         if (value > largest) {
3114           largest = value;
3115           pivotRow2 = i;
3116           iBias = ii;
3117         }
3118       }
3119     }
3120     pivotRow = pivotRow2 + iBias;
3121     ipiv[j] = pivotRow;
3122     if (largest) {
3123       if (j != pivotRow) {
3124         long double *COIN_RESTRICT aTop = aThis3;
3125         long double *COIN_RESTRICT aNotTop = aThis3 + iBias * BLOCKING8;
3126         for (int i = 0; i < n; i++) {
3127           long double value = aTop[j];
3128           aTop[j] = aNotTop[pivotRow2];
3129           aNotTop[pivotRow2] = value;
3130           aTop += BLOCKING8;
3131           aNotTop += BLOCKING8;
3132         }
3133       }
3134       aThis2 = aThis3 + j * BLOCKING8;
3135       long double pivotMultiplier = 1.0 / aThis2[j];
3136       aThis2[j] = pivotMultiplier;
3137       // Do L
3138       // this block
3139       for (int i = j + 1; i < BLOCKING8; i++)
3140         aThis2[i] *= pivotMultiplier;
3141       // other blocks
3142       for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3143         aThis2 += BLOCKING8 * BLOCKING8;
3144         for (int i = 0; i < BLOCKING8; i++)
3145           aThis2[i] *= pivotMultiplier;
3146       }
3147       // update rest
3148       long double *COIN_RESTRICT aPut;
3149       aThis2 = aThis3 + j * BLOCKING8;
3150       aPut = aThis2 + BLOCKING8;
3151       long double *COIN_RESTRICT aPut2 = aPut;
3152       // this block
3153       for (int i = j + 1; i < BLOCKING8; i++) {
3154         long double value = aPut[j];
3155         for (int k = j + 1; k < BLOCKING8; k++)
3156           aPut[k] -= value * aThis2[k];
3157         aPut += BLOCKING8;
3158       }
3159       // other blocks
3160       for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3161         aThis2 += BLOCKING8 * BLOCKING8;
3162         aPut = aThis2 + BLOCKING8;
3163         long double *COIN_RESTRICT aPut2a = aPut2;
3164         for (int i = j + 1; i < BLOCKING8; i++) {
3165           long double value = aPut2a[j];
3166           for (int k = 0; k < BLOCKING8; k++)
3167             aPut[k] -= value * aThis2[k];
3168           aPut += BLOCKING8;
3169           aPut2a += BLOCKING8;
3170         }
3171       }
3172     } else {
3173       return 1;
3174     }
3175   }
3176   return 0;
3177 }
3178 
CoinAbcDgetrf(int m,int n,long double * COIN_RESTRICT a,int lda,int * ipiv,int parallelMode)3179 int CoinAbcDgetrf(int m, int n, long double *COIN_RESTRICT a, int lda, int *ipiv
3180 #if ABC_PARALLEL == 2
3181   ,
3182   int parallelMode
3183 #endif
3184 )
3185 {
3186   assert(m == n);
3187   assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
3188   if (m < BLOCKING8) {
3189     return CoinAbcDgetrf2(m, n, a, ipiv);
3190   } else {
3191     for (int j = 0; j < n; j += BLOCKING8) {
3192       int start = j;
3193       int newSize = CoinMin(BLOCKING8, n - j);
3194       int end = j + newSize;
3195       int returnCode = CoinAbcDgetrf2(m - start, newSize, a + (start * lda + start * BLOCKING8),
3196         ipiv + start);
3197       if (!returnCode) {
3198         // adjust
3199         for (int k = start; k < end; k++)
3200           ipiv[k] += start;
3201         // swap 0<start
3202         CoinAbcDlaswp(start, a, lda, start, end, ipiv);
3203         if (end < n) {
3204           // swap >=end
3205           CoinAbcDlaswp(n - end, a + end * lda, lda, start, end, ipiv);
3206           CoinAbcDtrsmFactor(newSize, n - end, a + (start * lda + start * BLOCKING8), lda);
3207           CoinAbcDgemm(n - end, n - end, newSize,
3208             a + start * lda + end * BLOCKING8, lda,
3209             a + end * lda + start * BLOCKING8, a + end * lda + end * BLOCKING8
3210 #if ABC_PARALLEL == 2
3211             ,
3212             parallelMode
3213 #endif
3214           );
3215         }
3216       } else {
3217         return returnCode;
3218       }
3219     }
3220   }
3221   return 0;
3222 }
CoinAbcDgetrs(char trans,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT work)3223 void CoinAbcDgetrs(char trans, int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT work)
3224 {
3225   assert((m & (BLOCKING8 - 1)) == 0);
3226   if (trans == 'N') {
3227     //CoinAbcDlaswp1(work,m,ipiv);
3228     CoinAbcDtrsm0(m, a, work);
3229     CoinAbcDtrsm1(m, a, work);
3230   } else {
3231     assert(trans == 'T');
3232     CoinAbcDtrsm3(m, a, work);
3233     CoinAbcDtrsm2(m, a, work);
3234     //CoinAbcDlaswp1Backwards(work,m,ipiv);
3235   }
3236 }
3237 #endif
3238 
3239 /* vi: softtabstop=2 shiftwidth=2 expandtab tabstop=2
3240 */
3241