1 /* $Id: AbcSimplexParallel.cpp 2385 2019-01-06 19:43:06Z unxusr $ */
2 // Copyright (C) 2002, International Business Machines
3 // Corporation and others, Copyright (C) 2012, FasterCoin. All Rights Reserved.
4 // This code is licensed under the terms of the Eclipse Public License (EPL).
5 #include "CoinPragma.hpp"
6
7 #include <math.h>
8 #include "CoinHelperFunctions.hpp"
9 #include "CoinAbcHelperFunctions.hpp"
10 #include "AbcSimplexDual.hpp"
11 #include "ClpEventHandler.hpp"
12 #include "AbcSimplexFactorization.hpp"
13 #include "CoinPackedMatrix.hpp"
14 #include "CoinIndexedVector.hpp"
15 #include "CoinFloatEqual.hpp"
16 #include "AbcDualRowDantzig.hpp"
17 #include "ClpMessage.hpp"
18 #include "ClpLinearObjective.hpp"
19 #include <cfloat>
20 #include <cassert>
21 #include <string>
22 #include <stdio.h>
23 #include <iostream>
24 //#undef AVX2
25 #define _mm256_broadcast_sd(x) static_cast< __m256d >(__builtin_ia32_vbroadcastsd256(x))
26 #define _mm256_load_pd(x) *(__m256d *)(x)
27 #define _mm256_store_pd (s, x) * ((__m256d *)s) = x
28 //#define ABC_DEBUG 2
29 /* Reasons to come out:
30 -1 iterations etc
31 -2 inaccuracy
32 -3 slight inaccuracy (and done iterations)
33 +0 looks optimal (might be unbounded - but we will investigate)
34 +1 looks infeasible
35 +3 max iterations
36 */
whileIteratingSerial()37 int AbcSimplexDual::whileIteratingSerial()
38 {
39 /* arrays
40 0 - to get tableau row and then for weights update
41 1 - tableau column
42 2 - for flip
43 3 - actual tableau row
44 */
45 #ifdef CLP_DEBUG
46 int debugIteration = -1;
47 #endif
48 // if can't trust much and long way from optimal then relax
49 //if (largestPrimalError_ > 10.0)
50 //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
51 //else
52 //abcFactorization_->relaxAccuracyCheck(1.0);
53 // status stays at -1 while iterating, >=0 finished, -2 to invert
54 // status -3 to go to top without an invert
55 int returnCode = -1;
56 #define DELAYED_UPDATE
57 arrayForBtran_ = 0;
58 setUsedArray(arrayForBtran_);
59 arrayForFtran_ = 1;
60 setUsedArray(arrayForFtran_);
61 arrayForFlipBounds_ = 2;
62 setUsedArray(arrayForFlipBounds_);
63 arrayForTableauRow_ = 3;
64 setUsedArray(arrayForTableauRow_);
65 arrayForDualColumn_ = 4;
66 setUsedArray(arrayForDualColumn_);
67 arrayForReplaceColumn_ = 4; //4;
68 arrayForFlipRhs_ = 5;
69 setUsedArray(arrayForFlipRhs_);
70 dualPivotRow();
71 lastPivotRow_ = pivotRow_;
72 if (pivotRow_ >= 0) {
73 // we found a pivot row
74 createDualPricingVectorSerial();
75 do {
76 #if ABC_DEBUG
77 checkArrays();
78 #endif
79 /*
80 -1 - just move dual in values pass
81 0 - take iteration
82 1 - don't take but continue
83 2 - don't take and break
84 */
85 stateOfIteration_ = 0;
86 returnCode = -1;
87 // put row of tableau in usefulArray[arrayForTableauRow_]
88 /*
89 Could
90 a) copy [arrayForBtran_] and start off updateWeights earlier
91 b) break transposeTimes into two and do after slack part
92 c) also think if cleaner to go all way with update but just don't do final part
93 */
94 //upperTheta=COIN_DBL_MAX;
95 double saveAcceptable = acceptablePivot_;
96 if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
97 if (!abcFactorization_->pivots())
98 acceptablePivot_ *= 1.0e2;
99 else if (abcFactorization_->pivots() < 5)
100 acceptablePivot_ *= 1.0e1;
101 }
102 dualColumn1();
103 acceptablePivot_ = saveAcceptable;
104 if ((stateOfProblem_ & VALUES_PASS) != 0) {
105 // see if can just move dual
106 if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
107 stateOfIteration_ = -1;
108 }
109 }
110 //usefulArray_[arrayForTableauRow_].sortPacked();
111 //usefulArray_[arrayForTableauRow_].print();
112 //usefulArray_[arrayForDualColumn_].setPackedMode(true);
113 //usefulArray_[arrayForDualColumn].sortPacked();
114 //usefulArray_[arrayForDualColumn].print();
115 if (!stateOfIteration_) {
116 // get sequenceIn_
117 dualPivotColumn();
118 if (sequenceIn_ >= 0) {
119 // normal iteration
120 // update the incoming column (and do weights)
121 getTableauColumnFlipAndStartReplaceSerial();
122 } else if (stateOfIteration_ != -1) {
123 stateOfIteration_ = 2;
124 }
125 }
126 if (!stateOfIteration_) {
127 // assert (stateOfIteration_!=1);
128 int whatNext = 0;
129 whatNext = housekeeping();
130 if (whatNext == 1) {
131 problemStatus_ = -2; // refactorize
132 } else if (whatNext == 2) {
133 // maximum iterations or equivalent
134 problemStatus_ = 3;
135 returnCode = 3;
136 stateOfIteration_ = 2;
137 }
138 if (problemStatus_ == -1) {
139 replaceColumnPart3();
140 //clearArrays(arrayForReplaceColumn_);
141 #if ABC_DEBUG
142 checkArrays();
143 #endif
144 updateDualsInDual();
145 abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
146 movement_);
147 #if ABC_DEBUG
148 checkArrays();
149 #endif
150 } else {
151 abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
152 //clearArrays(arrayForBtran_);
153 //clearArrays(arrayForFtran_);
154 }
155 } else {
156 if (stateOfIteration_ < 0) {
157 // move dual in dual values pass
158 theta_ = abcDj_[sequenceOut_];
159 updateDualsInDual();
160 abcDj_[sequenceOut_] = 0.0;
161 sequenceOut_ = -1;
162 }
163 // clear all arrays
164 clearArrays(-1);
165 //sequenceIn_=-1;
166 //sequenceOut_=-1;
167 }
168 // Check event
169 {
170 int status = eventHandler_->event(ClpEventHandler::endOfIteration);
171 if (status >= 0) {
172 problemStatus_ = 5;
173 secondaryStatus_ = ClpEventHandler::endOfIteration;
174 returnCode = 4;
175 stateOfIteration_ = 2;
176 }
177 }
178 // at this stage sequenceIn_ may be <0
179 if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
180 // no incoming column is valid
181 returnCode = noPivotColumn();
182 }
183 if (stateOfIteration_ == 2) {
184 sequenceIn_ = -1;
185 break;
186 }
187 swapPrimalStuff();
188 if (problemStatus_ != -1) {
189 break;
190 }
191 // dualRow will go to virtual row pivot choice algorithm
192 // make sure values pass off if it should be
193 // use Btran array and clear inside dualPivotRow (if used)
194 int lastSequenceOut = sequenceOut_;
195 int lastDirectionOut = directionOut_;
196 dualPivotRow();
197 lastPivotRow_ = pivotRow_;
198 if (pivotRow_ >= 0) {
199 // we found a pivot row
200 // create as packed
201 createDualPricingVectorSerial();
202 swapDualStuff(lastSequenceOut, lastDirectionOut);
203 // next pivot
204 } else {
205 // no pivot row
206 clearArrays(-1);
207 returnCode = noPivotRow();
208 break;
209 }
210 } while (problemStatus_ == -1);
211 usefulArray_[arrayForTableauRow_].compact();
212 #if ABC_DEBUG
213 checkArrays();
214 #endif
215 } else {
216 // no pivot row on first try
217 clearArrays(-1);
218 returnCode = noPivotRow();
219 }
220 //printStuff();
221 clearArrays(-1);
222 //if (pivotRow_==-100)
223 //returnCode=-100; // end of values pass
224 return returnCode;
225 }
226 // Create dual pricing vector
createDualPricingVectorSerial()227 void AbcSimplexDual::createDualPricingVectorSerial()
228 {
229 #ifndef NDEBUG
230 #if ABC_NORMAL_DEBUG > 3
231 checkArrays();
232 #endif
233 #endif
234 // we found a pivot row
235 if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
236 handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
237 << pivotRow_
238 << CoinMessageEol;
239 }
240 // check accuracy of weights
241 abcDualRowPivot_->checkAccuracy();
242 // get sign for finding row of tableau
243 // create as packed
244 usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
245 abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
246 sequenceIn_ = -1;
247 }
getTableauColumnPart1Serial()248 void AbcSimplexDual::getTableauColumnPart1Serial()
249 {
250 #if ABC_DEBUG
251 {
252 const double *work = usefulArray_[arrayForTableauRow_].denseVector();
253 int number = usefulArray_[arrayForTableauRow_].getNumElements();
254 const int *which = usefulArray_[arrayForTableauRow_].getIndices();
255 for (int i = 0; i < number; i++) {
256 if (which[i] == sequenceIn_) {
257 assert(alpha_ == work[i]);
258 break;
259 }
260 }
261 }
262 #endif
263 //int returnCode=0;
264 // update the incoming column
265 unpack(usefulArray_[arrayForFtran_]);
266 // Array[0] may be needed until updateWeights2
267 // and update dual weights (can do in parallel - with extra array)
268 alpha_ = abcDualRowPivot_->updateWeights1(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
269 }
getTableauColumnFlipAndStartReplaceSerial()270 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceSerial()
271 {
272 // move checking stuff down into called functions
273 int numberFlipped;
274 numberFlipped = flipBounds();
275 // update the incoming column
276 getTableauColumnPart1Serial();
277 checkReplacePart1();
278 //checkReplacePart1a();
279 //checkReplacePart1b();
280 getTableauColumnPart2();
281 //usefulArray_[arrayForTableauRow_].compact();
282 // returns 3 if skip this iteration and re-factorize
283 /*
284 return code
285 0 - OK
286 2 - flag something and skip
287 3 - break and re-factorize
288 4 - break and say infeasible
289 */
290 int returnCode = 0;
291 // amount primal will move
292 movement_ = -dualOut_ * directionOut_ / alpha_;
293 // see if update stable
294 #if ABC_NORMAL_DEBUG > 3
295 if ((handler_->logLevel() & 32)) {
296 double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
297 double diff1 = fabs(alpha_ - btranAlpha_);
298 double diff2 = fabs(fabs(alpha_) - fabs(ft));
299 double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
300 double largest = CoinMax(CoinMax(diff1, diff2), diff3);
301 printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
302 btranAlpha_, alpha_, ft, largest);
303 if (largest > 0.001 * fabs(alpha_)) {
304 printf("bad\n");
305 }
306 }
307 #endif
308 int numberPivots = abcFactorization_->pivots();
309 //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
310 double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
311 // if can't trust much and long way from optimal then relax
312 if (largestPrimalError_ > 10.0)
313 checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
314 if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
315 handler_->message(CLP_DUAL_CHECK, messages_)
316 << btranAlpha_
317 << alpha_
318 << CoinMessageEol;
319 // see with more relaxed criterion
320 double test;
321 if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
322 test = 1.0e-1 * fabs(alpha_);
323 else
324 test = 10.0 * checkValue; //1.0e-4 * (1.0 + fabs(alpha_));
325 bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
326 double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
327 int error = 0;
328 while (derror > 1.0) {
329 error++;
330 derror *= 0.1;
331 }
332 int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
333 int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
334 #if ABC_NORMAL_DEBUG > 0
335 if (newFactorFrequency < forceFactorization_)
336 printf("Error of %g after %d pivots old forceFact %d now %d\n",
337 fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
338 forceFactorization_, newFactorFrequency);
339 #endif
340 if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
341 && abcFactorization_->pivotTolerance() < 0.5)
342 abcFactorization_->saferTolerances(1.0, -1.03);
343 forceFactorization_ = newFactorFrequency;
344
345 #if ABC_NORMAL_DEBUG > 0
346 if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
347 printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
348 }
349 #endif
350 if (numberPivots) {
351 if (needFlag && numberPivots < 10) {
352 // need to reject something
353 assert(sequenceOut_ >= 0);
354 char x = isColumn(sequenceOut_) ? 'C' : 'R';
355 handler_->message(CLP_SIMPLEX_FLAG, messages_)
356 << x << sequenceWithin(sequenceOut_)
357 << CoinMessageEol;
358 #if ABC_NORMAL_DEBUG > 0
359 printf("flag %d as alpha %g %g - after %d pivots\n", sequenceOut_,
360 btranAlpha_, alpha_, numberPivots);
361 #endif
362 // Make safer?
363 abcFactorization_->saferTolerances(1.0, -1.03);
364 setFlagged(sequenceOut_);
365 // so can't happen immediately again
366 sequenceOut_ = -1;
367 lastBadIteration_ = numberIterations_; // say be more cautious
368 }
369 // Make safer?
370 //if (numberPivots<5)
371 //abcFactorization_->saferTolerances (-0.99, -1.01);
372 problemStatus_ = -2; // factorize now
373 returnCode = -2;
374 stateOfIteration_ = 2;
375 } else {
376 if (needFlag) {
377 assert(sequenceOut_ >= 0);
378 // need to reject something
379 char x = isColumn(sequenceOut_) ? 'C' : 'R';
380 handler_->message(CLP_SIMPLEX_FLAG, messages_)
381 << x << sequenceWithin(sequenceOut_)
382 << CoinMessageEol;
383 #if ABC_NORMAL_DEBUG > 3
384 printf("flag a %g %g\n", btranAlpha_, alpha_);
385 #endif
386 setFlagged(sequenceOut_);
387 // so can't happen immediately again
388 sequenceOut_ = -1;
389 //abcProgress_.clearBadTimes();
390 lastBadIteration_ = numberIterations_; // say be more cautious
391 if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
392 //printf("I think should declare infeasible\n");
393 problemStatus_ = 1;
394 returnCode = 1;
395 stateOfIteration_ = 2;
396 } else {
397 stateOfIteration_ = 1;
398 }
399 // Make safer?
400 if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
401 // change tolerance and re-invert
402 abcFactorization_->saferTolerances(1.0, -1.03);
403 problemStatus_ = -6; // factorize now
404 returnCode = -2;
405 stateOfIteration_ = 2;
406 }
407 }
408 }
409 }
410 if (!stateOfIteration_) {
411 // check update
412 //int updateStatus =
413 /*
414 returns
415 0 - OK
416 1 - take iteration then re-factorize
417 2 - flag something and skip
418 3 - break and re-factorize
419 5 - take iteration then re-factorize because of memory
420 */
421 int status = checkReplace();
422 if (status && !returnCode)
423 returnCode = status;
424 }
425 //clearArrays(arrayForFlipRhs_);
426 if (stateOfIteration_ && numberFlipped) {
427 //usefulArray_[arrayForTableauRow_].compact();
428 // move variables back
429 flipBack(numberFlipped);
430 }
431 // could choose average of all three
432 return returnCode;
433 }
434 #if ABC_PARALLEL == 1
435 /* Reasons to come out:
436 -1 iterations etc
437 -2 inaccuracy
438 -3 slight inaccuracy (and done iterations)
439 +0 looks optimal (might be unbounded - but we will investigate)
440 +1 looks infeasible
441 +3 max iterations
442 */
whileIteratingThread()443 int AbcSimplexDual::whileIteratingThread()
444 {
445 /* arrays
446 0 - to get tableau row and then for weights update
447 1 - tableau column
448 2 - for flip
449 3 - actual tableau row
450 */
451 #ifdef CLP_DEBUG
452 int debugIteration = -1;
453 #endif
454 // if can't trust much and long way from optimal then relax
455 //if (largestPrimalError_ > 10.0)
456 //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
457 //else
458 //abcFactorization_->relaxAccuracyCheck(1.0);
459 // status stays at -1 while iterating, >=0 finished, -2 to invert
460 // status -3 to go to top without an invert
461 int returnCode = -1;
462 #define DELAYED_UPDATE
463 arrayForBtran_ = 0;
464 setUsedArray(arrayForBtran_);
465 arrayForFtran_ = 1;
466 setUsedArray(arrayForFtran_);
467 arrayForFlipBounds_ = 2;
468 setUsedArray(arrayForFlipBounds_);
469 arrayForTableauRow_ = 3;
470 setUsedArray(arrayForTableauRow_);
471 arrayForDualColumn_ = 4;
472 setUsedArray(arrayForDualColumn_);
473 arrayForReplaceColumn_ = 4; //4;
474 arrayForFlipRhs_ = 5;
475 setUsedArray(arrayForFlipRhs_);
476 dualPivotRow();
477 lastPivotRow_ = pivotRow_;
478 if (pivotRow_ >= 0) {
479 // we found a pivot row
480 createDualPricingVectorThread();
481 do {
482 #if ABC_DEBUG
483 checkArrays();
484 #endif
485 /*
486 -1 - just move dual in values pass
487 0 - take iteration
488 1 - don't take but continue
489 2 - don't take and break
490 */
491 stateOfIteration_ = 0;
492 returnCode = -1;
493 // put row of tableau in usefulArray[arrayForTableauRow_]
494 /*
495 Could
496 a) copy [arrayForBtran_] and start off updateWeights earlier
497 b) break transposeTimes into two and do after slack part
498 c) also think if cleaner to go all way with update but just don't do final part
499 */
500 //upperTheta=COIN_DBL_MAX;
501 double saveAcceptable = acceptablePivot_;
502 if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
503 if (!abcFactorization_->pivots())
504 acceptablePivot_ *= 1.0e2;
505 else if (abcFactorization_->pivots() < 5)
506 acceptablePivot_ *= 1.0e1;
507 }
508 dualColumn1();
509 acceptablePivot_ = saveAcceptable;
510 if ((stateOfProblem_ & VALUES_PASS) != 0) {
511 // see if can just move dual
512 if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
513 stateOfIteration_ = -1;
514 }
515 }
516 //usefulArray_[arrayForTableauRow_].sortPacked();
517 //usefulArray_[arrayForTableauRow_].print();
518 //usefulArray_[arrayForDualColumn_].setPackedMode(true);
519 //usefulArray_[arrayForDualColumn].sortPacked();
520 //usefulArray_[arrayForDualColumn].print();
521 if (parallelMode_ != 0) {
522 stopStart_ = 1 + 32 * 1; // Just first thread for updateweights
523 startParallelStuff(2);
524 }
525 if (!stateOfIteration_) {
526 // get sequenceIn_
527 dualPivotColumn();
528 if (sequenceIn_ >= 0) {
529 // normal iteration
530 // update the incoming column (and do weights if serial)
531 getTableauColumnFlipAndStartReplaceThread();
532 //usleep(1000);
533 } else if (stateOfIteration_ != -1) {
534 stateOfIteration_ = 2;
535 }
536 }
537 if (parallelMode_ != 0) {
538 // do sync here
539 stopParallelStuff(2);
540 }
541 if (!stateOfIteration_) {
542 // assert (stateOfIteration_!=1);
543 int whatNext = 0;
544 whatNext = housekeeping();
545 if (whatNext == 1) {
546 problemStatus_ = -2; // refactorize
547 } else if (whatNext == 2) {
548 // maximum iterations or equivalent
549 problemStatus_ = 3;
550 returnCode = 3;
551 stateOfIteration_ = 2;
552 }
553 } else {
554 if (stateOfIteration_ < 0) {
555 // move dual in dual values pass
556 theta_ = abcDj_[sequenceOut_];
557 updateDualsInDual();
558 abcDj_[sequenceOut_] = 0.0;
559 sequenceOut_ = -1;
560 }
561 // clear all arrays
562 clearArrays(-1);
563 }
564 // at this stage sequenceIn_ may be <0
565 if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
566 // no incoming column is valid
567 returnCode = noPivotColumn();
568 }
569 if (stateOfIteration_ == 2) {
570 sequenceIn_ = -1;
571 break;
572 }
573 if (problemStatus_ != -1) {
574 abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
575 swapPrimalStuff();
576 break;
577 }
578 #if ABC_DEBUG
579 checkArrays();
580 #endif
581 if (stateOfIteration_ != -1) {
582 assert(stateOfIteration_ == 0); // if 1 why are we here
583 // can do these in parallel
584 if (parallelMode_ == 0) {
585 replaceColumnPart3();
586 updateDualsInDual();
587 abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
588 movement_);
589 } else {
590 stopStart_ = 1 + 32 * 1;
591 startParallelStuff(3);
592 if (parallelMode_ > 1) {
593 stopStart_ = 2 + 32 * 2;
594 startParallelStuff(9);
595 } else {
596 replaceColumnPart3();
597 }
598 abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
599 movement_);
600 }
601 }
602 #if ABC_DEBUG
603 checkArrays();
604 #endif
605 swapPrimalStuff();
606 // dualRow will go to virtual row pivot choice algorithm
607 // make sure values pass off if it should be
608 // use Btran array and clear inside dualPivotRow (if used)
609 int lastSequenceOut = sequenceOut_;
610 int lastDirectionOut = directionOut_;
611 // redo to test on parallelMode_
612 if (parallelMode_ > 1) {
613 stopStart_ = 2 + 32 * 2; // stop factorization update
614 //stopStart_=3+32*3; // stop all
615 stopParallelStuff(9);
616 }
617 dualPivotRow();
618 lastPivotRow_ = pivotRow_;
619 if (pivotRow_ >= 0) {
620 // we found a pivot row
621 // create as packed
622 createDualPricingVectorThread();
623 swapDualStuff(lastSequenceOut, lastDirectionOut);
624 // next pivot
625 // redo to test on parallelMode_
626 if (parallelMode_ != 0) {
627 stopStart_ = 1 + 32 * 1; // stop dual update
628 stopParallelStuff(3);
629 }
630 } else {
631 // no pivot row
632 // redo to test on parallelMode_
633 if (parallelMode_ != 0) {
634 stopStart_ = 1 + 32 * 1; // stop dual update
635 stopParallelStuff(3);
636 }
637 clearArrays(-1);
638 returnCode = noPivotRow();
639 break;
640 }
641 } while (problemStatus_ == -1);
642 usefulArray_[arrayForTableauRow_].compact();
643 #if ABC_DEBUG
644 checkArrays();
645 #endif
646 } else {
647 // no pivot row on first try
648 clearArrays(-1);
649 returnCode = noPivotRow();
650 }
651 //printStuff();
652 clearArrays(-1);
653 //if (pivotRow_==-100)
654 //returnCode=-100; // end of values pass
655 return returnCode;
656 }
657 // Create dual pricing vector
createDualPricingVectorThread()658 void AbcSimplexDual::createDualPricingVectorThread()
659 {
660 #ifndef NDEBUG
661 #if ABC_NORMAL_DEBUG > 3
662 checkArrays();
663 #endif
664 #endif
665 // we found a pivot row
666 if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
667 handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
668 << pivotRow_
669 << CoinMessageEol;
670 }
671 // check accuracy of weights
672 abcDualRowPivot_->checkAccuracy();
673 // get sign for finding row of tableau
674 // create as packed
675 usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
676 abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
677 sequenceIn_ = -1;
678 }
getTableauColumnPart1Thread()679 void AbcSimplexDual::getTableauColumnPart1Thread()
680 {
681 #if ABC_DEBUG
682 {
683 const double *work = usefulArray_[arrayForTableauRow_].denseVector();
684 int number = usefulArray_[arrayForTableauRow_].getNumElements();
685 const int *which = usefulArray_[arrayForTableauRow_].getIndices();
686 for (int i = 0; i < number; i++) {
687 if (which[i] == sequenceIn_) {
688 assert(alpha_ == work[i]);
689 break;
690 }
691 }
692 }
693 #endif
694 //int returnCode=0;
695 // update the incoming column
696 unpack(usefulArray_[arrayForFtran_]);
697 // Array[0] may be needed until updateWeights2
698 // and update dual weights (can do in parallel - with extra array)
699 if (parallelMode_ != 0) {
700 abcFactorization_->updateColumnFTPart1(usefulArray_[arrayForFtran_]);
701 // pivot element
702 //alpha_ = usefulArray_[arrayForFtran_].denseVector()[pivotRow_];
703 } else {
704 alpha_ = abcDualRowPivot_->updateWeights1(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
705 }
706 }
getTableauColumnFlipAndStartReplaceThread()707 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceThread()
708 {
709 // move checking stuff down into called functions
710 // threads 2 and 3 are available
711 int numberFlipped;
712
713 if (parallelMode_ == 0) {
714 numberFlipped = flipBounds();
715 // update the incoming column
716 getTableauColumnPart1Thread();
717 checkReplacePart1();
718 //checkReplacePart1a();
719 //checkReplacePart1b();
720 getTableauColumnPart2();
721 } else {
722 // save stopStart
723 int saveStopStart = stopStart_;
724 if (parallelMode_ > 1) {
725 // we can flip immediately
726 stopStart_ = 2 + 32 * 2; // just thread 1
727 startParallelStuff(8);
728 // update the incoming column
729 getTableauColumnPart1Thread();
730 stopStart_ = 4 + 32 * 4; // just thread 2
731 startParallelStuff(7);
732 getTableauColumnPart2();
733 stopStart_ = 6 + 32 * 6; // just threads 1 and 2
734 stopParallelStuff(8);
735 //ftAlpha_=threadInfo_[2].result;
736 } else {
737 // just one extra thread - do replace
738 // update the incoming column
739 getTableauColumnPart1Thread();
740 stopStart_ = 2 + 32 * 2; // just thread 1
741 startParallelStuff(7);
742 getTableauColumnPart2();
743 numberFlipped = flipBounds();
744 stopParallelStuff(8);
745 //ftAlpha_=threadInfo_[1].result;
746 }
747 stopStart_ = saveStopStart;
748 }
749 //usefulArray_[arrayForTableauRow_].compact();
750 // returns 3 if skip this iteration and re-factorize
751 /*
752 return code
753 0 - OK
754 2 - flag something and skip
755 3 - break and re-factorize
756 4 - break and say infeasible
757 */
758 int returnCode = 0;
759 // amount primal will move
760 movement_ = -dualOut_ * directionOut_ / alpha_;
761 // see if update stable
762 #if ABC_NORMAL_DEBUG > 3
763 if ((handler_->logLevel() & 32)) {
764 double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
765 double diff1 = fabs(alpha_ - btranAlpha_);
766 double diff2 = fabs(fabs(alpha_) - fabs(ft));
767 double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
768 double largest = CoinMax(CoinMax(diff1, diff2), diff3);
769 printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
770 btranAlpha_, alpha_, ft, largest);
771 if (largest > 0.001 * fabs(alpha_)) {
772 printf("bad\n");
773 }
774 }
775 #endif
776 int numberPivots = abcFactorization_->pivots();
777 //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
778 double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
779 // if can't trust much and long way from optimal then relax
780 if (largestPrimalError_ > 10.0)
781 checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
782 if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
783 handler_->message(CLP_DUAL_CHECK, messages_)
784 << btranAlpha_
785 << alpha_
786 << CoinMessageEol;
787 // see with more relaxed criterion
788 double test;
789 if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
790 test = 1.0e-1 * fabs(alpha_);
791 else
792 test = 10.0 * checkValue; //1.0e-4 * (1.0 + fabs(alpha_));
793 bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
794 double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
795 int error = 0;
796 while (derror > 1.0) {
797 error++;
798 derror *= 0.1;
799 }
800 int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
801 int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
802 #if ABC_NORMAL_DEBUG > 0
803 if (newFactorFrequency < forceFactorization_)
804 printf("Error of %g after %d pivots old forceFact %d now %d\n",
805 fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
806 forceFactorization_, newFactorFrequency);
807 #endif
808 if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
809 && abcFactorization_->pivotTolerance() < 0.5)
810 abcFactorization_->saferTolerances(1.0, -1.03);
811 forceFactorization_ = newFactorFrequency;
812
813 #if ABC_NORMAL_DEBUG > 0
814 if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
815 printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
816 }
817 #endif
818 if (numberPivots) {
819 if (needFlag && numberPivots < 10) {
820 // need to reject something
821 assert(sequenceOut_ >= 0);
822 char x = isColumn(sequenceOut_) ? 'C' : 'R';
823 handler_->message(CLP_SIMPLEX_FLAG, messages_)
824 << x << sequenceWithin(sequenceOut_)
825 << CoinMessageEol;
826 #if ABC_NORMAL_DEBUG > 0
827 printf("flag %d as alpha %g %g - after %d pivots\n", sequenceOut_,
828 btranAlpha_, alpha_, numberPivots);
829 #endif
830 // Make safer?
831 abcFactorization_->saferTolerances(1.0, -1.03);
832 setFlagged(sequenceOut_);
833 // so can't happen immediately again
834 sequenceOut_ = -1;
835 lastBadIteration_ = numberIterations_; // say be more cautious
836 }
837 // Make safer?
838 //if (numberPivots<5)
839 //abcFactorization_->saferTolerances (-0.99, -1.01);
840 problemStatus_ = -2; // factorize now
841 returnCode = -2;
842 stateOfIteration_ = 2;
843 } else {
844 if (needFlag) {
845 assert(sequenceOut_ >= 0);
846 // need to reject something
847 char x = isColumn(sequenceOut_) ? 'C' : 'R';
848 handler_->message(CLP_SIMPLEX_FLAG, messages_)
849 << x << sequenceWithin(sequenceOut_)
850 << CoinMessageEol;
851 #if ABC_NORMAL_DEBUG > 3
852 printf("flag a %g %g\n", btranAlpha_, alpha_);
853 #endif
854 setFlagged(sequenceOut_);
855 // so can't happen immediately again
856 sequenceOut_ = -1;
857 //abcProgress_.clearBadTimes();
858 lastBadIteration_ = numberIterations_; // say be more cautious
859 if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
860 //printf("I think should declare infeasible\n");
861 problemStatus_ = 1;
862 returnCode = 1;
863 stateOfIteration_ = 2;
864 } else {
865 stateOfIteration_ = 1;
866 }
867 // Make safer?
868 if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
869 // change tolerance and re-invert
870 abcFactorization_->saferTolerances(1.0, -1.03);
871 problemStatus_ = -6; // factorize now
872 returnCode = -2;
873 stateOfIteration_ = 2;
874 }
875 }
876 }
877 }
878 if (!stateOfIteration_) {
879 // check update
880 //int updateStatus =
881 /*
882 returns
883 0 - OK
884 1 - take iteration then re-factorize
885 2 - flag something and skip
886 3 - break and re-factorize
887 5 - take iteration then re-factorize because of memory
888 */
889 int status = checkReplace();
890 if (status && !returnCode)
891 returnCode = status;
892 }
893 //clearArrays(arrayForFlipRhs_);
894 if (stateOfIteration_ && numberFlipped) {
895 //usefulArray_[arrayForTableauRow_].compact();
896 // move variables back
897 flipBack(numberFlipped);
898 }
899 // could choose average of all three
900 return returnCode;
901 }
902 #endif
903 #if ABC_PARALLEL == 2
904 #if ABC_NORMAL_DEBUG > 0
905 // for conflicts
906 int cilk_conflict = 0;
907 #endif
908 #ifdef EARLY_FACTORIZE
doEarlyFactorization(AbcSimplexDual * dual)909 static int doEarlyFactorization(AbcSimplexDual *dual)
910 {
911 int returnCode = cilk_spawn dual->whileIteratingParallel(123456789);
912 CoinIndexedVector &vector = *dual->usefulArray(ABC_NUMBER_USEFUL - 1);
913 int status = dual->earlyFactorization()->factorize(dual, vector);
914 #if 0
915 // Is this safe
916 if (!status&&true) {
917 // do pivots if there are any
918 int capacity = vector.capacity();
919 capacity--;
920 int numberPivotsStored = vector.getIndices()[capacity];
921 status =
922 dual->earlyFactorization()->replaceColumns(dual,vector,
923 0,numberPivotsStored,false);
924 }
925 #endif
926 if (status) {
927 printf("bad early factorization in doEarly - switch off\n");
928 vector.setNumElements(-1);
929 }
930 cilk_sync;
931 return returnCode;
932 }
933 #endif
934 #define MOVE_UPDATE_WEIGHTS
935 /* Reasons to come out:
936 -1 iterations etc
937 -2 inaccuracy
938 -3 slight inaccuracy (and done iterations)
939 +0 looks optimal (might be unbounded - but we will investigate)
940 +1 looks infeasible
941 +3 max iterations
942 */
whileIteratingCilk()943 int AbcSimplexDual::whileIteratingCilk()
944 {
945 /* arrays
946 0 - to get tableau row and then for weights update
947 1 - tableau column
948 2 - for flip
949 3 - actual tableau row
950 */
951 #ifdef CLP_DEBUG
952 int debugIteration = -1;
953 #endif
954 // if can't trust much and long way from optimal then relax
955 //if (largestPrimalError_ > 10.0)
956 //abcFactorization_->relaxAccuracyCheck(CoinMin(1.0e2, largestPrimalError_ / 10.0));
957 //else
958 //abcFactorization_->relaxAccuracyCheck(1.0);
959 // status stays at -1 while iterating, >=0 finished, -2 to invert
960 // status -3 to go to top without an invert
961 int returnCode = -1;
962 #define DELAYED_UPDATE
963 arrayForBtran_ = 0;
964 setUsedArray(arrayForBtran_);
965 arrayForFtran_ = 1;
966 setUsedArray(arrayForFtran_);
967 arrayForFlipBounds_ = 2;
968 setUsedArray(arrayForFlipBounds_);
969 arrayForTableauRow_ = 3;
970 setUsedArray(arrayForTableauRow_);
971 arrayForDualColumn_ = 4;
972 setUsedArray(arrayForDualColumn_);
973 arrayForReplaceColumn_ = 6; //4;
974 setUsedArray(arrayForReplaceColumn_);
975 #ifndef MOVE_UPDATE_WEIGHTS
976 arrayForFlipRhs_ = 5;
977 setUsedArray(arrayForFlipRhs_);
978 #else
979 arrayForFlipRhs_ = 0;
980 // use for weights
981 setUsedArray(5);
982 #endif
983 dualPivotRow();
984 lastPivotRow_ = pivotRow_;
985 if (pivotRow_ >= 0) {
986 // we found a pivot row
987 createDualPricingVectorCilk();
988 // ABC_PARALLEL 2
989 #if 0
990 {
991 for (int i=0;i<numberRows_;i++) {
992 int iSequence=abcPivotVariable_[i];
993 if (lowerBasic_[i]!=lowerSaved_[iSequence]||
994 upperBasic_[i]!=upperSaved_[iSequence])
995 printf("basic l %g,%g,%g u %g,%g\n",
996 abcLower_[iSequence],lowerSaved_[iSequence],lowerBasic_[i],
997 abcUpper_[iSequence],upperSaved_[iSequence],upperBasic_[i]);
998 }
999 }
1000 #endif
1001 #if 0
1002 for (int iSequence = 0; iSequence < numberTotal_; iSequence++) {
1003 int bad=-1;
1004 if (getFakeBound(iSequence)==noFake) {
1005 if (abcLower_[iSequence]!=lowerSaved_[iSequence]||
1006 abcUpper_[iSequence]!=upperSaved_[iSequence])
1007 bad=0;
1008 } else if (getFakeBound(iSequence)==lowerFake) {
1009 if (abcLower_[iSequence]==lowerSaved_[iSequence]||
1010 abcUpper_[iSequence]!=upperSaved_[iSequence])
1011 bad=1;
1012 } else if (getFakeBound(iSequence)==upperFake) {
1013 if (abcLower_[iSequence]!=lowerSaved_[iSequence]||
1014 abcUpper_[iSequence]==upperSaved_[iSequence])
1015 bad=2;
1016 } else {
1017 if (abcLower_[iSequence]==lowerSaved_[iSequence]||
1018 abcUpper_[iSequence]==upperSaved_[iSequence])
1019 bad=3;
1020 }
1021 if (bad>=0)
1022 printf("%d status %d l %g,%g u %g,%g\n",iSequence,internalStatus_[iSequence],
1023 abcLower_[iSequence],lowerSaved_[iSequence],
1024 abcUpper_[iSequence],upperSaved_[iSequence]);
1025 }
1026 #endif
1027 int numberPivots = abcFactorization_->maximumPivots();
1028 #ifdef EARLY_FACTORIZE
1029 int numberEarly = 0;
1030 if (numberPivots > 20 && (numberEarly_ & 0xffff) > 5) {
1031 numberEarly = numberEarly_ & 0xffff;
1032 numberPivots = CoinMax(numberPivots - numberEarly - abcFactorization_->pivots(), numberPivots / 2);
1033 }
1034 returnCode = whileIteratingParallel(numberPivots);
1035 if (returnCode == -99) {
1036 if (numberEarly) {
1037 if (!abcEarlyFactorization_)
1038 abcEarlyFactorization_ = new AbcSimplexFactorization(*abcFactorization_);
1039 // create pivot list
1040 CoinIndexedVector &vector = usefulArray_[ABC_NUMBER_USEFUL - 1];
1041 vector.checkClear();
1042 int *indices = vector.getIndices();
1043 int capacity = vector.capacity();
1044 int numberNeeded = numberRows_ + 2 * numberEarly + 1;
1045 if (capacity < numberNeeded) {
1046 vector.reserve(numberNeeded);
1047 capacity = vector.capacity();
1048 }
1049 int numberBasic = 0;
1050 for (int i = 0; i < numberRows_; i++) {
1051 int iSequence = abcPivotVariable_[i];
1052 if (iSequence < numberRows_)
1053 indices[numberBasic++] = iSequence;
1054 }
1055 vector.setNumElements(numberRows_ - numberBasic);
1056 for (int i = 0; i < numberRows_; i++) {
1057 int iSequence = abcPivotVariable_[i];
1058 if (iSequence >= numberRows_)
1059 indices[numberBasic++] = iSequence;
1060 }
1061 assert(numberBasic == numberRows_);
1062 indices[capacity - 1] = 0;
1063 // could set iterations to -1 for safety
1064 #if 0
1065 cilk_spawn doEarlyFactorization(this);
1066 returnCode=whileIteratingParallel(123456789);
1067 cilk_sync;
1068 #else
1069 returnCode = doEarlyFactorization(this);
1070 #endif
1071 } else {
1072 returnCode = whileIteratingParallel(123456789);
1073 }
1074 }
1075 #else
1076 returnCode = whileIteratingParallel(numberPivots);
1077 #endif
1078 usefulArray_[arrayForTableauRow_].compact();
1079 #if ABC_DEBUG
1080 checkArrays();
1081 #endif
1082 } else {
1083 // no pivot row on first try
1084 clearArrays(-1);
1085 returnCode = noPivotRow();
1086 }
1087 //printStuff();
1088 clearArrays(-1);
1089 //if (pivotRow_==-100)
1090 //returnCode=-100; // end of values pass
1091 return returnCode;
1092 }
1093 // Create dual pricing vector
createDualPricingVectorCilk()1094 void AbcSimplexDual::createDualPricingVectorCilk()
1095 {
1096 #if CILK_CONFLICT > 0
1097 if (cilk_conflict & 15 != 0) {
1098 printf("cilk_conflict %d!\n", cilk_conflict);
1099 cilk_conflict = 0;
1100 }
1101 #endif
1102 #ifndef NDEBUG
1103 #if ABC_NORMAL_DEBUG > 3
1104 checkArrays();
1105 #endif
1106 #endif
1107 // we found a pivot row
1108 if (handler_->detail(CLP_SIMPLEX_PIVOTROW, messages_) < 100) {
1109 handler_->message(CLP_SIMPLEX_PIVOTROW, messages_)
1110 << pivotRow_
1111 << CoinMessageEol;
1112 }
1113 // check accuracy of weights
1114 abcDualRowPivot_->checkAccuracy();
1115 // get sign for finding row of tableau
1116 // create as packed
1117 #if MOVE_REPLACE_PART1A > 0
1118 if (!abcFactorization_->usingFT()) {
1119 #endif
1120 usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
1121 abcFactorization_->updateColumnTranspose(usefulArray_[arrayForBtran_]);
1122 #if MOVE_REPLACE_PART1A > 0
1123 } else {
1124 cilk_spawn abcFactorization_->checkReplacePart1a(&usefulArray_[arrayForReplaceColumn_], pivotRow_);
1125 usefulArray_[arrayForBtran_].createOneUnpackedElement(pivotRow_, -directionOut_);
1126 abcFactorization_->updateColumnTransposeCpu(usefulArray_[arrayForBtran_], 1);
1127 cilk_sync;
1128 }
1129 #endif
1130 sequenceIn_ = -1;
1131 }
getTableauColumnPart1Cilk()1132 void AbcSimplexDual::getTableauColumnPart1Cilk()
1133 {
1134 #if ABC_DEBUG
1135 {
1136 const double *work = usefulArray_[arrayForTableauRow_].denseVector();
1137 int number = usefulArray_[arrayForTableauRow_].getNumElements();
1138 const int *which = usefulArray_[arrayForTableauRow_].getIndices();
1139 for (int i = 0; i < number; i++) {
1140 if (which[i] == sequenceIn_) {
1141 assert(alpha_ == work[i]);
1142 break;
1143 }
1144 }
1145 }
1146 #endif
1147 //int returnCode=0;
1148 // update the incoming column
1149 unpack(usefulArray_[arrayForFtran_]);
1150 // Array[0] may be needed until updateWeights2
1151 // and update dual weights (can do in parallel - with extra array)
1152 abcFactorization_->updateColumnFTPart1(usefulArray_[arrayForFtran_]);
1153 }
getTableauColumnFlipAndStartReplaceCilk()1154 int AbcSimplexDual::getTableauColumnFlipAndStartReplaceCilk()
1155 {
1156 // move checking stuff down into called functions
1157 // threads 2 and 3 are available
1158 int numberFlipped;
1159 //cilk
1160 getTableauColumnPart1Cilk();
1161 #if MOVE_REPLACE_PART1A <= 0
1162 cilk_spawn getTableauColumnPart2();
1163 #if MOVE_REPLACE_PART1A == 0
1164 cilk_spawn checkReplacePart1();
1165 #endif
1166 numberFlipped = flipBounds();
1167 cilk_sync;
1168 #else
1169 if (abcFactorization_->usingFT()) {
1170 cilk_spawn getTableauColumnPart2();
1171 ftAlpha_ = cilk_spawn abcFactorization_->checkReplacePart1b(&usefulArray_[arrayForReplaceColumn_], pivotRow_);
1172 numberFlipped = flipBounds();
1173 cilk_sync;
1174 } else {
1175 cilk_spawn getTableauColumnPart2();
1176 numberFlipped = flipBounds();
1177 cilk_sync;
1178 }
1179 #endif
1180 //usefulArray_[arrayForTableauRow_].compact();
1181 // returns 3 if skip this iteration and re-factorize
1182 /*
1183 return code
1184 0 - OK
1185 2 - flag something and skip
1186 3 - break and re-factorize
1187 4 - break and say infeasible
1188 */
1189 int returnCode = 0;
1190 // amount primal will move
1191 movement_ = -dualOut_ * directionOut_ / alpha_;
1192 // see if update stable
1193 #if ABC_NORMAL_DEBUG > 3
1194 if ((handler_->logLevel() & 32)) {
1195 double ft = ftAlpha_ * abcFactorization_->pivotRegion()[pivotRow_];
1196 double diff1 = fabs(alpha_ - btranAlpha_);
1197 double diff2 = fabs(fabs(alpha_) - fabs(ft));
1198 double diff3 = fabs(fabs(ft) - fabs(btranAlpha_));
1199 double largest = CoinMax(CoinMax(diff1, diff2), diff3);
1200 printf("btran alpha %g, ftran alpha %g ftAlpha %g largest diff %g\n",
1201 btranAlpha_, alpha_, ft, largest);
1202 if (largest > 0.001 * fabs(alpha_)) {
1203 printf("bad\n");
1204 }
1205 }
1206 #endif
1207 int numberPivots = abcFactorization_->pivots();
1208 //double checkValue = 1.0e-7; // numberPivots<5 ? 1.0e-7 : 1.0e-6;
1209 double checkValue = numberPivots ? 1.0e-7 : 1.0e-5;
1210 // if can't trust much and long way from optimal then relax
1211 if (largestPrimalError_ > 10.0)
1212 checkValue = CoinMin(1.0e-4, 1.0e-8 * largestPrimalError_);
1213 if (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > checkValue * (1.0 + fabs(alpha_))) {
1214 handler_->message(CLP_DUAL_CHECK, messages_)
1215 << btranAlpha_
1216 << alpha_
1217 << CoinMessageEol;
1218 // see with more relaxed criterion
1219 double test;
1220 if (fabs(btranAlpha_) < 1.0e-8 || fabs(alpha_) < 1.0e-8)
1221 test = 1.0e-1 * fabs(alpha_);
1222 else
1223 test = CoinMin(10.0 * checkValue, 1.0e-4 * (1.0 + fabs(alpha_)));
1224 bool needFlag = (fabs(btranAlpha_) < 1.0e-12 || fabs(alpha_) < 1.0e-12 || fabs(btranAlpha_ - alpha_) > test);
1225 double derror = CoinMin(fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), 1.0) * 0.9999e7;
1226 int error = 0;
1227 while (derror > 1.0) {
1228 error++;
1229 derror *= 0.1;
1230 }
1231 int frequency[8] = { 99999999, 100, 10, 2, 1, 1, 1, 1 };
1232 int newFactorFrequency = CoinMin(forceFactorization_, frequency[error]);
1233 #if ABC_NORMAL_DEBUG > 0
1234 if (newFactorFrequency < forceFactorization_)
1235 printf("Error of %g after %d pivots old forceFact %d now %d\n",
1236 fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)), numberPivots,
1237 forceFactorization_, newFactorFrequency);
1238 #endif
1239 if (!numberPivots && fabs(btranAlpha_ - alpha_) / (1.0 + fabs(alpha_)) > 0.5e-6
1240 && abcFactorization_->pivotTolerance() < 0.5)
1241 abcFactorization_->saferTolerances(1.0, -1.03);
1242 forceFactorization_ = newFactorFrequency;
1243
1244 #if ABC_NORMAL_DEBUG > 0
1245 if (fabs(btranAlpha_ + alpha_) < 1.0e-5 * (1.0 + fabs(alpha_))) {
1246 printf("diff (%g,%g) %g check %g\n", btranAlpha_, alpha_, fabs(btranAlpha_ - alpha_), checkValue * (1.0 + fabs(alpha_)));
1247 }
1248 #endif
1249 if (numberPivots) {
1250 if (needFlag && numberPivots < 10) {
1251 // need to reject something
1252 assert(sequenceOut_ >= 0);
1253 char x = isColumn(sequenceOut_) ? 'C' : 'R';
1254 handler_->message(CLP_SIMPLEX_FLAG, messages_)
1255 << x << sequenceWithin(sequenceOut_)
1256 << CoinMessageEol;
1257 #if ABC_NORMAL_DEBUG > 0
1258 printf("flag %d as alpha %g %g - after %d pivots\n", sequenceOut_,
1259 btranAlpha_, alpha_, numberPivots);
1260 #endif
1261 // Make safer?
1262 abcFactorization_->saferTolerances(1.0, -1.03);
1263 setFlagged(sequenceOut_);
1264 // so can't happen immediately again
1265 sequenceOut_ = -1;
1266 lastBadIteration_ = numberIterations_; // say be more cautious
1267 }
1268 // Make safer?
1269 //if (numberPivots<5)
1270 //abcFactorization_->saferTolerances (-0.99, -1.01);
1271 problemStatus_ = -2; // factorize now
1272 returnCode = -2;
1273 stateOfIteration_ = 2;
1274 } else {
1275 if (needFlag) {
1276 assert(sequenceOut_ >= 0);
1277 // need to reject something
1278 char x = isColumn(sequenceOut_) ? 'C' : 'R';
1279 handler_->message(CLP_SIMPLEX_FLAG, messages_)
1280 << x << sequenceWithin(sequenceOut_)
1281 << CoinMessageEol;
1282 #if ABC_NORMAL_DEBUG > 3
1283 printf("flag a %g %g\n", btranAlpha_, alpha_);
1284 #endif
1285 setFlagged(sequenceOut_);
1286 // so can't happen immediately again
1287 sequenceOut_ = -1;
1288 //abcProgress_.clearBadTimes();
1289 lastBadIteration_ = numberIterations_; // say be more cautious
1290 if (fabs(alpha_) < 1.0e-10 && fabs(btranAlpha_) < 1.0e-8 && numberIterations_ > 100) {
1291 //printf("I think should declare infeasible\n");
1292 problemStatus_ = 1;
1293 returnCode = 1;
1294 stateOfIteration_ = 2;
1295 } else {
1296 stateOfIteration_ = 1;
1297 }
1298 // Make safer?
1299 if (abcFactorization_->pivotTolerance() < 0.999 && stateOfIteration_ == 1) {
1300 // change tolerance and re-invert
1301 abcFactorization_->saferTolerances(1.0, -1.03);
1302 problemStatus_ = -6; // factorize now
1303 returnCode = -2;
1304 stateOfIteration_ = 2;
1305 }
1306 }
1307 }
1308 }
1309 if (!stateOfIteration_) {
1310 // check update
1311 //int updateStatus =
1312 /*
1313 returns
1314 0 - OK
1315 1 - take iteration then re-factorize
1316 2 - flag something and skip
1317 3 - break and re-factorize
1318 5 - take iteration then re-factorize because of memory
1319 */
1320 int status = checkReplace();
1321 if (status && !returnCode)
1322 returnCode = status;
1323 }
1324 //clearArrays(arrayForFlipRhs_);
1325 if (stateOfIteration_ && numberFlipped) {
1326 //usefulArray_[arrayForTableauRow_].compact();
1327 // move variables back
1328 flipBack(numberFlipped);
1329 }
1330 // could choose average of all three
1331 return returnCode;
1332 }
whileIteratingParallel(int numberIterations)1333 int AbcSimplexDual::whileIteratingParallel(int numberIterations)
1334 {
1335 int returnCode = -1;
1336 #ifdef EARLY_FACTORIZE
1337 int savePivot = -1;
1338 CoinIndexedVector &early = usefulArray_[ABC_NUMBER_USEFUL - 1];
1339 int *pivotIndices = early.getIndices();
1340 double *pivotValues = early.denseVector();
1341 int pivotCountPosition = early.capacity() - 1;
1342 int numberSaved = 0;
1343 if (numberIterations == 123456789)
1344 savePivot = pivotCountPosition;
1345 ;
1346 #endif
1347 numberIterations += numberIterations_;
1348 do {
1349 #if ABC_DEBUG
1350 checkArrays();
1351 #endif
1352 /*
1353 -1 - just move dual in values pass
1354 0 - take iteration
1355 1 - don't take but continue
1356 2 - don't take and break
1357 */
1358 stateOfIteration_ = 0;
1359 returnCode = -1;
1360 // put row of tableau in usefulArray[arrayForTableauRow_]
1361 /*
1362 Could
1363 a) copy [arrayForBtran_] and start off updateWeights earlier
1364 b) break transposeTimes into two and do after slack part
1365 c) also think if cleaner to go all way with update but just don't do final part
1366 */
1367 //upperTheta=COIN_DBL_MAX;
1368 double saveAcceptable = acceptablePivot_;
1369 if (largestPrimalError_ > 1.0e-5 || largestDualError_ > 1.0e-5) {
1370 //if ((largestPrimalError_>1.0e-5||largestDualError_>1.0e-5)&&false) {
1371 if (!abcFactorization_->pivots())
1372 acceptablePivot_ *= 1.0e2;
1373 else if (abcFactorization_->pivots() < 5)
1374 acceptablePivot_ *= 1.0e1;
1375 }
1376 #ifdef MOVE_UPDATE_WEIGHTS
1377 // copy btran across
1378 usefulArray_[5].copy(usefulArray_[arrayForBtran_]);
1379 cilk_spawn abcDualRowPivot_->updateWeightsOnly(usefulArray_[5]);
1380 ;
1381 #endif
1382 dualColumn1();
1383 acceptablePivot_ = saveAcceptable;
1384 if ((stateOfProblem_ & VALUES_PASS) != 0) {
1385 // see if can just move dual
1386 if (fabs(upperTheta_ - fabs(abcDj_[sequenceOut_])) < dualTolerance_) {
1387 stateOfIteration_ = -1;
1388 }
1389 }
1390 if (!stateOfIteration_) {
1391 #ifndef MOVE_UPDATE_WEIGHTS
1392 cilk_spawn abcDualRowPivot_->updateWeightsOnly(usefulArray_[arrayForBtran_]);
1393 ;
1394 #endif
1395 // get sequenceIn_
1396 dualPivotColumn();
1397 //stateOfIteration_=0;
1398 if (sequenceIn_ >= 0) {
1399 // normal iteration
1400 // update the incoming column
1401 //arrayForReplaceColumn_=getAvailableArray();
1402 getTableauColumnFlipAndStartReplaceCilk();
1403 //usleep(1000);
1404 } else if (stateOfIteration_ != -1) {
1405 stateOfIteration_ = 2;
1406 }
1407 }
1408 cilk_sync;
1409 // Check event
1410 {
1411 int status = eventHandler_->event(ClpEventHandler::endOfIteration);
1412 if (status >= 0) {
1413 problemStatus_ = 5;
1414 secondaryStatus_ = ClpEventHandler::endOfIteration;
1415 returnCode = 4;
1416 stateOfIteration_ = 2;
1417 }
1418 }
1419 if (!stateOfIteration_) {
1420 // assert (stateOfIteration_!=1);
1421 int whatNext = 0;
1422 whatNext = housekeeping();
1423 #ifdef EARLY_FACTORIZE
1424 if (savePivot >= 0) {
1425 // save pivot
1426 pivotIndices[--savePivot] = sequenceOut_;
1427 pivotValues[savePivot] = alpha_;
1428 pivotIndices[--savePivot] = sequenceIn_;
1429 pivotValues[savePivot] = btranAlpha_;
1430 numberSaved++;
1431 }
1432 #endif
1433 if (whatNext == 1) {
1434 problemStatus_ = -2; // refactorize
1435 usefulArray_[arrayForTableauRow_].compact();
1436 } else if (whatNext == 2) {
1437 // maximum iterations or equivalent
1438 problemStatus_ = 3;
1439 returnCode = 3;
1440 stateOfIteration_ = 2;
1441 }
1442 #ifdef EARLY_FACTORIZE
1443 if (savePivot >= 0) {
1444 // tell worker can update this
1445 pivotIndices[pivotCountPosition] = numberSaved;
1446 }
1447 #endif
1448 } else {
1449 usefulArray_[arrayForTableauRow_].compact();
1450 if (stateOfIteration_ < 0) {
1451 // move dual in dual values pass
1452 theta_ = abcDj_[sequenceOut_];
1453 updateDualsInDual();
1454 abcDj_[sequenceOut_] = 0.0;
1455 sequenceOut_ = -1;
1456 }
1457 // clear all arrays
1458 clearArrays(-1);
1459 }
1460 // at this stage sequenceIn_ may be <0
1461 if (sequenceIn_ < 0 && sequenceOut_ >= 0) {
1462 usefulArray_[arrayForTableauRow_].compact();
1463 // no incoming column is valid
1464 returnCode = noPivotColumn();
1465 }
1466 if (stateOfIteration_ == 2) {
1467 usefulArray_[arrayForTableauRow_].compact();
1468 sequenceIn_ = -1;
1469 #ifdef ABC_LONG_FACTORIZATION
1470 abcFactorization_->clearHiddenArrays();
1471 #endif
1472 break;
1473 }
1474 if (problemStatus_ != -1) {
1475 #ifndef MOVE_UPDATE_WEIGHTS
1476 abcDualRowPivot_->updateWeights2(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_]);
1477 #else
1478 abcDualRowPivot_->updateWeights2(usefulArray_[5], usefulArray_[arrayForFtran_]);
1479 #endif
1480 #ifdef ABC_LONG_FACTORIZATION
1481 abcFactorization_->clearHiddenArrays();
1482 #endif
1483 swapPrimalStuff();
1484 break;
1485 }
1486 if (stateOfIteration_ == 1) {
1487 // clear all arrays
1488 clearArrays(-2);
1489 #ifdef ABC_LONG_FACTORIZATION
1490 abcFactorization_->clearHiddenArrays();
1491 #endif
1492 }
1493 if (stateOfIteration_ == 0) {
1494 // can do these in parallel
1495 // No idea why I need this - but otherwise runs not repeatable (try again??)
1496 //usefulArray_[3].compact();
1497 cilk_spawn updateDualsInDual();
1498 int lastSequenceOut;
1499 int lastDirectionOut;
1500 if (firstFree_ < 0) {
1501 // can do in parallel
1502 cilk_spawn replaceColumnPart3();
1503 updatePrimalSolution();
1504 swapPrimalStuff();
1505 // dualRow will go to virtual row pivot choice algorithm
1506 // make sure values pass off if it should be
1507 // use Btran array and clear inside dualPivotRow (if used)
1508 lastSequenceOut = sequenceOut_;
1509 lastDirectionOut = directionOut_;
1510 dualPivotRow();
1511 cilk_sync;
1512 } else {
1513 // be more careful as dualPivotRow may do update
1514 cilk_spawn replaceColumnPart3();
1515 updatePrimalSolution();
1516 swapPrimalStuff();
1517 // dualRow will go to virtual row pivot choice algorithm
1518 // make sure values pass off if it should be
1519 // use Btran array and clear inside dualPivotRow (if used)
1520 lastSequenceOut = sequenceOut_;
1521 lastDirectionOut = directionOut_;
1522 cilk_sync;
1523 dualPivotRow();
1524 }
1525 lastPivotRow_ = pivotRow_;
1526 if (pivotRow_ >= 0) {
1527 // we found a pivot row
1528 // create as packed
1529 // dual->checkReplacePart1a();
1530 createDualPricingVectorCilk();
1531 swapDualStuff(lastSequenceOut, lastDirectionOut);
1532 }
1533 cilk_sync;
1534 } else {
1535 // after moving dual in values pass
1536 dualPivotRow();
1537 lastPivotRow_ = pivotRow_;
1538 if (pivotRow_ >= 0) {
1539 // we found a pivot row
1540 // create as packed
1541 createDualPricingVectorCilk();
1542 }
1543 }
1544 if (pivotRow_ < 0) {
1545 // no pivot row
1546 clearArrays(-1);
1547 returnCode = noPivotRow();
1548 break;
1549 }
1550 if (numberIterations == numberIterations_ && problemStatus_ == -1) {
1551 returnCode = -99;
1552 break;
1553 }
1554 } while (problemStatus_ == -1);
1555 return returnCode;
1556 }
1557 #if 0
1558 void
1559 AbcSimplexDual::whileIterating2()
1560 {
1561 // get sequenceIn_
1562 dualPivotColumn();
1563 //stateOfIteration_=0;
1564 if (sequenceIn_ >= 0) {
1565 // normal iteration
1566 // update the incoming column
1567 //arrayForReplaceColumn_=getAvailableArray();
1568 getTableauColumnFlipAndStartReplaceCilk();
1569 //usleep(1000);
1570 } else if (stateOfIteration_!=-1) {
1571 stateOfIteration_=2;
1572 }
1573 }
1574 #endif
1575 #endif
updatePrimalSolution()1576 void AbcSimplexDual::updatePrimalSolution()
1577 {
1578 // finish doing weights
1579 #ifndef MOVE_UPDATE_WEIGHTS
1580 abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[arrayForBtran_], usefulArray_[arrayForFtran_],
1581 movement_);
1582 #else
1583 abcDualRowPivot_->updatePrimalSolutionAndWeights(usefulArray_[5], usefulArray_[arrayForFtran_],
1584 movement_);
1585 #endif
1586 }
1587 int xxInfo[6][8];
parallelDual4(AbcSimplexDual * dual)1588 double parallelDual4(AbcSimplexDual *dual)
1589 {
1590 int maximumRows = dual->maximumAbcNumberRows();
1591 //int numberTotal=dual->numberTotal();
1592 CoinIndexedVector &update = *dual->usefulArray(dual->arrayForBtran());
1593 CoinPartitionedVector &tableauRow = *dual->usefulArray(dual->arrayForTableauRow());
1594 CoinPartitionedVector &candidateList = *dual->usefulArray(dual->arrayForDualColumn());
1595 AbcMatrix *matrix = dual->abcMatrix();
1596 // do slacks first (move before sync)
1597 double upperTheta = dual->upperTheta();
1598 //assert (upperTheta>-100*dual->dualTolerance()||dual->sequenceIn()>=0||dual->lastFirstFree()>=0);
1599 #if ABC_PARALLEL
1600 #define NUMBER_BLOCKS NUMBER_ROW_BLOCKS
1601 int numberBlocks = CoinMin(NUMBER_BLOCKS, dual->numberCpus());
1602 #else
1603 #define NUMBER_BLOCKS 1
1604 int numberBlocks = NUMBER_BLOCKS;
1605 #endif
1606 #if ABC_PARALLEL
1607 if (dual->parallelMode() == 0)
1608 numberBlocks = 1;
1609 #endif
1610 // see if worth using row copy
1611 bool gotRowCopy = matrix->gotRowCopy();
1612 int number = update.getNumElements();
1613 assert(number);
1614 bool useRowCopy = (gotRowCopy && (number << 2) < maximumRows);
1615 assert(numberBlocks == matrix->numberRowBlocks());
1616 #if ABC_PARALLEL == 1
1617 // redo so can pass info in with void *
1618 CoinThreadInfo *infoP = dual->threadInfoPointer();
1619 int cpuMask = ((1 << dual->parallelMode()) - 1);
1620 cpuMask += cpuMask << 5;
1621 dual->setStopStart(cpuMask);
1622 #endif
1623 CoinThreadInfo info[NUMBER_BLOCKS];
1624 const int *starts;
1625 if (useRowCopy)
1626 starts = matrix->blockStart();
1627 else
1628 starts = matrix->startColumnBlock();
1629 tableauRow.setPartitions(numberBlocks, starts);
1630 candidateList.setPartitions(numberBlocks, starts);
1631 //printf("free sequence in %d\n",dual->freeSequenceIn());
1632 if (useRowCopy) {
1633 // using row copy
1634 #if ABC_PARALLEL
1635 if (numberBlocks > 1) {
1636 #if ABC_PARALLEL == 2
1637 for (int i = 0; i < numberBlocks; i++) {
1638 info[i].stuff[1] = i;
1639 info[i].stuff[2] = -1;
1640 info[i].result = upperTheta;
1641 info[i].result = cilk_spawn
1642 matrix->dualColumn1Row(info[i].stuff[1], COIN_DBL_MAX, info[i].stuff[2],
1643 update, tableauRow, candidateList);
1644 }
1645 cilk_sync;
1646 #else
1647 // parallel 1
1648 for (int i = 0; i < numberBlocks; i++) {
1649 info[i].status = 5;
1650 info[i].stuff[1] = i;
1651 info[i].result = upperTheta;
1652 if (i < numberBlocks - 1) {
1653 infoP[i] = info[i];
1654 if (i == numberBlocks - 2)
1655 dual->startParallelStuff(5);
1656 }
1657 }
1658 info[numberBlocks - 1].stuff[1] = numberBlocks - 1;
1659 info[numberBlocks - 1].stuff[2] = -1;
1660 //double freeAlpha;
1661 info[numberBlocks - 1].result = matrix->dualColumn1Row(info[numberBlocks - 1].stuff[1],
1662 upperTheta, info[numberBlocks - 1].stuff[2],
1663 update, tableauRow, candidateList);
1664 dual->stopParallelStuff(5);
1665 for (int i = 0; i < numberBlocks - 1; i++)
1666 info[i] = infoP[i];
1667 #endif
1668 } else {
1669 #endif
1670 info[0].status = 5;
1671 info[0].stuff[1] = 0;
1672 info[0].result = upperTheta;
1673 info[0].stuff[1] = 0;
1674 info[0].stuff[2] = -1;
1675 // worth using row copy
1676 //assert (number>2);
1677 info[0].result = matrix->dualColumn1Row(info[0].stuff[1],
1678 upperTheta, info[0].stuff[2],
1679 update, tableauRow, candidateList);
1680 #if ABC_PARALLEL
1681 }
1682 #endif
1683 } else { // end of useRowCopy
1684 #if ABC_PARALLEL
1685 if (numberBlocks > 1) {
1686 #if ABC_PARALLEL == 2
1687 // do by column
1688 for (int i = 0; i < numberBlocks; i++) {
1689 info[i].stuff[1] = i;
1690 info[i].result = upperTheta;
1691 cilk_spawn
1692 matrix->dualColumn1Part(info[i].stuff[1], info[i].stuff[2],
1693 info[i].result,
1694 update, tableauRow, candidateList);
1695 }
1696 cilk_sync;
1697 #else
1698 // parallel 1
1699 // do by column
1700 for (int i = 0; i < numberBlocks; i++) {
1701 info[i].status = 4;
1702 info[i].stuff[1] = i;
1703 info[i].result = upperTheta;
1704 if (i < numberBlocks - 1) {
1705 infoP[i] = info[i];
1706 if (i == numberBlocks - 2)
1707 dual->startParallelStuff(4);
1708 }
1709 }
1710 matrix->dualColumn1Part(info[numberBlocks - 1].stuff[1], info[numberBlocks - 1].stuff[2],
1711 info[numberBlocks - 1].result,
1712 update, tableauRow, candidateList);
1713 dual->stopParallelStuff(4);
1714 for (int i = 0; i < numberBlocks - 1; i++)
1715 info[i] = infoP[i];
1716 #endif
1717 } else {
1718 #endif
1719 info[0].status = 4;
1720 info[0].stuff[1] = 0;
1721 info[0].result = upperTheta;
1722 info[0].stuff[1] = 0;
1723 matrix->dualColumn1Part(info[0].stuff[1], info[0].stuff[2],
1724 info[0].result,
1725 update, tableauRow, candidateList);
1726 #if ABC_PARALLEL
1727 }
1728 #endif
1729 }
1730 int sequenceIn[NUMBER_BLOCKS];
1731 bool anyFree = false;
1732 #if 0
1733 if (useRowCopy) {
1734 printf("Result at iteration %d slack seqIn %d upperTheta %g\n",
1735 dual->numberIterations(),dual->freeSequenceIn(),upperTheta);
1736 double * arrayT = tableauRow.denseVector();
1737 int * indexT = tableauRow.getIndices();
1738 double * arrayC = candidateList.denseVector();
1739 int * indexC = candidateList.getIndices();
1740 for (int i=0;i<numberBlocks;i++) {
1741 printf("Block %d seqIn %d upperTheta %g first %d last %d firstIn %d nnz %d numrem %d\n",
1742 i,info[i].stuff[2],info[i].result,
1743 xxInfo[0][i],xxInfo[1][i],xxInfo[2][i],xxInfo[3][i],xxInfo[4][i]);
1744 if (xxInfo[3][i]<-35) {
1745 assert (xxInfo[3][i]==tableauRow.getNumElements(i));
1746 assert (xxInfo[4][i]==candidateList.getNumElements(i));
1747 for (int k=0;k<xxInfo[3][i];k++)
1748 printf("T %d %d %g\n",k,indexT[k+xxInfo[2][i]],arrayT[k+xxInfo[2][i]]);
1749 for (int k=0;k<xxInfo[4][i];k++)
1750 printf("C %d %d %g\n",k,indexC[k+xxInfo[2][i]],arrayC[k+xxInfo[2][i]]);
1751 }
1752 }
1753 }
1754 #endif
1755 for (int i = 0; i < numberBlocks; i++) {
1756 sequenceIn[i] = info[i].stuff[2];
1757 if (sequenceIn[i] >= 0)
1758 anyFree = true;
1759 upperTheta = CoinMin(info[i].result, upperTheta);
1760 //assert (info[i].result>-100*dual->dualTolerance()||sequenceIn[i]>=0||dual->lastFirstFree()>=0);
1761 }
1762 if (anyFree) {
1763 int *COIN_RESTRICT index = tableauRow.getIndices();
1764 double *COIN_RESTRICT array = tableauRow.denseVector();
1765 // search for free coming in
1766 double bestFree = 0.0;
1767 int bestSequence = dual->sequenceIn();
1768 if (bestSequence >= 0)
1769 bestFree = dual->alpha();
1770 for (int i = 0; i < numberBlocks; i++) {
1771 int iLook = sequenceIn[i];
1772 if (iLook >= 0) {
1773 // free variable - search
1774 int start = starts[i];
1775 int end = start + tableauRow.getNumElements(i);
1776 for (int k = start; k < end; k++) {
1777 if (iLook == index[k]) {
1778 if (fabs(bestFree) < fabs(array[k])) {
1779 bestFree = array[k];
1780 bestSequence = iLook;
1781 }
1782 break;
1783 }
1784 }
1785 }
1786 }
1787 if (bestSequence >= 0) {
1788 double oldValue = dual->djRegion()[bestSequence];
1789 dual->setSequenceIn(bestSequence);
1790 dual->setAlpha(bestFree);
1791 dual->setTheta(oldValue / bestFree);
1792 }
1793 }
1794 //tableauRow.compact();
1795 //candidateList.compact();
1796 #if 0 //ndef NDEBUG
1797 tableauRow.setPackedMode(true);
1798 tableauRow.sortPacked();
1799 candidateList.setPackedMode(true);
1800 candidateList.sortPacked();
1801 #endif
1802 return upperTheta;
1803 }
1804 #if ABC_PARALLEL == 2
1805 static void
parallelDual5a(AbcSimplexFactorization * factorization,CoinIndexedVector * whichVector,int numberCpu,int whichCpu,double * weights)1806 parallelDual5a(AbcSimplexFactorization *factorization,
1807 CoinIndexedVector *whichVector,
1808 int numberCpu,
1809 int whichCpu,
1810 double *weights)
1811 {
1812 double *COIN_RESTRICT array = whichVector->denseVector();
1813 int *COIN_RESTRICT which = whichVector->getIndices();
1814 int numberRows = factorization->numberRows();
1815 for (int i = whichCpu; i < numberRows; i += numberCpu) {
1816 double value = 0.0;
1817 array[i] = 1.0;
1818 which[0] = i;
1819 whichVector->setNumElements(1);
1820 factorization->updateColumnTransposeCpu(*whichVector, whichCpu);
1821 int number = whichVector->getNumElements();
1822 for (int j = 0; j < number; j++) {
1823 int k = which[j];
1824 value += array[k] * array[k];
1825 array[k] = 0.0;
1826 }
1827 weights[i] = value;
1828 }
1829 whichVector->setNumElements(0);
1830 }
1831 #endif
1832 #if ABC_PARALLEL == 2
parallelDual5(AbcSimplexFactorization * factorization,CoinIndexedVector ** whichVector,int numberCpu,int whichCpu,double * weights)1833 void parallelDual5(AbcSimplexFactorization *factorization,
1834 CoinIndexedVector **whichVector,
1835 int numberCpu,
1836 int whichCpu,
1837 double *weights)
1838 {
1839 if (whichCpu) {
1840 cilk_spawn parallelDual5(factorization, whichVector, numberCpu, whichCpu - 1, weights);
1841 parallelDual5a(factorization, whichVector[whichCpu], numberCpu, whichCpu, weights);
1842 cilk_sync;
1843 } else {
1844 parallelDual5a(factorization, whichVector[whichCpu], numberCpu, whichCpu, weights);
1845 }
1846 }
1847 #endif
1848 // cilk seems a bit fragile
1849 #define CILK_FRAGILE 1
1850 #if CILK_FRAGILE > 1
1851 #undef cilk_spawn
1852 #undef cilk_sync
1853 #define cilk_spawn
1854 #define cilk_sync
1855 #define ONWARD 0
1856 #elif CILK_FRAGILE == 1
1857 #define ONWARD 0
1858 #else
1859 #define ONWARD 1
1860 #endif
1861 // Code below is just a translation of LAPACK
1862 #define BLOCKING1 8 // factorization strip
1863 #define BLOCKING2 8 // dgemm recursive
1864 #define BLOCKING3 16 // dgemm parallel
1865 /* type
1866 0 Left Lower NoTranspose Unit
1867 1 Left Upper NoTranspose NonUnit
1868 2 Left Lower Transpose Unit
1869 3 Left Upper Transpose NonUnit
1870 */
CoinAbcDtrsmFactor(int m,int n,double * COIN_RESTRICT a,int lda)1871 static void CoinAbcDtrsmFactor(int m, int n, double *COIN_RESTRICT a, int lda)
1872 {
1873 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
1874 assert(m == BLOCKING8);
1875 // 0 Left Lower NoTranspose Unit
1876 /* entry for column j and row i (when multiple of BLOCKING8)
1877 is at aBlocked+j*m+i*BLOCKING8
1878 */
1879 double *COIN_RESTRICT aBase2 = a;
1880 double *COIN_RESTRICT bBase2 = aBase2 + lda * BLOCKING8;
1881 for (int jj = 0; jj < n; jj += BLOCKING8) {
1882 double *COIN_RESTRICT bBase = bBase2;
1883 for (int j = jj; j < jj + BLOCKING8; j++) {
1884 #if 0
1885 double * COIN_RESTRICT aBase = aBase2;
1886 for (int k=0;k<BLOCKING8;k++) {
1887 double bValue = bBase[k];
1888 if (bValue) {
1889 for (int i=k+1;i<BLOCKING8;i++) {
1890 bBase[i]-=bValue*aBase[i];
1891 }
1892 }
1893 aBase+=BLOCKING8;
1894 }
1895 #else
1896 // unroll - stay in registers - don't test for zeros
1897 assert(BLOCKING8 == 8);
1898 double bValue0 = bBase[0];
1899 double bValue1 = bBase[1];
1900 double bValue2 = bBase[2];
1901 double bValue3 = bBase[3];
1902 double bValue4 = bBase[4];
1903 double bValue5 = bBase[5];
1904 double bValue6 = bBase[6];
1905 double bValue7 = bBase[7];
1906 bValue1 -= bValue0 * a[1 + 0 * BLOCKING8];
1907 bBase[1] = bValue1;
1908 bValue2 -= bValue0 * a[2 + 0 * BLOCKING8];
1909 bValue3 -= bValue0 * a[3 + 0 * BLOCKING8];
1910 bValue4 -= bValue0 * a[4 + 0 * BLOCKING8];
1911 bValue5 -= bValue0 * a[5 + 0 * BLOCKING8];
1912 bValue6 -= bValue0 * a[6 + 0 * BLOCKING8];
1913 bValue7 -= bValue0 * a[7 + 0 * BLOCKING8];
1914 bValue2 -= bValue1 * a[2 + 1 * BLOCKING8];
1915 bBase[2] = bValue2;
1916 bValue3 -= bValue1 * a[3 + 1 * BLOCKING8];
1917 bValue4 -= bValue1 * a[4 + 1 * BLOCKING8];
1918 bValue5 -= bValue1 * a[5 + 1 * BLOCKING8];
1919 bValue6 -= bValue1 * a[6 + 1 * BLOCKING8];
1920 bValue7 -= bValue1 * a[7 + 1 * BLOCKING8];
1921 bValue3 -= bValue2 * a[3 + 2 * BLOCKING8];
1922 bBase[3] = bValue3;
1923 bValue4 -= bValue2 * a[4 + 2 * BLOCKING8];
1924 bValue5 -= bValue2 * a[5 + 2 * BLOCKING8];
1925 bValue6 -= bValue2 * a[6 + 2 * BLOCKING8];
1926 bValue7 -= bValue2 * a[7 + 2 * BLOCKING8];
1927 bValue4 -= bValue3 * a[4 + 3 * BLOCKING8];
1928 bBase[4] = bValue4;
1929 bValue5 -= bValue3 * a[5 + 3 * BLOCKING8];
1930 bValue6 -= bValue3 * a[6 + 3 * BLOCKING8];
1931 bValue7 -= bValue3 * a[7 + 3 * BLOCKING8];
1932 bValue5 -= bValue4 * a[5 + 4 * BLOCKING8];
1933 bBase[5] = bValue5;
1934 bValue6 -= bValue4 * a[6 + 4 * BLOCKING8];
1935 bValue7 -= bValue4 * a[7 + 4 * BLOCKING8];
1936 bValue6 -= bValue5 * a[6 + 5 * BLOCKING8];
1937 bBase[6] = bValue6;
1938 bValue7 -= bValue5 * a[7 + 5 * BLOCKING8];
1939 bValue7 -= bValue6 * a[7 + 6 * BLOCKING8];
1940 bBase[7] = bValue7;
1941 #endif
1942 bBase += BLOCKING8;
1943 }
1944 bBase2 += lda * BLOCKING8;
1945 }
1946 }
1947 #define UNROLL_DTRSM 16
1948 #define CILK_DTRSM 32
dtrsm0(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)1949 static void dtrsm0(int kkk, int first, int last,
1950 int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
1951 {
1952 int mm = CoinMin(kkk + UNROLL_DTRSM * BLOCKING8, m);
1953 assert((last - first) % BLOCKING8 == 0);
1954 if (last - first > CILK_DTRSM) {
1955 int mid = ((first + last) >> 4) << 3;
1956 cilk_spawn dtrsm0(kkk, first, mid, m, a, b);
1957 dtrsm0(kkk, mid, last, m, a, b);
1958 cilk_sync;
1959 } else {
1960 const double *COIN_RESTRICT aBaseA = a + UNROLL_DTRSM * BLOCKING8X8 + kkk * BLOCKING8;
1961 aBaseA += (first - mm) * BLOCKING8 - BLOCKING8X8;
1962 aBaseA += m * kkk;
1963 for (int ii = first; ii < last; ii += BLOCKING8) {
1964 aBaseA += BLOCKING8X8;
1965 const double *COIN_RESTRICT aBaseB = aBaseA;
1966 double *COIN_RESTRICT bBaseI = b + ii;
1967 for (int kk = kkk; kk < mm; kk += BLOCKING8) {
1968 double *COIN_RESTRICT bBase = b + kk;
1969 const double *COIN_RESTRICT aBase2 = aBaseB; //a+UNROLL_DTRSM*BLOCKING8X8+m*kk+kkk*BLOCKING8;
1970 //aBase2 += (ii-mm)*BLOCKING8;
1971 //assert (aBase2==aBaseB);
1972 aBaseB += m * BLOCKING8;
1973 #if AVX2 != 2
1974 #define ALTERNATE_INNER
1975 #ifndef ALTERNATE_INNER
1976 for (int k = 0; k < BLOCKING8; k++) {
1977 //coin_prefetch_const(aBase2+BLOCKING8);
1978 for (int i = 0; i < BLOCKING8; i++) {
1979 bBaseI[i] -= bBase[k] * aBase2[i];
1980 }
1981 aBase2 += BLOCKING8;
1982 }
1983 #else
1984 double b0 = bBaseI[0];
1985 double b1 = bBaseI[1];
1986 double b2 = bBaseI[2];
1987 double b3 = bBaseI[3];
1988 double b4 = bBaseI[4];
1989 double b5 = bBaseI[5];
1990 double b6 = bBaseI[6];
1991 double b7 = bBaseI[7];
1992 for (int k = 0; k < BLOCKING8; k++) {
1993 //coin_prefetch_const(aBase2+BLOCKING8);
1994 double bValue = bBase[k];
1995 b0 -= bValue * aBase2[0];
1996 b1 -= bValue * aBase2[1];
1997 b2 -= bValue * aBase2[2];
1998 b3 -= bValue * aBase2[3];
1999 b4 -= bValue * aBase2[4];
2000 b5 -= bValue * aBase2[5];
2001 b6 -= bValue * aBase2[6];
2002 b7 -= bValue * aBase2[7];
2003 aBase2 += BLOCKING8;
2004 }
2005 bBaseI[0] = b0;
2006 bBaseI[1] = b1;
2007 bBaseI[2] = b2;
2008 bBaseI[3] = b3;
2009 bBaseI[4] = b4;
2010 bBaseI[5] = b5;
2011 bBaseI[6] = b6;
2012 bBaseI[7] = b7;
2013 #endif
2014 #else
2015 __m256d b0 = _mm256_load_pd(bBaseI);
2016 __m256d b1 = _mm256_load_pd(bBaseI + 4);
2017 for (int k = 0; k < BLOCKING8; k++) {
2018 //coin_prefetch_const(aBase2+BLOCKING8);
2019 __m256d bb = _mm256_broadcast_sd(bBase + k);
2020 __m256d a0 = _mm256_load_pd(aBase2);
2021 b0 -= a0 * bb;
2022 __m256d a1 = _mm256_load_pd(aBase2 + 4);
2023 b1 -= a1 * bb;
2024 aBase2 += BLOCKING8;
2025 }
2026 //_mm256_store_pd ((bBaseI), (b0));
2027 *reinterpret_cast< __m256d * >(bBaseI) = b0;
2028 //_mm256_store_pd (bBaseI+4, b1);
2029 *reinterpret_cast< __m256d * >(bBaseI + 4) = b1;
2030 #endif
2031 }
2032 }
2033 }
2034 }
CoinAbcDtrsm0(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2035 void CoinAbcDtrsm0(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2036 {
2037 assert((m & (BLOCKING8 - 1)) == 0);
2038 // 0 Left Lower NoTranspose Unit
2039 for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM * BLOCKING8) {
2040 int mm = CoinMin(m, kkk + UNROLL_DTRSM * BLOCKING8);
2041 for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2042 const double *COIN_RESTRICT aBase2 = a + kk * (m + BLOCKING8);
2043 double *COIN_RESTRICT bBase = b + kk;
2044 for (int k = 0; k < BLOCKING8; k++) {
2045 for (int i = k + 1; i < BLOCKING8; i++) {
2046 bBase[i] -= bBase[k] * aBase2[i];
2047 }
2048 aBase2 += BLOCKING8;
2049 }
2050 for (int ii = kk + BLOCKING8; ii < mm; ii += BLOCKING8) {
2051 double *COIN_RESTRICT bBaseI = b + ii;
2052 for (int k = 0; k < BLOCKING8; k++) {
2053 //coin_prefetch_const(aBase2+BLOCKING8);
2054 for (int i = 0; i < BLOCKING8; i++) {
2055 bBaseI[i] -= bBase[k] * aBase2[i];
2056 }
2057 aBase2 += BLOCKING8;
2058 }
2059 }
2060 }
2061 dtrsm0(kkk, mm, m, m, a, b);
2062 }
2063 }
dtrsm1(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2064 static void dtrsm1(int kkk, int first, int last,
2065 int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2066 {
2067 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2068 assert((last - first) % BLOCKING8 == 0);
2069 if (last - first > CILK_DTRSM) {
2070 int mid = ((first + last) >> 4) << 3;
2071 cilk_spawn dtrsm1(kkk, first, mid, m, a, b);
2072 dtrsm1(kkk, mid, last, m, a, b);
2073 cilk_sync;
2074 } else {
2075 for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2076 double *COIN_RESTRICT bBase2 = b + iii;
2077 const double *COIN_RESTRICT aBaseA = a + BLOCKING8X8 + BLOCKING8 * iii;
2078 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2079 double *COIN_RESTRICT bBase = b + ii;
2080 const double *COIN_RESTRICT aBase = aBaseA + m * ii;
2081 #if AVX2 != 2
2082 #ifndef ALTERNATE_INNER
2083 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2084 aBase -= BLOCKING8;
2085 //coin_prefetch_const(aBase-BLOCKING8);
2086 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2087 bBase2[k] -= bBase[i] * aBase[k];
2088 }
2089 }
2090 #else
2091 double b0 = bBase2[0];
2092 double b1 = bBase2[1];
2093 double b2 = bBase2[2];
2094 double b3 = bBase2[3];
2095 double b4 = bBase2[4];
2096 double b5 = bBase2[5];
2097 double b6 = bBase2[6];
2098 double b7 = bBase2[7];
2099 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2100 aBase -= BLOCKING8;
2101 //coin_prefetch_const(aBase-BLOCKING8);
2102 double bValue = bBase[i];
2103 b0 -= bValue * aBase[0];
2104 b1 -= bValue * aBase[1];
2105 b2 -= bValue * aBase[2];
2106 b3 -= bValue * aBase[3];
2107 b4 -= bValue * aBase[4];
2108 b5 -= bValue * aBase[5];
2109 b6 -= bValue * aBase[6];
2110 b7 -= bValue * aBase[7];
2111 }
2112 bBase2[0] = b0;
2113 bBase2[1] = b1;
2114 bBase2[2] = b2;
2115 bBase2[3] = b3;
2116 bBase2[4] = b4;
2117 bBase2[5] = b5;
2118 bBase2[6] = b6;
2119 bBase2[7] = b7;
2120 #endif
2121 #else
2122 __m256d b0 = _mm256_load_pd(bBase2);
2123 __m256d b1 = _mm256_load_pd(bBase2 + 4);
2124 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2125 aBase -= BLOCKING8;
2126 //coin_prefetch_const(aBase5-BLOCKING8);
2127 __m256d bb = _mm256_broadcast_sd(bBase + i);
2128 __m256d a0 = _mm256_load_pd(aBase);
2129 b0 -= a0 * bb;
2130 __m256d a1 = _mm256_load_pd(aBase + 4);
2131 b1 -= a1 * bb;
2132 }
2133 //_mm256_store_pd (bBase2, b0);
2134 *reinterpret_cast< __m256d * >(bBase2) = b0;
2135 //_mm256_store_pd (bBase2+4, b1);
2136 *reinterpret_cast< __m256d * >(bBase2 + 4) = b1;
2137 #endif
2138 }
2139 }
2140 }
2141 }
CoinAbcDtrsm1(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2142 void CoinAbcDtrsm1(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2143 {
2144 assert((m & (BLOCKING8 - 1)) == 0);
2145 // 1 Left Upper NoTranspose NonUnit
2146 for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2147 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2148 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2149 const double *COIN_RESTRICT aBase = a + m * ii + ii * BLOCKING8;
2150 double *COIN_RESTRICT bBase = b + ii;
2151 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2152 bBase[i] *= aBase[i * (BLOCKING8 + 1)];
2153 for (int k = i - 1; k >= 0; k--) {
2154 bBase[k] -= bBase[i] * aBase[k + i * BLOCKING8];
2155 }
2156 }
2157 double *COIN_RESTRICT bBase2 = bBase;
2158 for (int iii = ii; iii > mm; iii -= BLOCKING8) {
2159 bBase2 -= BLOCKING8;
2160 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2161 aBase -= BLOCKING8;
2162 coin_prefetch_const(aBase - BLOCKING8);
2163 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2164 bBase2[k] -= bBase[i] * aBase[k];
2165 }
2166 }
2167 }
2168 }
2169 dtrsm1(kkk, 0, mm, m, a, b);
2170 }
2171 }
dtrsm2(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2172 static void dtrsm2(int kkk, int first, int last,
2173 int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2174 {
2175 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2176 assert((last - first) % BLOCKING8 == 0);
2177 if (last - first > CILK_DTRSM) {
2178 int mid = ((first + last) >> 4) << 3;
2179 cilk_spawn dtrsm2(kkk, first, mid, m, a, b);
2180 dtrsm2(kkk, mid, last, m, a, b);
2181 cilk_sync;
2182 } else {
2183 for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2184 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2185 const double *COIN_RESTRICT bBase = b + ii;
2186 double *COIN_RESTRICT bBase2 = b + iii;
2187 const double *COIN_RESTRICT aBase = a + ii * BLOCKING8 + m * iii + BLOCKING8X8;
2188 for (int j = BLOCKING8 - 1; j >= 0; j -= 4) {
2189 double bValue0 = bBase2[j];
2190 double bValue1 = bBase2[j - 1];
2191 double bValue2 = bBase2[j - 2];
2192 double bValue3 = bBase2[j - 3];
2193 bValue0 -= aBase[-1 * BLOCKING8 + 7] * bBase[7];
2194 bValue1 -= aBase[-2 * BLOCKING8 + 7] * bBase[7];
2195 bValue2 -= aBase[-3 * BLOCKING8 + 7] * bBase[7];
2196 bValue3 -= aBase[-4 * BLOCKING8 + 7] * bBase[7];
2197 bValue0 -= aBase[-1 * BLOCKING8 + 6] * bBase[6];
2198 bValue1 -= aBase[-2 * BLOCKING8 + 6] * bBase[6];
2199 bValue2 -= aBase[-3 * BLOCKING8 + 6] * bBase[6];
2200 bValue3 -= aBase[-4 * BLOCKING8 + 6] * bBase[6];
2201 bValue0 -= aBase[-1 * BLOCKING8 + 5] * bBase[5];
2202 bValue1 -= aBase[-2 * BLOCKING8 + 5] * bBase[5];
2203 bValue2 -= aBase[-3 * BLOCKING8 + 5] * bBase[5];
2204 bValue3 -= aBase[-4 * BLOCKING8 + 5] * bBase[5];
2205 bValue0 -= aBase[-1 * BLOCKING8 + 4] * bBase[4];
2206 bValue1 -= aBase[-2 * BLOCKING8 + 4] * bBase[4];
2207 bValue2 -= aBase[-3 * BLOCKING8 + 4] * bBase[4];
2208 bValue3 -= aBase[-4 * BLOCKING8 + 4] * bBase[4];
2209 bValue0 -= aBase[-1 * BLOCKING8 + 3] * bBase[3];
2210 bValue1 -= aBase[-2 * BLOCKING8 + 3] * bBase[3];
2211 bValue2 -= aBase[-3 * BLOCKING8 + 3] * bBase[3];
2212 bValue3 -= aBase[-4 * BLOCKING8 + 3] * bBase[3];
2213 bValue0 -= aBase[-1 * BLOCKING8 + 2] * bBase[2];
2214 bValue1 -= aBase[-2 * BLOCKING8 + 2] * bBase[2];
2215 bValue2 -= aBase[-3 * BLOCKING8 + 2] * bBase[2];
2216 bValue3 -= aBase[-4 * BLOCKING8 + 2] * bBase[2];
2217 bValue0 -= aBase[-1 * BLOCKING8 + 1] * bBase[1];
2218 bValue1 -= aBase[-2 * BLOCKING8 + 1] * bBase[1];
2219 bValue2 -= aBase[-3 * BLOCKING8 + 1] * bBase[1];
2220 bValue3 -= aBase[-4 * BLOCKING8 + 1] * bBase[1];
2221 bValue0 -= aBase[-1 * BLOCKING8 + 0] * bBase[0];
2222 bValue1 -= aBase[-2 * BLOCKING8 + 0] * bBase[0];
2223 bValue2 -= aBase[-3 * BLOCKING8 + 0] * bBase[0];
2224 bValue3 -= aBase[-4 * BLOCKING8 + 0] * bBase[0];
2225 bBase2[j] = bValue0;
2226 bBase2[j - 1] = bValue1;
2227 bBase2[j - 2] = bValue2;
2228 bBase2[j - 3] = bValue3;
2229 aBase -= 4 * BLOCKING8;
2230 }
2231 }
2232 }
2233 }
2234 }
CoinAbcDtrsm2(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2235 void CoinAbcDtrsm2(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2236 {
2237 assert((m & (BLOCKING8 - 1)) == 0);
2238 // 2 Left Lower Transpose Unit
2239 for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2240 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2241 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2242 double *COIN_RESTRICT bBase = b + ii;
2243 double *COIN_RESTRICT bBase2 = bBase;
2244 const double *COIN_RESTRICT aBase = a + m * ii + (ii + BLOCKING8) * BLOCKING8;
2245 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2246 aBase -= BLOCKING8;
2247 for (int k = i + 1; k < BLOCKING8; k++) {
2248 bBase2[i] -= aBase[k] * bBase2[k];
2249 }
2250 }
2251 for (int iii = ii - BLOCKING8; iii >= mm; iii -= BLOCKING8) {
2252 bBase2 -= BLOCKING8;
2253 assert(bBase2 == b + iii);
2254 aBase -= m * BLOCKING8;
2255 const double *COIN_RESTRICT aBase2 = aBase + BLOCKING8X8;
2256 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2257 aBase2 -= BLOCKING8;
2258 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2259 bBase2[i] -= aBase2[k] * bBase[k];
2260 }
2261 }
2262 }
2263 }
2264 dtrsm2(kkk, 0, mm, m, a, b);
2265 }
2266 }
2267 #define UNROLL_DTRSM3 16
2268 #define CILK_DTRSM3 32
dtrsm3(int kkk,int first,int last,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2269 static void dtrsm3(int kkk, int first, int last,
2270 int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2271 {
2272 //int mm=CoinMin(kkk+UNROLL_DTRSM3*BLOCKING8,m);
2273 assert((last - first) % BLOCKING8 == 0);
2274 if (last - first > CILK_DTRSM3) {
2275 int mid = ((first + last) >> 4) << 3;
2276 cilk_spawn dtrsm3(kkk, first, mid, m, a, b);
2277 dtrsm3(kkk, mid, last, m, a, b);
2278 cilk_sync;
2279 } else {
2280 for (int kk = 0; kk < kkk; kk += BLOCKING8) {
2281 for (int ii = first; ii < last; ii += BLOCKING8) {
2282 double *COIN_RESTRICT bBase = b + ii;
2283 double *COIN_RESTRICT bBase2 = b + kk;
2284 const double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2285 for (int j = 0; j < BLOCKING8; j += 4) {
2286 double bValue0 = bBase[j];
2287 double bValue1 = bBase[j + 1];
2288 double bValue2 = bBase[j + 2];
2289 double bValue3 = bBase[j + 3];
2290 bValue0 -= aBase[0] * bBase2[0];
2291 bValue1 -= aBase[1 * BLOCKING8 + 0] * bBase2[0];
2292 bValue2 -= aBase[2 * BLOCKING8 + 0] * bBase2[0];
2293 bValue3 -= aBase[3 * BLOCKING8 + 0] * bBase2[0];
2294 bValue0 -= aBase[1] * bBase2[1];
2295 bValue1 -= aBase[1 * BLOCKING8 + 1] * bBase2[1];
2296 bValue2 -= aBase[2 * BLOCKING8 + 1] * bBase2[1];
2297 bValue3 -= aBase[3 * BLOCKING8 + 1] * bBase2[1];
2298 bValue0 -= aBase[2] * bBase2[2];
2299 bValue1 -= aBase[1 * BLOCKING8 + 2] * bBase2[2];
2300 bValue2 -= aBase[2 * BLOCKING8 + 2] * bBase2[2];
2301 bValue3 -= aBase[3 * BLOCKING8 + 2] * bBase2[2];
2302 bValue0 -= aBase[3] * bBase2[3];
2303 bValue1 -= aBase[1 * BLOCKING8 + 3] * bBase2[3];
2304 bValue2 -= aBase[2 * BLOCKING8 + 3] * bBase2[3];
2305 bValue3 -= aBase[3 * BLOCKING8 + 3] * bBase2[3];
2306 bValue0 -= aBase[4] * bBase2[4];
2307 bValue1 -= aBase[1 * BLOCKING8 + 4] * bBase2[4];
2308 bValue2 -= aBase[2 * BLOCKING8 + 4] * bBase2[4];
2309 bValue3 -= aBase[3 * BLOCKING8 + 4] * bBase2[4];
2310 bValue0 -= aBase[5] * bBase2[5];
2311 bValue1 -= aBase[1 * BLOCKING8 + 5] * bBase2[5];
2312 bValue2 -= aBase[2 * BLOCKING8 + 5] * bBase2[5];
2313 bValue3 -= aBase[3 * BLOCKING8 + 5] * bBase2[5];
2314 bValue0 -= aBase[6] * bBase2[6];
2315 bValue1 -= aBase[1 * BLOCKING8 + 6] * bBase2[6];
2316 bValue2 -= aBase[2 * BLOCKING8 + 7] * bBase2[7];
2317 bValue3 -= aBase[3 * BLOCKING8 + 6] * bBase2[6];
2318 bValue0 -= aBase[7] * bBase2[7];
2319 bValue1 -= aBase[1 * BLOCKING8 + 7] * bBase2[7];
2320 bValue2 -= aBase[2 * BLOCKING8 + 6] * bBase2[6];
2321 bValue3 -= aBase[3 * BLOCKING8 + 7] * bBase2[7];
2322 bBase[j] = bValue0;
2323 bBase[j + 1] = bValue1;
2324 bBase[j + 2] = bValue2;
2325 bBase[j + 3] = bValue3;
2326 aBase += 4 * BLOCKING8;
2327 }
2328 }
2329 }
2330 }
2331 }
CoinAbcDtrsm3(int m,double * COIN_RESTRICT a,double * COIN_RESTRICT b)2332 void CoinAbcDtrsm3(int m, double *COIN_RESTRICT a, double *COIN_RESTRICT b)
2333 {
2334 assert((m & (BLOCKING8 - 1)) == 0);
2335 // 3 Left Upper Transpose NonUnit
2336 for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM3 * BLOCKING8) {
2337 int mm = CoinMin(m, kkk + UNROLL_DTRSM3 * BLOCKING8);
2338 dtrsm3(kkk, kkk, mm, m, a, b);
2339 for (int ii = kkk; ii < mm; ii += BLOCKING8) {
2340 double *COIN_RESTRICT bBase = b + ii;
2341 for (int kk = kkk; kk < ii; kk += BLOCKING8) {
2342 double *COIN_RESTRICT bBase2 = b + kk;
2343 const double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2344 for (int i = 0; i < BLOCKING8; i++) {
2345 for (int k = 0; k < BLOCKING8; k++) {
2346 bBase[i] -= aBase[k] * bBase2[k];
2347 }
2348 aBase += BLOCKING8;
2349 }
2350 }
2351 const double *COIN_RESTRICT aBase = a + ii * m + ii * BLOCKING8;
2352 for (int i = 0; i < BLOCKING8; i++) {
2353 for (int k = 0; k < i; k++) {
2354 bBase[i] -= aBase[k] * bBase[k];
2355 }
2356 bBase[i] = bBase[i] * aBase[i];
2357 aBase += BLOCKING8;
2358 }
2359 }
2360 }
2361 }
2362 static void
CoinAbcDlaswp(int n,double * COIN_RESTRICT a,int lda,int start,int end,int * ipiv)2363 CoinAbcDlaswp(int n, double *COIN_RESTRICT a, int lda, int start, int end, int *ipiv)
2364 {
2365 assert((n & (BLOCKING8 - 1)) == 0);
2366 double *COIN_RESTRICT aThis3 = a;
2367 for (int j = 0; j < n; j += BLOCKING8) {
2368 for (int i = start; i < end; i++) {
2369 int ip = ipiv[i];
2370 if (ip != i) {
2371 double *COIN_RESTRICT aTop = aThis3 + j * lda;
2372 int iBias = ip / BLOCKING8;
2373 ip -= iBias * BLOCKING8;
2374 double *COIN_RESTRICT aNotTop = aTop + iBias * BLOCKING8X8;
2375 iBias = i / BLOCKING8;
2376 int i2 = i - iBias * BLOCKING8;
2377 aTop += iBias * BLOCKING8X8;
2378 for (int k = 0; k < BLOCKING8; k++) {
2379 double temp = aTop[i2];
2380 aTop[i2] = aNotTop[ip];
2381 aNotTop[ip] = temp;
2382 aTop += BLOCKING8;
2383 aNotTop += BLOCKING8;
2384 }
2385 }
2386 }
2387 }
2388 }
2389 extern void CoinAbcDgemm(int m, int n, int k, double *COIN_RESTRICT a, int lda,
2390 double *COIN_RESTRICT b, double *COIN_RESTRICT c
2391 #if ABC_PARALLEL == 2
2392 ,
2393 int parallelMode
2394 #endif
2395 );
CoinAbcDgetrf2(int m,int n,double * COIN_RESTRICT a,int * ipiv)2396 static int CoinAbcDgetrf2(int m, int n, double *COIN_RESTRICT a, int *ipiv)
2397 {
2398 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2399 assert(n == BLOCKING8);
2400 int dimension = CoinMin(m, n);
2401 /* entry for column j and row i (when multiple of BLOCKING8)
2402 is at aBlocked+j*m+i*BLOCKING8
2403 */
2404 assert(dimension == BLOCKING8);
2405 //printf("Dgetrf2 m=%d n=%d\n",m,n);
2406 double *COIN_RESTRICT aThis3 = a;
2407 for (int j = 0; j < dimension; j++) {
2408 int pivotRow = -1;
2409 double largest = 0.0;
2410 double *COIN_RESTRICT aThis2 = aThis3 + j * BLOCKING8;
2411 // this block
2412 int pivotRow2 = -1;
2413 for (int i = j; i < BLOCKING8; i++) {
2414 double value = fabs(aThis2[i]);
2415 if (value > largest) {
2416 largest = value;
2417 pivotRow2 = i;
2418 }
2419 }
2420 // other blocks
2421 int iBias = 0;
2422 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2423 aThis2 += BLOCKING8 * BLOCKING8;
2424 for (int i = 0; i < BLOCKING8; i++) {
2425 double value = fabs(aThis2[i]);
2426 if (value > largest) {
2427 largest = value;
2428 pivotRow2 = i;
2429 iBias = ii;
2430 }
2431 }
2432 }
2433 pivotRow = pivotRow2 + iBias;
2434 ipiv[j] = pivotRow;
2435 if (largest) {
2436 if (j != pivotRow) {
2437 double *COIN_RESTRICT aTop = aThis3;
2438 double *COIN_RESTRICT aNotTop = aThis3 + iBias * BLOCKING8;
2439 for (int i = 0; i < n; i++) {
2440 double value = aTop[j];
2441 aTop[j] = aNotTop[pivotRow2];
2442 aNotTop[pivotRow2] = value;
2443 aTop += BLOCKING8;
2444 aNotTop += BLOCKING8;
2445 }
2446 }
2447 aThis2 = aThis3 + j * BLOCKING8;
2448 double pivotMultiplier = 1.0 / aThis2[j];
2449 aThis2[j] = pivotMultiplier;
2450 // Do L
2451 // this block
2452 for (int i = j + 1; i < BLOCKING8; i++)
2453 aThis2[i] *= pivotMultiplier;
2454 // other blocks
2455 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2456 aThis2 += BLOCKING8 * BLOCKING8;
2457 for (int i = 0; i < BLOCKING8; i++)
2458 aThis2[i] *= pivotMultiplier;
2459 }
2460 // update rest
2461 double *COIN_RESTRICT aPut;
2462 aThis2 = aThis3 + j * BLOCKING8;
2463 aPut = aThis2 + BLOCKING8;
2464 double *COIN_RESTRICT aPut2 = aPut;
2465 // this block
2466 for (int i = j + 1; i < BLOCKING8; i++) {
2467 double value = aPut[j];
2468 for (int k = j + 1; k < BLOCKING8; k++)
2469 aPut[k] -= value * aThis2[k];
2470 aPut += BLOCKING8;
2471 }
2472 // other blocks
2473 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
2474 aThis2 += BLOCKING8 * BLOCKING8;
2475 aPut = aThis2 + BLOCKING8;
2476 double *COIN_RESTRICT aPut2a = aPut2;
2477 for (int i = j + 1; i < BLOCKING8; i++) {
2478 double value = aPut2a[j];
2479 for (int k = 0; k < BLOCKING8; k++)
2480 aPut[k] -= value * aThis2[k];
2481 aPut += BLOCKING8;
2482 aPut2a += BLOCKING8;
2483 }
2484 }
2485 } else {
2486 return 1;
2487 }
2488 }
2489 return 0;
2490 }
2491
CoinAbcDgetrf(int m,int n,double * COIN_RESTRICT a,int lda,int * ipiv,int parallelMode)2492 int CoinAbcDgetrf(int m, int n, double *COIN_RESTRICT a, int lda, int *ipiv
2493 #if ABC_PARALLEL == 2
2494 ,
2495 int parallelMode
2496 #endif
2497 )
2498 {
2499 assert(m == n);
2500 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2501 if (m < BLOCKING8) {
2502 return CoinAbcDgetrf2(m, n, a, ipiv);
2503 } else {
2504 for (int j = 0; j < n; j += BLOCKING8) {
2505 int start = j;
2506 int newSize = CoinMin(BLOCKING8, n - j);
2507 int end = j + newSize;
2508 int returnCode = CoinAbcDgetrf2(m - start, newSize, a + (start * lda + start * BLOCKING8),
2509 ipiv + start);
2510 if (!returnCode) {
2511 // adjust
2512 for (int k = start; k < end; k++)
2513 ipiv[k] += start;
2514 // swap 0<start
2515 CoinAbcDlaswp(start, a, lda, start, end, ipiv);
2516 if (end < n) {
2517 // swap >=end
2518 CoinAbcDlaswp(n - end, a + end * lda, lda, start, end, ipiv);
2519 CoinAbcDtrsmFactor(newSize, n - end, a + (start * lda + start * BLOCKING8), lda);
2520 CoinAbcDgemm(n - end, n - end, newSize,
2521 a + start * lda + end * BLOCKING8, lda,
2522 a + end * lda + start * BLOCKING8, a + end * lda + end * BLOCKING8
2523 #if ABC_PARALLEL == 2
2524 ,
2525 parallelMode
2526 #endif
2527 );
2528 }
2529 } else {
2530 return returnCode;
2531 }
2532 }
2533 }
2534 return 0;
2535 }
CoinAbcDgetrs(char trans,int m,double * COIN_RESTRICT a,double * COIN_RESTRICT work)2536 void CoinAbcDgetrs(char trans, int m, double *COIN_RESTRICT a, double *COIN_RESTRICT work)
2537 {
2538 assert((m & (BLOCKING8 - 1)) == 0);
2539 if (trans == 'N') {
2540 //CoinAbcDlaswp1(work,m,ipiv);
2541 CoinAbcDtrsm0(m, a, work);
2542 CoinAbcDtrsm1(m, a, work);
2543 } else {
2544 assert(trans == 'T');
2545 CoinAbcDtrsm3(m, a, work);
2546 CoinAbcDtrsm2(m, a, work);
2547 //CoinAbcDlaswp1Backwards(work,m,ipiv);
2548 }
2549 }
2550 #ifdef ABC_LONG_FACTORIZATION
2551 /// ****** Start long double version
2552 /* type
2553 0 Left Lower NoTranspose Unit
2554 1 Left Upper NoTranspose NonUnit
2555 2 Left Lower Transpose Unit
2556 3 Left Upper Transpose NonUnit
2557 */
CoinAbcDtrsmFactor(int m,int n,long double * COIN_RESTRICT a,int lda)2558 static void CoinAbcDtrsmFactor(int m, int n, long double *COIN_RESTRICT a, int lda)
2559 {
2560 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
2561 assert(m == BLOCKING8);
2562 // 0 Left Lower NoTranspose Unit
2563 /* entry for column j and row i (when multiple of BLOCKING8)
2564 is at aBlocked+j*m+i*BLOCKING8
2565 */
2566 long double *COIN_RESTRICT aBase2 = a;
2567 long double *COIN_RESTRICT bBase2 = aBase2 + lda * BLOCKING8;
2568 for (int jj = 0; jj < n; jj += BLOCKING8) {
2569 long double *COIN_RESTRICT bBase = bBase2;
2570 for (int j = jj; j < jj + BLOCKING8; j++) {
2571 #if 0
2572 long double * COIN_RESTRICT aBase = aBase2;
2573 for (int k=0;k<BLOCKING8;k++) {
2574 long double bValue = bBase[k];
2575 if (bValue) {
2576 for (int i=k+1;i<BLOCKING8;i++) {
2577 bBase[i]-=bValue*aBase[i];
2578 }
2579 }
2580 aBase+=BLOCKING8;
2581 }
2582 #else
2583 // unroll - stay in registers - don't test for zeros
2584 assert(BLOCKING8 == 8);
2585 long double bValue0 = bBase[0];
2586 long double bValue1 = bBase[1];
2587 long double bValue2 = bBase[2];
2588 long double bValue3 = bBase[3];
2589 long double bValue4 = bBase[4];
2590 long double bValue5 = bBase[5];
2591 long double bValue6 = bBase[6];
2592 long double bValue7 = bBase[7];
2593 bValue1 -= bValue0 * a[1 + 0 * BLOCKING8];
2594 bBase[1] = bValue1;
2595 bValue2 -= bValue0 * a[2 + 0 * BLOCKING8];
2596 bValue3 -= bValue0 * a[3 + 0 * BLOCKING8];
2597 bValue4 -= bValue0 * a[4 + 0 * BLOCKING8];
2598 bValue5 -= bValue0 * a[5 + 0 * BLOCKING8];
2599 bValue6 -= bValue0 * a[6 + 0 * BLOCKING8];
2600 bValue7 -= bValue0 * a[7 + 0 * BLOCKING8];
2601 bValue2 -= bValue1 * a[2 + 1 * BLOCKING8];
2602 bBase[2] = bValue2;
2603 bValue3 -= bValue1 * a[3 + 1 * BLOCKING8];
2604 bValue4 -= bValue1 * a[4 + 1 * BLOCKING8];
2605 bValue5 -= bValue1 * a[5 + 1 * BLOCKING8];
2606 bValue6 -= bValue1 * a[6 + 1 * BLOCKING8];
2607 bValue7 -= bValue1 * a[7 + 1 * BLOCKING8];
2608 bValue3 -= bValue2 * a[3 + 2 * BLOCKING8];
2609 bBase[3] = bValue3;
2610 bValue4 -= bValue2 * a[4 + 2 * BLOCKING8];
2611 bValue5 -= bValue2 * a[5 + 2 * BLOCKING8];
2612 bValue6 -= bValue2 * a[6 + 2 * BLOCKING8];
2613 bValue7 -= bValue2 * a[7 + 2 * BLOCKING8];
2614 bValue4 -= bValue3 * a[4 + 3 * BLOCKING8];
2615 bBase[4] = bValue4;
2616 bValue5 -= bValue3 * a[5 + 3 * BLOCKING8];
2617 bValue6 -= bValue3 * a[6 + 3 * BLOCKING8];
2618 bValue7 -= bValue3 * a[7 + 3 * BLOCKING8];
2619 bValue5 -= bValue4 * a[5 + 4 * BLOCKING8];
2620 bBase[5] = bValue5;
2621 bValue6 -= bValue4 * a[6 + 4 * BLOCKING8];
2622 bValue7 -= bValue4 * a[7 + 4 * BLOCKING8];
2623 bValue6 -= bValue5 * a[6 + 5 * BLOCKING8];
2624 bBase[6] = bValue6;
2625 bValue7 -= bValue5 * a[7 + 5 * BLOCKING8];
2626 bValue7 -= bValue6 * a[7 + 6 * BLOCKING8];
2627 bBase[7] = bValue7;
2628 #endif
2629 bBase += BLOCKING8;
2630 }
2631 bBase2 += lda * BLOCKING8;
2632 }
2633 }
2634 #define UNROLL_DTRSM 16
2635 #define CILK_DTRSM 32
dtrsm0(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2636 static void dtrsm0(int kkk, int first, int last,
2637 int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2638 {
2639 int mm = CoinMin(kkk + UNROLL_DTRSM * BLOCKING8, m);
2640 assert((last - first) % BLOCKING8 == 0);
2641 if (last - first > CILK_DTRSM) {
2642 int mid = ((first + last) >> 4) << 3;
2643 cilk_spawn dtrsm0(kkk, first, mid, m, a, b);
2644 dtrsm0(kkk, mid, last, m, a, b);
2645 cilk_sync;
2646 } else {
2647 const long double *COIN_RESTRICT aBaseA = a + UNROLL_DTRSM * BLOCKING8X8 + kkk * BLOCKING8;
2648 aBaseA += (first - mm) * BLOCKING8 - BLOCKING8X8;
2649 aBaseA += m * kkk;
2650 for (int ii = first; ii < last; ii += BLOCKING8) {
2651 aBaseA += BLOCKING8X8;
2652 const long double *COIN_RESTRICT aBaseB = aBaseA;
2653 long double *COIN_RESTRICT bBaseI = b + ii;
2654 for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2655 long double *COIN_RESTRICT bBase = b + kk;
2656 const long double *COIN_RESTRICT aBase2 = aBaseB; //a+UNROLL_DTRSM*BLOCKING8X8+m*kk+kkk*BLOCKING8;
2657 //aBase2 += (ii-mm)*BLOCKING8;
2658 //assert (aBase2==aBaseB);
2659 aBaseB += m * BLOCKING8;
2660 #if AVX2 != 2
2661 #define ALTERNATE_INNER
2662 #ifndef ALTERNATE_INNER
2663 for (int k = 0; k < BLOCKING8; k++) {
2664 //coin_prefetch_const(aBase2+BLOCKING8);
2665 for (int i = 0; i < BLOCKING8; i++) {
2666 bBaseI[i] -= bBase[k] * aBase2[i];
2667 }
2668 aBase2 += BLOCKING8;
2669 }
2670 #else
2671 long double b0 = bBaseI[0];
2672 long double b1 = bBaseI[1];
2673 long double b2 = bBaseI[2];
2674 long double b3 = bBaseI[3];
2675 long double b4 = bBaseI[4];
2676 long double b5 = bBaseI[5];
2677 long double b6 = bBaseI[6];
2678 long double b7 = bBaseI[7];
2679 for (int k = 0; k < BLOCKING8; k++) {
2680 //coin_prefetch_const(aBase2+BLOCKING8);
2681 long double bValue = bBase[k];
2682 b0 -= bValue * aBase2[0];
2683 b1 -= bValue * aBase2[1];
2684 b2 -= bValue * aBase2[2];
2685 b3 -= bValue * aBase2[3];
2686 b4 -= bValue * aBase2[4];
2687 b5 -= bValue * aBase2[5];
2688 b6 -= bValue * aBase2[6];
2689 b7 -= bValue * aBase2[7];
2690 aBase2 += BLOCKING8;
2691 }
2692 bBaseI[0] = b0;
2693 bBaseI[1] = b1;
2694 bBaseI[2] = b2;
2695 bBaseI[3] = b3;
2696 bBaseI[4] = b4;
2697 bBaseI[5] = b5;
2698 bBaseI[6] = b6;
2699 bBaseI[7] = b7;
2700 #endif
2701 #else
2702 __m256d b0 = _mm256_load_pd(bBaseI);
2703 __m256d b1 = _mm256_load_pd(bBaseI + 4);
2704 for (int k = 0; k < BLOCKING8; k++) {
2705 //coin_prefetch_const(aBase2+BLOCKING8);
2706 __m256d bb = _mm256_broadcast_sd(bBase + k);
2707 __m256d a0 = _mm256_load_pd(aBase2);
2708 b0 -= a0 * bb;
2709 __m256d a1 = _mm256_load_pd(aBase2 + 4);
2710 b1 -= a1 * bb;
2711 aBase2 += BLOCKING8;
2712 }
2713 //_mm256_store_pd ((bBaseI), (b0));
2714 *reinterpret_cast< __m256d * >(bBaseI) = b0;
2715 //_mm256_store_pd (bBaseI+4, b1);
2716 *reinterpret_cast< __m256d * >(bBaseI + 4) = b1;
2717 #endif
2718 }
2719 }
2720 }
2721 }
CoinAbcDtrsm0(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2722 void CoinAbcDtrsm0(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2723 {
2724 assert((m & (BLOCKING8 - 1)) == 0);
2725 // 0 Left Lower NoTranspose Unit
2726 for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM * BLOCKING8) {
2727 int mm = CoinMin(m, kkk + UNROLL_DTRSM * BLOCKING8);
2728 for (int kk = kkk; kk < mm; kk += BLOCKING8) {
2729 const long double *COIN_RESTRICT aBase2 = a + kk * (m + BLOCKING8);
2730 long double *COIN_RESTRICT bBase = b + kk;
2731 for (int k = 0; k < BLOCKING8; k++) {
2732 for (int i = k + 1; i < BLOCKING8; i++) {
2733 bBase[i] -= bBase[k] * aBase2[i];
2734 }
2735 aBase2 += BLOCKING8;
2736 }
2737 for (int ii = kk + BLOCKING8; ii < mm; ii += BLOCKING8) {
2738 long double *COIN_RESTRICT bBaseI = b + ii;
2739 for (int k = 0; k < BLOCKING8; k++) {
2740 //coin_prefetch_const(aBase2+BLOCKING8);
2741 for (int i = 0; i < BLOCKING8; i++) {
2742 bBaseI[i] -= bBase[k] * aBase2[i];
2743 }
2744 aBase2 += BLOCKING8;
2745 }
2746 }
2747 }
2748 dtrsm0(kkk, mm, m, m, a, b);
2749 }
2750 }
dtrsm1(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2751 static void dtrsm1(int kkk, int first, int last,
2752 int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2753 {
2754 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2755 assert((last - first) % BLOCKING8 == 0);
2756 if (last - first > CILK_DTRSM) {
2757 int mid = ((first + last) >> 4) << 3;
2758 cilk_spawn dtrsm1(kkk, first, mid, m, a, b);
2759 dtrsm1(kkk, mid, last, m, a, b);
2760 cilk_sync;
2761 } else {
2762 for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2763 long double *COIN_RESTRICT bBase2 = b + iii;
2764 const long double *COIN_RESTRICT aBaseA = a + BLOCKING8X8 + BLOCKING8 * iii;
2765 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2766 long double *COIN_RESTRICT bBase = b + ii;
2767 const long double *COIN_RESTRICT aBase = aBaseA + m * ii;
2768 #if AVX2 != 2
2769 #ifndef ALTERNATE_INNER
2770 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2771 aBase -= BLOCKING8;
2772 //coin_prefetch_const(aBase-BLOCKING8);
2773 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2774 bBase2[k] -= bBase[i] * aBase[k];
2775 }
2776 }
2777 #else
2778 long double b0 = bBase2[0];
2779 long double b1 = bBase2[1];
2780 long double b2 = bBase2[2];
2781 long double b3 = bBase2[3];
2782 long double b4 = bBase2[4];
2783 long double b5 = bBase2[5];
2784 long double b6 = bBase2[6];
2785 long double b7 = bBase2[7];
2786 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2787 aBase -= BLOCKING8;
2788 //coin_prefetch_const(aBase-BLOCKING8);
2789 long double bValue = bBase[i];
2790 b0 -= bValue * aBase[0];
2791 b1 -= bValue * aBase[1];
2792 b2 -= bValue * aBase[2];
2793 b3 -= bValue * aBase[3];
2794 b4 -= bValue * aBase[4];
2795 b5 -= bValue * aBase[5];
2796 b6 -= bValue * aBase[6];
2797 b7 -= bValue * aBase[7];
2798 }
2799 bBase2[0] = b0;
2800 bBase2[1] = b1;
2801 bBase2[2] = b2;
2802 bBase2[3] = b3;
2803 bBase2[4] = b4;
2804 bBase2[5] = b5;
2805 bBase2[6] = b6;
2806 bBase2[7] = b7;
2807 #endif
2808 #else
2809 __m256d b0 = _mm256_load_pd(bBase2);
2810 __m256d b1 = _mm256_load_pd(bBase2 + 4);
2811 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2812 aBase -= BLOCKING8;
2813 //coin_prefetch_const(aBase5-BLOCKING8);
2814 __m256d bb = _mm256_broadcast_sd(bBase + i);
2815 __m256d a0 = _mm256_load_pd(aBase);
2816 b0 -= a0 * bb;
2817 __m256d a1 = _mm256_load_pd(aBase + 4);
2818 b1 -= a1 * bb;
2819 }
2820 //_mm256_store_pd (bBase2, b0);
2821 *reinterpret_cast< __m256d * >(bBase2) = b0;
2822 //_mm256_store_pd (bBase2+4, b1);
2823 *reinterpret_cast< __m256d * >(bBase2 + 4) = b1;
2824 #endif
2825 }
2826 }
2827 }
2828 }
CoinAbcDtrsm1(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2829 void CoinAbcDtrsm1(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2830 {
2831 assert((m & (BLOCKING8 - 1)) == 0);
2832 // 1 Left Upper NoTranspose NonUnit
2833 for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2834 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2835 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2836 const long double *COIN_RESTRICT aBase = a + m * ii + ii * BLOCKING8;
2837 long double *COIN_RESTRICT bBase = b + ii;
2838 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2839 bBase[i] *= aBase[i * (BLOCKING8 + 1)];
2840 for (int k = i - 1; k >= 0; k--) {
2841 bBase[k] -= bBase[i] * aBase[k + i * BLOCKING8];
2842 }
2843 }
2844 long double *COIN_RESTRICT bBase2 = bBase;
2845 for (int iii = ii; iii > mm; iii -= BLOCKING8) {
2846 bBase2 -= BLOCKING8;
2847 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2848 aBase -= BLOCKING8;
2849 coin_prefetch_const(aBase - BLOCKING8);
2850 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2851 bBase2[k] -= bBase[i] * aBase[k];
2852 }
2853 }
2854 }
2855 }
2856 dtrsm1(kkk, 0, mm, m, a, b);
2857 }
2858 }
dtrsm2(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2859 static void dtrsm2(int kkk, int first, int last,
2860 int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2861 {
2862 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2863 assert((last - first) % BLOCKING8 == 0);
2864 if (last - first > CILK_DTRSM) {
2865 int mid = ((first + last) >> 4) << 3;
2866 cilk_spawn dtrsm2(kkk, first, mid, m, a, b);
2867 dtrsm2(kkk, mid, last, m, a, b);
2868 cilk_sync;
2869 } else {
2870 for (int iii = last - BLOCKING8; iii >= first; iii -= BLOCKING8) {
2871 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2872 const long double *COIN_RESTRICT bBase = b + ii;
2873 long double *COIN_RESTRICT bBase2 = b + iii;
2874 const long double *COIN_RESTRICT aBase = a + ii * BLOCKING8 + m * iii + BLOCKING8X8;
2875 for (int j = BLOCKING8 - 1; j >= 0; j -= 4) {
2876 long double bValue0 = bBase2[j];
2877 long double bValue1 = bBase2[j - 1];
2878 long double bValue2 = bBase2[j - 2];
2879 long double bValue3 = bBase2[j - 3];
2880 bValue0 -= aBase[-1 * BLOCKING8 + 7] * bBase[7];
2881 bValue1 -= aBase[-2 * BLOCKING8 + 7] * bBase[7];
2882 bValue2 -= aBase[-3 * BLOCKING8 + 7] * bBase[7];
2883 bValue3 -= aBase[-4 * BLOCKING8 + 7] * bBase[7];
2884 bValue0 -= aBase[-1 * BLOCKING8 + 6] * bBase[6];
2885 bValue1 -= aBase[-2 * BLOCKING8 + 6] * bBase[6];
2886 bValue2 -= aBase[-3 * BLOCKING8 + 6] * bBase[6];
2887 bValue3 -= aBase[-4 * BLOCKING8 + 6] * bBase[6];
2888 bValue0 -= aBase[-1 * BLOCKING8 + 5] * bBase[5];
2889 bValue1 -= aBase[-2 * BLOCKING8 + 5] * bBase[5];
2890 bValue2 -= aBase[-3 * BLOCKING8 + 5] * bBase[5];
2891 bValue3 -= aBase[-4 * BLOCKING8 + 5] * bBase[5];
2892 bValue0 -= aBase[-1 * BLOCKING8 + 4] * bBase[4];
2893 bValue1 -= aBase[-2 * BLOCKING8 + 4] * bBase[4];
2894 bValue2 -= aBase[-3 * BLOCKING8 + 4] * bBase[4];
2895 bValue3 -= aBase[-4 * BLOCKING8 + 4] * bBase[4];
2896 bValue0 -= aBase[-1 * BLOCKING8 + 3] * bBase[3];
2897 bValue1 -= aBase[-2 * BLOCKING8 + 3] * bBase[3];
2898 bValue2 -= aBase[-3 * BLOCKING8 + 3] * bBase[3];
2899 bValue3 -= aBase[-4 * BLOCKING8 + 3] * bBase[3];
2900 bValue0 -= aBase[-1 * BLOCKING8 + 2] * bBase[2];
2901 bValue1 -= aBase[-2 * BLOCKING8 + 2] * bBase[2];
2902 bValue2 -= aBase[-3 * BLOCKING8 + 2] * bBase[2];
2903 bValue3 -= aBase[-4 * BLOCKING8 + 2] * bBase[2];
2904 bValue0 -= aBase[-1 * BLOCKING8 + 1] * bBase[1];
2905 bValue1 -= aBase[-2 * BLOCKING8 + 1] * bBase[1];
2906 bValue2 -= aBase[-3 * BLOCKING8 + 1] * bBase[1];
2907 bValue3 -= aBase[-4 * BLOCKING8 + 1] * bBase[1];
2908 bValue0 -= aBase[-1 * BLOCKING8 + 0] * bBase[0];
2909 bValue1 -= aBase[-2 * BLOCKING8 + 0] * bBase[0];
2910 bValue2 -= aBase[-3 * BLOCKING8 + 0] * bBase[0];
2911 bValue3 -= aBase[-4 * BLOCKING8 + 0] * bBase[0];
2912 bBase2[j] = bValue0;
2913 bBase2[j - 1] = bValue1;
2914 bBase2[j - 2] = bValue2;
2915 bBase2[j - 3] = bValue3;
2916 aBase -= 4 * BLOCKING8;
2917 }
2918 }
2919 }
2920 }
2921 }
CoinAbcDtrsm2(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2922 void CoinAbcDtrsm2(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2923 {
2924 assert((m & (BLOCKING8 - 1)) == 0);
2925 // 2 Left Lower Transpose Unit
2926 for (int kkk = m - BLOCKING8; kkk >= 0; kkk -= UNROLL_DTRSM * BLOCKING8) {
2927 int mm = CoinMax(0, kkk - (UNROLL_DTRSM - 1) * BLOCKING8);
2928 for (int ii = kkk; ii >= mm; ii -= BLOCKING8) {
2929 long double *COIN_RESTRICT bBase = b + ii;
2930 long double *COIN_RESTRICT bBase2 = bBase;
2931 const long double *COIN_RESTRICT aBase = a + m * ii + (ii + BLOCKING8) * BLOCKING8;
2932 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2933 aBase -= BLOCKING8;
2934 for (int k = i + 1; k < BLOCKING8; k++) {
2935 bBase2[i] -= aBase[k] * bBase2[k];
2936 }
2937 }
2938 for (int iii = ii - BLOCKING8; iii >= mm; iii -= BLOCKING8) {
2939 bBase2 -= BLOCKING8;
2940 assert(bBase2 == b + iii);
2941 aBase -= m * BLOCKING8;
2942 const long double *COIN_RESTRICT aBase2 = aBase + BLOCKING8X8;
2943 for (int i = BLOCKING8 - 1; i >= 0; i--) {
2944 aBase2 -= BLOCKING8;
2945 for (int k = BLOCKING8 - 1; k >= 0; k--) {
2946 bBase2[i] -= aBase2[k] * bBase[k];
2947 }
2948 }
2949 }
2950 }
2951 dtrsm2(kkk, 0, mm, m, a, b);
2952 }
2953 }
2954 #define UNROLL_DTRSM3 16
2955 #define CILK_DTRSM3 32
dtrsm3(int kkk,int first,int last,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)2956 static void dtrsm3(int kkk, int first, int last,
2957 int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
2958 {
2959 //int mm=CoinMin(kkk+UNROLL_DTRSM3*BLOCKING8,m);
2960 assert((last - first) % BLOCKING8 == 0);
2961 if (last - first > CILK_DTRSM3) {
2962 int mid = ((first + last) >> 4) << 3;
2963 cilk_spawn dtrsm3(kkk, first, mid, m, a, b);
2964 dtrsm3(kkk, mid, last, m, a, b);
2965 cilk_sync;
2966 } else {
2967 for (int kk = 0; kk < kkk; kk += BLOCKING8) {
2968 for (int ii = first; ii < last; ii += BLOCKING8) {
2969 long double *COIN_RESTRICT bBase = b + ii;
2970 long double *COIN_RESTRICT bBase2 = b + kk;
2971 const long double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
2972 for (int j = 0; j < BLOCKING8; j += 4) {
2973 long double bValue0 = bBase[j];
2974 long double bValue1 = bBase[j + 1];
2975 long double bValue2 = bBase[j + 2];
2976 long double bValue3 = bBase[j + 3];
2977 bValue0 -= aBase[0] * bBase2[0];
2978 bValue1 -= aBase[1 * BLOCKING8 + 0] * bBase2[0];
2979 bValue2 -= aBase[2 * BLOCKING8 + 0] * bBase2[0];
2980 bValue3 -= aBase[3 * BLOCKING8 + 0] * bBase2[0];
2981 bValue0 -= aBase[1] * bBase2[1];
2982 bValue1 -= aBase[1 * BLOCKING8 + 1] * bBase2[1];
2983 bValue2 -= aBase[2 * BLOCKING8 + 1] * bBase2[1];
2984 bValue3 -= aBase[3 * BLOCKING8 + 1] * bBase2[1];
2985 bValue0 -= aBase[2] * bBase2[2];
2986 bValue1 -= aBase[1 * BLOCKING8 + 2] * bBase2[2];
2987 bValue2 -= aBase[2 * BLOCKING8 + 2] * bBase2[2];
2988 bValue3 -= aBase[3 * BLOCKING8 + 2] * bBase2[2];
2989 bValue0 -= aBase[3] * bBase2[3];
2990 bValue1 -= aBase[1 * BLOCKING8 + 3] * bBase2[3];
2991 bValue2 -= aBase[2 * BLOCKING8 + 3] * bBase2[3];
2992 bValue3 -= aBase[3 * BLOCKING8 + 3] * bBase2[3];
2993 bValue0 -= aBase[4] * bBase2[4];
2994 bValue1 -= aBase[1 * BLOCKING8 + 4] * bBase2[4];
2995 bValue2 -= aBase[2 * BLOCKING8 + 4] * bBase2[4];
2996 bValue3 -= aBase[3 * BLOCKING8 + 4] * bBase2[4];
2997 bValue0 -= aBase[5] * bBase2[5];
2998 bValue1 -= aBase[1 * BLOCKING8 + 5] * bBase2[5];
2999 bValue2 -= aBase[2 * BLOCKING8 + 5] * bBase2[5];
3000 bValue3 -= aBase[3 * BLOCKING8 + 5] * bBase2[5];
3001 bValue0 -= aBase[6] * bBase2[6];
3002 bValue1 -= aBase[1 * BLOCKING8 + 6] * bBase2[6];
3003 bValue2 -= aBase[2 * BLOCKING8 + 7] * bBase2[7];
3004 bValue3 -= aBase[3 * BLOCKING8 + 6] * bBase2[6];
3005 bValue0 -= aBase[7] * bBase2[7];
3006 bValue1 -= aBase[1 * BLOCKING8 + 7] * bBase2[7];
3007 bValue2 -= aBase[2 * BLOCKING8 + 6] * bBase2[6];
3008 bValue3 -= aBase[3 * BLOCKING8 + 7] * bBase2[7];
3009 bBase[j] = bValue0;
3010 bBase[j + 1] = bValue1;
3011 bBase[j + 2] = bValue2;
3012 bBase[j + 3] = bValue3;
3013 aBase += 4 * BLOCKING8;
3014 }
3015 }
3016 }
3017 }
3018 }
CoinAbcDtrsm3(int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT b)3019 void CoinAbcDtrsm3(int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT b)
3020 {
3021 assert((m & (BLOCKING8 - 1)) == 0);
3022 // 3 Left Upper Transpose NonUnit
3023 for (int kkk = 0; kkk < m; kkk += UNROLL_DTRSM3 * BLOCKING8) {
3024 int mm = CoinMin(m, kkk + UNROLL_DTRSM3 * BLOCKING8);
3025 dtrsm3(kkk, kkk, mm, m, a, b);
3026 for (int ii = kkk; ii < mm; ii += BLOCKING8) {
3027 long double *COIN_RESTRICT bBase = b + ii;
3028 for (int kk = kkk; kk < ii; kk += BLOCKING8) {
3029 long double *COIN_RESTRICT bBase2 = b + kk;
3030 const long double *COIN_RESTRICT aBase = a + ii * m + kk * BLOCKING8;
3031 for (int i = 0; i < BLOCKING8; i++) {
3032 for (int k = 0; k < BLOCKING8; k++) {
3033 bBase[i] -= aBase[k] * bBase2[k];
3034 }
3035 aBase += BLOCKING8;
3036 }
3037 }
3038 const long double *COIN_RESTRICT aBase = a + ii * m + ii * BLOCKING8;
3039 for (int i = 0; i < BLOCKING8; i++) {
3040 for (int k = 0; k < i; k++) {
3041 bBase[i] -= aBase[k] * bBase[k];
3042 }
3043 bBase[i] = bBase[i] * aBase[i];
3044 aBase += BLOCKING8;
3045 }
3046 }
3047 }
3048 }
3049 static void
CoinAbcDlaswp(int n,long double * COIN_RESTRICT a,int lda,int start,int end,int * ipiv)3050 CoinAbcDlaswp(int n, long double *COIN_RESTRICT a, int lda, int start, int end, int *ipiv)
3051 {
3052 assert((n & (BLOCKING8 - 1)) == 0);
3053 long double *COIN_RESTRICT aThis3 = a;
3054 for (int j = 0; j < n; j += BLOCKING8) {
3055 for (int i = start; i < end; i++) {
3056 int ip = ipiv[i];
3057 if (ip != i) {
3058 long double *COIN_RESTRICT aTop = aThis3 + j * lda;
3059 int iBias = ip / BLOCKING8;
3060 ip -= iBias * BLOCKING8;
3061 long double *COIN_RESTRICT aNotTop = aTop + iBias * BLOCKING8X8;
3062 iBias = i / BLOCKING8;
3063 int i2 = i - iBias * BLOCKING8;
3064 aTop += iBias * BLOCKING8X8;
3065 for (int k = 0; k < BLOCKING8; k++) {
3066 long double temp = aTop[i2];
3067 aTop[i2] = aNotTop[ip];
3068 aNotTop[ip] = temp;
3069 aTop += BLOCKING8;
3070 aNotTop += BLOCKING8;
3071 }
3072 }
3073 }
3074 }
3075 }
3076 extern void CoinAbcDgemm(int m, int n, int k, long double *COIN_RESTRICT a, int lda,
3077 long double *COIN_RESTRICT b, long double *COIN_RESTRICT c
3078 #if ABC_PARALLEL == 2
3079 ,
3080 int parallelMode
3081 #endif
3082 );
CoinAbcDgetrf2(int m,int n,long double * COIN_RESTRICT a,int * ipiv)3083 static int CoinAbcDgetrf2(int m, int n, long double *COIN_RESTRICT a, int *ipiv)
3084 {
3085 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
3086 assert(n == BLOCKING8);
3087 int dimension = CoinMin(m, n);
3088 /* entry for column j and row i (when multiple of BLOCKING8)
3089 is at aBlocked+j*m+i*BLOCKING8
3090 */
3091 assert(dimension == BLOCKING8);
3092 //printf("Dgetrf2 m=%d n=%d\n",m,n);
3093 long double *COIN_RESTRICT aThis3 = a;
3094 for (int j = 0; j < dimension; j++) {
3095 int pivotRow = -1;
3096 long double largest = 0.0;
3097 long double *COIN_RESTRICT aThis2 = aThis3 + j * BLOCKING8;
3098 // this block
3099 int pivotRow2 = -1;
3100 for (int i = j; i < BLOCKING8; i++) {
3101 long double value = fabsl(aThis2[i]);
3102 if (value > largest) {
3103 largest = value;
3104 pivotRow2 = i;
3105 }
3106 }
3107 // other blocks
3108 int iBias = 0;
3109 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3110 aThis2 += BLOCKING8 * BLOCKING8;
3111 for (int i = 0; i < BLOCKING8; i++) {
3112 long double value = fabsl(aThis2[i]);
3113 if (value > largest) {
3114 largest = value;
3115 pivotRow2 = i;
3116 iBias = ii;
3117 }
3118 }
3119 }
3120 pivotRow = pivotRow2 + iBias;
3121 ipiv[j] = pivotRow;
3122 if (largest) {
3123 if (j != pivotRow) {
3124 long double *COIN_RESTRICT aTop = aThis3;
3125 long double *COIN_RESTRICT aNotTop = aThis3 + iBias * BLOCKING8;
3126 for (int i = 0; i < n; i++) {
3127 long double value = aTop[j];
3128 aTop[j] = aNotTop[pivotRow2];
3129 aNotTop[pivotRow2] = value;
3130 aTop += BLOCKING8;
3131 aNotTop += BLOCKING8;
3132 }
3133 }
3134 aThis2 = aThis3 + j * BLOCKING8;
3135 long double pivotMultiplier = 1.0 / aThis2[j];
3136 aThis2[j] = pivotMultiplier;
3137 // Do L
3138 // this block
3139 for (int i = j + 1; i < BLOCKING8; i++)
3140 aThis2[i] *= pivotMultiplier;
3141 // other blocks
3142 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3143 aThis2 += BLOCKING8 * BLOCKING8;
3144 for (int i = 0; i < BLOCKING8; i++)
3145 aThis2[i] *= pivotMultiplier;
3146 }
3147 // update rest
3148 long double *COIN_RESTRICT aPut;
3149 aThis2 = aThis3 + j * BLOCKING8;
3150 aPut = aThis2 + BLOCKING8;
3151 long double *COIN_RESTRICT aPut2 = aPut;
3152 // this block
3153 for (int i = j + 1; i < BLOCKING8; i++) {
3154 long double value = aPut[j];
3155 for (int k = j + 1; k < BLOCKING8; k++)
3156 aPut[k] -= value * aThis2[k];
3157 aPut += BLOCKING8;
3158 }
3159 // other blocks
3160 for (int ii = BLOCKING8; ii < m; ii += BLOCKING8) {
3161 aThis2 += BLOCKING8 * BLOCKING8;
3162 aPut = aThis2 + BLOCKING8;
3163 long double *COIN_RESTRICT aPut2a = aPut2;
3164 for (int i = j + 1; i < BLOCKING8; i++) {
3165 long double value = aPut2a[j];
3166 for (int k = 0; k < BLOCKING8; k++)
3167 aPut[k] -= value * aThis2[k];
3168 aPut += BLOCKING8;
3169 aPut2a += BLOCKING8;
3170 }
3171 }
3172 } else {
3173 return 1;
3174 }
3175 }
3176 return 0;
3177 }
3178
CoinAbcDgetrf(int m,int n,long double * COIN_RESTRICT a,int lda,int * ipiv,int parallelMode)3179 int CoinAbcDgetrf(int m, int n, long double *COIN_RESTRICT a, int lda, int *ipiv
3180 #if ABC_PARALLEL == 2
3181 ,
3182 int parallelMode
3183 #endif
3184 )
3185 {
3186 assert(m == n);
3187 assert((m & (BLOCKING8 - 1)) == 0 && (n & (BLOCKING8 - 1)) == 0);
3188 if (m < BLOCKING8) {
3189 return CoinAbcDgetrf2(m, n, a, ipiv);
3190 } else {
3191 for (int j = 0; j < n; j += BLOCKING8) {
3192 int start = j;
3193 int newSize = CoinMin(BLOCKING8, n - j);
3194 int end = j + newSize;
3195 int returnCode = CoinAbcDgetrf2(m - start, newSize, a + (start * lda + start * BLOCKING8),
3196 ipiv + start);
3197 if (!returnCode) {
3198 // adjust
3199 for (int k = start; k < end; k++)
3200 ipiv[k] += start;
3201 // swap 0<start
3202 CoinAbcDlaswp(start, a, lda, start, end, ipiv);
3203 if (end < n) {
3204 // swap >=end
3205 CoinAbcDlaswp(n - end, a + end * lda, lda, start, end, ipiv);
3206 CoinAbcDtrsmFactor(newSize, n - end, a + (start * lda + start * BLOCKING8), lda);
3207 CoinAbcDgemm(n - end, n - end, newSize,
3208 a + start * lda + end * BLOCKING8, lda,
3209 a + end * lda + start * BLOCKING8, a + end * lda + end * BLOCKING8
3210 #if ABC_PARALLEL == 2
3211 ,
3212 parallelMode
3213 #endif
3214 );
3215 }
3216 } else {
3217 return returnCode;
3218 }
3219 }
3220 }
3221 return 0;
3222 }
CoinAbcDgetrs(char trans,int m,long double * COIN_RESTRICT a,long double * COIN_RESTRICT work)3223 void CoinAbcDgetrs(char trans, int m, long double *COIN_RESTRICT a, long double *COIN_RESTRICT work)
3224 {
3225 assert((m & (BLOCKING8 - 1)) == 0);
3226 if (trans == 'N') {
3227 //CoinAbcDlaswp1(work,m,ipiv);
3228 CoinAbcDtrsm0(m, a, work);
3229 CoinAbcDtrsm1(m, a, work);
3230 } else {
3231 assert(trans == 'T');
3232 CoinAbcDtrsm3(m, a, work);
3233 CoinAbcDtrsm2(m, a, work);
3234 //CoinAbcDlaswp1Backwards(work,m,ipiv);
3235 }
3236 }
3237 #endif
3238
3239 /* vi: softtabstop=2 shiftwidth=2 expandtab tabstop=2
3240 */
3241