Clp/src/ClpCholeskyDense.cpp

/* $Id: ClpCholeskyDense.cpp 2385 2019-01-06 19:43:06Z unxusr $ */
/*
  Copyright (C) 2003, International Business Machines Corporation
  and others.  All Rights Reserved.

  This code is licensed under the terms of the Eclipse Public License (EPL).
*/
#include "CoinPragma.hpp"
#include "CoinHelperFunctions.hpp"
#include "ClpConfig.h"
#include "ClpHelperFunctions.hpp"
#include "ClpInterior.hpp"
#include "ClpCholeskyDense.hpp"
#include "ClpMessage.hpp"
#include "ClpQuadraticObjective.hpp"
#if CLP_HAS_ABC
#include "CoinAbcCommon.hpp"
#endif
#if CLP_HAS_ABC < 3
#undef cilk_for
#undef cilk_spawn
#undef cilk_sync
#define cilk_for for
#define cilk_spawn
#define cilk_sync
#endif

/*#############################################################################*/
/* Constructors / Destructor / Assignment*/
/*#############################################################################*/

/*-------------------------------------------------------------------*/
/* Default Constructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::ClpCholeskyDense()
  : ClpCholeskyBase()
  , borrowSpace_(false)
{
  type_ = 11;
  ;
}

/*-------------------------------------------------------------------*/
/* Copy constructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::ClpCholeskyDense(const ClpCholeskyDense &rhs)
  : ClpCholeskyBase(rhs)
  , borrowSpace_(rhs.borrowSpace_)
{
  assert(!rhs.borrowSpace_ || !rhs.sizeFactor_); /* can't do if borrowing space*/
}

/*-------------------------------------------------------------------*/
/* Destructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::~ClpCholeskyDense()
{
  if (borrowSpace_) {
    /* set NULL*/
    sparseFactor_ = NULL;
    workDouble_ = NULL;
    diagonal_ = NULL;
  }
}

/*----------------------------------------------------------------*/
/* Assignment operator */
/*-------------------------------------------------------------------*/
ClpCholeskyDense &
ClpCholeskyDense::operator=(const ClpCholeskyDense &rhs)
{
  if (this != &rhs) {
    assert(!rhs.borrowSpace_ || !rhs.sizeFactor_); /* can't do if borrowing space*/
    ClpCholeskyBase::operator=(rhs);
    borrowSpace_ = rhs.borrowSpace_;
  }
  return *this;
}
/*-------------------------------------------------------------------*/
/* Clone*/
/*-------------------------------------------------------------------*/
ClpCholeskyBase *ClpCholeskyDense::clone() const
{
  return new ClpCholeskyDense(*this);
}
/* If not power of 2 then need to redo a bit*/
#define BLOCK 16
#define BLOCKSHIFT 4
/* Block unroll if power of 2 and at least 8*/
#define BLOCKUNROLL

#define BLOCKSQ (BLOCK * BLOCK)
#define BLOCKSQSHIFT (BLOCKSHIFT + BLOCKSHIFT)
#define number_blocks(x) (((x) + BLOCK - 1) >> BLOCKSHIFT)
#define number_rows(x) ((x) << BLOCKSHIFT)
#define number_entries(x) ((x) << BLOCKSQSHIFT)
/* Gets space */
int ClpCholeskyDense::reserveSpace(const ClpCholeskyBase *factor, int numberRows)
{
  numberRows_ = numberRows;
  int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
  /* allow one stripe extra*/
  numberBlocks = numberBlocks + ((numberBlocks * (numberBlocks + 1)) / 2);
  sizeFactor_ = numberBlocks * BLOCKSQ;
  /*#define CHOL_COMPARE*/
#ifdef CHOL_COMPARE
  sizeFactor_ += 95000;
#endif
  if (!factor) {
    sparseFactor_ = new longDouble[sizeFactor_];
    rowsDropped_ = new char[numberRows_];
    memset(rowsDropped_, 0, numberRows_);
    workDouble_ = new longDouble[numberRows_];
    diagonal_ = new longDouble[numberRows_];
  } else {
    borrowSpace_ = true;
    int numberFull = factor->numberRows();
    sparseFactor_ = factor->sparseFactor() + (factor->size() - sizeFactor_);
    workDouble_ = factor->workDouble() + (numberFull - numberRows_);
    diagonal_ = factor->diagonal() + (numberFull - numberRows_);
  }
  numberRowsDropped_ = 0;
  return 0;
}
/* Returns space needed */
int ClpCholeskyDense::space(int numberRows) const
{
  int numberBlocks = (numberRows + BLOCK - 1) >> BLOCKSHIFT;
  /* allow one stripe extra*/
  numberBlocks = numberBlocks + ((numberBlocks * (numberBlocks + 1)) / 2);
  int sizeFactor = numberBlocks * BLOCKSQ;
#ifdef CHOL_COMPARE
  sizeFactor += 95000;
#endif
  return sizeFactor;
}
/* Orders rows and saves pointer to matrix.and model */
int ClpCholeskyDense::order(ClpInterior *model)
{
  model_ = model;
  int numberRows;
  int numberRowsModel = model_->numberRows();
  int numberColumns = model_->numberColumns();
  if (!doKKT_) {
    numberRows = numberRowsModel;
  } else {
    numberRows = 2 * numberRowsModel + numberColumns;
  }
  reserveSpace(NULL, numberRows);
  rowCopy_ = model->clpMatrix()->reverseOrderedCopy();
  return 0;
}
/* Does Symbolic factorization given permutation.
   This is called immediately after order.  If user provides this then
   user must provide factorize and solve.  Otherwise the default factorization is used
   returns non-zero if not enough memory */
int ClpCholeskyDense::symbolic()
{
  return 0;
}
/* Factorize - filling in rowsDropped and returning number dropped */
int ClpCholeskyDense::factorize(const CoinWorkDouble *diagonal, int *rowsDropped)
{
  assert(!borrowSpace_);
  const CoinBigIndex *columnStart = model_->clpMatrix()->getVectorStarts();
  const int *columnLength = model_->clpMatrix()->getVectorLengths();
  const int *row = model_->clpMatrix()->getIndices();
  const double *element = model_->clpMatrix()->getElements();
  const CoinBigIndex *rowStart = rowCopy_->getVectorStarts();
  const int *rowLength = rowCopy_->getVectorLengths();
  const int *column = rowCopy_->getIndices();
  const double *elementByRow = rowCopy_->getElements();
  int numberColumns = model_->clpMatrix()->getNumCols();
  CoinZeroN(sparseFactor_, sizeFactor_);
  /*perturbation*/
  CoinWorkDouble perturbation = model_->diagonalPerturbation() * model_->diagonalNorm();
  perturbation = perturbation * perturbation;
  if (perturbation > 1.0) {
#ifdef COIN_DEVELOP
    /*if (model_->model()->logLevel()&4) */
    std::cout << "large perturbation " << perturbation << std::endl;
#endif
    perturbation = CoinSqrt(perturbation);
    ;
    perturbation = 1.0;
  }
  int iRow;
  int newDropped = 0;
  CoinWorkDouble largest = 1.0;
  CoinWorkDouble smallest = COIN_DBL_MAX;
  CoinWorkDouble delta2 = model_->delta(); /* add delta*delta to diagonal*/
  delta2 *= delta2;
  if (!doKKT_) {
    longDouble *work = sparseFactor_;
    work--; /* skip diagonal*/
    int addOffset = numberRows_ - 1;
    const CoinWorkDouble *diagonalSlack = diagonal + numberColumns;
    /* largest in initial matrix*/
    CoinWorkDouble largest2 = 1.0e-20;
    for (iRow = 0; iRow < numberRows_; iRow++) {
      if (!rowsDropped_[iRow]) {
        CoinBigIndex startRow = rowStart[iRow];
        CoinBigIndex endRow = rowStart[iRow] + rowLength[iRow];
        CoinWorkDouble diagonalValue = diagonalSlack[iRow] + delta2;
        for (CoinBigIndex k = startRow; k < endRow; k++) {
          int iColumn = column[k];
          CoinBigIndex start = columnStart[iColumn];
          CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
          CoinWorkDouble multiplier = diagonal[iColumn] * elementByRow[k];
          for (CoinBigIndex j = start; j < end; j++) {
            int jRow = row[j];
            if (!rowsDropped_[jRow]) {
              if (jRow > iRow) {
                CoinWorkDouble value = element[j] * multiplier;
                work[jRow] += value;
              } else if (jRow == iRow) {
                CoinWorkDouble value = element[j] * multiplier;
                diagonalValue += value;
              }
            }
          }
        }
        for (int j = iRow + 1; j < numberRows_; j++)
          largest2 = CoinMax(largest2, CoinAbs(work[j]));
        diagonal_[iRow] = diagonalValue;
        largest2 = CoinMax(largest2, CoinAbs(diagonalValue));
      } else {
        /* dropped*/
        diagonal_[iRow] = 1.0;
      }
      addOffset--;
      work += addOffset;
    }
    /*check sizes*/
    largest2 *= 1.0e-20;
    largest = CoinMin(largest2, CHOL_SMALL_VALUE);
    int numberDroppedBefore = 0;
    for (iRow = 0; iRow < numberRows_; iRow++) {
      int dropped = rowsDropped_[iRow];
      /* Move to int array*/
      rowsDropped[iRow] = dropped;
      if (!dropped) {
        CoinWorkDouble diagonal = diagonal_[iRow];
        if (diagonal > largest2) {
          diagonal_[iRow] = diagonal + perturbation;
        } else {
          diagonal_[iRow] = diagonal + perturbation;
          rowsDropped[iRow] = 2;
          numberDroppedBefore++;
        }
      }
    }
    doubleParameters_[10] = CoinMax(1.0e-20, largest);
    integerParameters_[20] = 0;
    doubleParameters_[3] = 0.0;
    doubleParameters_[4] = COIN_DBL_MAX;
    integerParameters_[34] = 0; /* say all must be positive*/
#ifdef CHOL_COMPARE
    if (numberRows_ < 200)
      factorizePart3(rowsDropped);
#endif
    factorizePart2(rowsDropped);
    newDropped = integerParameters_[20] + numberDroppedBefore;
    largest = doubleParameters_[3];
    smallest = doubleParameters_[4];
    if (model_->messageHandler()->logLevel() > 1)
      std::cout << "Cholesky - largest " << largest << " smallest " << smallest << std::endl;
    choleskyCondition_ = largest / smallest;
    /*drop fresh makes some formADAT easier*/
    if (newDropped || numberRowsDropped_) {
      newDropped = 0;
      for (int i = 0; i < numberRows_; i++) {
        char dropped = static_cast< char >(rowsDropped[i]);
        rowsDropped_[i] = dropped;
        if (dropped == 2) {
          /*dropped this time*/
          rowsDropped[newDropped++] = i;
          rowsDropped_[i] = 0;
        }
      }
      numberRowsDropped_ = newDropped;
      newDropped = -(2 + newDropped);
    }
  } else {
    /* KKT*/
    CoinPackedMatrix *quadratic = NULL;
    ClpQuadraticObjective *quadraticObj = (dynamic_cast< ClpQuadraticObjective * >(model_->objectiveAsObject()));
    if (quadraticObj)
      quadratic = quadraticObj->quadraticObjective();
    /* matrix*/
    int numberRowsModel = model_->numberRows();
    int numberColumns = model_->numberColumns();
    int numberTotal = numberColumns + numberRowsModel;
    longDouble *work = sparseFactor_;
    work--; /* skip diagonal*/
    int addOffset = numberRows_ - 1;
    int iColumn;
    if (!quadratic) {
      for (iColumn = 0; iColumn < numberColumns; iColumn++) {
        CoinWorkDouble value = diagonal[iColumn];
        if (CoinAbs(value) > 1.0e-100) {
          value = 1.0 / value;
          largest = CoinMax(largest, CoinAbs(value));
          diagonal_[iColumn] = -value;
          CoinBigIndex start = columnStart[iColumn];
          CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
          for (CoinBigIndex j = start; j < end; j++) {
            /*choleskyRow_[numberElements]=row[j]+numberTotal;*/
            /*sparseFactor_[numberElements++]=element[j];*/
            work[row[j] + numberTotal] = element[j];
            largest = CoinMax(largest, CoinAbs(element[j]));
          }
        } else {
          diagonal_[iColumn] = -value;
        }
        addOffset--;
        work += addOffset;
      }
    } else {
      /* Quadratic*/
      const int *columnQuadratic = quadratic->getIndices();
      const CoinBigIndex *columnQuadraticStart = quadratic->getVectorStarts();
      const int *columnQuadraticLength = quadratic->getVectorLengths();
      const double *quadraticElement = quadratic->getElements();
      for (iColumn = 0; iColumn < numberColumns; iColumn++) {
        CoinWorkDouble value = diagonal[iColumn];
        CoinBigIndex j;
        if (CoinAbs(value) > 1.0e-100) {
          value = 1.0 / value;
          for (j = columnQuadraticStart[iColumn];
               j < columnQuadraticStart[iColumn] + columnQuadraticLength[iColumn]; j++) {
            int jColumn = columnQuadratic[j];
            if (jColumn > iColumn) {
              work[jColumn] = -quadraticElement[j];
            } else if (iColumn == jColumn) {
              value += quadraticElement[j];
            }
          }
          largest = CoinMax(largest, CoinAbs(value));
          diagonal_[iColumn] = -value;
          CoinBigIndex start = columnStart[iColumn];
          CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
          for (j = start; j < end; j++) {
            work[row[j] + numberTotal] = element[j];
            largest = CoinMax(largest, CoinAbs(element[j]));
          }
        } else {
          value = 1.0e100;
          diagonal_[iColumn] = -value;
        }
        addOffset--;
        work += addOffset;
      }
    }
    /* slacks*/
    for (iColumn = numberColumns; iColumn < numberTotal; iColumn++) {
      CoinWorkDouble value = diagonal[iColumn];
      if (CoinAbs(value) > 1.0e-100) {
        value = 1.0 / value;
        largest = CoinMax(largest, CoinAbs(value));
      } else {
        value = 1.0e100;
      }
      diagonal_[iColumn] = -value;
      work[iColumn - numberColumns + numberTotal] = -1.0;
      addOffset--;
      work += addOffset;
    }
    /* Finish diagonal*/
    for (iRow = 0; iRow < numberRowsModel; iRow++) {
      diagonal_[iRow + numberTotal] = delta2;
    }
    /*check sizes*/
    largest *= 1.0e-20;
    largest = CoinMin(largest, CHOL_SMALL_VALUE);
    doubleParameters_[10] = CoinMax(1.0e-20, largest);
    integerParameters_[20] = 0;
    doubleParameters_[3] = 0.0;
    doubleParameters_[4] = COIN_DBL_MAX;
    /* Set up LDL cutoff*/
    integerParameters_[34] = numberTotal;
    /* KKT*/
    int *rowsDropped2 = new int[numberRows_];
    CoinZeroN(rowsDropped2, numberRows_);
#ifdef CHOL_COMPARE
    if (numberRows_ < 200)
      factorizePart3(rowsDropped2);
#endif
    factorizePart2(rowsDropped2);
    newDropped = integerParameters_[20];
    largest = doubleParameters_[3];
    smallest = doubleParameters_[4];
    if (model_->messageHandler()->logLevel() > 1)
      COIN_DETAIL_PRINT(std::cout << "Cholesky - largest " << largest << " smallest " << smallest << std::endl);
    choleskyCondition_ = largest / smallest;
    /* Should save adjustments in ..R_*/
    int n1 = 0, n2 = 0;
    CoinWorkDouble *primalR = model_->primalR();
    CoinWorkDouble *dualR = model_->dualR();
    for (iRow = 0; iRow < numberTotal; iRow++) {
      if (rowsDropped2[iRow]) {
        n1++;
        /*printf("row region1 %d dropped\n",iRow);*/
        /*rowsDropped_[iRow]=1;*/
        rowsDropped_[iRow] = 0;
        primalR[iRow] = doubleParameters_[20];
      } else {
        rowsDropped_[iRow] = 0;
        primalR[iRow] = 0.0;
      }
    }
    for (; iRow < numberRows_; iRow++) {
      if (rowsDropped2[iRow]) {
        n2++;
        /*printf("row region2 %d dropped\n",iRow);*/
        /*rowsDropped_[iRow]=1;*/
        rowsDropped_[iRow] = 0;
        dualR[iRow - numberTotal] = doubleParameters_[34];
      } else {
        rowsDropped_[iRow] = 0;
        dualR[iRow - numberTotal] = 0.0;
      }
    }
  }
  return 0;
}
/* Factorize - filling in rowsDropped and returning number dropped */
void ClpCholeskyDense::factorizePart3(int *rowsDropped)
{
  int iColumn;
  longDouble *xx = sparseFactor_;
  longDouble *yy = diagonal_;
  diagonal_ = sparseFactor_ + 40000;
  sparseFactor_ = diagonal_ + numberRows_;
  /*memcpy(sparseFactor_,xx,sizeFactor_*sizeof(double));*/
  CoinMemcpyN(xx, 40000, sparseFactor_);
  CoinMemcpyN(yy, numberRows_, diagonal_);
  int numberDropped = 0;
  CoinWorkDouble largest = 0.0;
  CoinWorkDouble smallest = COIN_DBL_MAX;
  double dropValue = doubleParameters_[10];
  int firstPositive = integerParameters_[34];
  longDouble *work = sparseFactor_;
  /* Allow for triangular*/
  int addOffset = numberRows_ - 1;
  work--;
  for (iColumn = 0; iColumn < numberRows_; iColumn++) {
    int iRow;
    int addOffsetNow = numberRows_ - 1;
    ;
    longDouble *workNow = sparseFactor_ - 1 + iColumn;
    CoinWorkDouble diagonalValue = diagonal_[iColumn];
    for (iRow = 0; iRow < iColumn; iRow++) {
      double aj = *workNow;
      addOffsetNow--;
      workNow += addOffsetNow;
      diagonalValue -= aj * aj * workDouble_[iRow];
    }
    bool dropColumn = false;
    if (iColumn < firstPositive) {
      /* must be negative*/
      if (diagonalValue <= -dropValue) {
        smallest = CoinMin(smallest, -diagonalValue);
        largest = CoinMax(largest, -diagonalValue);
        workDouble_[iColumn] = diagonalValue;
        diagonalValue = 1.0 / diagonalValue;
      } else {
        dropColumn = true;
        workDouble_[iColumn] = -1.0e100;
        diagonalValue = 0.0;
        integerParameters_[20]++;
      }
    } else {
      /* must be positive*/
      if (diagonalValue >= dropValue) {
        smallest = CoinMin(smallest, diagonalValue);
        largest = CoinMax(largest, diagonalValue);
        workDouble_[iColumn] = diagonalValue;
        diagonalValue = 1.0 / diagonalValue;
      } else {
        dropColumn = true;
        workDouble_[iColumn] = 1.0e100;
        diagonalValue = 0.0;
        integerParameters_[20]++;
      }
    }
    if (!dropColumn) {
      diagonal_[iColumn] = diagonalValue;
      for (iRow = iColumn + 1; iRow < numberRows_; iRow++) {
        double value = work[iRow];
        workNow = sparseFactor_ - 1;
        int addOffsetNow = numberRows_ - 1;
        ;
        for (int jColumn = 0; jColumn < iColumn; jColumn++) {
          double aj = workNow[iColumn];
          double multiplier = workDouble_[jColumn];
          double ai = workNow[iRow];
          addOffsetNow--;
          workNow += addOffsetNow;
          value -= aj * ai * multiplier;
        }
        work[iRow] = value * diagonalValue;
      }
    } else {
      /* drop column*/
      rowsDropped[iColumn] = 2;
      numberDropped++;
      diagonal_[iColumn] = 0.0;
      for (iRow = iColumn + 1; iRow < numberRows_; iRow++) {
        work[iRow] = 0.0;
      }
    }
    addOffset--;
    work += addOffset;
  }
  doubleParameters_[3] = largest;
  doubleParameters_[4] = smallest;
  integerParameters_[20] = numberDropped;
  sparseFactor_ = xx;
  diagonal_ = yy;
}
/*#define POS_DEBUG*/
#ifdef POS_DEBUG
static int counter = 0;
int ClpCholeskyDense::bNumber(const longDouble *array, int &iRow, int &iCol)
{
  int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
  longDouble *a = sparseFactor_ + BLOCKSQ * numberBlocks;
  int k = array - a;
  assert((k % BLOCKSQ) == 0);
  iCol = 0;
  int kk = k >> BLOCKSQSHIFT;
  /*printf("%d %d %d %d\n",k,kk,BLOCKSQ,BLOCKSQSHIFT);*/
  k = kk;
  while (k >= numberBlocks) {
    iCol++;
    k -= numberBlocks;
    numberBlocks--;
  }
  iRow = iCol + k;
  counter++;
  if (counter > 100000)
    exit(77);
  return kk;
}
#endif
/* Factorize - filling in rowsDropped and returning number dropped */
void ClpCholeskyDense::factorizePart2(int *rowsDropped)
{
  int iColumn;
  /*longDouble * xx = sparseFactor_;*/
  /*longDouble * yy = diagonal_;*/
  /*diagonal_ = sparseFactor_ + 40000;*/
  /*sparseFactor_=diagonal_ + numberRows_;*/
  /*memcpy(sparseFactor_,xx,sizeFactor_*sizeof(double));*/
  /*memcpy(sparseFactor_,xx,40000*sizeof(double));*/
  /*memcpy(diagonal_,yy,numberRows_*sizeof(double));*/
  int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
  /* later align on boundary*/
  longDouble *a = sparseFactor_ + BLOCKSQ * numberBlocks;
  int n = numberRows_;
  int nRound = numberRows_ & (~(BLOCK - 1));
  /* adjust if exact*/
  if (nRound == n)
    nRound -= BLOCK;
  int sizeLastBlock = n - nRound;
  int get = n * (n - 1) / 2; /* ? as no diagonal*/
  int block = numberBlocks * (numberBlocks + 1) / 2;
  int ifOdd;
  int rowLast;
  if (sizeLastBlock != BLOCK) {
    longDouble *aa = &a[(block - 1) * BLOCKSQ];
    rowLast = nRound - 1;
    ifOdd = 1;
    int put = BLOCKSQ;
    /* do last separately*/
    put -= (BLOCK - sizeLastBlock) * (BLOCK + 1);
    for (iColumn = numberRows_ - 1; iColumn >= nRound; iColumn--) {
      int put2 = put;
      put -= BLOCK;
      for (int iRow = numberRows_ - 1; iRow > iColumn; iRow--) {
        aa[--put2] = sparseFactor_[--get];
        assert(aa + put2 >= sparseFactor_ + get);
      }
      /* save diagonal as well*/
      aa[--put2] = diagonal_[iColumn];
    }
    n = nRound;
    block--;
  } else {
    /* exact fit*/
    rowLast = numberRows_ - 1;
    ifOdd = 0;
  }
  /* Now main loop*/
  int nBlock = 0;
  for (; n > 0; n -= BLOCK) {
    longDouble *aa = &a[(block - 1) * BLOCKSQ];
    longDouble *aaLast = NULL;
    int put = BLOCKSQ;
    int putLast = 0;
    /* see if we have small block*/
    if (ifOdd) {
      aaLast = &a[(block - 1) * BLOCKSQ];
      aa = aaLast - BLOCKSQ;
      putLast = BLOCKSQ - BLOCK + sizeLastBlock;
    }
    for (iColumn = n - 1; iColumn >= n - BLOCK; iColumn--) {
      if (aaLast) {
        /* last bit*/
        for (int iRow = numberRows_ - 1; iRow > rowLast; iRow--) {
          aaLast[--putLast] = sparseFactor_[--get];
          assert(aaLast + putLast >= sparseFactor_ + get);
        }
        putLast -= BLOCK - sizeLastBlock;
      }
      longDouble *aPut = aa;
      int j = rowLast;
      for (int jBlock = 0; jBlock <= nBlock; jBlock++) {
        int put2 = put;
        int last = CoinMax(j - BLOCK, iColumn);
        for (int iRow = j; iRow > last; iRow--) {
          aPut[--put2] = sparseFactor_[--get];
          assert(aPut + put2 >= sparseFactor_ + get);
        }
        if (j - BLOCK < iColumn) {
          /* save diagonal as well*/
          aPut[--put2] = diagonal_[iColumn];
        }
        j -= BLOCK;
        aPut -= BLOCKSQ;
      }
      put -= BLOCK;
    }
    nBlock++;
    block -= nBlock + ifOdd;
  }
  ClpCholeskyDenseC info;
  info.diagonal_ = diagonal_;
  info.doubleParameters_[0] = doubleParameters_[10];
  info.integerParameters_[0] = integerParameters_[34];
#ifndef CLP_CILK
  ClpCholeskyCfactor(&info, a, numberRows_, numberBlocks,
    diagonal_, workDouble_, rowsDropped);
#else
  info.a = a;
  info.n = numberRows_;
  info.numberBlocks = numberBlocks;
  info.work = workDouble_;
  info.rowsDropped = rowsDropped;
  info.integerParameters_[1] = model_->numberThreads();
  ClpCholeskySpawn(&info);
#endif
  double largest = 0.0;
  double smallest = COIN_DBL_MAX;
  int numberDropped = 0;
  for (int i = 0; i < numberRows_; i++) {
    if (diagonal_[i]) {
      largest = CoinMax(largest, CoinAbs(diagonal_[i]));
      smallest = CoinMin(smallest, CoinAbs(diagonal_[i]));
    } else {
      numberDropped++;
    }
  }
  doubleParameters_[3] = CoinMax(doubleParameters_[3], 1.0 / smallest);
  doubleParameters_[4] = CoinMin(doubleParameters_[4], 1.0 / largest);
  integerParameters_[20] += numberDropped;
}
/* Non leaf recursive factor*/
void ClpCholeskyCfactor(ClpCholeskyDenseC *thisStruct, longDouble *a, int n, int numberBlocks,
  longDouble *diagonal, longDouble *work, int *rowsDropped)
{
  if (n <= BLOCK) {
    ClpCholeskyCfactorLeaf(thisStruct, a, n, diagonal, work, rowsDropped);
  } else {
    int nb = number_blocks((n + 1) >> 1);
    int nThis = number_rows(nb);
    longDouble *aother;
    int nLeft = n - nThis;
    int nintri = (nb * (nb + 1)) >> 1;
    int nbelow = (numberBlocks - nb) * nb;
    ClpCholeskyCfactor(thisStruct, a, nThis, numberBlocks, diagonal, work, rowsDropped);
    ClpCholeskyCtriRec(thisStruct, a, nThis, a + number_entries(nb), diagonal, work, nLeft, nb, 0, numberBlocks);
    aother = a + number_entries(nintri + nbelow);
    ClpCholeskyCrecTri(thisStruct, a + number_entries(nb), nLeft, nThis, nb, 0, aother, diagonal, work, numberBlocks);
    ClpCholeskyCfactor(thisStruct, aother, nLeft,
      numberBlocks - nb, diagonal + nThis, work + nThis, rowsDropped);
  }
}
/* Non leaf recursive triangle rectangle update*/
void ClpCholeskyCtriRec(ClpCholeskyDenseC *thisStruct, longDouble *aTri, int nThis, longDouble *aUnder,
  longDouble *diagonal, longDouble *work,
  int nLeft, int iBlock, int jBlock,
  int numberBlocks)
{
  if (nThis <= BLOCK && nLeft <= BLOCK) {
    ClpCholeskyCtriRecLeaf(/*thisStruct,*/ aTri, aUnder, diagonal, work, nLeft);
  } else if (nThis < nLeft) {
    int nb = number_blocks((nLeft + 1) >> 1);
    int nLeft2 = number_rows(nb);
    cilk_spawn ClpCholeskyCtriRec(thisStruct, aTri, nThis, aUnder, diagonal, work, nLeft2, iBlock, jBlock, numberBlocks);
    ClpCholeskyCtriRec(thisStruct, aTri, nThis, aUnder + number_entries(nb), diagonal, work, nLeft - nLeft2,
      iBlock + nb, jBlock, numberBlocks);
    cilk_sync;
  } else {
    int nb = number_blocks((nThis + 1) >> 1);
    int nThis2 = number_rows(nb);
    longDouble *aother;
    int kBlock = jBlock + nb;
    int i;
    int nintri = (nb * (nb + 1)) >> 1;
    int nbelow = (numberBlocks - nb) * nb;
    ClpCholeskyCtriRec(thisStruct, aTri, nThis2, aUnder, diagonal, work, nLeft, iBlock, jBlock, numberBlocks);
    /* and rectangular update */
    i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) - (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
    aother = aUnder + number_entries(i);
    ClpCholeskyCrecRec(thisStruct, aTri + number_entries(nb), nThis - nThis2, nLeft, nThis2, aUnder, aother,
      work, kBlock, jBlock, numberBlocks);
    ClpCholeskyCtriRec(thisStruct, aTri + number_entries(nintri + nbelow), nThis - nThis2, aother, diagonal + nThis2,
      work + nThis2, nLeft,
      iBlock - nb, kBlock - nb, numberBlocks - nb);
  }
}
/* Non leaf recursive rectangle triangle update*/
void ClpCholeskyCrecTri(ClpCholeskyDenseC *thisStruct, longDouble *aUnder, int nTri, int nDo,
  int iBlock, int jBlock, longDouble *aTri,
  longDouble *diagonal, longDouble *work,
  int numberBlocks)
{
  if (nTri <= BLOCK && nDo <= BLOCK) {
    ClpCholeskyCrecTriLeaf(/*thisStruct,*/ aUnder, aTri, /*diagonal,*/ work, nTri);
  } else if (nTri < nDo) {
    int nb = number_blocks((nDo + 1) >> 1);
    int nDo2 = number_rows(nb);
    longDouble *aother;
    int i;
    ClpCholeskyCrecTri(thisStruct, aUnder, nTri, nDo2, iBlock, jBlock, aTri, diagonal, work, numberBlocks);
    i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) - (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
    aother = aUnder + number_entries(i);
    ClpCholeskyCrecTri(thisStruct, aother, nTri, nDo - nDo2, iBlock - nb, jBlock, aTri, diagonal + nDo2,
      work + nDo2, numberBlocks - nb);
  } else {
    int nb = number_blocks((nTri + 1) >> 1);
    int nTri2 = number_rows(nb);
    longDouble *aother;
    int i;
    cilk_spawn ClpCholeskyCrecTri(thisStruct, aUnder, nTri2, nDo, iBlock, jBlock, aTri, diagonal, work, numberBlocks);
    /* and rectangular update */
    i = ((numberBlocks - iBlock) * (numberBlocks - iBlock + 1) - (numberBlocks - iBlock - nb) * (numberBlocks - iBlock - nb + 1)) >> 1;
    aother = aTri + number_entries(nb);
    ClpCholeskyCrecRec(thisStruct, aUnder, nTri2, nTri - nTri2, nDo, aUnder + number_entries(nb), aother,
      work, iBlock, jBlock, numberBlocks);
    ClpCholeskyCrecTri(thisStruct, aUnder + number_entries(nb), nTri - nTri2, nDo, iBlock + nb, jBlock,
      aTri + number_entries(i), diagonal, work, numberBlocks);
    cilk_sync;
  }
}
/* Non leaf recursive rectangle rectangle update,
   nUnder is number of rows in iBlock,
   nUnderK is number of rows in kBlock
*/
void ClpCholeskyCrecRec(ClpCholeskyDenseC *thisStruct, longDouble *above, int nUnder, int nUnderK,
  int nDo, longDouble *aUnder, longDouble *aOther,
  longDouble *work,
  int iBlock, int jBlock,
  int numberBlocks)
{
  if (nDo <= BLOCK && nUnder <= BLOCK && nUnderK <= BLOCK) {
    assert(nDo == BLOCK && nUnder == BLOCK);
    ClpCholeskyCrecRecLeaf(/*thisStruct,*/ above, aUnder, aOther, work, nUnderK);
  } else if (nDo <= nUnderK && nUnder <= nUnderK) {
    int nb = number_blocks((nUnderK + 1) >> 1);
    int nUnder2 = number_rows(nb);
    cilk_spawn ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnder2, nDo, aUnder, aOther, work,
      iBlock, jBlock, numberBlocks);
    ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnderK - nUnder2, nDo, aUnder + number_entries(nb),
      aOther + number_entries(nb), work, iBlock, jBlock, numberBlocks);
    cilk_sync;
  } else if (nUnderK <= nDo && nUnder <= nDo) {
    int nb = number_blocks((nDo + 1) >> 1);
    int nDo2 = number_rows(nb);
    int i;
    ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnderK, nDo2, aUnder, aOther, work,
      iBlock, jBlock, numberBlocks);
    i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) - (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
    ClpCholeskyCrecRec(thisStruct, above + number_entries(i), nUnder, nUnderK, nDo - nDo2,
      aUnder + number_entries(i),
      aOther, work + nDo2,
      iBlock - nb, jBlock, numberBlocks - nb);
  } else {
    int nb = number_blocks((nUnder + 1) >> 1);
    int nUnder2 = number_rows(nb);
    int i;
    cilk_spawn ClpCholeskyCrecRec(thisStruct, above, nUnder2, nUnderK, nDo, aUnder, aOther, work,
      iBlock, jBlock, numberBlocks);
    i = ((numberBlocks - iBlock) * (numberBlocks - iBlock - 1) - (numberBlocks - iBlock - nb) * (numberBlocks - iBlock - nb - 1)) >> 1;
    ClpCholeskyCrecRec(thisStruct, above + number_entries(nb), nUnder - nUnder2, nUnderK, nDo, aUnder,
      aOther + number_entries(i), work, iBlock + nb, jBlock, numberBlocks);
    cilk_sync;
  }
}
/* Leaf recursive factor*/
void ClpCholeskyCfactorLeaf(ClpCholeskyDenseC *thisStruct, longDouble *a, int n,
  longDouble *diagonal, longDouble *work, int *rowsDropped)
{
  double dropValue = thisStruct->doubleParameters_[0];
  int firstPositive = thisStruct->integerParameters_[0];
  int rowOffset = static_cast< int >(diagonal - thisStruct->diagonal_);
  int i, j, k;
  CoinWorkDouble t00, temp1;
  longDouble *aa;
  aa = a - BLOCK;
  for (j = 0; j < n; j++) {
    bool dropColumn;
    CoinWorkDouble useT00;
    aa += BLOCK;
    t00 = aa[j];
    for (k = 0; k < j; ++k) {
      CoinWorkDouble multiplier = work[k];
      t00 -= a[j + k * BLOCK] * a[j + k * BLOCK] * multiplier;
    }
    dropColumn = false;
    useT00 = t00;
    if (j + rowOffset < firstPositive) {
      /* must be negative*/
      if (t00 <= -dropValue) {
        /*aa[j]=t00;*/
        t00 = 1.0 / t00;
      } else {
        dropColumn = true;
        /*aa[j]=-1.0e100;*/
        useT00 = -1.0e-100;
        t00 = 0.0;
      }
    } else {
      /* must be positive*/
      if (t00 >= dropValue) {
        /*aa[j]=t00;*/
        t00 = 1.0 / t00;
      } else {
        dropColumn = true;
        /*aa[j]=1.0e100;*/
        useT00 = 1.0e-100;
        t00 = 0.0;
      }
    }
    if (!dropColumn) {
      diagonal[j] = t00;
      work[j] = useT00;
      temp1 = t00;
      for (i = j + 1; i < n; i++) {
        t00 = aa[i];
        for (k = 0; k < j; ++k) {
          CoinWorkDouble multiplier = work[k];
          t00 -= a[i + k * BLOCK] * a[j + k * BLOCK] * multiplier;
        }
        aa[i] = t00 * temp1;
      }
    } else {
      /* drop column*/
      rowsDropped[j + rowOffset] = 2;
      diagonal[j] = 0.0;
      /*aa[j]=1.0e100;*/
      work[j] = 1.0e100;
      for (i = j + 1; i < n; i++) {
        aa[i] = 0.0;
      }
    }
  }
}
/* Leaf recursive triangle rectangle update*/
void ClpCholeskyCtriRecLeaf(/*ClpCholeskyDenseC * thisStruct,*/ longDouble *aTri, longDouble *aUnder, longDouble *diagonal, longDouble *work,
  int nUnder)
{
#ifdef POS_DEBUG
  int iru, icu;
  int iu = bNumber(aUnder, iru, icu);
  int irt, ict;
  int it = bNumber(aTri, irt, ict);
  /*printf("%d %d\n",iu,it);*/
  printf("trirecleaf  under (%d,%d), tri (%d,%d)\n",
    iru, icu, irt, ict);
  assert(diagonal == thisStruct->diagonal_ + ict * BLOCK);
#endif
  int j;
  longDouble *aa;
#ifdef BLOCKUNROLL
  if (nUnder == BLOCK) {
    aa = aTri - 2 * BLOCK;
    for (j = 0; j < BLOCK; j += 2) {
      int i;
      CoinWorkDouble temp0 = diagonal[j];
      CoinWorkDouble temp1 = diagonal[j + 1];
      aa += 2 * BLOCK;
      for (i = 0; i < BLOCK; i += 2) {
        CoinWorkDouble at1;
        CoinWorkDouble t00 = aUnder[i + j * BLOCK];
        CoinWorkDouble t10 = aUnder[i + BLOCK + j * BLOCK];
        CoinWorkDouble t01 = aUnder[i + 1 + j * BLOCK];
        CoinWorkDouble t11 = aUnder[i + 1 + BLOCK + j * BLOCK];
        int k;
        for (k = 0; k < j; ++k) {
          CoinWorkDouble multiplier = work[k];
          CoinWorkDouble au0 = aUnder[i + k * BLOCK] * multiplier;
          CoinWorkDouble au1 = aUnder[i + 1 + k * BLOCK] * multiplier;
          CoinWorkDouble at0 = aTri[j + k * BLOCK];
          CoinWorkDouble at1 = aTri[j + 1 + k * BLOCK];
          t00 -= au0 * at0;
          t10 -= au0 * at1;
          t01 -= au1 * at0;
          t11 -= au1 * at1;
        }
        t00 *= temp0;
        at1 = aTri[j + 1 + j * BLOCK] * work[j];
        t10 -= t00 * at1;
        t01 *= temp0;
        t11 -= t01 * at1;
        aUnder[i + j * BLOCK] = t00;
        aUnder[i + 1 + j * BLOCK] = t01;
        aUnder[i + BLOCK + j * BLOCK] = t10 * temp1;
        aUnder[i + 1 + BLOCK + j * BLOCK] = t11 * temp1;
      }
    }
  } else {
#endif
    aa = aTri - BLOCK;
    for (j = 0; j < BLOCK; j++) {
      int i;
      CoinWorkDouble temp1 = diagonal[j];
      aa += BLOCK;
      for (i = 0; i < nUnder; i++) {
        int k;
        CoinWorkDouble t00 = aUnder[i + j * BLOCK];
        for (k = 0; k < j; ++k) {
          CoinWorkDouble multiplier = work[k];
          t00 -= aUnder[i + k * BLOCK] * aTri[j + k * BLOCK] * multiplier;
        }
        aUnder[i + j * BLOCK] = t00 * temp1;
      }
    }
#ifdef BLOCKUNROLL
  }
#endif
}
/* Leaf recursive rectangle triangle update*/
void ClpCholeskyCrecTriLeaf(/*ClpCholeskyDenseC * thisStruct,*/ longDouble *aUnder, longDouble *aTri,
  /*longDouble * diagonal,*/ longDouble *work, int nUnder)
{
#ifdef POS_DEBUG
  int iru, icu;
  int iu = bNumber(aUnder, iru, icu);
  int irt, ict;
  int it = bNumber(aTri, irt, ict);
  /*printf("%d %d\n",iu,it);*/
  printf("rectrileaf  under (%d,%d), tri (%d,%d)\n",
    iru, icu, irt, ict);
  assert(diagonal == thisStruct->diagonal_ + icu * BLOCK);
#endif
  int i, j, k;
  CoinWorkDouble t00;
  longDouble *aa;
#ifdef BLOCKUNROLL
  if (nUnder == BLOCK) {
    longDouble *aUnder2 = aUnder - 2;
    aa = aTri - 2 * BLOCK;
    for (j = 0; j < BLOCK; j += 2) {
      CoinWorkDouble t00, t01, t10, t11;
      aa += 2 * BLOCK;
      aUnder2 += 2;
      t00 = aa[j];
      t01 = aa[j + 1];
      t10 = aa[j + 1 + BLOCK];
      for (k = 0; k < BLOCK; ++k) {
        CoinWorkDouble multiplier = work[k];
        CoinWorkDouble a0 = aUnder2[k * BLOCK];
        CoinWorkDouble a1 = aUnder2[1 + k * BLOCK];
        CoinWorkDouble x0 = a0 * multiplier;
        CoinWorkDouble x1 = a1 * multiplier;
        t00 -= a0 * x0;
        t01 -= a1 * x0;
        t10 -= a1 * x1;
      }
      aa[j] = t00;
      aa[j + 1] = t01;
      aa[j + 1 + BLOCK] = t10;
      for (i = j + 2; i < BLOCK; i += 2) {
        t00 = aa[i];
        t01 = aa[i + BLOCK];
        t10 = aa[i + 1];
        t11 = aa[i + 1 + BLOCK];
        for (k = 0; k < BLOCK; ++k) {
          CoinWorkDouble multiplier = work[k];
          CoinWorkDouble a0 = aUnder2[k * BLOCK] * multiplier;
          CoinWorkDouble a1 = aUnder2[1 + k * BLOCK] * multiplier;
          t00 -= aUnder[i + k * BLOCK] * a0;
          t01 -= aUnder[i + k * BLOCK] * a1;
          t10 -= aUnder[i + 1 + k * BLOCK] * a0;
          t11 -= aUnder[i + 1 + k * BLOCK] * a1;
        }
        aa[i] = t00;
        aa[i + BLOCK] = t01;
        aa[i + 1] = t10;
        aa[i + 1 + BLOCK] = t11;
      }
    }
  } else {
#endif
    aa = aTri - BLOCK;
    for (j = 0; j < nUnder; j++) {
      aa += BLOCK;
      for (i = j; i < nUnder; i++) {
        t00 = aa[i];
        for (k = 0; k < BLOCK; ++k) {
          CoinWorkDouble multiplier = work[k];
          t00 -= aUnder[i + k * BLOCK] * aUnder[j + k * BLOCK] * multiplier;
        }
        aa[i] = t00;
      }
    }
#ifdef BLOCKUNROLL
  }
#endif
}
/* Leaf recursive rectangle rectangle update,
   nUnder is number of rows in iBlock,
   nUnderK is number of rows in kBlock
*/
void ClpCholeskyCrecRecLeaf(/*ClpCholeskyDenseC * thisStruct,*/
  const longDouble *COIN_RESTRICT above,
  const longDouble *COIN_RESTRICT aUnder,
  longDouble *COIN_RESTRICT aOther,
  const longDouble *COIN_RESTRICT work,
  int nUnder)
{
#ifdef POS_DEBUG
  int ira, ica;
  int ia = bNumber(above, ira, ica);
  int iru, icu;
  int iu = bNumber(aUnder, iru, icu);
  int iro, ico;
  int io = bNumber(aOther, iro, ico);
  /*printf("%d %d %d\n",ia,iu,io);*/
  printf("recrecleaf above (%d,%d), under (%d,%d), other (%d,%d)\n",
    ira, ica, iru, icu, iro, ico);
#endif
  int i, j, k;
  longDouble *aa;
#ifdef BLOCKUNROLL
  aa = aOther - 4 * BLOCK;
  if (nUnder == BLOCK) {
    /*#define INTEL*/
#ifdef INTEL
    aa += 2 * BLOCK;
    for (j = 0; j < BLOCK; j += 2) {
      aa += 2 * BLOCK;
      for (i = 0; i < BLOCK; i += 2) {
        CoinWorkDouble t00 = aa[i + 0 * BLOCK];
        CoinWorkDouble t10 = aa[i + 1 * BLOCK];
        CoinWorkDouble t01 = aa[i + 1 + 0 * BLOCK];
        CoinWorkDouble t11 = aa[i + 1 + 1 * BLOCK];
        for (k = 0; k < BLOCK; k++) {
          CoinWorkDouble multiplier = work[k];
          CoinWorkDouble a00 = aUnder[i + k * BLOCK] * multiplier;
          CoinWorkDouble a01 = aUnder[i + 1 + k * BLOCK] * multiplier;
          t00 -= a00 * above[j + 0 + k * BLOCK];
          t10 -= a00 * above[j + 1 + k * BLOCK];
          t01 -= a01 * above[j + 0 + k * BLOCK];
          t11 -= a01 * above[j + 1 + k * BLOCK];
        }
        aa[i + 0 * BLOCK] = t00;
        aa[i + 1 * BLOCK] = t10;
        aa[i + 1 + 0 * BLOCK] = t01;
        aa[i + 1 + 1 * BLOCK] = t11;
      }
    }
#else
    for (j = 0; j < BLOCK; j += 4) {
      aa += 4 * BLOCK;
      for (i = 0; i < BLOCK; i += 4) {
        CoinWorkDouble t00 = aa[i + 0 + 0 * BLOCK];
        CoinWorkDouble t10 = aa[i + 0 + 1 * BLOCK];
        CoinWorkDouble t20 = aa[i + 0 + 2 * BLOCK];
        CoinWorkDouble t30 = aa[i + 0 + 3 * BLOCK];
        CoinWorkDouble t01 = aa[i + 1 + 0 * BLOCK];
        CoinWorkDouble t11 = aa[i + 1 + 1 * BLOCK];
        CoinWorkDouble t21 = aa[i + 1 + 2 * BLOCK];
        CoinWorkDouble t31 = aa[i + 1 + 3 * BLOCK];
        CoinWorkDouble t02 = aa[i + 2 + 0 * BLOCK];
        CoinWorkDouble t12 = aa[i + 2 + 1 * BLOCK];
        CoinWorkDouble t22 = aa[i + 2 + 2 * BLOCK];
        CoinWorkDouble t32 = aa[i + 2 + 3 * BLOCK];
        CoinWorkDouble t03 = aa[i + 3 + 0 * BLOCK];
        CoinWorkDouble t13 = aa[i + 3 + 1 * BLOCK];
        CoinWorkDouble t23 = aa[i + 3 + 2 * BLOCK];
        CoinWorkDouble t33 = aa[i + 3 + 3 * BLOCK];
        const longDouble *COIN_RESTRICT aUnderNow = aUnder + i;
        const longDouble *COIN_RESTRICT aboveNow = above + j;
        for (k = 0; k < BLOCK; k++) {
          CoinWorkDouble multiplier = work[k];
          CoinWorkDouble a00 = aUnderNow[0] * multiplier;
          CoinWorkDouble a01 = aUnderNow[1] * multiplier;
          CoinWorkDouble a02 = aUnderNow[2] * multiplier;
          CoinWorkDouble a03 = aUnderNow[3] * multiplier;
          t00 -= a00 * aboveNow[0];
          t10 -= a00 * aboveNow[1];
          t20 -= a00 * aboveNow[2];
          t30 -= a00 * aboveNow[3];
          t01 -= a01 * aboveNow[0];
          t11 -= a01 * aboveNow[1];
          t21 -= a01 * aboveNow[2];
          t31 -= a01 * aboveNow[3];
          t02 -= a02 * aboveNow[0];
          t12 -= a02 * aboveNow[1];
          t22 -= a02 * aboveNow[2];
          t32 -= a02 * aboveNow[3];
          t03 -= a03 * aboveNow[0];
          t13 -= a03 * aboveNow[1];
          t23 -= a03 * aboveNow[2];
          t33 -= a03 * aboveNow[3];
          aUnderNow += BLOCK;
          aboveNow += BLOCK;
        }
        aa[i + 0 + 0 * BLOCK] = t00;
        aa[i + 0 + 1 * BLOCK] = t10;
        aa[i + 0 + 2 * BLOCK] = t20;
        aa[i + 0 + 3 * BLOCK] = t30;
        aa[i + 1 + 0 * BLOCK] = t01;
        aa[i + 1 + 1 * BLOCK] = t11;
        aa[i + 1 + 2 * BLOCK] = t21;
        aa[i + 1 + 3 * BLOCK] = t31;
        aa[i + 2 + 0 * BLOCK] = t02;
        aa[i + 2 + 1 * BLOCK] = t12;
        aa[i + 2 + 2 * BLOCK] = t22;
        aa[i + 2 + 3 * BLOCK] = t32;
        aa[i + 3 + 0 * BLOCK] = t03;
        aa[i + 3 + 1 * BLOCK] = t13;
        aa[i + 3 + 2 * BLOCK] = t23;
        aa[i + 3 + 3 * BLOCK] = t33;
      }
    }
#endif
  } else {
    int odd = nUnder & 1;
    int n = nUnder - odd;
    for (j = 0; j < BLOCK; j += 4) {
      aa += 4 * BLOCK;
      for (i = 0; i < n; i += 2) {
        CoinWorkDouble t00 = aa[i + 0 * BLOCK];
        CoinWorkDouble t10 = aa[i + 1 * BLOCK];
        CoinWorkDouble t20 = aa[i + 2 * BLOCK];
        CoinWorkDouble t30 = aa[i + 3 * BLOCK];
        CoinWorkDouble t01 = aa[i + 1 + 0 * BLOCK];
        CoinWorkDouble t11 = aa[i + 1 + 1 * BLOCK];
        CoinWorkDouble t21 = aa[i + 1 + 2 * BLOCK];
        CoinWorkDouble t31 = aa[i + 1 + 3 * BLOCK];
        const longDouble *COIN_RESTRICT aUnderNow = aUnder + i;
        const longDouble *COIN_RESTRICT aboveNow = above + j;
        for (k = 0; k < BLOCK; k++) {
          CoinWorkDouble multiplier = work[k];
          CoinWorkDouble a00 = aUnderNow[0] * multiplier;
          CoinWorkDouble a01 = aUnderNow[1] * multiplier;
          t00 -= a00 * aboveNow[0];
          t10 -= a00 * aboveNow[1];
          t20 -= a00 * aboveNow[2];
          t30 -= a00 * aboveNow[3];
          t01 -= a01 * aboveNow[0];
          t11 -= a01 * aboveNow[1];
          t21 -= a01 * aboveNow[2];
          t31 -= a01 * aboveNow[3];
          aUnderNow += BLOCK;
          aboveNow += BLOCK;
        }
        aa[i + 0 * BLOCK] = t00;
        aa[i + 1 * BLOCK] = t10;
        aa[i + 2 * BLOCK] = t20;
        aa[i + 3 * BLOCK] = t30;
        aa[i + 1 + 0 * BLOCK] = t01;
        aa[i + 1 + 1 * BLOCK] = t11;
        aa[i + 1 + 2 * BLOCK] = t21;
        aa[i + 1 + 3 * BLOCK] = t31;
      }
      if (odd) {
        CoinWorkDouble t0 = aa[n + 0 * BLOCK];
        CoinWorkDouble t1 = aa[n + 1 * BLOCK];
        CoinWorkDouble t2 = aa[n + 2 * BLOCK];
        CoinWorkDouble t3 = aa[n + 3 * BLOCK];
        CoinWorkDouble a0;
        for (k = 0; k < BLOCK; k++) {
          a0 = aUnder[n + k * BLOCK] * work[k];
          t0 -= a0 * above[j + 0 + k * BLOCK];
          t1 -= a0 * above[j + 1 + k * BLOCK];
          t2 -= a0 * above[j + 2 + k * BLOCK];
          t3 -= a0 * above[j + 3 + k * BLOCK];
        }
        aa[n + 0 * BLOCK] = t0;
        aa[n + 1 * BLOCK] = t1;
        aa[n + 2 * BLOCK] = t2;
        aa[n + 3 * BLOCK] = t3;
      }
    }
  }
#else
  aa = aOther - BLOCK;
  for (j = 0; j < BLOCK; j++) {
    aa += BLOCK;
    for (i = 0; i < nUnder; i++) {
      CoinWorkDouble t00 = aa[i + 0 * BLOCK];
      for (k = 0; k < BLOCK; k++) {
        CoinWorkDouble a00 = aUnder[i + k * BLOCK] * work[k];
        t00 -= a00 * above[j + k * BLOCK];
      }
      aa[i] = t00;
    }
  }
#endif
}
/* Uses factorization to solve. */
void ClpCholeskyDense::solve(CoinWorkDouble *region)
{
#ifdef CHOL_COMPARE
  double *region2 = NULL;
  if (numberRows_ < 200) {
    longDouble *xx = sparseFactor_;
    longDouble *yy = diagonal_;
    diagonal_ = sparseFactor_ + 40000;
    sparseFactor_ = diagonal_ + numberRows_;
    region2 = ClpCopyOfArray(region, numberRows_);
    int iRow, iColumn;
    int addOffset = numberRows_ - 1;
    longDouble *work = sparseFactor_ - 1;
    for (iColumn = 0; iColumn < numberRows_; iColumn++) {
      double value = region2[iColumn];
      for (iRow = iColumn + 1; iRow < numberRows_; iRow++)
        region2[iRow] -= value * work[iRow];
      addOffset--;
      work += addOffset;
    }
    for (iColumn = numberRows_ - 1; iColumn >= 0; iColumn--) {
      double value = region2[iColumn] * diagonal_[iColumn];
      work -= addOffset;
      addOffset++;
      for (iRow = iColumn + 1; iRow < numberRows_; iRow++)
        value -= region2[iRow] * work[iRow];
      region2[iColumn] = value;
    }
    sparseFactor_ = xx;
    diagonal_ = yy;
  }
#endif
  /*longDouble * xx = sparseFactor_;*/
  /*longDouble * yy = diagonal_;*/
  /*diagonal_ = sparseFactor_ + 40000;*/
  /*sparseFactor_=diagonal_ + numberRows_;*/
  int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
  /* later align on boundary*/
  longDouble *a = sparseFactor_ + BLOCKSQ * numberBlocks;
  int iBlock;
  longDouble *aa = a;
  int iColumn;
  for (iBlock = 0; iBlock < numberBlocks; iBlock++) {
    int nChunk;
    int jBlock;
    int iDo = iBlock * BLOCK;
    int base = iDo;
    if (iDo + BLOCK > numberRows_) {
      nChunk = numberRows_ - iDo;
    } else {
      nChunk = BLOCK;
    }
    solveF1(aa, nChunk, region + iDo);
    for (jBlock = iBlock + 1; jBlock < numberBlocks; jBlock++) {
      base += BLOCK;
      aa += BLOCKSQ;
      if (base + BLOCK > numberRows_) {
        nChunk = numberRows_ - base;
      } else {
        nChunk = BLOCK;
      }
      solveF2(aa, nChunk, region + iDo, region + base);
    }
    aa += BLOCKSQ;
  }
  /* do diagonal outside*/
  for (iColumn = 0; iColumn < numberRows_; iColumn++)
    region[iColumn] *= diagonal_[iColumn];
  int offset = ((numberBlocks * (numberBlocks + 1)) >> 1);
  aa = a + number_entries(offset - 1);
  int lBase = (numberBlocks - 1) * BLOCK;
  for (iBlock = numberBlocks - 1; iBlock >= 0; iBlock--) {
    int nChunk;
    int jBlock;
    int triBase = iBlock * BLOCK;
    int iBase = lBase;
    for (jBlock = iBlock + 1; jBlock < numberBlocks; jBlock++) {
      if (iBase + BLOCK > numberRows_) {
        nChunk = numberRows_ - iBase;
      } else {
        nChunk = BLOCK;
      }
      solveB2(aa, nChunk, region + triBase, region + iBase);
      iBase -= BLOCK;
      aa -= BLOCKSQ;
    }
    if (triBase + BLOCK > numberRows_) {
      nChunk = numberRows_ - triBase;
    } else {
      nChunk = BLOCK;
    }
    solveB1(aa, nChunk, region + triBase);
    aa -= BLOCKSQ;
  }
#ifdef CHOL_COMPARE
  if (numberRows_ < 200) {
    for (int i = 0; i < numberRows_; i++) {
      assert(CoinAbs(region[i] - region2[i]) < 1.0e-3);
    }
    delete[] region2;
  }
#endif
}
/* Forward part of solve 1*/
void ClpCholeskyDense::solveF1(longDouble *a, int n, CoinWorkDouble *region)
{
  int j, k;
  CoinWorkDouble t00;
  for (j = 0; j < n; j++) {
    t00 = region[j];
    for (k = 0; k < j; ++k) {
      t00 -= region[k] * a[j + k * BLOCK];
    }
    /*t00*=a[j + j * BLOCK];*/
    region[j] = t00;
  }
}
/* Forward part of solve 2*/
void ClpCholeskyDense::solveF2(longDouble *a, int n, CoinWorkDouble *region, CoinWorkDouble *region2)
{
  int j, k;
#ifdef BLOCKUNROLL
  if (n == BLOCK) {
    for (k = 0; k < BLOCK; k += 4) {
      CoinWorkDouble t0 = region2[0];
      CoinWorkDouble t1 = region2[1];
      CoinWorkDouble t2 = region2[2];
      CoinWorkDouble t3 = region2[3];
      t0 -= region[0] * a[0 + 0 * BLOCK];
      t1 -= region[0] * a[1 + 0 * BLOCK];
      t2 -= region[0] * a[2 + 0 * BLOCK];
      t3 -= region[0] * a[3 + 0 * BLOCK];

      t0 -= region[1] * a[0 + 1 * BLOCK];
      t1 -= region[1] * a[1 + 1 * BLOCK];
      t2 -= region[1] * a[2 + 1 * BLOCK];
      t3 -= region[1] * a[3 + 1 * BLOCK];

      t0 -= region[2] * a[0 + 2 * BLOCK];
      t1 -= region[2] * a[1 + 2 * BLOCK];
      t2 -= region[2] * a[2 + 2 * BLOCK];
      t3 -= region[2] * a[3 + 2 * BLOCK];

      t0 -= region[3] * a[0 + 3 * BLOCK];
      t1 -= region[3] * a[1 + 3 * BLOCK];
      t2 -= region[3] * a[2 + 3 * BLOCK];
      t3 -= region[3] * a[3 + 3 * BLOCK];

      t0 -= region[4] * a[0 + 4 * BLOCK];
      t1 -= region[4] * a[1 + 4 * BLOCK];
      t2 -= region[4] * a[2 + 4 * BLOCK];
      t3 -= region[4] * a[3 + 4 * BLOCK];

      t0 -= region[5] * a[0 + 5 * BLOCK];
      t1 -= region[5] * a[1 + 5 * BLOCK];
      t2 -= region[5] * a[2 + 5 * BLOCK];
      t3 -= region[5] * a[3 + 5 * BLOCK];

      t0 -= region[6] * a[0 + 6 * BLOCK];
      t1 -= region[6] * a[1 + 6 * BLOCK];
      t2 -= region[6] * a[2 + 6 * BLOCK];
      t3 -= region[6] * a[3 + 6 * BLOCK];

      t0 -= region[7] * a[0 + 7 * BLOCK];
      t1 -= region[7] * a[1 + 7 * BLOCK];
      t2 -= region[7] * a[2 + 7 * BLOCK];
      t3 -= region[7] * a[3 + 7 * BLOCK];
#if BLOCK > 8
      t0 -= region[8] * a[0 + 8 * BLOCK];
      t1 -= region[8] * a[1 + 8 * BLOCK];
      t2 -= region[8] * a[2 + 8 * BLOCK];
      t3 -= region[8] * a[3 + 8 * BLOCK];

      t0 -= region[9] * a[0 + 9 * BLOCK];
      t1 -= region[9] * a[1 + 9 * BLOCK];
      t2 -= region[9] * a[2 + 9 * BLOCK];
      t3 -= region[9] * a[3 + 9 * BLOCK];

      t0 -= region[10] * a[0 + 10 * BLOCK];
      t1 -= region[10] * a[1 + 10 * BLOCK];
      t2 -= region[10] * a[2 + 10 * BLOCK];
      t3 -= region[10] * a[3 + 10 * BLOCK];

      t0 -= region[11] * a[0 + 11 * BLOCK];
      t1 -= region[11] * a[1 + 11 * BLOCK];
      t2 -= region[11] * a[2 + 11 * BLOCK];
      t3 -= region[11] * a[3 + 11 * BLOCK];

      t0 -= region[12] * a[0 + 12 * BLOCK];
      t1 -= region[12] * a[1 + 12 * BLOCK];
      t2 -= region[12] * a[2 + 12 * BLOCK];
      t3 -= region[12] * a[3 + 12 * BLOCK];

      t0 -= region[13] * a[0 + 13 * BLOCK];
      t1 -= region[13] * a[1 + 13 * BLOCK];
      t2 -= region[13] * a[2 + 13 * BLOCK];
      t3 -= region[13] * a[3 + 13 * BLOCK];

      t0 -= region[14] * a[0 + 14 * BLOCK];
      t1 -= region[14] * a[1 + 14 * BLOCK];
      t2 -= region[14] * a[2 + 14 * BLOCK];
      t3 -= region[14] * a[3 + 14 * BLOCK];

      t0 -= region[15] * a[0 + 15 * BLOCK];
      t1 -= region[15] * a[1 + 15 * BLOCK];
      t2 -= region[15] * a[2 + 15 * BLOCK];
      t3 -= region[15] * a[3 + 15 * BLOCK];
#endif
      region2[0] = t0;
      region2[1] = t1;
      region2[2] = t2;
      region2[3] = t3;
      region2 += 4;
      a += 4;
    }
  } else {
#endif
    for (k = 0; k < n; ++k) {
      CoinWorkDouble t00 = region2[k];
      for (j = 0; j < BLOCK; j++) {
        t00 -= region[j] * a[k + j * BLOCK];
      }
      region2[k] = t00;
    }
#ifdef BLOCKUNROLL
  }
#endif
}
/* Backward part of solve 1*/
void ClpCholeskyDense::solveB1(longDouble *a, int n, CoinWorkDouble *region)
{
  int j, k;
  CoinWorkDouble t00;
  for (j = n - 1; j >= 0; j--) {
    t00 = region[j];
    for (k = j + 1; k < n; ++k) {
      t00 -= region[k] * a[k + j * BLOCK];
    }
    /*t00*=a[j + j * BLOCK];*/
    region[j] = t00;
  }
}
/* Backward part of solve 2*/
void ClpCholeskyDense::solveB2(longDouble *a, int n, CoinWorkDouble *region, CoinWorkDouble *region2)
{
  int j, k;
#ifdef BLOCKUNROLL
  if (n == BLOCK) {
    for (j = 0; j < BLOCK; j += 4) {
      CoinWorkDouble t0 = region[0];
      CoinWorkDouble t1 = region[1];
      CoinWorkDouble t2 = region[2];
      CoinWorkDouble t3 = region[3];
      t0 -= region2[0] * a[0 + 0 * BLOCK];
      t1 -= region2[0] * a[0 + 1 * BLOCK];
      t2 -= region2[0] * a[0 + 2 * BLOCK];
      t3 -= region2[0] * a[0 + 3 * BLOCK];

      t0 -= region2[1] * a[1 + 0 * BLOCK];
      t1 -= region2[1] * a[1 + 1 * BLOCK];
      t2 -= region2[1] * a[1 + 2 * BLOCK];
      t3 -= region2[1] * a[1 + 3 * BLOCK];

      t0 -= region2[2] * a[2 + 0 * BLOCK];
      t1 -= region2[2] * a[2 + 1 * BLOCK];
      t2 -= region2[2] * a[2 + 2 * BLOCK];
      t3 -= region2[2] * a[2 + 3 * BLOCK];

      t0 -= region2[3] * a[3 + 0 * BLOCK];
      t1 -= region2[3] * a[3 + 1 * BLOCK];
      t2 -= region2[3] * a[3 + 2 * BLOCK];
      t3 -= region2[3] * a[3 + 3 * BLOCK];

      t0 -= region2[4] * a[4 + 0 * BLOCK];
      t1 -= region2[4] * a[4 + 1 * BLOCK];
      t2 -= region2[4] * a[4 + 2 * BLOCK];
      t3 -= region2[4] * a[4 + 3 * BLOCK];

      t0 -= region2[5] * a[5 + 0 * BLOCK];
      t1 -= region2[5] * a[5 + 1 * BLOCK];
      t2 -= region2[5] * a[5 + 2 * BLOCK];
      t3 -= region2[5] * a[5 + 3 * BLOCK];

      t0 -= region2[6] * a[6 + 0 * BLOCK];
      t1 -= region2[6] * a[6 + 1 * BLOCK];
      t2 -= region2[6] * a[6 + 2 * BLOCK];
      t3 -= region2[6] * a[6 + 3 * BLOCK];

      t0 -= region2[7] * a[7 + 0 * BLOCK];
      t1 -= region2[7] * a[7 + 1 * BLOCK];
      t2 -= region2[7] * a[7 + 2 * BLOCK];
      t3 -= region2[7] * a[7 + 3 * BLOCK];
#if BLOCK > 8

      t0 -= region2[8] * a[8 + 0 * BLOCK];
      t1 -= region2[8] * a[8 + 1 * BLOCK];
      t2 -= region2[8] * a[8 + 2 * BLOCK];
      t3 -= region2[8] * a[8 + 3 * BLOCK];

      t0 -= region2[9] * a[9 + 0 * BLOCK];
      t1 -= region2[9] * a[9 + 1 * BLOCK];
      t2 -= region2[9] * a[9 + 2 * BLOCK];
      t3 -= region2[9] * a[9 + 3 * BLOCK];

      t0 -= region2[10] * a[10 + 0 * BLOCK];
      t1 -= region2[10] * a[10 + 1 * BLOCK];
      t2 -= region2[10] * a[10 + 2 * BLOCK];
      t3 -= region2[10] * a[10 + 3 * BLOCK];

      t0 -= region2[11] * a[11 + 0 * BLOCK];
      t1 -= region2[11] * a[11 + 1 * BLOCK];
      t2 -= region2[11] * a[11 + 2 * BLOCK];
      t3 -= region2[11] * a[11 + 3 * BLOCK];

      t0 -= region2[12] * a[12 + 0 * BLOCK];
      t1 -= region2[12] * a[12 + 1 * BLOCK];
      t2 -= region2[12] * a[12 + 2 * BLOCK];
      t3 -= region2[12] * a[12 + 3 * BLOCK];

      t0 -= region2[13] * a[13 + 0 * BLOCK];
      t1 -= region2[13] * a[13 + 1 * BLOCK];
      t2 -= region2[13] * a[13 + 2 * BLOCK];
      t3 -= region2[13] * a[13 + 3 * BLOCK];

      t0 -= region2[14] * a[14 + 0 * BLOCK];
      t1 -= region2[14] * a[14 + 1 * BLOCK];
      t2 -= region2[14] * a[14 + 2 * BLOCK];
      t3 -= region2[14] * a[14 + 3 * BLOCK];

      t0 -= region2[15] * a[15 + 0 * BLOCK];
      t1 -= region2[15] * a[15 + 1 * BLOCK];
      t2 -= region2[15] * a[15 + 2 * BLOCK];
      t3 -= region2[15] * a[15 + 3 * BLOCK];
#endif
      region[0] = t0;
      region[1] = t1;
      region[2] = t2;
      region[3] = t3;
      a += 4 * BLOCK;
      region += 4;
    }
  } else {
#endif
    for (j = 0; j < BLOCK; j++) {
      CoinWorkDouble t00 = region[j];
      for (k = 0; k < n; ++k) {
        t00 -= region2[k] * a[k + j * BLOCK];
      }
      region[j] = t00;
    }
#ifdef BLOCKUNROLL
  }
#endif
}

/* vi: softtabstop=2 shiftwidth=2 expandtab tabstop=2
*/