1 // Ceres Solver - A fast non-linear least squares minimizer 2 // Copyright 2015 Google Inc. All rights reserved. 3 // http://ceres-solver.org/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // 8 // * Redistributions of source code must retain the above copyright notice, 9 // this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above copyright notice, 11 // this list of conditions and the following disclaimer in the documentation 12 // and/or other materials provided with the distribution. 13 // * Neither the name of Google Inc. nor the names of its contributors may be 14 // used to endorse or promote products derived from this software without 15 // specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 // 29 // Author: sameeragarwal@google.com (Sameer Agarwal) 30 31 #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_H_ 32 #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_ 33 34 #include <map> 35 #include <vector> 36 #include "ceres/mutex.h" 37 #include "ceres/block_random_access_matrix.h" 38 #include "ceres/block_sparse_matrix.h" 39 #include "ceres/block_structure.h" 40 #include "ceres/linear_solver.h" 41 #include "ceres/internal/eigen.h" 42 #include "ceres/internal/scoped_ptr.h" 43 44 namespace ceres { 45 namespace internal { 46 47 // Classes implementing the SchurEliminatorBase interface implement 48 // variable elimination for linear least squares problems. Assuming 49 // that the input linear system Ax = b can be partitioned into 50 // 51 // E y + F z = b 52 // 53 // Where x = [y;z] is a partition of the variables. The paritioning 54 // of the variables is such that, E'E is a block diagonal matrix. Or 55 // in other words, the parameter blocks in E form an independent set 56 // of the of the graph implied by the block matrix A'A. Then, this 57 // class provides the functionality to compute the Schur complement 58 // system 59 // 60 // S z = r 61 // 62 // where 63 // 64 // S = F'F - F'E (E'E)^{-1} E'F and r = F'b - F'E(E'E)^(-1) E'b 65 // 66 // This is the Eliminate operation, i.e., construct the linear system 67 // obtained by eliminating the variables in E. 68 // 69 // The eliminator also provides the reverse functionality, i.e. given 70 // values for z it can back substitute for the values of y, by solving the 71 // linear system 72 // 73 // Ey = b - F z 74 // 75 // which is done by observing that 76 // 77 // y = (E'E)^(-1) [E'b - E'F z] 78 // 79 // The eliminator has a number of requirements. 80 // 81 // The rows of A are ordered so that for every variable block in y, 82 // all the rows containing that variable block occur as a vertically 83 // contiguous block. i.e the matrix A looks like 84 // 85 // E F chunk 86 // A = [ y1 0 0 0 | z1 0 0 0 z5] 1 87 // [ y1 0 0 0 | z1 z2 0 0 0] 1 88 // [ 0 y2 0 0 | 0 0 z3 0 0] 2 89 // [ 0 0 y3 0 | z1 z2 z3 z4 z5] 3 90 // [ 0 0 y3 0 | z1 0 0 0 z5] 3 91 // [ 0 0 0 y4 | 0 0 0 0 z5] 4 92 // [ 0 0 0 y4 | 0 z2 0 0 0] 4 93 // [ 0 0 0 y4 | 0 0 0 0 0] 4 94 // [ 0 0 0 0 | z1 0 0 0 0] non chunk blocks 95 // [ 0 0 0 0 | 0 0 z3 z4 z5] non chunk blocks 96 // 97 // This structure should be reflected in the corresponding 98 // CompressedRowBlockStructure object associated with A. The linear 99 // system Ax = b should either be well posed or the array D below 100 // should be non-null and the diagonal matrix corresponding to it 101 // should be non-singular. For simplicity of exposition only the case 102 // with a null D is described. 103 // 104 // The usual way to do the elimination is as follows. Starting with 105 // 106 // E y + F z = b 107 // 108 // we can form the normal equations, 109 // 110 // E'E y + E'F z = E'b 111 // F'E y + F'F z = F'b 112 // 113 // multiplying both sides of the first equation by (E'E)^(-1) and then 114 // by F'E we get 115 // 116 // F'E y + F'E (E'E)^(-1) E'F z = F'E (E'E)^(-1) E'b 117 // F'E y + F'F z = F'b 118 // 119 // now subtracting the two equations we get 120 // 121 // [FF' - F'E (E'E)^(-1) E'F] z = F'b - F'E(E'E)^(-1) E'b 122 // 123 // Instead of forming the normal equations and operating on them as 124 // general sparse matrices, the algorithm here deals with one 125 // parameter block in y at a time. The rows corresponding to a single 126 // parameter block yi are known as a chunk, and the algorithm operates 127 // on one chunk at a time. The mathematics remains the same since the 128 // reduced linear system can be shown to be the sum of the reduced 129 // linear systems for each chunk. This can be seen by observing two 130 // things. 131 // 132 // 1. E'E is a block diagonal matrix. 133 // 134 // 2. When E'F is computed, only the terms within a single chunk 135 // interact, i.e for y1 column blocks when transposed and multiplied 136 // with F, the only non-zero contribution comes from the blocks in 137 // chunk1. 138 // 139 // Thus, the reduced linear system 140 // 141 // FF' - F'E (E'E)^(-1) E'F 142 // 143 // can be re-written as 144 // 145 // sum_k F_k F_k' - F_k'E_k (E_k'E_k)^(-1) E_k' F_k 146 // 147 // Where the sum is over chunks and E_k'E_k is dense matrix of size y1 148 // x y1. 149 // 150 // Advanced usage. Uptil now it has been assumed that the user would 151 // be interested in all of the Schur Complement S. However, it is also 152 // possible to use this eliminator to obtain an arbitrary submatrix of 153 // the full Schur complement. When the eliminator is generating the 154 // blocks of S, it asks the RandomAccessBlockMatrix instance passed to 155 // it if it has storage for that block. If it does, the eliminator 156 // computes/updates it, if not it is skipped. This is useful when one 157 // is interested in constructing a preconditioner based on the Schur 158 // Complement, e.g., computing the block diagonal of S so that it can 159 // be used as a preconditioner for an Iterative Substructuring based 160 // solver [See Agarwal et al, Bundle Adjustment in the Large, ECCV 161 // 2008 for an example of such use]. 162 // 163 // Example usage: Please see schur_complement_solver.cc 164 class SchurEliminatorBase { 165 public: ~SchurEliminatorBase()166 virtual ~SchurEliminatorBase() {} 167 168 // Initialize the eliminator. It is the user's responsibilty to call 169 // this function before calling Eliminate or BackSubstitute. It is 170 // also the caller's responsibilty to ensure that the 171 // CompressedRowBlockStructure object passed to this method is the 172 // same one (or is equivalent to) the one associated with the 173 // BlockSparseMatrix objects below. 174 // 175 // assume_full_rank_ete controls how the eliminator inverts with the 176 // diagonal blocks corresponding to e blocks in A'A. If 177 // assume_full_rank_ete is true, then a Cholesky factorization is 178 // used to compute the inverse, otherwise a singular value 179 // decomposition is used to compute the pseudo inverse. 180 virtual void Init(int num_eliminate_blocks, 181 bool assume_full_rank_ete, 182 const CompressedRowBlockStructure* bs) = 0; 183 184 // Compute the Schur complement system from the augmented linear 185 // least squares problem [A;D] x = [b;0]. The left hand side and the 186 // right hand side of the reduced linear system are returned in lhs 187 // and rhs respectively. 188 // 189 // It is the caller's responsibility to construct and initialize 190 // lhs. Depending upon the structure of the lhs object passed here, 191 // the full or a submatrix of the Schur complement will be computed. 192 // 193 // Since the Schur complement is a symmetric matrix, only the upper 194 // triangular part of the Schur complement is computed. 195 virtual void Eliminate(const BlockSparseMatrix* A, 196 const double* b, 197 const double* D, 198 BlockRandomAccessMatrix* lhs, 199 double* rhs) = 0; 200 201 // Given values for the variables z in the F block of A, solve for 202 // the optimal values of the variables y corresponding to the E 203 // block in A. 204 virtual void BackSubstitute(const BlockSparseMatrix* A, 205 const double* b, 206 const double* D, 207 const double* z, 208 double* y) = 0; 209 // Factory 210 static SchurEliminatorBase* Create(const LinearSolver::Options& options); 211 }; 212 213 // Templated implementation of the SchurEliminatorBase interface. The 214 // templating is on the sizes of the row, e and f blocks sizes in the 215 // input matrix. In many problems, the sizes of one or more of these 216 // blocks are constant, in that case, its worth passing these 217 // parameters as template arguments so that they are visible to the 218 // compiler and can be used for compile time optimization of the low 219 // level linear algebra routines. 220 // 221 // This implementation is mulithreaded using OpenMP. The level of 222 // parallelism is controlled by LinearSolver::Options::num_threads. 223 template <int kRowBlockSize = Eigen::Dynamic, 224 int kEBlockSize = Eigen::Dynamic, 225 int kFBlockSize = Eigen::Dynamic > 226 class SchurEliminator : public SchurEliminatorBase { 227 public: SchurEliminator(const LinearSolver::Options & options)228 explicit SchurEliminator(const LinearSolver::Options& options) 229 : num_threads_(options.num_threads) { 230 } 231 232 // SchurEliminatorBase Interface 233 virtual ~SchurEliminator(); 234 virtual void Init(int num_eliminate_blocks, 235 bool assume_full_rank_ete, 236 const CompressedRowBlockStructure* bs); 237 virtual void Eliminate(const BlockSparseMatrix* A, 238 const double* b, 239 const double* D, 240 BlockRandomAccessMatrix* lhs, 241 double* rhs); 242 virtual void BackSubstitute(const BlockSparseMatrix* A, 243 const double* b, 244 const double* D, 245 const double* z, 246 double* y); 247 248 private: 249 // Chunk objects store combinatorial information needed to 250 // efficiently eliminate a whole chunk out of the least squares 251 // problem. Consider the first chunk in the example matrix above. 252 // 253 // [ y1 0 0 0 | z1 0 0 0 z5] 254 // [ y1 0 0 0 | z1 z2 0 0 0] 255 // 256 // One of the intermediate quantities that needs to be calculated is 257 // for each row the product of the y block transposed with the 258 // non-zero z block, and the sum of these blocks across rows. A 259 // temporary array "buffer_" is used for computing and storing them 260 // and the buffer_layout maps the indices of the z-blocks to 261 // position in the buffer_ array. The size of the chunk is the 262 // number of row blocks/residual blocks for the particular y block 263 // being considered. 264 // 265 // For the example chunk shown above, 266 // 267 // size = 2 268 // 269 // The entries of buffer_layout will be filled in the following order. 270 // 271 // buffer_layout[z1] = 0 272 // buffer_layout[z5] = y1 * z1 273 // buffer_layout[z2] = y1 * z1 + y1 * z5 274 typedef std::map<int, int> BufferLayoutType; 275 struct Chunk { ChunkChunk276 Chunk() : size(0) {} 277 int size; 278 int start; 279 BufferLayoutType buffer_layout; 280 }; 281 282 void ChunkDiagonalBlockAndGradient( 283 const Chunk& chunk, 284 const BlockSparseMatrix* A, 285 const double* b, 286 int row_block_counter, 287 typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet, 288 double* g, 289 double* buffer, 290 BlockRandomAccessMatrix* lhs); 291 292 void UpdateRhs(const Chunk& chunk, 293 const BlockSparseMatrix* A, 294 const double* b, 295 int row_block_counter, 296 const double* inverse_ete_g, 297 double* rhs); 298 299 void ChunkOuterProduct(const CompressedRowBlockStructure* bs, 300 const Matrix& inverse_eet, 301 const double* buffer, 302 const BufferLayoutType& buffer_layout, 303 BlockRandomAccessMatrix* lhs); 304 void EBlockRowOuterProduct(const BlockSparseMatrix* A, 305 int row_block_index, 306 BlockRandomAccessMatrix* lhs); 307 308 309 void NoEBlockRowsUpdate(const BlockSparseMatrix* A, 310 const double* b, 311 int row_block_counter, 312 BlockRandomAccessMatrix* lhs, 313 double* rhs); 314 315 void NoEBlockRowOuterProduct(const BlockSparseMatrix* A, 316 int row_block_index, 317 BlockRandomAccessMatrix* lhs); 318 319 int num_threads_; 320 int num_eliminate_blocks_; 321 bool assume_full_rank_ete_; 322 323 // Block layout of the columns of the reduced linear system. Since 324 // the f blocks can be of varying size, this vector stores the 325 // position of each f block in the row/col of the reduced linear 326 // system. Thus lhs_row_layout_[i] is the row/col position of the 327 // i^th f block. 328 std::vector<int> lhs_row_layout_; 329 330 // Combinatorial structure of the chunks in A. For more information 331 // see the documentation of the Chunk object above. 332 std::vector<Chunk> chunks_; 333 334 // TODO(sameeragarwal): The following two arrays contain per-thread 335 // storage. They should be refactored into a per thread struct. 336 337 // Buffer to store the products of the y and z blocks generated 338 // during the elimination phase. buffer_ is of size num_threads * 339 // buffer_size_. Each thread accesses the chunk 340 // 341 // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] 342 // 343 scoped_array<double> buffer_; 344 345 // Buffer to store per thread matrix matrix products used by 346 // ChunkOuterProduct. Like buffer_ it is of size num_threads * 347 // buffer_size_. Each thread accesses the chunk 348 // 349 // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1] 350 // 351 scoped_array<double> chunk_outer_product_buffer_; 352 353 int buffer_size_; 354 int uneliminated_row_begins_; 355 356 // Locks for the blocks in the right hand side of the reduced linear 357 // system. 358 std::vector<Mutex*> rhs_locks_; 359 }; 360 361 } // namespace internal 362 } // namespace ceres 363 364 #endif // CERES_INTERNAL_SCHUR_ELIMINATOR_H_ 365