1 /* $Id: BlockBoundaryAlgorithm.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Chris Lanczycki
27 *
28 * File Description:
29 * Base class for objects that select new block boundaries for
30 * an AlignmentUtility based on some concrete algorithm. Should
31 * be configured with a ColumnScorer concrete instance.
32 * File also contains virtually inherited subclasses.
33 *
34 * ===========================================================================
35 */
36
37 #include <ncbi_pch.hpp>
38 #include <algo/structure/bma_refine/BlockBoundaryAlgorithm.hpp>
39
40 USING_NCBI_SCOPE;
41
42 BEGIN_SCOPE(align_refine)
43
44 const unsigned int BlockBoundaryAlgorithm::DEFAULT_MIN_BLOCK_SIZE = 1;
45
BlockBoundaryAlgorithm(const ColumnScorer * scorer,double ext,double shrink,BlockBoundaryAlgorithmMethod method)46 BlockBoundaryAlgorithm::BlockBoundaryAlgorithm(const ColumnScorer* scorer, double ext, double shrink, BlockBoundaryAlgorithmMethod method)
47 : m_boundaryMethod(method), m_minBlockSize(DEFAULT_MIN_BLOCK_SIZE), m_canBlocksShrink(false) {
48 AddScorer(scorer, ext, shrink);
49 }
50
~BlockBoundaryAlgorithm()51 BlockBoundaryAlgorithm::~BlockBoundaryAlgorithm() {
52 // for (unsigned int i = 0; i < m_columnScorers.size(); ++i) {
53 // delete m_columnScorers[i];
54 // }
55 // m_columnScorers.clear();
56 }
57
AddScorer(const ColumnScorer * scorer,double ext,double shrink)58 void BlockBoundaryAlgorithm::AddScorer(const ColumnScorer* scorer, double ext, double shrink) {
59 if (scorer) {
60 m_columnScorers.push_back(scorer);
61 m_extensionThresholds.push_back(ext);
62 m_shrinkageThresholds.push_back((shrink == kMax_Double) ? ext : shrink);
63 }
64 }
65
ComputeColumnScores(const BMA & bma,unsigned int minIndex,unsigned int maxIndex,ColScoreMap & scores,unsigned int scorerIndex) const66 void BlockBoundaryAlgorithm::ComputeColumnScores(const BMA& bma, unsigned int minIndex,
67 unsigned int maxIndex, ColScoreMap& scores,
68 unsigned int scorerIndex) const {
69 unsigned int i = minIndex;
70 if (scorerIndex < m_columnScorers.size() && m_columnScorers[scorerIndex]) {
71 for (; i <= maxIndex; ++i) {
72 scores.insert(ColScoreMapValType(i, m_columnScorers[scorerIndex]->ColumnScore(bma, i)));
73 }
74 }
75 }
76
77 /*********************/
78
PassAllTests(const BMA & bma,unsigned int alignmentIndex) const79 bool SimpleBoundaryExtender::PassAllTests(const BMA& bma, unsigned int alignmentIndex) const {
80 bool tryNextScorer;
81 double columnScore;
82 vector<double> rowScoreCache;
83
84 tryNextScorer = true;
85 rowScoreCache.clear();
86 for (unsigned int j = 0; tryNextScorer && j < m_columnScorers.size(); ++j) {
87 columnScore = m_columnScorers[j]->ColumnScore(bma, alignmentIndex, &rowScoreCache);
88 tryNextScorer = (columnScore >= m_extensionThresholds[j]);
89 TRACE_MESSAGE_CL(" (aindex, score, thold, scorer) = (" << alignmentIndex+1 << ", " << columnScore << ", " << m_extensionThresholds[j] << ", " << j << ") PASSED? " << tryNextScorer);
90 }
91 return tryNextScorer;
92 }
93
94
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const95 bool SimpleBoundaryExtender::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
96
97 bool result = false;
98 unsigned int i;
99 unsigned int newFrom = block.from, newTo = block.to;
100 unsigned int minIndex = block.from - block.nExt;
101 unsigned int maxIndex = block.to + block.cExt;
102
103 // Since this class only extends the bounds, do not allow block shrinkage.
104 unsigned int minIndexCore = block.from, maxIndexCore = block.to;
105
106 if (m_columnScorers.size() > 0 && (block.nExt > 0 || block.cExt > 0)) {
107 // extend N-terminal bound until a column score is extension threshold
108 i = minIndexCore-1;
109 while (i >= minIndex) {
110
111 // if all tests are passed, extend
112 TRACE_MESSAGE_CL("GetNewBoundaries: N term check scores for block " << block.blockNum + 1);
113 if (PassAllTests(bma, i)) {
114 newFrom = i;
115 } else {
116 i = minIndex;
117 }
118
119 // If i == 0, --i makes i = kMax_UInt!!
120 if (i == 0) break;
121 --i;
122 }
123
124 // extend C-terminal bound until a column score is extension threshold
125 i = maxIndexCore+1;
126 while (i <= maxIndex) {
127
128 // if all tests are passed, extend
129 TRACE_MESSAGE_CL("GetNewBoundaries: C term check scores for block " << block.blockNum + 1);
130 if (PassAllTests(bma, i)) {
131 newTo = i;
132 } else {
133 i = maxIndex;
134 }
135 ++i;
136 }
137
138 result = (block.from != newFrom || block.to != newTo);
139 if (result) {
140 TRACE_MESSAGE_CL("\nGetNewBoundaries: EXTENSION -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", " << block.to+1 << "; " << newFrom+1 << ", " << newTo+1 << ")\n");
141 } else {
142 TRACE_MESSAGE_CL("\nGetNewBoundaries: NO EXTENSION -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
143 }
144 block.from = newFrom;
145 block.to = newTo;
146 }
147
148 return result;
149 }
150
151
152 /*********************/
153
PassAllTests(const BMA & bma,unsigned int alignmentIndex) const154 bool SimpleBoundaryShrinker::PassAllTests(const BMA& bma, unsigned int alignmentIndex) const {
155 bool tryNextScorer;
156 double columnScore;
157 vector<double> rowScoreCache;
158
159 tryNextScorer = true;
160 rowScoreCache.clear();
161 for (unsigned int j = 0; tryNextScorer && j < m_columnScorers.size(); ++j) {
162 columnScore = m_columnScorers[j]->ColumnScore(bma, alignmentIndex, &rowScoreCache);
163 tryNextScorer = (columnScore < m_shrinkageThresholds[j]);
164 TRACE_MESSAGE_CL(" (aindex, score, thold, scorer) = (" << alignmentIndex+1 << ", " << columnScore << ", " << m_shrinkageThresholds[j] << ", " << j << ") PASSED? " << tryNextScorer);
165 }
166 return tryNextScorer;
167 }
168
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const169 bool SimpleBoundaryShrinker::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
170
171 bool result = false;
172 bool stopShrinking = false;
173 unsigned int newFrom = block.from, newTo = block.to, bwidth = newTo - newFrom + 1;
174
175 // SetDiagStream(&NcbiCout);
176 // EDiagSev oldPostLevel = SetDiagPostLevel(eDiag_Info);
177
178 if (m_columnScorers.size() > 0) {
179
180 // shrink N-terminal bound until a column score exceeds shrinkage threshold,
181 // or block is at the minimum size.
182 while (!stopShrinking && bwidth > m_minBlockSize) {
183 TRACE_MESSAGE_CL("GetNewBoundaries - shrink: N term check scores for block " << block.blockNum + 1);
184 if (PassAllTests(bma, newFrom)) {
185 ++newFrom;
186 --bwidth;
187 TRACE_MESSAGE_CL(" ....n-terminal shrinking " << newFrom << " " << bwidth);
188 } else {
189 stopShrinking = true;
190 }
191 }
192
193 // shrink C-terminal bound until a column score exceeds shrinkage threshold,
194 // or block is at the minimum size.
195 stopShrinking = false;
196 while (!stopShrinking && bwidth > m_minBlockSize) {
197 TRACE_MESSAGE_CL("GetNewBoundaries - shrink: C term check scores for block " << block.blockNum + 1);
198 if (PassAllTests(bma, newTo)) {
199 --newTo;
200 --bwidth;
201 TRACE_MESSAGE_CL(" ....c-terminal shrinking " << newTo << " " << bwidth);
202 } else {
203 stopShrinking = true;
204 }
205 }
206
207
208 result = (block.from != newFrom || block.to != newTo);
209 if (result) {
210 TRACE_MESSAGE_CL("\nGetNewBoundaries: SHRUNK -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", " << block.to+1 << "; " << newFrom+1 << ", " << newTo+1 << ")\n");
211 //cout << "\nGetNewBoundaries: SHRUNK -- (oldfrom, oldto; newfrom, newto) = (" << block.from << ", " << block.to << "; " << newFrom << ", " << newTo << ")\n";
212 } else {
213 TRACE_MESSAGE_CL("\nGetNewBoundaries: NO SHRINK -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
214 //cout << "\nGetNewBoundaries: NO SHRINK -- (oldfrom, oldto) = (" << block.from << ", " << block.to << ")\n";
215 }
216
217 // Note: if the block disappears, newFrom > newTo
218 block.from = newFrom;
219 block.to = newTo;
220 if (newFrom > newTo) TRACE_MESSAGE_CL("********** deletion....\n");
221 }
222
223 // SetDiagPostLevel(oldPostLevel);
224 // SetDiagStream(&NcbiCerr);
225
226 return result;
227 }
228
229 /*********************/
230
SimpleBoundaryExtenderAndShrinker(bool extendFirst,const ColumnScorer * scorer,double ext,double shrink,const ColumnScorer * secondScorer,double ext2,double shrink2)231 SimpleBoundaryExtenderAndShrinker::SimpleBoundaryExtenderAndShrinker(bool extendFirst, const ColumnScorer* scorer, double ext, double shrink, const ColumnScorer* secondScorer, double ext2, double shrink2) : BlockBoundaryAlgorithm(scorer, ext, shrink, eSimpleExtendAndShrink), m_extendFirst(extendFirst) {
232
233 m_canBlocksShrink = true;
234 if (scorer) {
235 m_secondColumnScorers.push_back((secondScorer) ? secondScorer : scorer);
236 m_secondExtensionThresholds.push_back((secondScorer) ? ext2 : ext);
237 m_secondShrinkageThresholds.push_back((secondScorer) ? shrink2 : shrink);
238 }
239 }
240
~SimpleBoundaryExtenderAndShrinker()241 SimpleBoundaryExtenderAndShrinker::~SimpleBoundaryExtenderAndShrinker() {
242 // most often both scorers will be the same. already cleaning up the first scorer
243 // in the parent class; if clean up here too will have double-delete problems there if
244 // I'm not careful...
245 // for (unsigned int i = 0; i < m_columnScorers.size(); ++i) {
246 // delete m_columnScorers[i];
247 // }
248 // m_columnScorers.clear();
249 };
250
AddScorer(const ColumnScorer * scorer,double ext,double shrink)251 void SimpleBoundaryExtenderAndShrinker::AddScorer(const ColumnScorer* scorer, double ext, double shrink) {
252
253 AddFirstScorer(scorer, ext, shrink);
254 AddSecondScorer(scorer, ext, shrink);
255 }
256
AddFirstScorer(const ColumnScorer * scorer,double ext,double shrink)257 void SimpleBoundaryExtenderAndShrinker::AddFirstScorer(const ColumnScorer* scorer, double ext, double shrink) {
258
259 if (scorer) {
260 m_columnScorers.push_back(scorer);
261 m_extensionThresholds.push_back(ext);
262 m_shrinkageThresholds.push_back((shrink == kMax_Double) ? ext : shrink);
263 }
264 }
265
AddSecondScorer(const ColumnScorer * scorer2,double ext2,double shrink2)266 void SimpleBoundaryExtenderAndShrinker::AddSecondScorer(const ColumnScorer* scorer2, double ext2, double shrink2) {
267
268 if (scorer2) {
269 m_secondColumnScorers.push_back(scorer2);
270 m_secondExtensionThresholds.push_back(ext2);
271 m_secondShrinkageThresholds.push_back((shrink2 == kMax_Double) ? ext2 : shrink2);
272 }
273 }
274
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const275 bool SimpleBoundaryExtenderAndShrinker::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
276
277 unsigned int i;
278 ExtendableBlock origBlock = block;
279 BlockBoundaryAlgorithm* firstAlgorithm;
280 BlockBoundaryAlgorithm* secondAlgorithm;
281 // const ColumnScorer* secondScorer = (m_secondColumnScorer) ? m_secondColumnScorer : m_columnScorer;
282
283 // Create with NULL scorer to make this a bit less verbose; nulls are not actually added.
284 if (m_extendFirst) {
285 firstAlgorithm = new SimpleBoundaryExtender(NULL, 0);
286 secondAlgorithm = new SimpleBoundaryShrinker(NULL, 0);
287 } else {
288 firstAlgorithm = new SimpleBoundaryShrinker(NULL, 0);
289 secondAlgorithm = new SimpleBoundaryExtender(NULL, 0);
290 }
291
292 if (!firstAlgorithm || !secondAlgorithm) return false;
293
294 firstAlgorithm->SetMinBlockSize(m_minBlockSize);
295 for (i = 0; i < m_columnScorers.size(); ++i) {
296 firstAlgorithm->AddScorer(m_columnScorers[i], m_extensionThresholds[i], m_shrinkageThresholds[i]);
297 if (firstAlgorithm->CanBlocksShrink()) {
298 TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm: try to shrink block " << block.blockNum+1 << " before trying to extend.");
299 } else {
300 TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm: try to extend block " << block.blockNum+1 << " before trying to shrink.");
301 }
302 }
303
304
305 bool result = firstAlgorithm->GetNewBoundaries(block, bma);
306
307 // If the first step performed a shrink, need to adjust the maximal extensions as
308 // the block can now be extended further due to the shrinking.
309 if (firstAlgorithm->CanBlocksShrink() && result) {
310 if (block.from > origBlock.from) block.nExt += (block.from - origBlock.from);
311 if (block.to < origBlock.to) block.cExt += (origBlock.to - block.to);
312 }
313
314 // Do the second algorithm unless a block deletion recommended.
315 if (block.from <= block.to) {
316 secondAlgorithm->SetMinBlockSize(m_minBlockSize);
317 for (i = 0; i < m_secondColumnScorers.size(); ++i) {
318 secondAlgorithm->AddScorer(m_secondColumnScorers[i], m_secondExtensionThresholds[i],
319 m_secondShrinkageThresholds[i]);
320 }
321 result |= secondAlgorithm->GetNewBoundaries(block, bma);
322 }
323
324 if (!result)
325 TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm: did NOT extend or shrink block " << block.blockNum+1 << ".");
326
327 delete firstAlgorithm;
328 delete secondAlgorithm;
329
330 // Until they reflect the possible extensions for the new block boundaries, don't
331 // surprise the caller w/ meaningless new values for n/cExt.
332 block.nExt = origBlock.nExt;
333 block.cExt = origBlock.cExt;
334
335 return result;
336 }
337
338
339 /*********************/
340
341
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const342 bool GreedyBoundaryExtender::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
343
344 // For each scorer (vector index), map the column scores by alignment index.
345 typedef map< unsigned int, double > TMap;
346 typedef map< unsigned int, double >::value_type TMapVT;
347 vector< TMap > columnScores;
348
349 // When multiple scorers, don't keep getting row scores over and over for each scorer.
350 vector< double > rowScoreCache;
351
352 bool result = false;
353 bool oneScorer = (m_columnScorers.size() == 1);
354 unsigned int i;
355 unsigned int bestNewFrom, bestNewTo;
356 unsigned int newFrom, newTo;
357 unsigned int minIndex = block.from - block.nExt;
358 unsigned int maxIndex = block.to + block.cExt;
359 unsigned int baseBlockSize = block.to - block.from + 1, extendedBlockSize;
360 vector<unsigned int> newFroms, newTos;
361 double baseBlockScore = 0.0, extendedBlockScore, avgExtBlockScore, columnScore, bestScore;
362
363 // Since this class only extends the bounds, do not allow block shrinkage.
364 unsigned int minIndexCore = block.from, maxIndexCore = block.to;
365
366 if (m_columnScorers.size() == 0 || (block.nExt <= 0 && block.cExt <= 0)) {
367 return result;
368 }
369 if (baseBlockSize <= 0) {
370 ERROR_MESSAGE_CL(" Bad block size error for block " << block.blockNum + 1 << ": min = " << minIndexCore << " and max = " << maxIndexCore);
371 return result;
372 }
373
374 // Make sure there's a map for each scorer.
375 columnScores.resize(m_columnScorers.size());
376
377 // collect all individual column scores needed
378 i = minIndex;
379 while (i <= maxIndex) {
380 rowScoreCache.clear();
381 for (unsigned int j = 0; j < m_columnScorers.size(); ++j) {
382 columnScore = m_columnScorers[j]->ColumnScore(bma, i, (oneScorer) ? NULL : &rowScoreCache);
383 columnScores[j].insert(TMapVT(i, columnScore));
384 if (i >= minIndexCore && i <= maxIndexCore) {
385 baseBlockScore += columnScore;
386 TRACE_MESSAGE_CL(" scorer " << j+1 << ": existing column " << setw(4) << i << "; score " << setprecision(4) << columnScore);
387 } else {
388 TRACE_MESSAGE_CL(" scorer " << j+1 << ": column " << setw(4) << i << "; score " << setprecision(4) << columnScore);
389 }
390
391 }
392 ++i;
393 }
394
395 // use column scorer 'IsBetterThan' method to determine the best position among all scores
396 // -- settle ties by taking the larger extension
397
398 bestNewFrom = minIndexCore;
399 bestNewTo = maxIndexCore;
400 TRACE_MESSAGE_CL(" GreedyExtend: look at block at (" << bestNewFrom << ", " << bestNewTo << ")");
401 for (unsigned int j = 0; j < m_columnScorers.size(); ++j) {
402 TMap& scores = columnScores[j];
403
404 // Find best N-terminal score.
405 newFrom = minIndexCore; // original boundary
406 bestScore = baseBlockScore / baseBlockSize;
407 extendedBlockScore = baseBlockScore;
408 extendedBlockSize = baseBlockSize;
409 i = minIndexCore - 1;
410 TRACE_MESSAGE_CL(" scorer " << j+1 << ": N-terminal base score = " << bestScore);
411 while (i >= minIndex) {
412 ++extendedBlockSize;
413 extendedBlockScore += scores[i];
414 avgExtBlockScore = extendedBlockScore/extendedBlockSize;
415 TRACE_MESSAGE_CL(" index " << i << ": extended score = " << avgExtBlockScore);
416 if (m_columnScorers[j]->IsBetterThan(avgExtBlockScore, bestScore) >= 0 && MeetsExtensionThreshold(avgExtBlockScore, j)) {
417 bestScore = avgExtBlockScore;
418 newFrom = i;
419 TRACE_MESSAGE_CL(" ==> new N-terminus chosen at " << i);
420 }
421
422 // If i == 0, --i makes i = kMax_UInt!!
423 if (i == 0) break;
424 --i;
425 }
426 newFroms.push_back(newFrom);
427
428 // Find best C-terminal score.
429 newTo = maxIndexCore; // original boundary
430 bestScore = baseBlockScore / baseBlockSize;
431 extendedBlockScore = baseBlockScore;
432 extendedBlockSize = baseBlockSize;
433 i = maxIndexCore + 1;
434 TRACE_MESSAGE_CL(" scorer " << j+1 << ": C-terminal base score = " << bestScore);
435 while (i <= maxIndex) {
436 ++extendedBlockSize;
437 extendedBlockScore += scores[i];
438 avgExtBlockScore = extendedBlockScore/extendedBlockSize;
439 TRACE_MESSAGE_CL(" index " << i << ": extended score = " << avgExtBlockScore);
440 if (m_columnScorers[j]->IsBetterThan(avgExtBlockScore, bestScore) >= 0 && MeetsExtensionThreshold(avgExtBlockScore, j)) {
441 bestScore = avgExtBlockScore;
442 newTo = i;
443 TRACE_MESSAGE_CL(" ==> new C-terminus chosen at " << i);
444 }
445 ++i;
446 }
447 newTos.push_back(newTo);
448
449 // With multiple scorers, choose the largest extension.
450 if (newFrom < bestNewFrom) {
451 bestNewFrom = newFrom;
452 }
453 if (newTo > bestNewTo) {
454 bestNewTo = newTo;
455 }
456 }
457
458 result = (block.from != bestNewFrom || block.to != bestNewTo);
459 if (result) {
460 TRACE_MESSAGE_CL("\nGetNewBoundaries: EXTENSION -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", " << block.to+1 << "; " << bestNewFrom+1 << ", " << bestNewTo+1 << ")\n");
461 block.from = bestNewFrom;
462 block.to = bestNewTo;
463 } else {
464 TRACE_MESSAGE_CL("\nGetNewBoundaries: NO EXTENSION -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
465 }
466
467 return result;
468 }
469
MeetsExtensionThreshold(double score,unsigned int scorerIndex) const470 bool GreedyBoundaryExtender::MeetsExtensionThreshold(double score, unsigned int scorerIndex) const {
471 bool result = false;
472 if (scorerIndex < m_columnScorers.size()) {
473 if (score != ColumnScorer::SCORE_INVALID_OR_NOT_COMPUTED) {
474 result = (m_columnScorers[scorerIndex]->IsBetterThan(score, m_extensionThresholds[scorerIndex]) >= 0);
475 }
476 }
477 return result;
478 }
479
480 END_SCOPE(align_refine)
481