1 /*  $Id: BlockBoundaryAlgorithm.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Chris Lanczycki
27 *
28 * File Description:
29 *           Base class for objects that select new block boundaries for
30 *           an AlignmentUtility based on some concrete algorithm.  Should
31 *           be configured with a ColumnScorer concrete instance.
32 *           File also contains virtually inherited subclasses.
33 *
34 * ===========================================================================
35 */
36 
37 #include <ncbi_pch.hpp>
38 #include <algo/structure/bma_refine/BlockBoundaryAlgorithm.hpp>
39 
40 USING_NCBI_SCOPE;
41 
42 BEGIN_SCOPE(align_refine)
43 
44 const unsigned int BlockBoundaryAlgorithm::DEFAULT_MIN_BLOCK_SIZE = 1;
45 
BlockBoundaryAlgorithm(const ColumnScorer * scorer,double ext,double shrink,BlockBoundaryAlgorithmMethod method)46 BlockBoundaryAlgorithm::BlockBoundaryAlgorithm(const ColumnScorer* scorer, double ext, double shrink, BlockBoundaryAlgorithmMethod method)
47     : m_boundaryMethod(method), m_minBlockSize(DEFAULT_MIN_BLOCK_SIZE), m_canBlocksShrink(false) {
48     AddScorer(scorer, ext, shrink);
49 }
50 
~BlockBoundaryAlgorithm()51 BlockBoundaryAlgorithm::~BlockBoundaryAlgorithm() {
52 //    for (unsigned int i = 0; i < m_columnScorers.size(); ++i) {
53 //        delete m_columnScorers[i];
54 //    }
55 //    m_columnScorers.clear();
56 }
57 
AddScorer(const ColumnScorer * scorer,double ext,double shrink)58 void BlockBoundaryAlgorithm::AddScorer(const ColumnScorer* scorer, double ext, double shrink) {
59     if (scorer) {
60         m_columnScorers.push_back(scorer);
61         m_extensionThresholds.push_back(ext);
62         m_shrinkageThresholds.push_back((shrink == kMax_Double) ? ext : shrink);
63     }
64 }
65 
ComputeColumnScores(const BMA & bma,unsigned int minIndex,unsigned int maxIndex,ColScoreMap & scores,unsigned int scorerIndex) const66 void BlockBoundaryAlgorithm::ComputeColumnScores(const BMA& bma, unsigned int minIndex,
67                                                  unsigned int maxIndex, ColScoreMap& scores,
68                                                  unsigned int scorerIndex) const {
69     unsigned int i = minIndex;
70     if (scorerIndex < m_columnScorers.size() && m_columnScorers[scorerIndex]) {
71         for (; i <= maxIndex; ++i) {
72             scores.insert(ColScoreMapValType(i, m_columnScorers[scorerIndex]->ColumnScore(bma, i)));
73         }
74     }
75 }
76 
77 /*********************/
78 
PassAllTests(const BMA & bma,unsigned int alignmentIndex) const79 bool SimpleBoundaryExtender::PassAllTests(const BMA& bma, unsigned int alignmentIndex) const {
80     bool tryNextScorer;
81     double columnScore;
82     vector<double> rowScoreCache;
83 
84     tryNextScorer = true;
85     rowScoreCache.clear();
86     for (unsigned int j = 0; tryNextScorer && j < m_columnScorers.size(); ++j) {
87         columnScore = m_columnScorers[j]->ColumnScore(bma, alignmentIndex, &rowScoreCache);
88         tryNextScorer = (columnScore >= m_extensionThresholds[j]);
89         TRACE_MESSAGE_CL("          (aindex, score, thold, scorer) = (" << alignmentIndex+1 << ", " << columnScore << ", " << m_extensionThresholds[j] << ", " << j << ") PASSED? " << tryNextScorer);
90     }
91     return tryNextScorer;
92 }
93 
94 
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const95 bool SimpleBoundaryExtender::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
96 
97     bool result = false;
98     unsigned int i;
99     unsigned int newFrom  = block.from, newTo = block.to;
100     unsigned int minIndex = block.from - block.nExt;
101     unsigned int maxIndex = block.to   + block.cExt;
102 
103     //  Since this class only extends the bounds, do not allow block shrinkage.
104     unsigned int minIndexCore = block.from, maxIndexCore = block.to;
105 
106     if (m_columnScorers.size() > 0 && (block.nExt > 0 || block.cExt > 0)) {
107         //  extend N-terminal bound until a column score is extension threshold
108         i = minIndexCore-1;
109         while (i >= minIndex) {
110 
111             // if all tests are passed, extend
112             TRACE_MESSAGE_CL("GetNewBoundaries:  N term check scores for block " << block.blockNum + 1);
113             if (PassAllTests(bma, i)) {
114                 newFrom = i;
115             } else {
116                 i = minIndex;
117             }
118 
119             //  If i == 0, --i makes i = kMax_UInt!!
120             if (i == 0) break;
121             --i;
122         }
123 
124         //  extend C-terminal bound until a column score is extension threshold
125         i = maxIndexCore+1;
126         while (i <= maxIndex) {
127 
128             // if all tests are passed, extend
129             TRACE_MESSAGE_CL("GetNewBoundaries:  C term check scores for block " << block.blockNum + 1);
130             if (PassAllTests(bma, i)) {
131                 newTo = i;
132             } else {
133                 i = maxIndex;
134             }
135             ++i;
136         }
137 
138         result = (block.from != newFrom || block.to != newTo);
139         if (result) {
140             TRACE_MESSAGE_CL("\nGetNewBoundaries:  EXTENSION -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", "  << block.to+1 << "; " << newFrom+1 << ", " << newTo+1 << ")\n");
141         } else {
142             TRACE_MESSAGE_CL("\nGetNewBoundaries:  NO EXTENSION -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
143         }
144         block.from = newFrom;
145         block.to   = newTo;
146     }
147 
148     return result;
149 }
150 
151 
152 /*********************/
153 
PassAllTests(const BMA & bma,unsigned int alignmentIndex) const154 bool SimpleBoundaryShrinker::PassAllTests(const BMA& bma, unsigned int alignmentIndex) const {
155     bool tryNextScorer;
156     double columnScore;
157     vector<double> rowScoreCache;
158 
159     tryNextScorer = true;
160     rowScoreCache.clear();
161     for (unsigned int j = 0; tryNextScorer && j < m_columnScorers.size(); ++j) {
162         columnScore = m_columnScorers[j]->ColumnScore(bma, alignmentIndex, &rowScoreCache);
163         tryNextScorer = (columnScore < m_shrinkageThresholds[j]);
164         TRACE_MESSAGE_CL("          (aindex, score, thold, scorer) = (" << alignmentIndex+1 << ", " << columnScore << ", " << m_shrinkageThresholds[j] << ", " << j << ") PASSED? " << tryNextScorer);
165     }
166     return tryNextScorer;
167 }
168 
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const169 bool SimpleBoundaryShrinker::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
170 
171     bool result = false;
172     bool stopShrinking = false;
173     unsigned int newFrom  = block.from, newTo = block.to, bwidth = newTo - newFrom + 1;
174 
175 //    SetDiagStream(&NcbiCout);
176 //    EDiagSev oldPostLevel = SetDiagPostLevel(eDiag_Info);
177 
178     if (m_columnScorers.size() > 0) {
179 
180         //  shrink N-terminal bound until a column score exceeds shrinkage threshold,
181         //  or block is at the minimum size.
182         while (!stopShrinking && bwidth > m_minBlockSize) {
183             TRACE_MESSAGE_CL("GetNewBoundaries - shrink:  N term check scores for block " << block.blockNum + 1);
184             if (PassAllTests(bma, newFrom)) {
185                 ++newFrom;
186                 --bwidth;
187                 TRACE_MESSAGE_CL("     ....n-terminal shrinking " << newFrom << " " << bwidth);
188             } else {
189                 stopShrinking = true;
190             }
191         }
192 
193         //  shrink C-terminal bound until a column score exceeds shrinkage threshold,
194         //  or block is at the minimum size.
195         stopShrinking = false;
196         while (!stopShrinking && bwidth > m_minBlockSize) {
197             TRACE_MESSAGE_CL("GetNewBoundaries - shrink:  C term check scores for block " << block.blockNum + 1);
198             if (PassAllTests(bma, newTo)) {
199                 --newTo;
200                 --bwidth;
201                 TRACE_MESSAGE_CL("     ....c-terminal shrinking " << newTo << " " << bwidth);
202             } else {
203                 stopShrinking = true;
204             }
205         }
206 
207 
208         result = (block.from != newFrom || block.to != newTo);
209         if (result) {
210             TRACE_MESSAGE_CL("\nGetNewBoundaries:  SHRUNK -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", "  << block.to+1 << "; " << newFrom+1 << ", " << newTo+1 << ")\n");
211             //cout << "\nGetNewBoundaries:  SHRUNK -- (oldfrom, oldto; newfrom, newto) = (" << block.from << ", "  << block.to << "; " << newFrom << ", " << newTo << ")\n";
212         } else {
213             TRACE_MESSAGE_CL("\nGetNewBoundaries:  NO SHRINK -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
214             //cout << "\nGetNewBoundaries:  NO SHRINK -- (oldfrom, oldto) = (" << block.from << ", " << block.to << ")\n";
215         }
216 
217         //  Note:  if the block disappears, newFrom > newTo
218         block.from = newFrom;
219         block.to   = newTo;
220         if (newFrom > newTo) TRACE_MESSAGE_CL("**********  deletion....\n");
221     }
222 
223 //    SetDiagPostLevel(oldPostLevel);
224 //    SetDiagStream(&NcbiCerr);
225 
226     return result;
227 }
228 
229 /*********************/
230 
SimpleBoundaryExtenderAndShrinker(bool extendFirst,const ColumnScorer * scorer,double ext,double shrink,const ColumnScorer * secondScorer,double ext2,double shrink2)231 SimpleBoundaryExtenderAndShrinker::SimpleBoundaryExtenderAndShrinker(bool extendFirst, const ColumnScorer* scorer, double ext, double shrink, const ColumnScorer* secondScorer, double ext2, double shrink2) : BlockBoundaryAlgorithm(scorer, ext, shrink, eSimpleExtendAndShrink), m_extendFirst(extendFirst) {
232 
233     m_canBlocksShrink = true;
234     if (scorer) {
235         m_secondColumnScorers.push_back((secondScorer) ? secondScorer : scorer);
236         m_secondExtensionThresholds.push_back((secondScorer) ? ext2 : ext);
237         m_secondShrinkageThresholds.push_back((secondScorer) ? shrink2 : shrink);
238     }
239 }
240 
~SimpleBoundaryExtenderAndShrinker()241 SimpleBoundaryExtenderAndShrinker::~SimpleBoundaryExtenderAndShrinker() {
242 //  most often both scorers will be the same.  already cleaning up the first scorer
243 //  in the parent class; if clean up here too will have double-delete problems there if
244 //  I'm not careful...
245 //    for (unsigned int i = 0; i < m_columnScorers.size(); ++i) {
246 //        delete m_columnScorers[i];
247 //    }
248 //    m_columnScorers.clear();
249 };
250 
AddScorer(const ColumnScorer * scorer,double ext,double shrink)251 void SimpleBoundaryExtenderAndShrinker::AddScorer(const ColumnScorer* scorer, double ext, double shrink) {
252 
253     AddFirstScorer(scorer, ext, shrink);
254     AddSecondScorer(scorer, ext, shrink);
255 }
256 
AddFirstScorer(const ColumnScorer * scorer,double ext,double shrink)257 void SimpleBoundaryExtenderAndShrinker::AddFirstScorer(const ColumnScorer* scorer, double ext, double shrink) {
258 
259     if (scorer) {
260         m_columnScorers.push_back(scorer);
261         m_extensionThresholds.push_back(ext);
262         m_shrinkageThresholds.push_back((shrink == kMax_Double) ? ext : shrink);
263     }
264 }
265 
AddSecondScorer(const ColumnScorer * scorer2,double ext2,double shrink2)266 void SimpleBoundaryExtenderAndShrinker::AddSecondScorer(const ColumnScorer* scorer2, double ext2, double shrink2) {
267 
268     if (scorer2) {
269         m_secondColumnScorers.push_back(scorer2);
270         m_secondExtensionThresholds.push_back(ext2);
271         m_secondShrinkageThresholds.push_back((shrink2 == kMax_Double) ? ext2 : shrink2);
272     }
273 }
274 
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const275 bool SimpleBoundaryExtenderAndShrinker::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
276 
277     unsigned int i;
278     ExtendableBlock origBlock = block;
279     BlockBoundaryAlgorithm* firstAlgorithm;
280     BlockBoundaryAlgorithm* secondAlgorithm;
281 //    const ColumnScorer* secondScorer = (m_secondColumnScorer) ? m_secondColumnScorer : m_columnScorer;
282 
283     //  Create with NULL scorer to make this a bit less verbose; nulls are not actually added.
284     if (m_extendFirst) {
285         firstAlgorithm  = new SimpleBoundaryExtender(NULL, 0);
286         secondAlgorithm = new SimpleBoundaryShrinker(NULL, 0);
287     } else {
288         firstAlgorithm  = new SimpleBoundaryShrinker(NULL, 0);
289         secondAlgorithm = new SimpleBoundaryExtender(NULL, 0);
290     }
291 
292     if (!firstAlgorithm || !secondAlgorithm) return false;
293 
294     firstAlgorithm->SetMinBlockSize(m_minBlockSize);
295     for (i = 0; i < m_columnScorers.size(); ++i) {
296         firstAlgorithm->AddScorer(m_columnScorers[i], m_extensionThresholds[i], m_shrinkageThresholds[i]);
297         if (firstAlgorithm->CanBlocksShrink()) {
298             TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm:  try to shrink block " << block.blockNum+1 << " before trying to extend.");
299         } else {
300             TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm:  try to extend block " << block.blockNum+1 << " before trying to shrink.");
301         }
302     }
303 
304 
305     bool result = firstAlgorithm->GetNewBoundaries(block, bma);
306 
307     //  If the first step performed a shrink, need to adjust the maximal extensions as
308     //  the block can now be extended further due to the shrinking.
309     if (firstAlgorithm->CanBlocksShrink() && result) {
310         if (block.from > origBlock.from) block.nExt += (block.from - origBlock.from);
311         if (block.to < origBlock.to) block.cExt += (origBlock.to - block.to);
312     }
313 
314     //  Do the second algorithm unless a block deletion recommended.
315     if (block.from <= block.to) {
316         secondAlgorithm->SetMinBlockSize(m_minBlockSize);
317         for (i = 0; i < m_secondColumnScorers.size(); ++i) {
318             secondAlgorithm->AddScorer(m_secondColumnScorers[i], m_secondExtensionThresholds[i],
319                                       m_secondShrinkageThresholds[i]);
320         }
321         result |= secondAlgorithm->GetNewBoundaries(block, bma);
322     }
323 
324     if (!result)
325         TRACE_MESSAGE_CL("GetNewBoundaries - Extend or Shrink algorithm:  did NOT extend or shrink block " << block.blockNum+1 << ".");
326 
327     delete firstAlgorithm;
328     delete secondAlgorithm;
329 
330     //  Until they reflect the possible extensions for the new block boundaries, don't
331     //  surprise the caller w/ meaningless new values for n/cExt.
332     block.nExt = origBlock.nExt;
333     block.cExt = origBlock.cExt;
334 
335     return result;
336 }
337 
338 
339 /*********************/
340 
341 
GetNewBoundaries(ExtendableBlock & block,const BMA & bma) const342 bool GreedyBoundaryExtender::GetNewBoundaries(ExtendableBlock& block, const BMA& bma) const {
343 
344     //  For each scorer (vector index), map the column scores by alignment index.
345     typedef map< unsigned int, double > TMap;
346     typedef map< unsigned int, double >::value_type TMapVT;
347     vector< TMap > columnScores;
348 
349     //  When multiple scorers, don't keep getting row scores over and over for each scorer.
350     vector< double > rowScoreCache;
351 
352     bool result = false;
353     bool oneScorer = (m_columnScorers.size() == 1);
354     unsigned int i;
355     unsigned int bestNewFrom, bestNewTo;
356     unsigned int newFrom, newTo;
357     unsigned int minIndex = block.from - block.nExt;
358     unsigned int maxIndex = block.to   + block.cExt;
359     unsigned int baseBlockSize = block.to - block.from + 1, extendedBlockSize;
360     vector<unsigned int> newFroms, newTos;
361     double baseBlockScore = 0.0, extendedBlockScore, avgExtBlockScore, columnScore, bestScore;
362 
363     //  Since this class only extends the bounds, do not allow block shrinkage.
364     unsigned int minIndexCore = block.from, maxIndexCore = block.to;
365 
366     if (m_columnScorers.size() == 0 || (block.nExt <= 0 && block.cExt <= 0)) {
367         return result;
368     }
369     if (baseBlockSize <= 0) {
370         ERROR_MESSAGE_CL("    Bad block size error for block " << block.blockNum + 1 << ":  min = " << minIndexCore << " and  max = " << maxIndexCore);
371         return result;
372     }
373 
374     //  Make sure there's a map for each scorer.
375     columnScores.resize(m_columnScorers.size());
376 
377     //  collect all individual column scores needed
378     i = minIndex;
379     while (i <= maxIndex) {
380         rowScoreCache.clear();
381         for (unsigned int j = 0; j < m_columnScorers.size(); ++j) {
382             columnScore = m_columnScorers[j]->ColumnScore(bma, i, (oneScorer) ? NULL : &rowScoreCache);
383             columnScores[j].insert(TMapVT(i, columnScore));
384             if (i >= minIndexCore && i <= maxIndexCore) {
385                 baseBlockScore += columnScore;
386                 TRACE_MESSAGE_CL("       scorer " << j+1 << ":  existing column " << setw(4) << i << "; score " << setprecision(4) << columnScore);
387             } else {
388                 TRACE_MESSAGE_CL("       scorer " << j+1 << ":  column " << setw(4) << i << "; score " << setprecision(4) << columnScore);
389             }
390 
391         }
392         ++i;
393     }
394 
395 //    use column scorer 'IsBetterThan' method to determine the best position among all scores
396 //        -- settle ties by taking the larger extension
397 
398     bestNewFrom = minIndexCore;
399     bestNewTo   = maxIndexCore;
400     TRACE_MESSAGE_CL("   GreedyExtend:  look at block at (" << bestNewFrom << ", " << bestNewTo << ")");
401     for (unsigned int j = 0; j < m_columnScorers.size(); ++j) {
402         TMap& scores = columnScores[j];
403 
404         //  Find best N-terminal score.
405         newFrom   = minIndexCore;     //  original boundary
406         bestScore = baseBlockScore / baseBlockSize;
407         extendedBlockScore = baseBlockScore;
408         extendedBlockSize  = baseBlockSize;
409         i         = minIndexCore - 1;
410         TRACE_MESSAGE_CL("       scorer " << j+1 << ":  N-terminal base score = " << bestScore);
411         while (i >= minIndex) {
412             ++extendedBlockSize;
413             extendedBlockScore += scores[i];
414             avgExtBlockScore = extendedBlockScore/extendedBlockSize;
415             TRACE_MESSAGE_CL("           index " << i << ":  extended score = " << avgExtBlockScore);
416             if (m_columnScorers[j]->IsBetterThan(avgExtBlockScore, bestScore) >= 0 && MeetsExtensionThreshold(avgExtBlockScore, j)) {
417                 bestScore = avgExtBlockScore;
418                 newFrom = i;
419                 TRACE_MESSAGE_CL("           ==> new N-terminus chosen at " << i);
420             }
421 
422             //  If i == 0, --i makes i = kMax_UInt!!
423             if (i == 0) break;
424             --i;
425         }
426         newFroms.push_back(newFrom);
427 
428         //  Find best C-terminal score.
429         newTo     = maxIndexCore;     //  original boundary
430         bestScore = baseBlockScore / baseBlockSize;
431         extendedBlockScore = baseBlockScore;
432         extendedBlockSize  = baseBlockSize;
433         i         = maxIndexCore + 1;
434         TRACE_MESSAGE_CL("       scorer " << j+1 << ":  C-terminal base score = " << bestScore);
435         while (i <= maxIndex) {
436             ++extendedBlockSize;
437             extendedBlockScore += scores[i];
438             avgExtBlockScore = extendedBlockScore/extendedBlockSize;
439             TRACE_MESSAGE_CL("           index " << i << ":  extended score = " << avgExtBlockScore);
440             if (m_columnScorers[j]->IsBetterThan(avgExtBlockScore, bestScore) >= 0 && MeetsExtensionThreshold(avgExtBlockScore, j)) {
441                 bestScore = avgExtBlockScore;
442                 newTo = i;
443                 TRACE_MESSAGE_CL("           ==> new C-terminus chosen at " << i);
444             }
445             ++i;
446         }
447         newTos.push_back(newTo);
448 
449         //  With multiple scorers, choose the largest extension.
450         if (newFrom < bestNewFrom) {
451             bestNewFrom = newFrom;
452         }
453         if (newTo > bestNewTo) {
454             bestNewTo = newTo;
455         }
456     }
457 
458     result = (block.from != bestNewFrom || block.to != bestNewTo);
459     if (result) {
460         TRACE_MESSAGE_CL("\nGetNewBoundaries:  EXTENSION -- (oldfrom, oldto; newfrom, newto) = (" << block.from+1 << ", "  << block.to+1 << "; " << bestNewFrom+1 << ", " << bestNewTo+1 << ")\n");
461         block.from = bestNewFrom;
462         block.to   = bestNewTo;
463     } else {
464         TRACE_MESSAGE_CL("\nGetNewBoundaries:  NO EXTENSION -- (oldfrom, oldto) = (" << block.from+1 << ", " << block.to+1 << ")\n");
465     }
466 
467     return result;
468 }
469 
MeetsExtensionThreshold(double score,unsigned int scorerIndex) const470 bool GreedyBoundaryExtender::MeetsExtensionThreshold(double score, unsigned int scorerIndex) const {
471     bool result = false;
472     if (scorerIndex < m_columnScorers.size()) {
473         if (score != ColumnScorer::SCORE_INVALID_OR_NOT_COMPUTED) {
474             result = (m_columnScorers[scorerIndex]->IsBetterThan(score, m_extensionThresholds[scorerIndex]) >= 0);
475         }
476     }
477     return result;
478 }
479 
480 END_SCOPE(align_refine)
481