1 /*  $Id: struct_util.cpp 600980 2020-01-30 14:49:42Z lanczyck $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Paul Thiessen
27 *
28 * File Description:
29 *      AlignmentUtility class
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <corelib/ncbistd.hpp>
37 
38 #include <set>
39 
40 #include <algo/structure/struct_util/struct_util.hpp>
41 #include <algo/structure/struct_util/su_sequence_set.hpp>
42 #include <algo/structure/struct_util/su_block_multiple_alignment.hpp>
43 #include <objects/seqloc/PDB_seq_id.hpp>
44 #include "su_private.hpp"
45 #include "su_alignment_set.hpp"
46 #include <algo/structure/struct_util/su_pssm.hpp>
47 
48 #include <algo/structure/struct_dp/struct_dp.h>
49 
50 USING_NCBI_SCOPE;
51 USING_SCOPE(objects);
52 
53 
BEGIN_SCOPE(struct_util)54 BEGIN_SCOPE(struct_util)
55 
56 AlignmentUtility::AlignmentUtility(const CSeq_entry& seqEntry, const SeqAnnotList& seqAnnots)
57 {
58     CRef < CSeq_entry > seqEntryCopy(new CSeq_entry());
59     seqEntryCopy->Assign(seqEntry);
60     m_seqEntries.push_back(seqEntryCopy);
61 
62     SeqAnnotList::const_iterator a, ae = seqAnnots.end();
63     for (a=seqAnnots.begin(); a!=ae; ++a) {
64         CRef < CSeq_annot > seqAnnotCopy(new CSeq_annot());
65         seqAnnotCopy->Assign(**a);
66         m_seqAnnots.push_back(seqAnnotCopy);
67     }
68 
69     Init();
70 }
71 
AlignmentUtility(const SeqEntryList & seqEntries,const SeqAnnotList & seqAnnots)72 AlignmentUtility::AlignmentUtility(const SeqEntryList& seqEntries, const SeqAnnotList& seqAnnots)
73 {
74     SeqEntryList::const_iterator s, se = seqEntries.end();
75     for (s=seqEntries.begin(); s!=se; ++s) {
76         CRef < CSeq_entry > seqEntryCopy(new CSeq_entry());
77         seqEntryCopy->Assign(**s);
78         m_seqEntries.push_back(seqEntryCopy);
79     }
80 
81     SeqAnnotList::const_iterator a, ae = seqAnnots.end();
82     for (a=seqAnnots.begin(); a!=ae; ++a) {
83         CRef < CSeq_annot > seqAnnotCopy(new CSeq_annot());
84         seqAnnotCopy->Assign(**a);
85         m_seqAnnots.push_back(seqAnnotCopy);
86     }
87 
88     Init();
89 }
90 
Init(void)91 void AlignmentUtility::Init(void)
92 {
93     m_sequenceSet = NULL;
94     m_alignmentSet = NULL;
95     m_okay = true;
96     m_currentMultiple = NULL;
97 
98     bool prevState = CException::EnableBackgroundReporting(false);
99 
100     try {
101         m_sequenceSet = new SequenceSet(m_seqEntries);
102         m_alignmentSet = new AlignmentSet(m_seqAnnots, *m_sequenceSet);
103     } catch (CException& e) {
104         ERROR_MESSAGE("exception during AlignmentUtility initialization: " << e.GetMsg());
105         m_okay = false;
106     }
107 
108     CException::EnableBackgroundReporting(prevState);
109 }
110 
~AlignmentUtility()111 AlignmentUtility::~AlignmentUtility()
112 {
113     if (m_sequenceSet)
114         delete m_sequenceSet;
115     if (m_alignmentSet)
116         delete m_alignmentSet;
117     if (m_currentMultiple)
118         delete m_currentMultiple;
119 }
120 
Clone() const121 AlignmentUtility* AlignmentUtility::Clone() const {
122     const SeqAnnotList& seqAnnots = const_cast<AlignmentUtility*>(this)->GetSeqAnnots();
123     return new AlignmentUtility(m_seqEntries, seqAnnots);
124 }
125 
AlignedToAllSlaves(unsigned int masterResidue,const AlignmentSet::AlignmentList & alignments)126 static bool AlignedToAllSlaves(unsigned int masterResidue, const AlignmentSet::AlignmentList& alignments)
127 {
128     AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
129     for (a=alignments.begin(); a!=ae; ++a) {
130         if ((*a)->m_masterToSlave[masterResidue] == MasterSlaveAlignment::eUnaligned)
131             return false;
132     }
133     return true;
134 }
135 
NoSlaveInsertionsBetween(unsigned int masterFrom,unsigned int masterTo,const AlignmentSet::AlignmentList & alignments)136 static bool NoSlaveInsertionsBetween(unsigned int masterFrom, unsigned int masterTo,
137     const AlignmentSet::AlignmentList& alignments)
138 {
139     AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
140     for (a=alignments.begin(); a!=ae; ++a) {
141         if (((*a)->m_masterToSlave[masterTo] - (*a)->m_masterToSlave[masterFrom]) != (masterTo - masterFrom))
142             return false;
143     }
144     return true;
145 }
146 
NoBlockBoundariesBetween(unsigned int masterFrom,unsigned int masterTo,const AlignmentSet::AlignmentList & alignments)147 static bool NoBlockBoundariesBetween(unsigned int masterFrom, unsigned int masterTo,
148     const AlignmentSet::AlignmentList& alignments)
149 {
150     AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
151     for (a=alignments.begin(); a!=ae; ++a) {
152         if ((*a)->m_blockStructure[masterTo] != (*a)->m_blockStructure[masterFrom])
153             return false;
154     }
155     return true;
156 }
157 
DoIBM(void)158 bool AlignmentUtility::DoIBM(void)
159 {
160     if (m_currentMultiple) {
161         WARNING_MESSAGE("DoIBM() - already have a blocked alignment");
162         return true;
163     }
164     if (!m_alignmentSet || m_seqAnnots.size() == 0) {
165         ERROR_MESSAGE("DoIBM() - no alignment data present");
166         return false;
167     }
168 
169     TRACE_MESSAGE("doing IBM");
170     const AlignmentSet::AlignmentList& alignments = m_alignmentSet->m_alignments;
171     AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
172 
173     // create sequence list; fill with sequences of master + slaves
174     BlockMultipleAlignment::SequenceList sequenceList(alignments.size() + 1);
175     BlockMultipleAlignment::SequenceList::iterator s = sequenceList.begin();
176     *(s++) = alignments.front()->m_master;
177     for (a=alignments.begin(); a!=ae; ++a) {
178         *(s++) = (*a)->m_slave;
179         if ((*a)->m_master != sequenceList.front()) {
180             ERROR_MESSAGE("AlignmentUtility::DoIBM() - all pairwise alignments must have the same master sequence");
181             return false;
182         }
183     }
184     CRef < BlockMultipleAlignment > multipleAlignment(new BlockMultipleAlignment(sequenceList));
185 
186     // each block is a continuous region on the master, over which each master
187     // residue is aligned to a residue of each slave, and where there are no
188     // insertions relative to the master in any of the slaves
189     unsigned int masterFrom = 0, masterTo, row;
190     UngappedAlignedBlock *newBlock;
191 
192     while (masterFrom < multipleAlignment->GetMaster()->Length()) {
193 
194         // look for first all-aligned residue
195         if (!AlignedToAllSlaves(masterFrom, alignments)) {
196             ++masterFrom;
197             continue;
198         }
199 
200         // find all next continuous all-aligned residues, but checking for
201         // block boundaries from the original master-slave pairs, so that
202         // blocks don't get merged
203         for (masterTo=masterFrom+1;
204                 masterTo < multipleAlignment->GetMaster()->Length() &&
205                 AlignedToAllSlaves(masterTo, alignments) &&
206                 NoSlaveInsertionsBetween(masterFrom, masterTo, alignments) &&
207                 NoBlockBoundariesBetween(masterFrom, masterTo, alignments);
208              ++masterTo) ;
209         --masterTo; // after loop, masterTo = first residue past block
210 
211         // create new block with ranges from master and all slaves
212         newBlock = new UngappedAlignedBlock(multipleAlignment);
213         newBlock->SetRangeOfRow(0, masterFrom, masterTo);
214         newBlock->m_width = masterTo - masterFrom + 1;
215 
216         for (a=alignments.begin(), row=1; a!=ae; ++a, ++row) {
217             newBlock->SetRangeOfRow(row,
218                 (*a)->m_masterToSlave[masterFrom],
219                 (*a)->m_masterToSlave[masterTo]);
220         }
221 
222         // copy new block into alignment
223         multipleAlignment->AddAlignedBlockAtEnd(newBlock);
224 
225         // start looking for next block
226         masterFrom = masterTo + 1;
227     }
228 
229     if (!multipleAlignment->AddUnalignedBlocks() || !multipleAlignment->UpdateBlockMap()) {
230         ERROR_MESSAGE("AlignmentUtility::DoIBM() - error finalizing alignment");
231         return false;
232     }
233 
234     // switch data to the new multiple alignment
235     m_currentMultiple = multipleAlignment.Release();
236     RemoveAlignAnnot();
237     return true;
238 }
239 
240 // global stuff for DP block aligner score callback
241 DP_BlockInfo *g_dpBlocks = NULL;
242 const BLAST_Matrix *g_dpPSSM = NULL;
243 const Sequence *g_dpQuery = NULL;
244 
245 // sum of scores for residue vs. PSSM
246 extern "C" {
ScoreByPSSM(unsigned int block,unsigned int queryPos)247 int ScoreByPSSM(unsigned int block, unsigned int queryPos)
248 {
249     if (!g_dpBlocks || !g_dpPSSM || !g_dpQuery || block >= g_dpBlocks->nBlocks ||
250         queryPos > g_dpQuery->Length() - g_dpBlocks->blockSizes[block])
251     {
252         ERROR_MESSAGE("ScoreByPSSM() - bad parameters");
253         return DP_NEGATIVE_INFINITY;
254     }
255 
256     int masterPos = g_dpBlocks->blockPositions[block], score = 0;
257     for (unsigned int i=0; i<g_dpBlocks->blockSizes[block]; ++i)
258         score += GetPSSMScoreOfCharWithAverageOfBZ(g_dpPSSM, masterPos + i, g_dpQuery->m_sequenceString[queryPos + i]);
259 
260     return score;
261 }
262 }
263 
DoLeaveOneOut(unsigned int rowToRealign,const std::vector<unsigned int> & blocksToRealign,double percentile,unsigned int extension,unsigned int cutoff,unsigned int queryFrom,unsigned int queryTo)264 bool AlignmentUtility::DoLeaveOneOut(
265     unsigned int rowToRealign, const std::vector < unsigned int >& blocksToRealign, // what to realign
266     double percentile, unsigned int extension, unsigned int cutoff,                 // to calculate max loop lengths
267     unsigned int queryFrom, unsigned int queryTo)                                   // range on realigned row to search
268 {
269     // first we need to do IBM -> BlockMultipleAlignment
270     if (!m_currentMultiple && !DoIBM())
271             return false;
272 
273     BlockMultipleAlignment::UngappedAlignedBlockList blocks;
274     m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
275     if (blocks.size() == 0) {
276         ERROR_MESSAGE("need at least one aligned block for LOO");
277         return false;
278     }
279 
280     BlockMultipleAlignment::AlignmentList toRealign;
281 
282     bool status = false;
283     DP_BlockInfo *dpBlocks = NULL;
284     DP_AlignmentResult *dpResult = NULL;
285 
286     bool prevState = CException::EnableBackgroundReporting(false);
287 
288     try {
289         // first calculate max loop lengths for default square well scoring; include the row to be realigned,
290         // in case it contains an unusually long loop that we don't want to disallow (still depends on loop
291         // calculation params, though)
292         dpBlocks = DP_CreateBlockInfo(blocks.size());
293         unsigned int b, r;
294         const Block::Range *range, *nextRange;
295         unsigned int *loopLengths = new unsigned int[m_currentMultiple->NRows()];
296         for (b=0; b<blocks.size()-1; ++b) {
297             for (r=0; r<m_currentMultiple->NRows(); ++r) {
298                 range = blocks[b]->GetRangeOfRow(r);
299                 nextRange = blocks[b + 1]->GetRangeOfRow(r);
300                 loopLengths[r] = nextRange->from - range->to - 1;
301             }
302             dpBlocks->maxLoops[b] = DP_CalculateMaxLoopLength(
303                 m_currentMultiple->NRows(), loopLengths, percentile, extension, cutoff);
304         }
305         delete[] loopLengths;
306 
307         // now pull out the row we're realigning
308         if (rowToRealign < 1 || rowToRealign >= m_currentMultiple->NRows())
309             THROW_MESSAGE("invalid row number");
310         vector < unsigned int > rowsToRemove(1, rowToRealign);
311         TRACE_MESSAGE("extracting for realignment: "
312             << m_currentMultiple->GetSequenceOfRow(rowToRealign)->IdentifierString());
313         if (!m_currentMultiple->ExtractRows(rowsToRemove, &toRealign))
314             THROW_MESSAGE("ExtractRows() failed");
315 
316         // fill out the rest of the DP_BlockInfo structure
317         for (b=0; b<blocks.size(); ++b) {
318             range = blocks[b]->GetRangeOfRow(0);
319             dpBlocks->blockPositions[b] = range->from;
320             dpBlocks->blockSizes[b] = range->to - range->from + 1;
321         }
322 
323         // if we're not realigning, freeze blocks to original slave position
324         if (blocksToRealign.size() == 0)
325             THROW_MESSAGE("need at least one block to realign");
326         vector < bool > realignBlock(blocks.size(), false);
327         for (r=0; r<blocksToRealign.size(); ++r) {
328             if (blocksToRealign[r] < blocks.size())
329                 realignBlock[blocksToRealign[r]] = true;
330             else
331                 THROW_MESSAGE("block to realign is out of range");
332         }
333         toRealign.front()->GetUngappedAlignedBlocks(&blocks);
334         for (b=0; b<blocks.size(); ++b) {
335             if (!realignBlock[b])
336                 dpBlocks->freezeBlocks[b] = blocks[b]->GetRangeOfRow(1)->from;
337 //            TRACE_MESSAGE("block " << (b+1) << " is " << (realignBlock[b] ? "to be realigned" : "frozen"));
338         }
339 
340         // verify query range
341         r = toRealign.front()->GetSequenceOfRow(1)->Length();
342         if (queryTo == kMax_UInt)
343             queryTo = r - 1;
344         if (queryFrom >= r || queryTo >= r || queryFrom > queryTo)
345             THROW_MESSAGE("bad queryFrom/To range");
346         TRACE_MESSAGE("queryFrom " << queryFrom << ", queryTo " << queryTo);
347 
348         // set up PSSM
349         g_dpPSSM = m_currentMultiple->GetPSSM();
350         g_dpBlocks = dpBlocks;
351         g_dpQuery = toRealign.front()->GetSequenceOfRow(1);
352 
353         // show score before alignment
354         int originalScore = 0;
355         BlockMultipleAlignment::UngappedAlignedBlockList ob;
356         toRealign.front()->GetUngappedAlignedBlocks(&ob);
357         BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator l, o, le = blocks.end();
358         for (l=blocks.begin(), o=ob.begin(); l!=le; ++l, ++o) {
359             const Block::Range
360                 *masterRange = (*l)->GetRangeOfRow(0),
361                 *range = (*o)->GetRangeOfRow(1);
362             for (unsigned int i=0; i<(*l)->m_width; ++i)
363                 originalScore += GetPSSMScoreOfCharWithAverageOfBZ(
364                     m_currentMultiple->GetPSSM(), masterRange->from + i, g_dpQuery->m_sequenceString[range->from + i]);
365         }
366         INFO_MESSAGE("score of extracted row with PSSM(N-1) before realignment: " << originalScore);
367 
368         // call the block aligner
369         if (DP_GlobalBlockAlign(dpBlocks, ScoreByPSSM, queryFrom, queryTo, &dpResult) != STRUCT_DP_FOUND_ALIGNMENT ||
370                 dpResult->nBlocks != blocks.size() || dpResult->firstBlock != 0)
371             THROW_MESSAGE("DP_GlobalBlockAlign() failed");
372         INFO_MESSAGE("score of new alignment of extracted row with PSSM(N-1): " << dpResult->score);
373 
374         // adjust new alignment according to dp result
375         BlockMultipleAlignment::ModifiableUngappedAlignedBlockList modBlocks;
376         toRealign.front()->GetModifiableUngappedAlignedBlocks(&modBlocks);
377         for (b=0; b<modBlocks.size(); ++b) {
378             if (realignBlock[b]) {
379                 modBlocks[b]->SetRangeOfRow(1,
380                     dpResult->blockPositions[b], dpResult->blockPositions[b] + dpBlocks->blockSizes[b] - 1);
381             } else {
382                 if ((int)dpResult->blockPositions[b] != modBlocks[b]->GetRangeOfRow(1)->from)    // just to check...
383                     THROW_MESSAGE("dpResult block doesn't match frozen position on slave");
384             }
385         }
386 
387         // merge realigned row back onto the end of the multiple alignment
388         TRACE_MESSAGE("merging DP results back into multiple alignment");
389         if (!m_currentMultiple->MergeAlignment(toRealign.front()))
390             THROW_MESSAGE("MergeAlignment() failed");
391 
392         // move extracted row back to where it was
393         vector < unsigned int > rowOrder(m_currentMultiple->NRows());
394         for (r=0; r<m_currentMultiple->NRows(); ++r) {
395             if (r < rowToRealign)
396                 rowOrder[r] = r;
397             else if (r == rowToRealign)
398                 rowOrder[r] = m_currentMultiple->NRows() - 1;
399             else
400                 rowOrder[r] = r - 1;
401         }
402         if (!m_currentMultiple->ReorderRows(rowOrder))
403             THROW_MESSAGE("ReorderRows() failed");
404 
405         // remove other alignment data, since it no longer matches the multiple
406         RemoveAlignAnnot();
407 
408         status = true;
409 
410     } catch (CException& e) {
411         ERROR_MESSAGE("DoLeaveOneOut(): exception: " << e.GetMsg());
412     }
413 
414     CException::EnableBackgroundReporting(prevState);
415 
416     DELETE_ALL_AND_CLEAR(toRealign, BlockMultipleAlignment::AlignmentList);
417 
418     DP_DestroyBlockInfo(dpBlocks);
419     DP_DestroyAlignmentResult(dpResult);
420 
421     g_dpPSSM = NULL;
422     g_dpBlocks = NULL;
423     g_dpQuery = NULL;
424 
425     return status;
426 }
427 
428 
DoLeaveNOut(std::vector<unsigned int> & rowsToRealign,const std::vector<unsigned int> & blocksToRealign,double percentile,unsigned int extension,unsigned int cutoff,std::vector<unsigned int> & queryFromVec,std::vector<unsigned int> & queryToVec)429 bool AlignmentUtility::DoLeaveNOut(
430     std::vector<unsigned int>& rowsToRealign, const std::vector < unsigned int >& blocksToRealign, // what to realign
431     double percentile, unsigned int extension, unsigned int cutoff,                 // to calculate max loop lengths
432     std::vector<unsigned int>& queryFromVec, std::vector<unsigned int>& queryToVec)                                   // range on realigned row to search
433 {
434     static const unsigned int LNO_MIN_NROWS_REMAINING = 3;
435 
436     unsigned int nRows, row;
437     unsigned int queryFrom, queryTo;
438     unsigned int nToRealign = rowsToRealign.size(), nFailed = 0;
439     set<unsigned int> failed;
440     string failedMsg, rowStr;
441 
442     // first we need to do IBM -> BlockMultipleAlignment
443     if (!m_currentMultiple && !DoIBM())
444             return false;
445 
446     // Set is used only to remove duplicates; since we don't
447     // recompute PSSM between block aligner calls, the order the rows
448     // are left out is not important when put back into 'sortedRowsToRealign'.
449     unsigned int nDifferentToRealign;
450     vector<unsigned int> sortedRowsToRealign;
451     set<unsigned int> setOfRows;
452     setOfRows.insert(rowsToRealign.begin(), rowsToRealign.end());
453     set<unsigned int>::iterator setIt, setEnd = setOfRows.end();
454 
455     nRows = m_currentMultiple->NRows();
456     nDifferentToRealign = setOfRows.size();
457     if (nToRealign != nDifferentToRealign) {
458         WARNING_MESSAGE("Duplicate rows found among list of rows to leave out skipped\n(repeated applications of LNO on a single row has no affect)");
459 //        return false;
460     }
461 
462     // now pull out the valid rows we can realigning
463     for (setIt = setOfRows.begin(); setIt != setEnd; ++setIt) {
464         if (*setIt < 1 || *setIt >= nRows) {
465             WARNING_MESSAGE("Skipping invalid row number " << *setIt);
466         } else {
467             sortedRowsToRealign.push_back(*setIt);
468             TRACE_MESSAGE("extracting for realignment: "
469                           << m_currentMultiple->GetSequenceOfRow(*setIt)->IdentifierString());
470         }
471 
472     }
473     nDifferentToRealign = sortedRowsToRealign.size();
474 
475 
476     BlockMultipleAlignment::UngappedAlignedBlockList blocks;
477     m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
478 
479 //    if (nToRealign > nRows - LNO_MIN_NROWS_REMAINING) {
480     if (nDifferentToRealign > nRows - LNO_MIN_NROWS_REMAINING) {
481         ERROR_MESSAGE("need at least " << nRows - LNO_MIN_NROWS_REMAINING << " alignment rows to leave out " << nDifferentToRealign << " different rows.");
482         return false;
483     }
484     if (blocks.size() == 0) {
485         ERROR_MESSAGE("need at least one aligned block for LNO");
486         return false;
487     }
488     if (nToRealign != queryFromVec.size() || nToRealign != queryToVec.size()) {
489         ERROR_MESSAGE("inconsistent sizes between row list and query terminii lists for LNO");
490         return false;
491     }
492 
493     BlockMultipleAlignment::AlignmentList toRealign;
494 
495     bool status = false;
496     DP_BlockInfo *dpBlocks = NULL;
497     DP_AlignmentResult *dpResult = NULL;
498 
499     bool prevState = CException::EnableBackgroundReporting(false);
500 
501     try {
502         // first calculate max loop lengths for default square well scoring; include the row to be realigned,
503         // in case it contains an unusually long loop that we don't want to disallow (still depends on loop
504         // calculation params, though)
505         dpBlocks = DP_CreateBlockInfo(blocks.size());
506         unsigned int b, r, i;
507         const Block::Range *range, *nextRange;
508         unsigned int *loopLengths = new unsigned int[nRows];
509         for (b=0; b<blocks.size()-1; ++b) {
510             for (r=0; r<nRows; ++r) {
511                 range = blocks[b]->GetRangeOfRow(r);
512                 nextRange = blocks[b + 1]->GetRangeOfRow(r);
513                 loopLengths[r] = nextRange->from - range->to - 1;
514             }
515             dpBlocks->maxLoops[b] = DP_CalculateMaxLoopLength(
516                 nRows, loopLengths, percentile, extension, cutoff);
517         }
518         delete[] loopLengths;
519 
520         //  Extract rows in decreasing row-number order.
521         BlockMultipleAlignment::AlignmentList::iterator toRealignIt, toRealignEnd;
522         if (!m_currentMultiple->ExtractRows(sortedRowsToRealign, &toRealign))
523             THROW_MESSAGE("ExtractRows() failed");
524         if (nDifferentToRealign != toRealign.size()) {
525             THROW_MESSAGE("Not as many entries in toRealign as number of different rows to realign");
526         }
527 
528         // fill out the rest of the DP_BlockInfo structure
529         for (b=0; b<blocks.size(); ++b) {
530             range = blocks[b]->GetRangeOfRow(0);
531             dpBlocks->blockPositions[b] = range->from;
532             dpBlocks->blockSizes[b] = range->to - range->from + 1;
533         }
534 
535         // if we're not realigning, freeze blocks to original slave position
536         if (blocksToRealign.size() == 0)
537             THROW_MESSAGE("need at least one block to realign");
538 
539         vector < bool > realignBlock(blocks.size(), false);
540         for (r=0; r<blocksToRealign.size(); ++r) {
541             if (blocksToRealign[r] < blocks.size())
542                 realignBlock[blocksToRealign[r]] = true;
543             else
544                 THROW_MESSAGE("block to realign is out of range");
545         }
546 
547         //  Block-align each of the extracted rows.
548         toRealignIt = toRealign.begin();
549         toRealignEnd = toRealign.end();
550         for (i=0; i < nDifferentToRealign; ++i, ++toRealignIt) {
551             row = sortedRowsToRealign[i];
552             rowStr = NStr::UIntToString(row) + " (" + GetSeqIdStringForRow(row) + ")\n";
553 
554             //  find correct from/to in the original unsorted vectors
555             r = 0;
556             while (r < nToRealign && rowsToRealign[r] != row) {
557                 ++r;
558             }
559             if (r < nToRealign) {
560                 queryFrom = queryFromVec[r];
561                 queryTo   = queryToVec[r];
562             } else {  //  should never happen...
563                 ++nFailed;
564                 failed.insert(row);
565                 failedMsg.append("in original sorted list could not find expected row " + rowStr);
566                 continue;
567             }
568 
569             (*toRealignIt)->GetUngappedAlignedBlocks(&blocks);
570             for (b=0; b<blocks.size(); ++b) {
571                 if (!realignBlock[b])
572                     dpBlocks->freezeBlocks[b] = blocks[b]->GetRangeOfRow(1)->from;
573 //                TRACE_MESSAGE("block " << (b+1) << " is " << (realignBlock[b] ? "to be realigned" : "frozen"));
574             }
575 
576             // verify query range
577             r = (*toRealignIt)->GetSequenceOfRow(1)->Length();
578             if (queryTo == kMax_UInt)
579                 queryTo = r - 1;
580             if (queryFrom >= r || queryTo >= r || queryFrom > queryTo) {
581                 ++nFailed;
582                 failed.insert(row);
583                 failedMsg.append("bad queryFrom/To range for row " + rowStr);
584                 continue;
585             }
586             TRACE_MESSAGE("row " << row << " length = " << r << ";  queryFrom " << queryFrom << ", queryTo " << queryTo);
587 
588             // set up PSSM
589             g_dpPSSM = m_currentMultiple->GetPSSM();
590             g_dpBlocks = dpBlocks;
591             g_dpQuery = (*toRealignIt)->GetSequenceOfRow(1);
592 
593             // show score before alignment
594             int originalScore = 0;
595             BlockMultipleAlignment::UngappedAlignedBlockList ob;
596             (*toRealignIt)->GetUngappedAlignedBlocks(&ob);
597             BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator l, o, le = blocks.end();
598             for (l=blocks.begin(), o=ob.begin(); l!=le; ++l, ++o) {
599                 const Block::Range
600                     *masterRange = (*l)->GetRangeOfRow(0),
601                     *range = (*o)->GetRangeOfRow(1);
602                 for (unsigned int i=0; i<(*l)->m_width; ++i)
603                     originalScore += GetPSSMScoreOfCharWithAverageOfBZ(
604                         m_currentMultiple->GetPSSM(), masterRange->from + i, g_dpQuery->m_sequenceString[range->from + i]);
605             }
606             INFO_MESSAGE("score of extracted row with PSSM(N-" << nDifferentToRealign << ") before realignment: " << originalScore);
607 
608             // call the block aligner
609             if (DP_GlobalBlockAlign(dpBlocks, ScoreByPSSM, queryFrom, queryTo, &dpResult) != STRUCT_DP_FOUND_ALIGNMENT ||
610                 dpResult->nBlocks != blocks.size() || dpResult->firstBlock != 0) {
611                 ++nFailed;
612                 failed.insert(row);
613                 failedMsg.append("DP_GlobalBlockAlign() failed for row " + rowStr);
614             } else {
615 
616                 INFO_MESSAGE("score of new alignment of extracted row with PSSM(N-" << nDifferentToRealign << "): " << dpResult->score);
617 
618                 // Adjust new alignment according to dp result; first check if any of the
619                 // blocks will present a problem so we don't modify N-terminal blocks if an
620                 // error is encountered in a more C-terminal block.
621                 BlockMultipleAlignment::ModifiableUngappedAlignedBlockList modBlocks;
622                 (*toRealignIt)->GetModifiableUngappedAlignedBlocks(&modBlocks);
623                 for (b=0; b<modBlocks.size(); ++b) {
624                     if (!realignBlock[b]) {
625                         // just to check...
626                         if ((int)dpResult->blockPositions[b] != modBlocks[b]->GetRangeOfRow(1)->from) {
627                             ++nFailed;
628                             failed.insert(row);
629                             failedMsg.append("dpResult block doesn't match frozen position on slave for row " + rowStr);
630                             break;
631                         }
632                     }
633                 }
634                 if (failed.count(row) == 0) {
635                     for (b=0; b<modBlocks.size(); ++b) {
636                         if (realignBlock[b]) {
637                             modBlocks[b]->SetRangeOfRow(1,
638                                                         dpResult->blockPositions[b], dpResult->blockPositions[b] + dpBlocks->blockSizes[b] - 1);
639 
640                         }
641                     }
642                 }
643             }
644 
645             DP_DestroyAlignmentResult(dpResult);
646             dpResult = NULL;
647         }
648 
649 
650         //  The toRealign container will hold the orignial, unmodified alignment
651         //  obtained by ExtractRows above.
652         if (nFailed > 0) {
653             ERR_POST(ncbi::Error << "Encountered problems realigning some rows; those implicated are unchanged:\n       " + failedMsg + "\n\n");
654         }
655 
656         // merge realigned rows back onto the end of the multiple alignment,
657         // in order of increasing row number
658         TRACE_MESSAGE("merging DP results back into multiple alignment");
659         toRealignIt = toRealign.end();
660         --toRealignIt;
661 
662         //  Failures here are fatal as the re-aligned sequence cannot be replaced
663         //  back in its original location in the alignment.
664         for (int j=nDifferentToRealign-1; j >=0; --j, --toRealignIt) {
665             row = sortedRowsToRealign[j];
666             rowStr = NStr::UIntToString(row) + " (" + GetSeqIdStringForRow(row) + ")";
667             if (!m_currentMultiple->MergeAlignment(*toRealignIt))
668                 THROW_MESSAGE("MergeAlignment() failed for row " + rowStr);
669 
670             // move extracted row back to where it was
671             vector < unsigned int > rowOrder(m_currentMultiple->NRows());
672             for (r=0; r<m_currentMultiple->NRows(); ++r) {
673                 if (r < row)
674                     rowOrder[r] = r;
675                 else if (r == row)
676                     rowOrder[r] = m_currentMultiple->NRows() - 1;
677                 else
678                     rowOrder[r] = r - 1;
679             }
680             if (!m_currentMultiple->ReorderRows(rowOrder))
681                 THROW_MESSAGE("ReorderRows() failed when reinserting row " + rowStr);
682         }
683 
684         // remove other alignment data, since it no longer matches the multiple
685         RemoveAlignAnnot();
686 
687         status = true;
688 
689     } catch (CException& e) {
690         ERROR_MESSAGE("DoLeaveNOut(): exception: " << e.GetMsg());
691     }
692 
693     CException::EnableBackgroundReporting(prevState);
694 
695     DELETE_ALL_AND_CLEAR(toRealign, BlockMultipleAlignment::AlignmentList);
696 
697     DP_DestroyBlockInfo(dpBlocks);
698     DP_DestroyAlignmentResult(dpResult);
699 
700     g_dpPSSM = NULL;
701     g_dpBlocks = NULL;
702     g_dpQuery = NULL;
703 
704     return status;
705 }
706 
RemoveMultiple(void)707 void AlignmentUtility::RemoveMultiple(void)
708 {
709     if (m_currentMultiple) {
710         delete m_currentMultiple;
711         m_currentMultiple = NULL;
712     }
713 }
714 
RemoveAlignAnnot(void)715 void AlignmentUtility::RemoveAlignAnnot(void)
716 {
717     m_seqAnnots.clear();
718     if (m_alignmentSet) {
719         delete m_alignmentSet;
720         m_alignmentSet = NULL;
721     }
722 }
723 
GetSeqAnnots(void)724 const AlignmentUtility::SeqAnnotList& AlignmentUtility::GetSeqAnnots(void)
725 {
726     if (!m_alignmentSet || m_seqAnnots.size() == 0) {
727         if (m_alignmentSet)
728             ERROR_MESSAGE("ack - shouldn't have m_alignmentSet but empty m_seqAnnots");
729         m_alignmentSet = AlignmentSet::CreateFromMultiple(
730             m_currentMultiple, &m_seqAnnots, *m_sequenceSet);
731     }
732     return m_seqAnnots;
733 }
734 
GetPSSM(void)735 const BLAST_Matrix * AlignmentUtility::GetPSSM(void)
736 {
737     // first we need to do IBM -> BlockMultipleAlignment
738     if (!m_currentMultiple && !DoIBM())
739             return NULL;
740 
741 	return m_currentMultiple->GetPSSM();
742 }
743 
GetBlockMultipleAlignment(void)744 const BlockMultipleAlignment * AlignmentUtility::GetBlockMultipleAlignment(void)
745 {
746     // first we need to do IBM -> BlockMultipleAlignment
747     if (!m_currentMultiple && !DoIBM())
748             return NULL;
749 
750     return m_currentMultiple;
751 }
752 
SetBlockMultipleAlignment(const BlockMultipleAlignment * bma)753 void AlignmentUtility::SetBlockMultipleAlignment(const BlockMultipleAlignment* bma) {
754     if (bma) {
755         RemoveMultiple();
756         RemoveAlignAnnot();
757         m_currentMultiple = bma->Clone();
758     }
759 }
760 
ScoreRowByPSSM(unsigned int row)761 int AlignmentUtility::ScoreRowByPSSM(unsigned int row)
762 {
763     // first we need to do IBM -> BlockMultipleAlignment
764     if (!m_currentMultiple && !DoIBM())
765             return kMin_Int;
766 
767     if (row >= m_currentMultiple->NRows()) {
768         ERROR_MESSAGE("AlignmentUtility::ScoreRowByPSSM() - row out of range");
769         return kMin_Int;
770     }
771 
772     BlockMultipleAlignment::UngappedAlignedBlockList blocks;
773     m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
774     if (blocks.size() == 0) {
775         WARNING_MESSAGE("AlignmentUtility::ScoreRowByPSSM() - alignment has no blocks");
776         return kMin_Int;
777     }
778 
779     const Sequence *sequence = m_currentMultiple->GetSequenceOfRow(row);
780 
781     int score = 0;
782     BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator b, be = blocks.end();
783     for (b=blocks.begin(); b!=be; ++b) {
784         const Block::Range
785             *masterRange = (*b)->GetRangeOfRow(0),
786             *range = (*b)->GetRangeOfRow(row);
787         for (unsigned int i=0; i<(*b)->m_width; ++i)
788             score += GetPSSMScoreOfCharWithAverageOfBZ(
789                 m_currentMultiple->GetPSSM(), masterRange->from + i, sequence->m_sequenceString[range->from + i]);
790     }
791 
792     return score;
793 }
794 
GetNRows()795 unsigned int AlignmentUtility::GetNRows() {
796     const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
797     return (bma) ? bma->NRows() : 0;
798 }
799 
GetNRows() const800 unsigned int AlignmentUtility::GetNRows() const {
801     unsigned int nRows = 0;
802     if (m_currentMultiple) {
803         nRows = m_currentMultiple->NRows();
804     } else {
805         AlignmentUtility* dummy = Clone();
806         nRows = (dummy) ? dummy->GetNRows() : 0;
807         delete dummy;
808     }
809     return nRows;
810 }
811 
IsRowPDB(unsigned int row)812 bool AlignmentUtility::IsRowPDB(unsigned int row) {
813     bool result = false;
814     const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
815     const Sequence* sequence = (bma) ? bma->GetSequenceOfRow(row) : NULL;
816 
817     if (sequence) {
818         result = sequence->GetPreferredIdentifier().IsPdb();
819     }
820 
821     return result;
822 }
823 
IsRowPDB(unsigned int row) const824 bool AlignmentUtility::IsRowPDB(unsigned int row) const {
825     bool result = false;
826     if (m_currentMultiple) {
827         const Sequence* sequence = m_currentMultiple->GetSequenceOfRow(row);
828         if (sequence) {
829             result = sequence->GetPreferredIdentifier().IsPdb();
830         }
831     } else {
832         AlignmentUtility* dummy = Clone();
833         result = (dummy && dummy->IsRowPDB(row));
834     }
835     return result;
836 }
837 
SequenceIdToString(const Sequence & sequence)838 string SequenceIdToString(const Sequence& sequence) {
839 
840     static const string defStr = "Non-GI/PDB Sequence Type";
841     string s = defStr;
842     const CSeq_id& id = sequence.GetPreferredIdentifier();
843 
844     if (id.IsPdb()) {
845 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
846         s = "PDB " + id.GetPdb().GetMol().Get() + '_' + id.GetPdb().GetEffectiveChain_id();
847 #else
848         char chain = id.GetPdb().GetChain();
849         s = "PDB " + id.GetPdb().GetMol().Get() + '_' + chain;
850 #endif
851     } else if (id.IsGi()) {
852         s = "GI " + NStr::NumericToString(id.GetGi());
853     }
854     return s;
855 }
856 
GetSeqIdStringForRow(unsigned int row)857 string AlignmentUtility::GetSeqIdStringForRow(unsigned int row) {
858 
859     static const string defStr = "<Could not find a sequence for row ";
860     string idStr = defStr + NStr::IntToString(row + 1) + '>';
861     const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
862     const Sequence* sequence = (bma) ? bma->GetSequenceOfRow(row) : NULL;
863 
864     if (sequence) {
865         idStr = SequenceIdToString(*sequence);
866     }
867     return idStr;
868 }
869 
GetSeqIdStringForRow(unsigned int row) const870 string AlignmentUtility::GetSeqIdStringForRow(unsigned int row) const {
871     static const string defStr = "<Could not find a sequence for row ";
872     string idStr = defStr + NStr::IntToString(row + 1) + '>';
873     if (m_currentMultiple) {
874         const Sequence* sequence = m_currentMultiple->GetSequenceOfRow(row);
875         if (sequence) {
876             idStr = SequenceIdToString(*sequence);
877         }
878     } else {
879         AlignmentUtility* dummy = Clone();
880         if (dummy) {
881             idStr = dummy->GetSeqIdStringForRow(row);
882         }
883     }
884     return idStr;
885 }
886 
887 END_SCOPE(struct_util)
888