1 /* $Id: struct_util.cpp 600980 2020-01-30 14:49:42Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * AlignmentUtility class
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <corelib/ncbistd.hpp>
37
38 #include <set>
39
40 #include <algo/structure/struct_util/struct_util.hpp>
41 #include <algo/structure/struct_util/su_sequence_set.hpp>
42 #include <algo/structure/struct_util/su_block_multiple_alignment.hpp>
43 #include <objects/seqloc/PDB_seq_id.hpp>
44 #include "su_private.hpp"
45 #include "su_alignment_set.hpp"
46 #include <algo/structure/struct_util/su_pssm.hpp>
47
48 #include <algo/structure/struct_dp/struct_dp.h>
49
50 USING_NCBI_SCOPE;
51 USING_SCOPE(objects);
52
53
BEGIN_SCOPE(struct_util)54 BEGIN_SCOPE(struct_util)
55
56 AlignmentUtility::AlignmentUtility(const CSeq_entry& seqEntry, const SeqAnnotList& seqAnnots)
57 {
58 CRef < CSeq_entry > seqEntryCopy(new CSeq_entry());
59 seqEntryCopy->Assign(seqEntry);
60 m_seqEntries.push_back(seqEntryCopy);
61
62 SeqAnnotList::const_iterator a, ae = seqAnnots.end();
63 for (a=seqAnnots.begin(); a!=ae; ++a) {
64 CRef < CSeq_annot > seqAnnotCopy(new CSeq_annot());
65 seqAnnotCopy->Assign(**a);
66 m_seqAnnots.push_back(seqAnnotCopy);
67 }
68
69 Init();
70 }
71
AlignmentUtility(const SeqEntryList & seqEntries,const SeqAnnotList & seqAnnots)72 AlignmentUtility::AlignmentUtility(const SeqEntryList& seqEntries, const SeqAnnotList& seqAnnots)
73 {
74 SeqEntryList::const_iterator s, se = seqEntries.end();
75 for (s=seqEntries.begin(); s!=se; ++s) {
76 CRef < CSeq_entry > seqEntryCopy(new CSeq_entry());
77 seqEntryCopy->Assign(**s);
78 m_seqEntries.push_back(seqEntryCopy);
79 }
80
81 SeqAnnotList::const_iterator a, ae = seqAnnots.end();
82 for (a=seqAnnots.begin(); a!=ae; ++a) {
83 CRef < CSeq_annot > seqAnnotCopy(new CSeq_annot());
84 seqAnnotCopy->Assign(**a);
85 m_seqAnnots.push_back(seqAnnotCopy);
86 }
87
88 Init();
89 }
90
Init(void)91 void AlignmentUtility::Init(void)
92 {
93 m_sequenceSet = NULL;
94 m_alignmentSet = NULL;
95 m_okay = true;
96 m_currentMultiple = NULL;
97
98 bool prevState = CException::EnableBackgroundReporting(false);
99
100 try {
101 m_sequenceSet = new SequenceSet(m_seqEntries);
102 m_alignmentSet = new AlignmentSet(m_seqAnnots, *m_sequenceSet);
103 } catch (CException& e) {
104 ERROR_MESSAGE("exception during AlignmentUtility initialization: " << e.GetMsg());
105 m_okay = false;
106 }
107
108 CException::EnableBackgroundReporting(prevState);
109 }
110
~AlignmentUtility()111 AlignmentUtility::~AlignmentUtility()
112 {
113 if (m_sequenceSet)
114 delete m_sequenceSet;
115 if (m_alignmentSet)
116 delete m_alignmentSet;
117 if (m_currentMultiple)
118 delete m_currentMultiple;
119 }
120
Clone() const121 AlignmentUtility* AlignmentUtility::Clone() const {
122 const SeqAnnotList& seqAnnots = const_cast<AlignmentUtility*>(this)->GetSeqAnnots();
123 return new AlignmentUtility(m_seqEntries, seqAnnots);
124 }
125
AlignedToAllSlaves(unsigned int masterResidue,const AlignmentSet::AlignmentList & alignments)126 static bool AlignedToAllSlaves(unsigned int masterResidue, const AlignmentSet::AlignmentList& alignments)
127 {
128 AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
129 for (a=alignments.begin(); a!=ae; ++a) {
130 if ((*a)->m_masterToSlave[masterResidue] == MasterSlaveAlignment::eUnaligned)
131 return false;
132 }
133 return true;
134 }
135
NoSlaveInsertionsBetween(unsigned int masterFrom,unsigned int masterTo,const AlignmentSet::AlignmentList & alignments)136 static bool NoSlaveInsertionsBetween(unsigned int masterFrom, unsigned int masterTo,
137 const AlignmentSet::AlignmentList& alignments)
138 {
139 AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
140 for (a=alignments.begin(); a!=ae; ++a) {
141 if (((*a)->m_masterToSlave[masterTo] - (*a)->m_masterToSlave[masterFrom]) != (masterTo - masterFrom))
142 return false;
143 }
144 return true;
145 }
146
NoBlockBoundariesBetween(unsigned int masterFrom,unsigned int masterTo,const AlignmentSet::AlignmentList & alignments)147 static bool NoBlockBoundariesBetween(unsigned int masterFrom, unsigned int masterTo,
148 const AlignmentSet::AlignmentList& alignments)
149 {
150 AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
151 for (a=alignments.begin(); a!=ae; ++a) {
152 if ((*a)->m_blockStructure[masterTo] != (*a)->m_blockStructure[masterFrom])
153 return false;
154 }
155 return true;
156 }
157
DoIBM(void)158 bool AlignmentUtility::DoIBM(void)
159 {
160 if (m_currentMultiple) {
161 WARNING_MESSAGE("DoIBM() - already have a blocked alignment");
162 return true;
163 }
164 if (!m_alignmentSet || m_seqAnnots.size() == 0) {
165 ERROR_MESSAGE("DoIBM() - no alignment data present");
166 return false;
167 }
168
169 TRACE_MESSAGE("doing IBM");
170 const AlignmentSet::AlignmentList& alignments = m_alignmentSet->m_alignments;
171 AlignmentSet::AlignmentList::const_iterator a, ae = alignments.end();
172
173 // create sequence list; fill with sequences of master + slaves
174 BlockMultipleAlignment::SequenceList sequenceList(alignments.size() + 1);
175 BlockMultipleAlignment::SequenceList::iterator s = sequenceList.begin();
176 *(s++) = alignments.front()->m_master;
177 for (a=alignments.begin(); a!=ae; ++a) {
178 *(s++) = (*a)->m_slave;
179 if ((*a)->m_master != sequenceList.front()) {
180 ERROR_MESSAGE("AlignmentUtility::DoIBM() - all pairwise alignments must have the same master sequence");
181 return false;
182 }
183 }
184 CRef < BlockMultipleAlignment > multipleAlignment(new BlockMultipleAlignment(sequenceList));
185
186 // each block is a continuous region on the master, over which each master
187 // residue is aligned to a residue of each slave, and where there are no
188 // insertions relative to the master in any of the slaves
189 unsigned int masterFrom = 0, masterTo, row;
190 UngappedAlignedBlock *newBlock;
191
192 while (masterFrom < multipleAlignment->GetMaster()->Length()) {
193
194 // look for first all-aligned residue
195 if (!AlignedToAllSlaves(masterFrom, alignments)) {
196 ++masterFrom;
197 continue;
198 }
199
200 // find all next continuous all-aligned residues, but checking for
201 // block boundaries from the original master-slave pairs, so that
202 // blocks don't get merged
203 for (masterTo=masterFrom+1;
204 masterTo < multipleAlignment->GetMaster()->Length() &&
205 AlignedToAllSlaves(masterTo, alignments) &&
206 NoSlaveInsertionsBetween(masterFrom, masterTo, alignments) &&
207 NoBlockBoundariesBetween(masterFrom, masterTo, alignments);
208 ++masterTo) ;
209 --masterTo; // after loop, masterTo = first residue past block
210
211 // create new block with ranges from master and all slaves
212 newBlock = new UngappedAlignedBlock(multipleAlignment);
213 newBlock->SetRangeOfRow(0, masterFrom, masterTo);
214 newBlock->m_width = masterTo - masterFrom + 1;
215
216 for (a=alignments.begin(), row=1; a!=ae; ++a, ++row) {
217 newBlock->SetRangeOfRow(row,
218 (*a)->m_masterToSlave[masterFrom],
219 (*a)->m_masterToSlave[masterTo]);
220 }
221
222 // copy new block into alignment
223 multipleAlignment->AddAlignedBlockAtEnd(newBlock);
224
225 // start looking for next block
226 masterFrom = masterTo + 1;
227 }
228
229 if (!multipleAlignment->AddUnalignedBlocks() || !multipleAlignment->UpdateBlockMap()) {
230 ERROR_MESSAGE("AlignmentUtility::DoIBM() - error finalizing alignment");
231 return false;
232 }
233
234 // switch data to the new multiple alignment
235 m_currentMultiple = multipleAlignment.Release();
236 RemoveAlignAnnot();
237 return true;
238 }
239
240 // global stuff for DP block aligner score callback
241 DP_BlockInfo *g_dpBlocks = NULL;
242 const BLAST_Matrix *g_dpPSSM = NULL;
243 const Sequence *g_dpQuery = NULL;
244
245 // sum of scores for residue vs. PSSM
246 extern "C" {
ScoreByPSSM(unsigned int block,unsigned int queryPos)247 int ScoreByPSSM(unsigned int block, unsigned int queryPos)
248 {
249 if (!g_dpBlocks || !g_dpPSSM || !g_dpQuery || block >= g_dpBlocks->nBlocks ||
250 queryPos > g_dpQuery->Length() - g_dpBlocks->blockSizes[block])
251 {
252 ERROR_MESSAGE("ScoreByPSSM() - bad parameters");
253 return DP_NEGATIVE_INFINITY;
254 }
255
256 int masterPos = g_dpBlocks->blockPositions[block], score = 0;
257 for (unsigned int i=0; i<g_dpBlocks->blockSizes[block]; ++i)
258 score += GetPSSMScoreOfCharWithAverageOfBZ(g_dpPSSM, masterPos + i, g_dpQuery->m_sequenceString[queryPos + i]);
259
260 return score;
261 }
262 }
263
DoLeaveOneOut(unsigned int rowToRealign,const std::vector<unsigned int> & blocksToRealign,double percentile,unsigned int extension,unsigned int cutoff,unsigned int queryFrom,unsigned int queryTo)264 bool AlignmentUtility::DoLeaveOneOut(
265 unsigned int rowToRealign, const std::vector < unsigned int >& blocksToRealign, // what to realign
266 double percentile, unsigned int extension, unsigned int cutoff, // to calculate max loop lengths
267 unsigned int queryFrom, unsigned int queryTo) // range on realigned row to search
268 {
269 // first we need to do IBM -> BlockMultipleAlignment
270 if (!m_currentMultiple && !DoIBM())
271 return false;
272
273 BlockMultipleAlignment::UngappedAlignedBlockList blocks;
274 m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
275 if (blocks.size() == 0) {
276 ERROR_MESSAGE("need at least one aligned block for LOO");
277 return false;
278 }
279
280 BlockMultipleAlignment::AlignmentList toRealign;
281
282 bool status = false;
283 DP_BlockInfo *dpBlocks = NULL;
284 DP_AlignmentResult *dpResult = NULL;
285
286 bool prevState = CException::EnableBackgroundReporting(false);
287
288 try {
289 // first calculate max loop lengths for default square well scoring; include the row to be realigned,
290 // in case it contains an unusually long loop that we don't want to disallow (still depends on loop
291 // calculation params, though)
292 dpBlocks = DP_CreateBlockInfo(blocks.size());
293 unsigned int b, r;
294 const Block::Range *range, *nextRange;
295 unsigned int *loopLengths = new unsigned int[m_currentMultiple->NRows()];
296 for (b=0; b<blocks.size()-1; ++b) {
297 for (r=0; r<m_currentMultiple->NRows(); ++r) {
298 range = blocks[b]->GetRangeOfRow(r);
299 nextRange = blocks[b + 1]->GetRangeOfRow(r);
300 loopLengths[r] = nextRange->from - range->to - 1;
301 }
302 dpBlocks->maxLoops[b] = DP_CalculateMaxLoopLength(
303 m_currentMultiple->NRows(), loopLengths, percentile, extension, cutoff);
304 }
305 delete[] loopLengths;
306
307 // now pull out the row we're realigning
308 if (rowToRealign < 1 || rowToRealign >= m_currentMultiple->NRows())
309 THROW_MESSAGE("invalid row number");
310 vector < unsigned int > rowsToRemove(1, rowToRealign);
311 TRACE_MESSAGE("extracting for realignment: "
312 << m_currentMultiple->GetSequenceOfRow(rowToRealign)->IdentifierString());
313 if (!m_currentMultiple->ExtractRows(rowsToRemove, &toRealign))
314 THROW_MESSAGE("ExtractRows() failed");
315
316 // fill out the rest of the DP_BlockInfo structure
317 for (b=0; b<blocks.size(); ++b) {
318 range = blocks[b]->GetRangeOfRow(0);
319 dpBlocks->blockPositions[b] = range->from;
320 dpBlocks->blockSizes[b] = range->to - range->from + 1;
321 }
322
323 // if we're not realigning, freeze blocks to original slave position
324 if (blocksToRealign.size() == 0)
325 THROW_MESSAGE("need at least one block to realign");
326 vector < bool > realignBlock(blocks.size(), false);
327 for (r=0; r<blocksToRealign.size(); ++r) {
328 if (blocksToRealign[r] < blocks.size())
329 realignBlock[blocksToRealign[r]] = true;
330 else
331 THROW_MESSAGE("block to realign is out of range");
332 }
333 toRealign.front()->GetUngappedAlignedBlocks(&blocks);
334 for (b=0; b<blocks.size(); ++b) {
335 if (!realignBlock[b])
336 dpBlocks->freezeBlocks[b] = blocks[b]->GetRangeOfRow(1)->from;
337 // TRACE_MESSAGE("block " << (b+1) << " is " << (realignBlock[b] ? "to be realigned" : "frozen"));
338 }
339
340 // verify query range
341 r = toRealign.front()->GetSequenceOfRow(1)->Length();
342 if (queryTo == kMax_UInt)
343 queryTo = r - 1;
344 if (queryFrom >= r || queryTo >= r || queryFrom > queryTo)
345 THROW_MESSAGE("bad queryFrom/To range");
346 TRACE_MESSAGE("queryFrom " << queryFrom << ", queryTo " << queryTo);
347
348 // set up PSSM
349 g_dpPSSM = m_currentMultiple->GetPSSM();
350 g_dpBlocks = dpBlocks;
351 g_dpQuery = toRealign.front()->GetSequenceOfRow(1);
352
353 // show score before alignment
354 int originalScore = 0;
355 BlockMultipleAlignment::UngappedAlignedBlockList ob;
356 toRealign.front()->GetUngappedAlignedBlocks(&ob);
357 BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator l, o, le = blocks.end();
358 for (l=blocks.begin(), o=ob.begin(); l!=le; ++l, ++o) {
359 const Block::Range
360 *masterRange = (*l)->GetRangeOfRow(0),
361 *range = (*o)->GetRangeOfRow(1);
362 for (unsigned int i=0; i<(*l)->m_width; ++i)
363 originalScore += GetPSSMScoreOfCharWithAverageOfBZ(
364 m_currentMultiple->GetPSSM(), masterRange->from + i, g_dpQuery->m_sequenceString[range->from + i]);
365 }
366 INFO_MESSAGE("score of extracted row with PSSM(N-1) before realignment: " << originalScore);
367
368 // call the block aligner
369 if (DP_GlobalBlockAlign(dpBlocks, ScoreByPSSM, queryFrom, queryTo, &dpResult) != STRUCT_DP_FOUND_ALIGNMENT ||
370 dpResult->nBlocks != blocks.size() || dpResult->firstBlock != 0)
371 THROW_MESSAGE("DP_GlobalBlockAlign() failed");
372 INFO_MESSAGE("score of new alignment of extracted row with PSSM(N-1): " << dpResult->score);
373
374 // adjust new alignment according to dp result
375 BlockMultipleAlignment::ModifiableUngappedAlignedBlockList modBlocks;
376 toRealign.front()->GetModifiableUngappedAlignedBlocks(&modBlocks);
377 for (b=0; b<modBlocks.size(); ++b) {
378 if (realignBlock[b]) {
379 modBlocks[b]->SetRangeOfRow(1,
380 dpResult->blockPositions[b], dpResult->blockPositions[b] + dpBlocks->blockSizes[b] - 1);
381 } else {
382 if ((int)dpResult->blockPositions[b] != modBlocks[b]->GetRangeOfRow(1)->from) // just to check...
383 THROW_MESSAGE("dpResult block doesn't match frozen position on slave");
384 }
385 }
386
387 // merge realigned row back onto the end of the multiple alignment
388 TRACE_MESSAGE("merging DP results back into multiple alignment");
389 if (!m_currentMultiple->MergeAlignment(toRealign.front()))
390 THROW_MESSAGE("MergeAlignment() failed");
391
392 // move extracted row back to where it was
393 vector < unsigned int > rowOrder(m_currentMultiple->NRows());
394 for (r=0; r<m_currentMultiple->NRows(); ++r) {
395 if (r < rowToRealign)
396 rowOrder[r] = r;
397 else if (r == rowToRealign)
398 rowOrder[r] = m_currentMultiple->NRows() - 1;
399 else
400 rowOrder[r] = r - 1;
401 }
402 if (!m_currentMultiple->ReorderRows(rowOrder))
403 THROW_MESSAGE("ReorderRows() failed");
404
405 // remove other alignment data, since it no longer matches the multiple
406 RemoveAlignAnnot();
407
408 status = true;
409
410 } catch (CException& e) {
411 ERROR_MESSAGE("DoLeaveOneOut(): exception: " << e.GetMsg());
412 }
413
414 CException::EnableBackgroundReporting(prevState);
415
416 DELETE_ALL_AND_CLEAR(toRealign, BlockMultipleAlignment::AlignmentList);
417
418 DP_DestroyBlockInfo(dpBlocks);
419 DP_DestroyAlignmentResult(dpResult);
420
421 g_dpPSSM = NULL;
422 g_dpBlocks = NULL;
423 g_dpQuery = NULL;
424
425 return status;
426 }
427
428
DoLeaveNOut(std::vector<unsigned int> & rowsToRealign,const std::vector<unsigned int> & blocksToRealign,double percentile,unsigned int extension,unsigned int cutoff,std::vector<unsigned int> & queryFromVec,std::vector<unsigned int> & queryToVec)429 bool AlignmentUtility::DoLeaveNOut(
430 std::vector<unsigned int>& rowsToRealign, const std::vector < unsigned int >& blocksToRealign, // what to realign
431 double percentile, unsigned int extension, unsigned int cutoff, // to calculate max loop lengths
432 std::vector<unsigned int>& queryFromVec, std::vector<unsigned int>& queryToVec) // range on realigned row to search
433 {
434 static const unsigned int LNO_MIN_NROWS_REMAINING = 3;
435
436 unsigned int nRows, row;
437 unsigned int queryFrom, queryTo;
438 unsigned int nToRealign = rowsToRealign.size(), nFailed = 0;
439 set<unsigned int> failed;
440 string failedMsg, rowStr;
441
442 // first we need to do IBM -> BlockMultipleAlignment
443 if (!m_currentMultiple && !DoIBM())
444 return false;
445
446 // Set is used only to remove duplicates; since we don't
447 // recompute PSSM between block aligner calls, the order the rows
448 // are left out is not important when put back into 'sortedRowsToRealign'.
449 unsigned int nDifferentToRealign;
450 vector<unsigned int> sortedRowsToRealign;
451 set<unsigned int> setOfRows;
452 setOfRows.insert(rowsToRealign.begin(), rowsToRealign.end());
453 set<unsigned int>::iterator setIt, setEnd = setOfRows.end();
454
455 nRows = m_currentMultiple->NRows();
456 nDifferentToRealign = setOfRows.size();
457 if (nToRealign != nDifferentToRealign) {
458 WARNING_MESSAGE("Duplicate rows found among list of rows to leave out skipped\n(repeated applications of LNO on a single row has no affect)");
459 // return false;
460 }
461
462 // now pull out the valid rows we can realigning
463 for (setIt = setOfRows.begin(); setIt != setEnd; ++setIt) {
464 if (*setIt < 1 || *setIt >= nRows) {
465 WARNING_MESSAGE("Skipping invalid row number " << *setIt);
466 } else {
467 sortedRowsToRealign.push_back(*setIt);
468 TRACE_MESSAGE("extracting for realignment: "
469 << m_currentMultiple->GetSequenceOfRow(*setIt)->IdentifierString());
470 }
471
472 }
473 nDifferentToRealign = sortedRowsToRealign.size();
474
475
476 BlockMultipleAlignment::UngappedAlignedBlockList blocks;
477 m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
478
479 // if (nToRealign > nRows - LNO_MIN_NROWS_REMAINING) {
480 if (nDifferentToRealign > nRows - LNO_MIN_NROWS_REMAINING) {
481 ERROR_MESSAGE("need at least " << nRows - LNO_MIN_NROWS_REMAINING << " alignment rows to leave out " << nDifferentToRealign << " different rows.");
482 return false;
483 }
484 if (blocks.size() == 0) {
485 ERROR_MESSAGE("need at least one aligned block for LNO");
486 return false;
487 }
488 if (nToRealign != queryFromVec.size() || nToRealign != queryToVec.size()) {
489 ERROR_MESSAGE("inconsistent sizes between row list and query terminii lists for LNO");
490 return false;
491 }
492
493 BlockMultipleAlignment::AlignmentList toRealign;
494
495 bool status = false;
496 DP_BlockInfo *dpBlocks = NULL;
497 DP_AlignmentResult *dpResult = NULL;
498
499 bool prevState = CException::EnableBackgroundReporting(false);
500
501 try {
502 // first calculate max loop lengths for default square well scoring; include the row to be realigned,
503 // in case it contains an unusually long loop that we don't want to disallow (still depends on loop
504 // calculation params, though)
505 dpBlocks = DP_CreateBlockInfo(blocks.size());
506 unsigned int b, r, i;
507 const Block::Range *range, *nextRange;
508 unsigned int *loopLengths = new unsigned int[nRows];
509 for (b=0; b<blocks.size()-1; ++b) {
510 for (r=0; r<nRows; ++r) {
511 range = blocks[b]->GetRangeOfRow(r);
512 nextRange = blocks[b + 1]->GetRangeOfRow(r);
513 loopLengths[r] = nextRange->from - range->to - 1;
514 }
515 dpBlocks->maxLoops[b] = DP_CalculateMaxLoopLength(
516 nRows, loopLengths, percentile, extension, cutoff);
517 }
518 delete[] loopLengths;
519
520 // Extract rows in decreasing row-number order.
521 BlockMultipleAlignment::AlignmentList::iterator toRealignIt, toRealignEnd;
522 if (!m_currentMultiple->ExtractRows(sortedRowsToRealign, &toRealign))
523 THROW_MESSAGE("ExtractRows() failed");
524 if (nDifferentToRealign != toRealign.size()) {
525 THROW_MESSAGE("Not as many entries in toRealign as number of different rows to realign");
526 }
527
528 // fill out the rest of the DP_BlockInfo structure
529 for (b=0; b<blocks.size(); ++b) {
530 range = blocks[b]->GetRangeOfRow(0);
531 dpBlocks->blockPositions[b] = range->from;
532 dpBlocks->blockSizes[b] = range->to - range->from + 1;
533 }
534
535 // if we're not realigning, freeze blocks to original slave position
536 if (blocksToRealign.size() == 0)
537 THROW_MESSAGE("need at least one block to realign");
538
539 vector < bool > realignBlock(blocks.size(), false);
540 for (r=0; r<blocksToRealign.size(); ++r) {
541 if (blocksToRealign[r] < blocks.size())
542 realignBlock[blocksToRealign[r]] = true;
543 else
544 THROW_MESSAGE("block to realign is out of range");
545 }
546
547 // Block-align each of the extracted rows.
548 toRealignIt = toRealign.begin();
549 toRealignEnd = toRealign.end();
550 for (i=0; i < nDifferentToRealign; ++i, ++toRealignIt) {
551 row = sortedRowsToRealign[i];
552 rowStr = NStr::UIntToString(row) + " (" + GetSeqIdStringForRow(row) + ")\n";
553
554 // find correct from/to in the original unsorted vectors
555 r = 0;
556 while (r < nToRealign && rowsToRealign[r] != row) {
557 ++r;
558 }
559 if (r < nToRealign) {
560 queryFrom = queryFromVec[r];
561 queryTo = queryToVec[r];
562 } else { // should never happen...
563 ++nFailed;
564 failed.insert(row);
565 failedMsg.append("in original sorted list could not find expected row " + rowStr);
566 continue;
567 }
568
569 (*toRealignIt)->GetUngappedAlignedBlocks(&blocks);
570 for (b=0; b<blocks.size(); ++b) {
571 if (!realignBlock[b])
572 dpBlocks->freezeBlocks[b] = blocks[b]->GetRangeOfRow(1)->from;
573 // TRACE_MESSAGE("block " << (b+1) << " is " << (realignBlock[b] ? "to be realigned" : "frozen"));
574 }
575
576 // verify query range
577 r = (*toRealignIt)->GetSequenceOfRow(1)->Length();
578 if (queryTo == kMax_UInt)
579 queryTo = r - 1;
580 if (queryFrom >= r || queryTo >= r || queryFrom > queryTo) {
581 ++nFailed;
582 failed.insert(row);
583 failedMsg.append("bad queryFrom/To range for row " + rowStr);
584 continue;
585 }
586 TRACE_MESSAGE("row " << row << " length = " << r << "; queryFrom " << queryFrom << ", queryTo " << queryTo);
587
588 // set up PSSM
589 g_dpPSSM = m_currentMultiple->GetPSSM();
590 g_dpBlocks = dpBlocks;
591 g_dpQuery = (*toRealignIt)->GetSequenceOfRow(1);
592
593 // show score before alignment
594 int originalScore = 0;
595 BlockMultipleAlignment::UngappedAlignedBlockList ob;
596 (*toRealignIt)->GetUngappedAlignedBlocks(&ob);
597 BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator l, o, le = blocks.end();
598 for (l=blocks.begin(), o=ob.begin(); l!=le; ++l, ++o) {
599 const Block::Range
600 *masterRange = (*l)->GetRangeOfRow(0),
601 *range = (*o)->GetRangeOfRow(1);
602 for (unsigned int i=0; i<(*l)->m_width; ++i)
603 originalScore += GetPSSMScoreOfCharWithAverageOfBZ(
604 m_currentMultiple->GetPSSM(), masterRange->from + i, g_dpQuery->m_sequenceString[range->from + i]);
605 }
606 INFO_MESSAGE("score of extracted row with PSSM(N-" << nDifferentToRealign << ") before realignment: " << originalScore);
607
608 // call the block aligner
609 if (DP_GlobalBlockAlign(dpBlocks, ScoreByPSSM, queryFrom, queryTo, &dpResult) != STRUCT_DP_FOUND_ALIGNMENT ||
610 dpResult->nBlocks != blocks.size() || dpResult->firstBlock != 0) {
611 ++nFailed;
612 failed.insert(row);
613 failedMsg.append("DP_GlobalBlockAlign() failed for row " + rowStr);
614 } else {
615
616 INFO_MESSAGE("score of new alignment of extracted row with PSSM(N-" << nDifferentToRealign << "): " << dpResult->score);
617
618 // Adjust new alignment according to dp result; first check if any of the
619 // blocks will present a problem so we don't modify N-terminal blocks if an
620 // error is encountered in a more C-terminal block.
621 BlockMultipleAlignment::ModifiableUngappedAlignedBlockList modBlocks;
622 (*toRealignIt)->GetModifiableUngappedAlignedBlocks(&modBlocks);
623 for (b=0; b<modBlocks.size(); ++b) {
624 if (!realignBlock[b]) {
625 // just to check...
626 if ((int)dpResult->blockPositions[b] != modBlocks[b]->GetRangeOfRow(1)->from) {
627 ++nFailed;
628 failed.insert(row);
629 failedMsg.append("dpResult block doesn't match frozen position on slave for row " + rowStr);
630 break;
631 }
632 }
633 }
634 if (failed.count(row) == 0) {
635 for (b=0; b<modBlocks.size(); ++b) {
636 if (realignBlock[b]) {
637 modBlocks[b]->SetRangeOfRow(1,
638 dpResult->blockPositions[b], dpResult->blockPositions[b] + dpBlocks->blockSizes[b] - 1);
639
640 }
641 }
642 }
643 }
644
645 DP_DestroyAlignmentResult(dpResult);
646 dpResult = NULL;
647 }
648
649
650 // The toRealign container will hold the orignial, unmodified alignment
651 // obtained by ExtractRows above.
652 if (nFailed > 0) {
653 ERR_POST(ncbi::Error << "Encountered problems realigning some rows; those implicated are unchanged:\n " + failedMsg + "\n\n");
654 }
655
656 // merge realigned rows back onto the end of the multiple alignment,
657 // in order of increasing row number
658 TRACE_MESSAGE("merging DP results back into multiple alignment");
659 toRealignIt = toRealign.end();
660 --toRealignIt;
661
662 // Failures here are fatal as the re-aligned sequence cannot be replaced
663 // back in its original location in the alignment.
664 for (int j=nDifferentToRealign-1; j >=0; --j, --toRealignIt) {
665 row = sortedRowsToRealign[j];
666 rowStr = NStr::UIntToString(row) + " (" + GetSeqIdStringForRow(row) + ")";
667 if (!m_currentMultiple->MergeAlignment(*toRealignIt))
668 THROW_MESSAGE("MergeAlignment() failed for row " + rowStr);
669
670 // move extracted row back to where it was
671 vector < unsigned int > rowOrder(m_currentMultiple->NRows());
672 for (r=0; r<m_currentMultiple->NRows(); ++r) {
673 if (r < row)
674 rowOrder[r] = r;
675 else if (r == row)
676 rowOrder[r] = m_currentMultiple->NRows() - 1;
677 else
678 rowOrder[r] = r - 1;
679 }
680 if (!m_currentMultiple->ReorderRows(rowOrder))
681 THROW_MESSAGE("ReorderRows() failed when reinserting row " + rowStr);
682 }
683
684 // remove other alignment data, since it no longer matches the multiple
685 RemoveAlignAnnot();
686
687 status = true;
688
689 } catch (CException& e) {
690 ERROR_MESSAGE("DoLeaveNOut(): exception: " << e.GetMsg());
691 }
692
693 CException::EnableBackgroundReporting(prevState);
694
695 DELETE_ALL_AND_CLEAR(toRealign, BlockMultipleAlignment::AlignmentList);
696
697 DP_DestroyBlockInfo(dpBlocks);
698 DP_DestroyAlignmentResult(dpResult);
699
700 g_dpPSSM = NULL;
701 g_dpBlocks = NULL;
702 g_dpQuery = NULL;
703
704 return status;
705 }
706
RemoveMultiple(void)707 void AlignmentUtility::RemoveMultiple(void)
708 {
709 if (m_currentMultiple) {
710 delete m_currentMultiple;
711 m_currentMultiple = NULL;
712 }
713 }
714
RemoveAlignAnnot(void)715 void AlignmentUtility::RemoveAlignAnnot(void)
716 {
717 m_seqAnnots.clear();
718 if (m_alignmentSet) {
719 delete m_alignmentSet;
720 m_alignmentSet = NULL;
721 }
722 }
723
GetSeqAnnots(void)724 const AlignmentUtility::SeqAnnotList& AlignmentUtility::GetSeqAnnots(void)
725 {
726 if (!m_alignmentSet || m_seqAnnots.size() == 0) {
727 if (m_alignmentSet)
728 ERROR_MESSAGE("ack - shouldn't have m_alignmentSet but empty m_seqAnnots");
729 m_alignmentSet = AlignmentSet::CreateFromMultiple(
730 m_currentMultiple, &m_seqAnnots, *m_sequenceSet);
731 }
732 return m_seqAnnots;
733 }
734
GetPSSM(void)735 const BLAST_Matrix * AlignmentUtility::GetPSSM(void)
736 {
737 // first we need to do IBM -> BlockMultipleAlignment
738 if (!m_currentMultiple && !DoIBM())
739 return NULL;
740
741 return m_currentMultiple->GetPSSM();
742 }
743
GetBlockMultipleAlignment(void)744 const BlockMultipleAlignment * AlignmentUtility::GetBlockMultipleAlignment(void)
745 {
746 // first we need to do IBM -> BlockMultipleAlignment
747 if (!m_currentMultiple && !DoIBM())
748 return NULL;
749
750 return m_currentMultiple;
751 }
752
SetBlockMultipleAlignment(const BlockMultipleAlignment * bma)753 void AlignmentUtility::SetBlockMultipleAlignment(const BlockMultipleAlignment* bma) {
754 if (bma) {
755 RemoveMultiple();
756 RemoveAlignAnnot();
757 m_currentMultiple = bma->Clone();
758 }
759 }
760
ScoreRowByPSSM(unsigned int row)761 int AlignmentUtility::ScoreRowByPSSM(unsigned int row)
762 {
763 // first we need to do IBM -> BlockMultipleAlignment
764 if (!m_currentMultiple && !DoIBM())
765 return kMin_Int;
766
767 if (row >= m_currentMultiple->NRows()) {
768 ERROR_MESSAGE("AlignmentUtility::ScoreRowByPSSM() - row out of range");
769 return kMin_Int;
770 }
771
772 BlockMultipleAlignment::UngappedAlignedBlockList blocks;
773 m_currentMultiple->GetUngappedAlignedBlocks(&blocks);
774 if (blocks.size() == 0) {
775 WARNING_MESSAGE("AlignmentUtility::ScoreRowByPSSM() - alignment has no blocks");
776 return kMin_Int;
777 }
778
779 const Sequence *sequence = m_currentMultiple->GetSequenceOfRow(row);
780
781 int score = 0;
782 BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator b, be = blocks.end();
783 for (b=blocks.begin(); b!=be; ++b) {
784 const Block::Range
785 *masterRange = (*b)->GetRangeOfRow(0),
786 *range = (*b)->GetRangeOfRow(row);
787 for (unsigned int i=0; i<(*b)->m_width; ++i)
788 score += GetPSSMScoreOfCharWithAverageOfBZ(
789 m_currentMultiple->GetPSSM(), masterRange->from + i, sequence->m_sequenceString[range->from + i]);
790 }
791
792 return score;
793 }
794
GetNRows()795 unsigned int AlignmentUtility::GetNRows() {
796 const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
797 return (bma) ? bma->NRows() : 0;
798 }
799
GetNRows() const800 unsigned int AlignmentUtility::GetNRows() const {
801 unsigned int nRows = 0;
802 if (m_currentMultiple) {
803 nRows = m_currentMultiple->NRows();
804 } else {
805 AlignmentUtility* dummy = Clone();
806 nRows = (dummy) ? dummy->GetNRows() : 0;
807 delete dummy;
808 }
809 return nRows;
810 }
811
IsRowPDB(unsigned int row)812 bool AlignmentUtility::IsRowPDB(unsigned int row) {
813 bool result = false;
814 const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
815 const Sequence* sequence = (bma) ? bma->GetSequenceOfRow(row) : NULL;
816
817 if (sequence) {
818 result = sequence->GetPreferredIdentifier().IsPdb();
819 }
820
821 return result;
822 }
823
IsRowPDB(unsigned int row) const824 bool AlignmentUtility::IsRowPDB(unsigned int row) const {
825 bool result = false;
826 if (m_currentMultiple) {
827 const Sequence* sequence = m_currentMultiple->GetSequenceOfRow(row);
828 if (sequence) {
829 result = sequence->GetPreferredIdentifier().IsPdb();
830 }
831 } else {
832 AlignmentUtility* dummy = Clone();
833 result = (dummy && dummy->IsRowPDB(row));
834 }
835 return result;
836 }
837
SequenceIdToString(const Sequence & sequence)838 string SequenceIdToString(const Sequence& sequence) {
839
840 static const string defStr = "Non-GI/PDB Sequence Type";
841 string s = defStr;
842 const CSeq_id& id = sequence.GetPreferredIdentifier();
843
844 if (id.IsPdb()) {
845 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
846 s = "PDB " + id.GetPdb().GetMol().Get() + '_' + id.GetPdb().GetEffectiveChain_id();
847 #else
848 char chain = id.GetPdb().GetChain();
849 s = "PDB " + id.GetPdb().GetMol().Get() + '_' + chain;
850 #endif
851 } else if (id.IsGi()) {
852 s = "GI " + NStr::NumericToString(id.GetGi());
853 }
854 return s;
855 }
856
GetSeqIdStringForRow(unsigned int row)857 string AlignmentUtility::GetSeqIdStringForRow(unsigned int row) {
858
859 static const string defStr = "<Could not find a sequence for row ";
860 string idStr = defStr + NStr::IntToString(row + 1) + '>';
861 const BlockMultipleAlignment* bma = GetBlockMultipleAlignment();
862 const Sequence* sequence = (bma) ? bma->GetSequenceOfRow(row) : NULL;
863
864 if (sequence) {
865 idStr = SequenceIdToString(*sequence);
866 }
867 return idStr;
868 }
869
GetSeqIdStringForRow(unsigned int row) const870 string AlignmentUtility::GetSeqIdStringForRow(unsigned int row) const {
871 static const string defStr = "<Could not find a sequence for row ";
872 string idStr = defStr + NStr::IntToString(row + 1) + '>';
873 if (m_currentMultiple) {
874 const Sequence* sequence = m_currentMultiple->GetSequenceOfRow(row);
875 if (sequence) {
876 idStr = SequenceIdToString(*sequence);
877 }
878 } else {
879 AlignmentUtility* dummy = Clone();
880 if (dummy) {
881 idStr = dummy->GetSeqIdStringForRow(row);
882 }
883 }
884 return idStr;
885 }
886
887 END_SCOPE(struct_util)
888