1 /* $Id: cuCdUpdater.cpp 609836 2020-06-08 15:56:03Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Charlie Liu
27 *
28 * File Description:
29 *
30 * Update CDs
31 *
32 * ===========================================================================
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <algo/structure/cd_utils/cuAlign.hpp>
37 #include <algo/structure/cd_utils/cuTaxClient.hpp>
38 #include <objects/id1/Entry_complexities.hpp>
39 #include <objects/id1/ID1server_maxcomplex.hpp>
40 #include <objects/seqalign/Dense_seg.hpp>
41 #include <objects/general/User_object.hpp>
42 #include <objects/general/User_field.hpp>
43 #include <algo/blast/api/blast_exception.hpp>
44 #include <algo/blast/api/effsearchspace_calc.hpp>
45 #include <algo/blast/api/psiblast_options.hpp>
46 #include <objects/blastdb/Blast_def_line.hpp>
47 #include <objtools/blast/services/blast_services.hpp>
48 #include <algo/structure/cd_utils/cuSequence.hpp>
49 #include <algo/structure/cd_utils/cuCdUpdateParameters.hpp>
50 #include <algo/structure/cd_utils/cuCD.hpp>
51 #include <algo/structure/cd_utils/cuCdUpdater.hpp>
52 #include <algo/structure/cd_utils/cuUtils.hpp>
53 #include <algo/structure/cd_utils/cuCdReadWriteASN.hpp>
54 #include <algo/structure/cd_utils/cuHitsDistributor.hpp>
55 #include <algo/structure/cd_utils/cuSequenceTable.hpp>
56 #include <algo/structure/cd_utils/cuPssmMaker.hpp>
57 #include <algo/structure/cd_utils/cuBlockIntersector.hpp>
58 #include <algo/structure/cd_utils/cuBlockFormater.hpp>
59 #include <objects/scoremat/Pssm.hpp>
60 #include <algo/structure/cd_utils/cuAlignmentCollection.hpp>
61 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(cd_utils)62 BEGIN_SCOPE(cd_utils)
63
64 CDUpdateStats::CDUpdateStats() : numBlastHits(0), numRedundant(0), numObsolete(0),
65 numFilteredByOverlap(0){}
66
toString(bool detailed)67 string CDUpdateStats::toString(bool detailed)
68 {
69 int added = numBlastHits - envSeq.size() - fragmented.size() - overlap.size()
70 - noSeq.size() - badAlign.size() - numRedundant - oldNewPairs.size();
71 string result = "Total number of sequences added to the pending list of the Cd:" + NStr::IntToString(added)
72 +". ";
73 if (numFilteredByOverlap > 0)
74 {
75 result += "Total number of pending sequences that are not moved to normal alignment because of insufficient overlapping:"
76 + NStr::IntToString(numFilteredByOverlap) + ". ";
77 }
78 if (!detailed)
79 return result;
80 result += "Number of Blast Hits = ";
81 result += NStr::IntToString(numBlastHits);
82 result += ". ";
83
84 result += toString(envSeq, "Environmental Sequences");
85 result += toString(fragmented, "Sequence Fragments");
86 result += toString(overlap, "Alignments overlapping a row already in CD");
87 result += toString(noSeq, "Alignments with no sequence data");
88 result += toString(badAlign, "Alignments that are corrupted or do not match with the CD");
89 if (numRedundant > 0)
90 {
91 result += "Alignments removed due to redundancy:";
92 result += NStr::IntToString(numRedundant);
93 result += ". ";
94 }
95 result += toString(oldNewPairs, "New sequences that can replace old sequences (in parentheses) already in CD");
96 result += ". ";
97 if (numObsolete > 0)
98 {
99 result += "Numer of obsolete sequences removed:";
100 result += NStr::IntToString(numObsolete);
101 result += ". ";
102 }
103 return result;
104 }
105
toString(vector<TGi> & gis,string type)106 string CDUpdateStats::toString(vector<TGi>& gis, string type)
107 {
108 if (gis.size() <= 0)
109 return "";
110 string result = "Number of " + type + " =";
111 result += NStr::UIntToString((unsigned)gis.size());
112 result += ":\n";
113 result += toString(gis);
114 result += "\n";
115 return result;
116 }
117
toString(vector<TGi> & gis)118 string CDUpdateStats::toString(vector<TGi>& gis)
119 {
120 string result;
121 for(unsigned int i = 0; i < gis.size(); i++)
122 {
123 result += NStr::NumericToString<TGi>( gis[i] );
124 result += ",";
125 }
126 return result;
127 }
128
toString(vector<OldNewGiPair> & giPairs,string type)129 string CDUpdateStats::toString(vector<OldNewGiPair>& giPairs, string type)
130 {
131 if (giPairs.size() <= 0)
132 return "";
133 string result = "Number of " + type + " =";
134 result += NStr::UIntToString((unsigned)giPairs.size());
135 result += ":\n";
136 for (unsigned int i = 0 ; i < giPairs.size(); i++)
137 {
138 result += NStr::NumericToString<TGi>(giPairs[i].second);
139 result += "(";
140 result += NStr::NumericToString<TGi>(giPairs[i].first);
141 result += ")";
142 result += ",";
143 }
144 result += "\n";
145 return result;
146 }
147
148
149 /*---------------------------------------------------------------------------------*/
150 /* UpdaterInterface */
151 /*---------------------------------------------------------------------------------*/
152
153 list<UpdaterInterface*> UpdaterInterface::m_updaterList;
154
addUpdater(UpdaterInterface * updater)155 void UpdaterInterface::addUpdater(UpdaterInterface* updater)
156 {
157 m_updaterList.push_back(updater);
158 }
159
IsEmpty()160 bool UpdaterInterface::IsEmpty()
161 {
162 return m_updaterList.empty();
163 }
164
removeUpdaters(const vector<CCdCore * > & cds)165 void UpdaterInterface::removeUpdaters(const vector<CCdCore*>& cds)
166 {
167 for (unsigned int i = 0; i < cds.size(); i++)
168 {
169 for (list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
170 lit != m_updaterList.end(); lit++)
171 {
172 if ((*lit)->hasCd(cds[i]))
173 {
174 UpdaterInterface* cup = *lit;
175 m_updaterList.erase(lit);
176 if (cup)
177 {
178 //cup->getCd()->ClearUpdateInfo();
179 delete cup;
180 }
181 break;
182 }
183 }
184 }
185 }
186
removeUpdaters(const vector<UpdaterInterface * > & updaters)187 void UpdaterInterface::removeUpdaters(const vector<UpdaterInterface*>& updaters)
188 {
189 for (unsigned int i = 0; i < updaters.size(); i++)
190 {
191 for (list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
192 lit != m_updaterList.end(); lit++)
193 {
194 if ((*lit) == updaters[i])
195 {
196 UpdaterInterface* cup = *lit;
197 m_updaterList.erase(lit);
198 if (cup)
199 {
200 //cup->getCd()->ClearUpdateInfo();
201 delete cup;
202 }
203 break;
204 }
205 }
206 }
207 }
208
checkAllBlasts(vector<UpdaterInterface * > & blasted)209 int UpdaterInterface::checkAllBlasts(vector<UpdaterInterface*>& blasted)
210 {
211 list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
212 while(lit != m_updaterList.end())
213 {
214 UpdaterInterface* updater = *lit;
215 if(updater->getBlastHits())
216 {
217 blasted.push_back(updater);
218 }
219 lit++;
220 }
221 return blasted.size();
222 }
223
224 /*---------------------------------------------------------------------------------*/
225 /* GroupUpdater */
226 /*---------------------------------------------------------------------------------*/
227
GroupUpdater(vector<CCdCore * > & cds,CdUpdateParameters & config)228 GroupUpdater::GroupUpdater(vector<CCdCore*>& cds, CdUpdateParameters& config)
229 {
230 for (unsigned int i = 0; i < cds.size(); i++)
231 m_cdUpdaters.push_back(new CDUpdater(cds[i], config));
232 }
233
~GroupUpdater()234 GroupUpdater::~GroupUpdater() //delete all in m_cdUpdaters
235 {
236 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
237 delete (m_cdUpdaters[i]);
238 }
239
240 //UpdaterInterface
submitBlast(bool wait,int row)241 int GroupUpdater::submitBlast(bool wait, int row)
242 {
243 int count = 0;
244 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
245 {
246 if (!(m_cdUpdaters[i])->submitBlast(wait,row)) {
247 return 0; // return 0 if one fails so legacy "if" statements still work.
248 }
249 else {
250 count++;
251 }
252 }
253 return count;
254 }
255
getBlastHits()256 bool GroupUpdater::getBlastHits()
257 {
258 bool allDone = true;
259 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
260 {
261 if (!(m_cdUpdaters[i]->getBlastHits()))
262 allDone = false;
263 }
264 return allDone;
265 }
266
processBlastHits()267 bool GroupUpdater::processBlastHits()
268 {
269 if (!getBlastHits())
270 {
271 LOG_POST("Not all BLASTs on the group are done. Thus updating this group can't be done at this time.\n");
272 return false;
273 }
274 //distribute
275 HitDistributor dist;
276 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
277 {
278 dist.addBatch(m_cdUpdaters[i]->getAlignments());
279 }
280 //dist.dump("DistTab.txt");
281 dist.distribute();
282 //update individual CDs
283 bool allDone = true;
284 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
285 {
286 if (!(m_cdUpdaters[i]->processBlastHits()))
287 allDone = false;
288 }
289 return allDone;
290 }
291
getCds(vector<CCdCore * > & cds)292 void GroupUpdater::getCds(vector<CCdCore*>& cds)
293 {
294 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
295 {
296 m_cdUpdaters[i]->getCds(cds);
297 }
298 }
299
hasCd(CCdCore * cd)300 bool GroupUpdater::hasCd(CCdCore* cd)
301 {
302 for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
303 {
304 if (m_cdUpdaters[i]->hasCd(cd))
305 return true;
306 }
307 return false;;
308 }
309
310 /*---------------------------------------------------------------------------------*/
311 /* CDUpdater */
312 /*---------------------------------------------------------------------------------*/
313
CDUpdater(CCdCore * cd,CdUpdateParameters & config)314 CDUpdater::CDUpdater(CCdCore* cd, CdUpdateParameters& config)
315 : m_config(config), m_cd(cd), m_guideAlignment(0), m_processPendingThreshold(-1), m_hitsNeeded(-1),
316 m_blastQueryRow(0)
317 {
318 }
319
320
~CDUpdater()321 CDUpdater::~CDUpdater()
322 {
323 if (m_guideAlignment)
324 delete m_guideAlignment;
325 }
326
submitBlast(bool wait,int row)327 int CDUpdater::submitBlast(bool wait, int row)
328 {
329 m_blastQueryRow = row;
330 bool blasted = false;
331 try {
332 blasted = blast(wait, row);
333 }
334 catch (blast::CBlastException& be) {
335 blasted = false;
336 m_lastError = "Blast exception in submitBlast for row " + NStr::IntToString(row) + ":\n";
337 m_lastError += be.ReportAll();
338 }
339 catch (CException& e) {
340 blasted = false;
341 m_lastError = "NCBI C++ Toolkit exception in submitBlast for row " + NStr::IntToString(row) + ":\n";
342 m_lastError += e.ReportAll();
343 }
344 catch (...) {
345 blasted = false;
346 m_lastError = "Unknown exception in submitBlast for row " + NStr::IntToString(row) + "\n";
347 }
348
349 if (blasted)
350 {
351 LOG_POST("RID of Blast for the update of " << m_cd->GetAccession() << " is " << getRid());
352 }
353 else
354 {
355 LOG_POST("Update of " << m_cd->GetAccession() << " failed due to error\n" << getLastError());
356 }
357
358 return(blasted ? 1 : 0);
359 }
360
getBlastHits()361 bool CDUpdater::getBlastHits()
362 {
363 if (!m_hits.Empty())
364 return true;
365 else
366 return getHits(m_hits);
367 }
368
processBlastHits()369 bool CDUpdater::processBlastHits()
370 {
371 bool updated = false;
372
373 //m_Data.cacheCD(m_cd->GetAccession().c_str(),"Updated by Blast",EDITTYPE_ALIGNMENT);
374 if (!m_hits.Empty())
375 {
376 updated = true;
377 update(m_cd, *m_hits);
378 //LOG_POST("Stats of Updating "<<m_cd->GetAccession()<< ":\n"<<getStats().toString());
379 unsigned numNoAlignment = getStats().badAlign.size();
380 if (numNoAlignment > 0)
381 LOG_POST("There are "<<numNoAlignment
382 <<" hits whose alignments do not overlap with the CD. This may indicate there are long insert to the CD alignment. You can find the GIs for those hits in the log.");
383 }
384 else
385 {
386 LOG_POST("Found no BLAST hits to process for CD " << m_cd->GetAccession() << ".\n");
387 }
388 return updated;
389 }
390
getCds(vector<CCdCore * > & cds)391 void CDUpdater::getCds(vector<CCdCore*>& cds)
392 {
393 cds.push_back(m_cd);
394 }
395
hasCd(CCdCore * cd)396 bool CDUpdater::hasCd(CCdCore* cd)
397 {
398 return m_cd == cd;
399 }
400
blast(bool wait,int row)401 bool CDUpdater::blast(bool wait, int row)
402 {
403 blast::CRemoteBlast* rblast;
404 blast::CBlastProteinOptionsHandle* blastopt;
405 CPSIBlastOptionsHandle * psiopt = NULL;
406 if (m_config.blastType == eBLAST)
407 {
408 blastopt = new blast::CBlastProteinOptionsHandle(blast::CBlastOptions::eRemote);
409 rblast = new blast::CRemoteBlast(blastopt);
410 }
411 else
412 {
413 psiopt = new blast::CPSIBlastOptionsHandle(blast::CBlastOptions::eRemote);
414 //psiopt->SetCompositionBasedStats(eNoCompositionBasedStats);
415 rblast = new blast::CRemoteBlast(psiopt);
416 blastopt = psiopt;
417 }
418 //rblast-> SetVerbose();
419 blastopt->SetSegFiltering(false);
420 if (m_config.numHits > 0)
421 blastopt->SetHitlistSize(m_config.numHits);
422 if (m_config.evalue > 0)
423 blastopt->SetEvalueThreshold(m_config.evalue);
424 if (m_config.identityThreshold > 0) //does not seem to do anything with RemoteBlast
425 blastopt->SetPercentIdentity((double)m_config.identityThreshold);
426 rblast->SetDatabase(CdUpdateParameters::getBlastDatabaseName(m_config.database));
427
428 string entrezQuery;
429 if (m_config.organism != eAll_organisms)
430 {
431 entrezQuery += CdUpdateParameters::getOrganismName(m_config.organism);
432 entrezQuery += "[Organism]";
433 }
434 if (m_config.entrezQuery.size() > 0)
435 entrezQuery += m_config.entrezQuery;
436 if (!entrezQuery.empty())
437 rblast->SetEntrezQuery(entrezQuery.c_str());
438
439 //set PSSM here
440 if (m_config.blastType == eBLAST)
441 {
442 CRef<objects::CBioseq_set> bioseqs(new CBioseq_set);
443 list< CRef< CSeq_entry > >& seqList = bioseqs->SetSeq_set();
444 CRef< CSeq_entry > seqOld;
445
446 if (!m_cd->GetSeqEntryForRow(row, seqOld)) {
447 delete rblast;
448 return false;
449 }
450
451 seqList.push_back(seqOld);
452 CRef< CSeq_id > seqId = seqOld->SetSeq().SetId().front();
453 TMaskedQueryRegions masks;
454 int lo = m_cd->GetLowerBound(row);
455 int hi = m_cd->GetUpperBound(row);
456 int len = m_cd->GetSequenceStringByRow(row).length();
457 if (lo > 0)
458 masks.push_back(CRef<CSeqLocInfo>( new CSeqLocInfo(new CSeq_interval(*seqId, 0,lo-1),0)));
459 if (hi < (len-1))
460 masks.push_back(CRef<CSeqLocInfo> ((new CSeqLocInfo(new CSeq_interval(*seqId, hi + 1, len - 1),0))));
461 if (masks.size() > 0)
462 {
463 TSeqLocInfoVector masking_locations;
464 masking_locations.push_back(masks);
465 rblast->SetQueries(bioseqs,masking_locations);
466 }
467 else
468 rblast->SetQueries(bioseqs);
469 //debug
470 /*
471 string err;
472 if (!WriteASNToFile("blast_query", *bioseqs, false,&err))
473 LOG_POST("Failed to write to blast_query");
474 */
475 //end of debug
476 }
477 else //psi-blast
478 {
479 bool useConsensus = true;
480 PssmMaker pm(m_cd, useConsensus, true);
481 PssmMakerOptions config;
482 config.unalignedSegThreshold = 35;
483 config.requestFrequencyRatios = true;
484 pm.setOptions(config);
485 CRef<CPssmWithParameters> pssm = pm.make();
486
487 m_guideAlignment = new BlockModelPair(pm.getGuideAlignment());
488 m_guideAlignment->degap();
489 m_guideAlignment->reverse(); //keep con::master
490 m_consensus = pm.getConsensus();
491
492 psiopt->SetPseudoCount(pm.getPseudoCount());
493
494 rblast->SetQueries(pssm);
495 }
496
497
498 // Submit and, if requested, wait for blast results.
499 // Trap any exceptions and return 'false' in all such cases.
500 bool blasted = false;
501 try {
502 if (wait) {
503 blasted = rblast-> SubmitSync();
504 m_rid = rblast->GetRID();
505 //LOG_POST("RID="<<m_rid);
506 getBlastHits();
507 }
508
509 blasted = rblast->Submit();
510
511 if (!blasted) {
512 m_lastError = rblast->GetErrors();
513 }
514
515 } catch (CRemoteBlastException& e) {
516 m_lastError = "RemoteBlast exception in CDUpdater::blast() for row " + NStr::IntToString(row) + ":\n";
517 m_lastError += e.ReportAll();
518 // string err = e.GetErrCodeString();
519 // LOG_POST("RemoteBlast exception in CDUpdater::blast() for row " << NStr::IntToString(row) << ": error code = " << err);
520 }
521 catch (CException& e) {
522 m_lastError = "NCBI C++ Toolkit exception in CDUpdater::blast() for row " + NStr::IntToString(row) + ":\n";
523 m_lastError += e.ReportAll();
524 }
525 catch (...) {
526 m_lastError = "Unknown exception in CDUpdater::blast() for row " + NStr::IntToString(row) + "\n";
527 }
528
529 m_rid = rblast->GetRID();
530 delete rblast;
531 return blasted;
532 }
533
getHits(CRef<CSeq_align_set> & hits)534 bool CDUpdater::getHits(CRef<CSeq_align_set> & hits)
535 {
536 bool done = false;
537 blast::CRemoteBlast rblast(getRid());
538 try {
539 //LOG_POST("Calling RemoteBlast::CheckDone().\n");
540 done = rblast.CheckDone();
541 //LOG_POST("Returned from RemoteBlast::CheckDone().\n");
542 if (done)
543 {
544 hits = rblast.GetAlignments();
545 }
546 } catch (...) {
547 LOG_POST("Exception while getting BLAST hits of CD " << m_cd->GetAccession() << " for RID " << getRid());
548 }
549 return done;
550 }
551
checkDone()552 bool CDUpdater::checkDone()
553 {
554 bool done = false;
555 blast::CRemoteBlast rblast(getRid());
556 try {
557 //LOG_POST("Calling RemoteBlast::CheckDone().\n");
558 done = rblast.CheckDone();
559 //LOG_POST("Returned from RemoteBlast::CheckDone().\n");
560
561 //CCdCore::UpdateInfo ui = m_cd->GetUpdateInfo();
562 //ui.status = CCdCore::BLAST_DONE;
563 //m_cd->SetUpdateInfo(ui);
564
565 } catch (...) {
566 LOG_POST("Exception during CheckDone for CD " << m_cd->GetAccession() << ", RID " << getRid());
567 }
568 return done;
569 }
570
checkBlastAndUpdate()571 bool CDUpdater::checkBlastAndUpdate()
572 {
573 CRef<CSeq_align_set> seqAligns;
574 if(getHits(seqAligns))
575 {
576 if(!seqAligns.Empty())
577 {
578 update(m_cd, *seqAligns);
579 SetUpdateDate(m_cd);
580 //LOG_POST("Stats of Updating "<<m_cd->GetAccession()<<" are "<<getStats().toString());
581 unsigned numNoAlignment = getStats().badAlign.size();
582 if (numNoAlignment > 0)
583 LOG_POST("There are hits whose alignments do not overlap with the CD. This may indicate there are long insert to the CD alignment. You find the GIs for those hits in the log\n");
584 }
585 else
586 {
587 LOG_POST("Got no alignment for BLAST hits for CD " << m_cd->GetAccession() << ". will try again to retrieve the hits.\n");
588 return true;
589 }
590 return true;
591 }
592 else
593 return false;
594 }
595
ComputePercentIdentity(const CRef<CSeq_align> & alignment,const string & queryString,const string & subjectString)596 double CDUpdater::ComputePercentIdentity(const CRef< CSeq_align >& alignment, const string& queryString, const string& subjectString)
597 {
598 double result = 0.0;
599 unsigned int nIdent = 0;
600 unsigned int qLen = queryString.length(), sLen = subjectString.length();
601 unsigned int i, j, qStart, qStop, sStart, sStop;
602
603 if (alignment.Empty() || qLen == 0 || sLen == 0) return result;
604
605 // Note that is is only %id in the aligned region, and doesn't factor in any non-identities
606 // implicit in any parts of the query N- and/or C-terminal to the alignment.
607 const CSeq_align::C_Segs::TDenseg& denseg = alignment->GetSegs().GetDenseg();
608 double denom = (denseg.GetSeqStop(0) - denseg.GetSeqStart(0) + 1);
609
610 CDense_seg::TStarts starts = denseg.GetStarts();
611 CDense_seg::TLens lens = denseg.GetLens();
612
613 for (i = 0; i < lens.size(); ++i) {
614 // Do this check before implicit cast to unsigned int when assigning to qStart, sStart.
615 if (starts[2*i] < 0 || starts[2*i + 1] < 0) continue; // gap
616 qStart = starts[2*i];
617 sStart = starts[2*i + 1];
618
619 qStop = qStart + lens[i] - 1;
620 sStop = sStart + lens[i] - 1;
621 if (qStop >= qLen || sStop >= sLen) continue; // string index out of range
622
623 for (j = 0; j < lens[i]; ++j) {
624 if (queryString[qStart + j] == subjectString[sStart + j]) ++nIdent;
625 }
626 }
627 result = 100.0*nIdent/denom;
628 // LOG_POST(nIdent << " identities found (" << result << "%)\nquery: " << queryString << "\nsubject: " << subjectString);
629
630 return result;
631 }
632
update(CCdCore * cd,CSeq_align_set & alignments)633 bool CDUpdater::update(CCdCore* cd, CSeq_align_set& alignments)
634 {
635 if ( !cd || (!alignments.IsSet()))
636 return false;
637
638 double pidScore = 0.0;
639 list< CRef< CSeq_align > >& seqAligns = alignments.Set();
640 m_stats.numBlastHits = seqAligns.size();
641 vector< CRef< CBioseq > > bioseqs;
642 LOG_POST("Got "<<m_stats.numBlastHits<<" blast hits for CD " << cd->GetAccession() << ".");
643 retrieveAllSequences(alignments, bioseqs);
644
645 SequenceTable seqTable;
646 CDRefresher* refresher = 0;
647 if (m_config.replaceOldAcc)
648 refresher = new CDRefresher(cd);
649 vector< CRef< CBioseq > > bioseqVec;
650 for (unsigned int i = 0; i < bioseqs.size(); i++)
651 {
652 bioseqVec.clear();
653 SplitBioseqByBlastDefline (bioseqs[i], bioseqVec);
654 seqTable.addSequences(bioseqVec, true); //as a group
655 }
656
657 // debugging *** ***
658 //string err;
659 //WriteASNToFile("alignments.txt", alignments, false, &err);
660
661 // debugging *** ***
662 //seqTable.dump("seqTable.txt");
663 //LOG_POST("Retrieved "<<bioseqs.size()<<" Bioseqs for blast hits\n");
664 //LOG_POST("Process BLAST Hits and add them to "<<cd->GetAccession());
665 int completed = 0;
666 list< CRef< CSeq_align > >::iterator it = seqAligns.begin();
667 //CID1Client id1Client;
668 //id1Client.SetAllowDeadEntries(false);
669
670 CRef< CSeq_id > seqID, querySeqID;
671 CRef<CSeq_entry> seqEntry;
672 CRef< CBioseq > queryBioseq(new CBioseq);
673 string queryString, subjectString;
674
675 // Get bioseq corresponding to the query.
676 if (it != seqAligns.end()) {
677 querySeqID = (*it)->SetSegs().SetDenseg().GetIds()[0];
678 if (!cd->CopyBioseqForSeqId(querySeqID, queryBioseq)) {
679 queryBioseq.Reset();
680
681 // This message isn't relevant when the query is a PSSM.
682 if (m_config.blastType == eBLAST)
683 LOG_POST("No bioseq found in CD " << cd->GetAccession() << " for update query.");
684 } else {
685 queryString = GetRawSequenceString(*queryBioseq);
686 }
687 }
688
689
690 //for BLAST, if the master is PDB, the master seqid (gi from BLAST) needs to be changed
691 if (m_config.blastType == eBLAST && it != seqAligns.end())
692 {
693 CSeq_align::C_Segs& oldSegs = (*it)->SetSegs();
694 CRef< CSeq_align::C_Segs::TDenseg> denseg( &(oldSegs.SetDenseg()) );
695 vector< CRef< CSeq_id > >& seqIds= denseg->SetIds();
696 CRef< CSeq_entry > masterSeq;
697 cd->GetSeqEntryForRow(m_blastQueryRow, masterSeq);
698 vector< CRef< CSeq_id > > pdbIds;
699 GetAllIdsFromSeqEntry(masterSeq, pdbIds, true);
700 if ((pdbIds.size() > 0) && SeqEntryHasSeqId(masterSeq, *seqIds[0]) && (!seqIds[0] ->IsPdb()))
701 m_masterPdb = pdbIds[0];
702 }
703 for(; it != seqAligns.end(); it++)
704 {
705 pidScore = 0.0;
706 CRef< CSeq_align > seqAlignRef = *it;
707 //seqAlign from BLAST is in Denseg
708 CSeq_align::C_Segs::TDenseg& denseg = seqAlignRef->SetSegs().SetDenseg();
709
710 // 9/25/08: CRemoteBlast dropped the identity count from the scores, and is a
711 // pending issue (JIRA ticket http://jira.be-md.ncbi.nlm.nih.gov/browse/SB-114).
712 // Workaround is to compute the % identity directly, as done below in
713 // the function ComputePercentIdentity.
714 if (false && m_config.identityThreshold > 0)
715 {
716
717 // Note: if the type isn't in the seq-align, pidScore remains 0.0 and the
718 // scan of seqAligns will be aborted.
719 seqAlignRef->GetNamedScore(CSeq_align::eScore_IdentityCount, pidScore);
720
721 int start = denseg.GetSeqStart(0);
722 int stop = denseg.GetSeqStop(0);
723 pidScore = 100*pidScore/(stop - start + 1);
724 if ((int)pidScore < m_config.identityThreshold)
725 break; //stop
726 }
727 //the second is slave
728 if (denseg.GetDim() > 1)
729 seqID = denseg.GetIds()[1];
730 else
731 break; //should not be here
732
733 TGi gi = seqID->GetGi();
734 vector< CRef< CBioseq > > bioseqVec;
735 if(seqTable.findSequencesInTheGroup(seqID, bioseqVec) > 0)
736 {
737 //one SeqAlign returned from BLAST may represent the hits on several
738 //different Bioseq that all have the same seq_data
739 //pick the Seq_id from the most useful Bioseq
740 if (bioseqVec.size() > 1)
741 {
742 int index = pickBioseq(refresher, seqAlignRef, bioseqVec);
743 seqEntry = new CSeq_entry;
744 seqEntry->SetSeq(*bioseqVec[index]);
745 seqID = denseg.GetIds()[1];
746 gi = seqID->GetGi();
747 subjectString = GetRawSequenceString(*bioseqVec[index]);
748 }
749 else
750 {
751 seqEntry = new CSeq_entry;
752 seqEntry->SetSeq(*bioseqVec[0]);
753 subjectString = GetRawSequenceString(*bioseqVec[0]);
754 }
755
756 // when there's no identity count in the seq-align, compute it directly
757 if (m_config.identityThreshold > 0)
758 {
759 pidScore = ComputePercentIdentity(seqAlignRef, queryString, subjectString);
760 if ((int)pidScore < m_config.identityThreshold)
761 break; //stop
762 }
763
764 //change seqAlign from denseg to dendiag
765 //use pdb_id if available
766 if(!modifySeqAlignSeqEntry(cd, *it, seqEntry))
767 {
768 m_stats.badAlign.push_back(gi);
769 continue;
770 }
771
772 bool passed = true;
773 if (!m_config.noFilter)
774 passed = passedFilters(cd, *it, seqEntry);
775 if(passed) //not framented; not environmental seq
776 {
777 // add merge fragment later here
778
779 //remaster the seqAlign from consensus to CD.master
780 if (m_config.blastType == ePSI_BLAST)
781 {
782 BlockModelPair bmp(*it);
783 bmp.remaster(*m_guideAlignment);
784 CRef<CSeq_align> saRef = bmp.toSeqAlign();
785 if (saRef.Empty())
786 {
787 m_stats.badAlign.push_back(gi);
788 //LOG_POST("No valid alignment after remastering to the CD for maste. gi|%d is ignored", gi);
789 continue;
790 }
791 else
792 (*it) = saRef;
793 }
794 //check to see if it is necessary to replace old sequences
795 TGi replacedGi = INVALID_GI;
796 if (m_config.replaceOldAcc)
797 replacedGi = refresher->refresh(seqAlignRef, seqEntry);
798 if (replacedGi > ZERO_GI)
799 {
800 m_stats.oldNewPairs.push_back(CDUpdateStats::OldNewGiPair(replacedGi, gi));
801 }
802 else
803 {
804 cd->AddPendingSeqAlign(*(it));
805 //just add sequence now. redundancy will be removed later
806 cd->AddSequence(seqEntry);
807 }
808 }
809 }
810 else
811 m_stats.noSeq.push_back(gi);
812 completed++;
813 if (m_hitsNeeded > 0)
814 {
815 if (completed >= m_hitsNeeded)
816 break;
817 }
818 if ((completed % 500) == 0)
819 LOG_POST("Processed "<<completed<<" of "<<m_stats.numBlastHits<<" hits.");
820 }
821
822 // always keep normal rows w/ automatic NR
823 LOG_POST("Finishing processing hits for "<<cd->GetAccession());
824
825 if (m_processPendingThreshold > 0)
826 {
827 m_stats.numFilteredByOverlap = mergePending(cd, m_processPendingThreshold,true);
828 }
829 if (refresher)
830 delete refresher;
831 return true;
832 }
833
mergePending(CCdCore * cd,int threshold,bool remaster)834 int CDUpdater::mergePending(CCdCore* cd, int threshold, bool remaster)
835 {
836 int excluded = processPendingToNormal(threshold, cd);
837 if (remaster)
838 {//check and remaster if necessary
839 CRef< CSeq_entry > seqEntry;
840 cd->GetSeqEntryForRow(0,seqEntry);
841 vector< CRef< CSeq_id > > seqIds;
842 GetAllIdsFromSeqEntry(seqEntry,seqIds, true);
843 if (seqIds.size() == 0)
844 {
845 int nRows = cd->GetNumRows();
846 int i = 1;
847 for (; i < nRows; i++)
848 {
849 CRef< CSeq_id > SeqID;
850 if (cd->GetSeqIDForRow(i-1,1, SeqID))
851 {
852 if (SeqID->IsPdb())
853 break;
854 }
855 }
856 if (i < nRows)
857 {
858 string err;
859 ReMasterCdWithoutUnifiedBlocks(cd, i, true);
860 }
861 }
862 }
863 return excluded;
864 }
865
866
pickBioseq(CDRefresher * refresher,CRef<CSeq_align> seqAlignRef,vector<CRef<CBioseq>> & bioseqVec)867 int CDUpdater::pickBioseq(CDRefresher* refresher, CRef< CSeq_align > seqAlignRef,
868 vector< CRef< CBioseq > >& bioseqVec)
869 {
870 CSeq_align::C_Segs::TDenseg& denseg = seqAlignRef->SetSegs().SetDenseg();
871 //the second is slave
872 vector< CRef< CSeq_id > >& seqIdVec = denseg.SetIds();
873 CRef< CSeq_id > seqID;
874 assert(denseg.GetDim() > 1);
875 seqID = seqIdVec[1];
876 int index = -1;
877 //if the current seqId's bioseq has a PDB , no change
878 for (int i = 0; i < (int) bioseqVec.size(); i++)
879 {
880 if(BioseqHasSeqId(*(bioseqVec[i]), *seqID))
881 {
882 index = i;
883 const CBioseq::TId& ids = bioseqVec[i]->GetId();
884 CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
885 for (; it != itend; ++it)
886 {
887 if ((*it)->IsPdb())
888 {
889 return index;
890 }
891 }
892 }
893 }
894 assert(index >= 0);
895 //use other PDB if there is one.
896 for (int i = 0; i < (int) bioseqVec.size(); i++)
897 {
898 if (i==index)
899 continue;
900 const CBioseq::TId& ids = bioseqVec[i]->GetId();
901 CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
902 CRef< CSeq_id > giId;
903 bool foundPDB =false;
904 for (; it != itend; ++it)
905 {
906 if ((*it)->IsPdb())
907 foundPDB = 1;
908 else if ((*it)->IsGi())
909 giId = *it;
910 }
911 if (foundPDB)
912 {
913 seqIdVec[1] = giId; //replace id in SeqAlign
914 return i;
915 }
916 }
917
918 //use the one whose older version is already in CD
919 //this can be used to replace the old one later
920 if (refresher)
921 {
922 for (int i = 0; i < (int) bioseqVec.size(); i++)
923 {
924 if (refresher->hasOlderVersion(bioseqVec[i]))
925 {
926 const CBioseq::TId& ids = bioseqVec[i]->GetId();
927 CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
928 for (; it != itend; ++it)
929 {
930 if ((*it)->IsGi())
931 {
932 seqIdVec[1] = (*it); //replace
933 return i;
934 }
935 }
936 }
937 }
938 }
939 return index;
940 }
941
findRowsWithOldSeq(CCdCore * cd,CBioseq & bioseq)942 bool CDUpdater::findRowsWithOldSeq(CCdCore* cd, CBioseq& bioseq)
943 {
944 CRef< CBioseq > bioseqRef(&bioseq);
945 TGi giNew = getGi(bioseqRef);
946 int num = cd->GetNumRows();
947 vector<int> rows;
948 string nAcc;
949 int nVer=0;
950 CRef< CSeq_id > seqId;
951 if (!GetAccAndVersion(bioseqRef, nAcc, nVer, seqId))
952 return false;
953 bool foundOldSeq = false;
954 for (int i = 0; i < num; i++)
955 {
956 cd->GetBioseqForRow(i, bioseqRef);
957 string oAcc;
958 int oVer=0;
959 if (GetAccAndVersion(bioseqRef, oAcc, oVer,seqId))
960 {
961 TGi giOld = getGi(bioseqRef);
962 if ((oAcc == nAcc) && (giNew != giOld))
963 {
964 rows.push_back(i);
965 m_stats.oldNewPairs.push_back(CDUpdateStats::OldNewGiPair(giOld, giNew));
966 foundOldSeq = true;
967 }
968 }
969 }
970 return foundOldSeq;
971 }
972
passedFilters(CCdCore * cd,CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)973 bool CDUpdater::passedFilters(CCdCore* cd, CRef< CSeq_align > seqAlign,
974 CRef< CSeq_entry > seqEntry)
975 {
976 //filter environmental seq
977 CRef< CBioseq > bioseq;
978 TGi gi = getGi(seqEntry);
979 if (!GetOneBioseqFromSeqEntry(seqEntry, bioseq))
980 {
981 m_stats.noSeq.push_back(gi);
982 return false; //no seq is not acceptable
983 }
984
985 //filter fragmented
986 if (m_config.missingResidueThreshold > 0 && isFragmentedSeq(cd, seqAlign, seqEntry))
987 {
988 m_stats.fragmented.push_back(gi);
989 return false;
990 }
991 // filter overlapping updates unless disabled
992 if (m_config.allowedOverlapWithCDRow >= 0 && overlapWithCDRow(cd, seqAlign))
993 {
994 m_stats.overlap.push_back(gi);
995 return false;
996 }
997 return true;
998 }
999
overlapWithCDRow(CCdCore * cd,CRef<CSeq_align> seqAlign)1000 bool CDUpdater::overlapWithCDRow(CCdCore* cd,CRef< CSeq_align > seqAlign)
1001 {
1002 // Ignore overlaps by disabling this check when requested.
1003 int overlap = m_config.allowedOverlapWithCDRow;
1004 if (overlap < 0) return false;
1005
1006 bool result = false;
1007 BlockModel bm(seqAlign);
1008 int lo, hi;
1009 int lastPos = bm.getLastAlignedPosition();
1010 int firstPos = bm.getFirstAlignedPosition();
1011 CRef< CSeq_id > seqId = bm.getSeqId();
1012 CRef< CSeq_id > seqIdRow;
1013
1014 // Scan until the first overlap of significant size found.
1015 // Do not return 'false' after first seq-id match in case there are repeats.
1016 for(int i = 0; !result && i < cd->GetNumRows(); i++)
1017 {
1018 if(cd->GetSeqIDFromAlignment(i, seqIdRow))
1019 {
1020 if (SeqIdsMatch(seqId, seqIdRow))
1021 {
1022 lo = cd->GetLowerBound(i);
1023 hi = cd->GetUpperBound(i);
1024 if (lo + overlap <= firstPos)
1025 result = (hi - overlap >= firstPos);
1026 else
1027 result = (lo + overlap <lastPos);
1028
1029 if (result) {
1030 if (overlap > 0) {
1031 LOG_POST("CD sequence " << i << " [" << lo << ", " << hi << "] and proposed update with range [" << firstPos << ", " << lastPos << "] exceed maximum allowed overlap = " << overlap);
1032 } else {
1033 LOG_POST("Disallowed overlap of CD sequence " << i << " [" << lo << ", " << hi << "] and proposed update with range [" << firstPos << ", " << lastPos << "]");
1034 }
1035 }
1036 // if (lo <= firstPos)
1037 // return hi >=firstPos;
1038 // else
1039 // return lo <lastPos;
1040 }
1041 }
1042 }
1043 return result;
1044 }
1045
isFragmentedSeq(CCdCore * cd,CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)1046 bool CDUpdater::isFragmentedSeq(CCdCore* cd, CRef< CSeq_align > seqAlign,
1047 CRef< CSeq_entry > seqEntry)
1048 {
1049 int pssmLen = m_consensus.size(); //equal to PSSM length
1050 int lenAligned = GetNumAlignedResidues(seqAlign);
1051 if (lenAligned >= pssmLen)
1052 return false;
1053 BlockModel master(seqAlign, false);
1054 int mGapToN = master.getGapToNTerminal(0);
1055
1056 //master is consensus at this point
1057 int mGapToC = master.getGapToCTerminal(master.getBlocks().size() -1, m_consensus.size());
1058 BlockModel slave(seqAlign);
1059 CRef< CBioseq > bioseq;
1060 if (!GetOneBioseqFromSeqEntry(seqEntry, bioseq))
1061 return true; //no seq is a fragmented seq
1062
1063 int seqLen = GetSeqLength(*bioseq);
1064 int sGapToN = slave.getGapToNTerminal(0);
1065 int sGapToC = slave.getGapToCTerminal(slave.getBlocks().size() - 1, seqLen);
1066 int allowed = m_config.missingResidueThreshold;
1067 if ( ( mGapToN - sGapToN > allowed) ||
1068 (mGapToC - sGapToC > allowed))
1069 return true;
1070 else
1071 return false;
1072 }
1073
retrieveSeq(CID1Client & client,CRef<CSeq_id> seqID,CRef<CSeq_entry> & seqEntry)1074 bool CDUpdater::retrieveSeq(CID1Client& client, CRef<CSeq_id> seqID, CRef<CSeq_entry>& seqEntry)
1075 {
1076 try {
1077 //seqEntry = client.AskGetsefromgi(maxplex);
1078 seqEntry = client.FetchEntry(*seqID, 1);
1079 } catch (...)
1080 {
1081 return false;
1082 }
1083 return true;
1084 }
1085
findSeq(CRef<CSeq_id> seqID,vector<CRef<CBioseq>> & bioseqs,CRef<CSeq_entry> & seqEntry)1086 bool CDUpdater::findSeq(CRef<CSeq_id> seqID, vector< CRef< CBioseq > >& bioseqs, CRef<CSeq_entry>& seqEntry)
1087 {
1088 for (unsigned int i = 0; i < bioseqs.size(); i++)
1089 {
1090 const CBioseq::TId& ids = bioseqs[i]->SetId();
1091 CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
1092 for (; it != itend; ++it) {
1093 if (SeqIdsMatch(seqID, *it))
1094 {
1095 seqEntry = new CSeq_entry;
1096 seqEntry->SetSeq(*bioseqs[i]);
1097 return true;
1098 }
1099 }
1100 }
1101 return false;
1102 }
1103
retrieveAllSequences(CSeq_align_set & alignments,vector<CRef<CBioseq>> & bioseqs)1104 void CDUpdater::retrieveAllSequences(CSeq_align_set& alignments, vector< CRef< CBioseq > >& bioseqs)
1105 {
1106 vector< CRef<CSeq_id> > seqids;
1107 unsigned int batchSize = 500;
1108 unsigned int maxBatchSize = 2000;
1109 string dbName = CdUpdateParameters::getBlastDatabaseName(m_config.database);
1110
1111 list< CRef< CSeq_align > >& seqAligns = alignments.Set();
1112 list< CRef< CSeq_align > >::iterator lit = seqAligns.begin();
1113 for (; lit != seqAligns.end(); lit++)
1114 {
1115 seqids.push_back((*lit)->SetSegs().SetDenseg().GetIds()[1]);
1116 list< CRef< CSeq_align > >::iterator next = lit;
1117 next++;
1118 //the batch is full or reach the end
1119 if (seqids.size() >= batchSize || (next == (seqAligns.end())) )
1120 {
1121 string errors, warnings;
1122 vector< CRef< CBioseq > > bioseqBatch;
1123 try {
1124 //LOG_POST("Calling CBlastServices::GetSequences().\n");
1125 // For Blast v5 databases, not all members of an identical protein group are in the database
1126 // which may cause GetSequences to not find the exact sequence specified from such a group
1127 // if it wasn't one of the representatives (5 as of early 2020).
1128 CBlastServices::GetSequences(seqids, dbName, 'p', bioseqBatch, errors,warnings);
1129 LOG_POST("Returned from CBlastServices::GetSequences('" << dbName << "') with a batch of " << bioseqBatch.size() << " sequences.");
1130 }
1131 catch (blast::CBlastException& be)
1132 {
1133 if (seqids.size() > maxBatchSize)
1134 {
1135 seqids.clear(); //give up on retrieving sequence on these hits.
1136 //LOG_POST("Retrieving sequences from RemoteBlast failed after repeated tries. Giving up on these %d blast hits");
1137 }
1138 else
1139 LOG_POST("Retrieving sequences from RemoteBlast failed with an exception of "<<be.GetErrCodeString());
1140 continue;
1141 } catch (...)
1142 {
1143 LOG_POST("Unspecified exception during CBlastServices::GetSequences(). Skipping to next Seq-align.\n");
1144 continue;
1145 }
1146
1147 if (seqids.size()!= bioseqBatch.size())
1148 {
1149 LOG_POST("Ask for "<< seqids.size()<<" sequences. Got "<<bioseqBatch.size()<<" back\n");
1150 LOG_POST("Error="<<errors<<"\nWarnings="<<warnings);
1151 }
1152 seqids.clear();
1153 for (unsigned int i = 0 ; i < bioseqBatch.size(); i++)
1154 {
1155 bioseqs.push_back(bioseqBatch[i]);
1156 }
1157 }
1158 }
1159 }
1160
getGi(CRef<CSeq_entry> seqEntry)1161 TGi CDUpdater::getGi(CRef< CSeq_entry > seqEntry)
1162 {
1163 vector< CRef< CSeq_id > > seqIds;
1164 GetAllIdsFromSeqEntry(seqEntry, seqIds);
1165 for (unsigned int i = 0; i < seqIds.size(); i++)
1166 if (seqIds[i]->IsGi())
1167 return seqIds[i]->GetGi();
1168 return ZERO_GI;
1169 }
1170
getGi(CRef<CBioseq> bioseq)1171 TGi CDUpdater::getGi(CRef<CBioseq> bioseq)
1172 {
1173 const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
1174 list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
1175 for (; cit != seqIds.end(); cit++)
1176 if ((*cit)->IsGi())
1177 return (*cit)->GetGi();
1178 return ZERO_GI;
1179 }
1180
1181 //change seqAlign from denseg to dendiag
1182 //remaster back to the master.
1183 //use pdb_id if available
modifySeqAlignSeqEntry(CCdCore * cd,CRef<CSeq_align> & seqAlign,CRef<CSeq_entry> seqEntry)1184 bool CDUpdater::modifySeqAlignSeqEntry(CCdCore* cd, CRef< CSeq_align >& seqAlign,
1185 CRef< CSeq_entry > seqEntry)
1186 {
1187 CSeq_align::C_Segs& oldSegs = seqAlign->SetSegs();
1188 CRef< CSeq_align::C_Segs::TDenseg> denseg( &(oldSegs.SetDenseg()) );
1189 vector< CRef< CSeq_id > >& seqIds= denseg->SetIds();
1190 if(seqIds.size() <= 1)
1191 return false;
1192
1193 if (!m_masterPdb.Empty())
1194 {
1195 seqIds[0].Reset(m_masterPdb.GetPointer());
1196 }
1197
1198 //if slave has a pdb-id use it in seqAlign
1199 vector< CRef< CSeq_id > > slaveIds;
1200 GetAllIdsFromSeqEntry(seqEntry, slaveIds, true); //pdb only
1201 if (slaveIds.size() > 0)
1202 seqIds[1].Reset( (slaveIds[0]).GetPointer() );
1203
1204 if (seqEntry->IsSet())
1205 {
1206 //pick the right BioSeq from the Set
1207 CRef< CBioseq > bioseq;
1208 if (GetOneBioseqFromSeqEntry(seqEntry, bioseq, seqIds[1].GetPointer()))
1209 {
1210 if (!reformatBioseq(bioseq, seqEntry, m_client))
1211 return false;
1212 seqEntry->SetSeq(*bioseq);
1213 }
1214 else
1215 return false;
1216 }
1217 else
1218 {
1219 CRef< CBioseq > bioseq(&seqEntry->SetSeq());
1220 if (!reformatBioseq(bioseq, seqEntry, m_client))
1221 return false;
1222 }
1223
1224 CSeq_align::C_Segs::TDendiag& dendiag = seqAlign->SetSegs().SetDendiag();
1225 Denseg2DenseDiagList(*denseg, dendiag);
1226 /*
1227 BlockModelPair bmp(seqAlign);
1228 bmp.remaster(*m_guideAlignment);
1229 seqAlign = bmp.toSeqAlign();*/
1230
1231 return true;
1232 }
1233
1234 //get org-ref from seqEntry if bioseq does not have one
1235 //remove all unnecessary fields
1236 //replace ftable with mmdb-id
reformatBioseq(CRef<CBioseq> bioseq,CRef<CSeq_entry> seqEntry,CEntrez2Client & client)1237 bool CDUpdater::reformatBioseq(CRef< CBioseq > bioseq, CRef< CSeq_entry > seqEntry, CEntrez2Client& client )
1238 {
1239 //get BioSource if there is none in bioseq
1240 CSeq_descr& seqDescr = bioseq->SetDescr();
1241 bool hasSource = false;
1242 bool hasTitle = false;
1243 //reset all fields except the source field
1244
1245 //need trim even if bioseq is not a Set
1246 if (seqDescr.IsSet())
1247 {
1248 list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
1249 list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
1250 while (cit != descrList.end())
1251 {
1252 if ((*cit)->IsSource() && (!hasSource)) //only keep one source field
1253 {
1254 hasSource = true;
1255 cit++;
1256 }
1257 else if ( (*cit)->IsTitle())
1258 {
1259 cit++;
1260 hasTitle = true;
1261 }
1262 //extract taxid/taxname from "TaxNamesData" field
1263 //blastdb uses it to send tax info
1264 else if ((*cit)->IsUser() && (!hasSource))
1265 {
1266 if ((*cit)->SetUser().SetType().SetStr() == "TaxNamesData")
1267 {
1268 vector< CRef< CUser_field > >& fields = (*cit)->SetUser().SetData();
1269 if ( fields.size() > 0)
1270 {
1271 CRef< CUser_field > field = fields[0];
1272 TTaxId taxid = TAX_ID_FROM(CObject_id::TId, field->GetLabel().GetId());
1273 string taxname = field->GetData().GetStrs()[0];
1274 //create a source seedsc and add it
1275 CRef< CSeqdesc > source(new CSeqdesc);
1276 COrg_ref& orgRef = source->SetSource().SetOrg();
1277 orgRef.SetTaxId(taxid);
1278 orgRef.SetTaxname(taxname);
1279 descrList.push_back(source);
1280 hasSource = true;
1281 }
1282 }
1283 cit = descrList.erase(cit);
1284 }
1285 else
1286 cit = descrList.erase(cit);
1287 }
1288 }
1289 if (!hasSource)
1290 {
1291 //get source or org-ref from seqEntry
1292 if (seqEntry->IsSet())
1293 {
1294 const list< CRef< CSeqdesc > >& descrList = seqEntry->GetSet().GetDescr().Get();
1295 list< CRef< CSeqdesc > >::const_iterator cit = descrList.begin();
1296 for (; cit != descrList.end(); cit++)
1297 {
1298 if ((*cit)->IsSource())
1299 {
1300 seqDescr.Set().push_back(*cit);
1301 break;
1302 }
1303 }
1304 }
1305 }
1306 // if bioSeq is pdb
1307 //replace annot field with mmdb-id
1308 //otherwise reset annot field
1309 bioseq->ResetAnnot();
1310 const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
1311 list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
1312 bool isPdb = false;
1313 for (; cit != seqIds.end(); cit++)
1314 {
1315 if ((*cit)->IsPdb())
1316 {
1317 isPdb = true;
1318 break;
1319 }
1320 }
1321 if (isPdb)
1322 {
1323 //CEntrez2Client client;
1324 vector<TIntId> uids;
1325 string pdb = (*cit)->GetPdb().GetMol().Get();
1326 pdb += "[ACCN]";
1327 try {
1328 client.Query(pdb, "structure", uids);
1329 } catch (CException& e)
1330 {
1331 LOG_POST("\nFailed to retrieve mmdb-id for "<<pdb<<" because the error:\n "<<e.ReportAll());
1332 return false;
1333 }
1334 int mmdbId = 0;
1335 if (uids.size() > 0)
1336 {
1337 mmdbId = uids[0];
1338 CRef<CSeq_id> mmdbTag (new CSeq_id);
1339 CSeq_id::TGeneral& generalId = mmdbTag->SetGeneral();
1340 generalId.SetDb("mmdb");
1341 generalId.SetTag().SetId(mmdbId);
1342 CRef< CSeq_annot> seqAnnot (new CSeq_annot);
1343 seqAnnot->SetData().SetIds().push_back(mmdbTag);
1344 bioseq->SetAnnot().push_back(seqAnnot);
1345 }
1346 if (!hasTitle)
1347 {
1348 CRef< CPDB_block > pdbBlock;
1349 if (GetPDBBlockFromSeqEntry(seqEntry, pdbBlock))
1350 {
1351 CRef< CSeqdesc > seqDesc(new CSeqdesc);
1352 if (pdbBlock->CanGetCompound())
1353 {
1354 const list< string >& compounds = pdbBlock->GetCompound();
1355 if (compounds.size() != 0)
1356 seqDesc->SetTitle(*(compounds.begin()));
1357 seqDescr.Set().push_back(seqDesc);
1358 }
1359 }
1360 }
1361 }
1362 return true;
1363
1364 }
1365
GetAllIdsFromSeqEntry(CRef<CSeq_entry> seqEntry,vector<CRef<CSeq_id>> & slaveIds,bool pdbOnly)1366 int CDUpdater::GetAllIdsFromSeqEntry(CRef< CSeq_entry > seqEntry,
1367 vector< CRef< CSeq_id > >& slaveIds, bool pdbOnly)
1368 {
1369 if (seqEntry->IsSeq())
1370 {
1371 const list< CRef< CSeq_id > >& seqIdList = seqEntry->GetSeq().GetId();
1372 list< CRef< CSeq_id > >::const_iterator lsii;
1373 for (lsii = seqIdList.begin(); lsii != seqIdList.end(); ++lsii)
1374 {
1375 if (pdbOnly)
1376 {
1377 if ((*lsii)->IsPdb())
1378 slaveIds.push_back(*lsii);
1379 }
1380 else
1381 slaveIds.push_back(*lsii);
1382 }
1383 return slaveIds.size();
1384 }
1385 else
1386 {
1387 list< CRef< CSeq_entry > >::const_iterator lsei;
1388 const list< CRef< CSeq_entry > >& seqEntryList = seqEntry->GetSet().GetSeq_set();
1389 for (lsei = seqEntryList.begin(); lsei != seqEntryList.end(); ++lsei)
1390 {
1391 GetAllIdsFromSeqEntry(*lsei, slaveIds, pdbOnly); // RECURSIVE!!
1392 }
1393 return slaveIds.size();
1394 }
1395 }
1396 //get only protein
GetOneBioseqFromSeqEntry(CRef<CSeq_entry> seqEntry,CRef<CBioseq> & bioseq,const CSeq_id * seqId)1397 bool CDUpdater::GetOneBioseqFromSeqEntry(CRef< CSeq_entry > seqEntry,
1398 CRef< CBioseq >& bioseq,const CSeq_id* seqId)
1399 {
1400 if (seqEntry->IsSeq())
1401 {
1402 if (seqEntry->GetSeq().IsAa())
1403 {
1404 if (seqId)
1405 {
1406 if (SeqEntryHasSeqId(seqEntry, *seqId))
1407 {
1408 bioseq.Reset(&seqEntry->SetSeq());
1409 return true;
1410 }
1411 else
1412 return false;
1413 }
1414 else
1415 {
1416 bioseq.Reset(&seqEntry->SetSeq());
1417 return true;
1418 }
1419 }
1420 else
1421 return false;
1422
1423 }
1424 else
1425 {
1426 list< CRef< CSeq_entry > >::const_iterator lsei;
1427 const list< CRef< CSeq_entry > >& seqEntryList = seqEntry->GetSet().GetSeq_set();
1428 for (lsei = seqEntryList.begin(); lsei != seqEntryList.end(); ++lsei)
1429 {
1430 if (GetOneBioseqFromSeqEntry(*lsei, bioseq, seqId)) // RECURSIVE!!
1431 return true;
1432 }
1433 return false;
1434 }
1435 }
1436
SeqEntryHasSeqId(CRef<CSeq_entry> seqEntry,const CSeq_id & seqId)1437 bool CDUpdater::SeqEntryHasSeqId(CRef< CSeq_entry > seqEntry, const CSeq_id& seqId)
1438 {
1439 vector< CRef< CSeq_id > > seqIds;
1440 GetAllIdsFromSeqEntry(seqEntry, seqIds,false);
1441 for (unsigned int i = 0; i < seqIds.size(); i++)
1442 {
1443 if (seqIds[i]->Match(seqId))
1444 return true;
1445 }
1446 return false;
1447 }
1448
BioseqHasSeqId(const CBioseq & bioseq,const CSeq_id & seqId)1449 bool CDUpdater::BioseqHasSeqId(const CBioseq& bioseq, const CSeq_id& seqId)
1450 {
1451 const CBioseq::TId& ids = bioseq.GetId();
1452 CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
1453 for (; it != itend; ++it)
1454 {
1455 if ((*it)->Match(seqId))
1456 {
1457 return true;
1458 }
1459 }
1460 return false;
1461 }
1462
1463
SplitBioseqByBlastDefline(CRef<CBioseq> orig,vector<CRef<CBioseq>> & bioseqs)1464 int CDUpdater::SplitBioseqByBlastDefline (CRef<CBioseq> orig, vector< CRef<CBioseq> >& bioseqs)
1465 {
1466 CRef<CBlast_def_line_set> blastDefLine = GetBlastDefline(*orig);
1467 RemoveBlastDefline(*orig);
1468 list< CRef< CBlast_def_line > >& deflines = blastDefLine->Set();
1469 //most cases
1470 if (deflines.size() <= 1)
1471 {
1472 bioseqs.push_back(orig);
1473 return 1;
1474 }
1475 //PDBs likely
1476 int order = 0;
1477 for (list< CRef< CBlast_def_line > >::iterator iter = deflines.begin();
1478 iter != deflines.end(); iter++)
1479 {
1480 CRef<CBioseq> splitBioseq(new CBioseq);
1481 splitBioseq->Assign(*orig);
1482 reformatBioseqByBlastDefline(splitBioseq, *iter, order);
1483 bioseqs.push_back(splitBioseq);
1484 order++;
1485 }
1486 return deflines.size();
1487 }
1488
reformatBioseqByBlastDefline(CRef<CBioseq> bioseq,CRef<CBlast_def_line> blastDefline,int order)1489 void CDUpdater::reformatBioseqByBlastDefline(CRef<CBioseq> bioseq, CRef< CBlast_def_line > blastDefline, int order)
1490 {
1491 CSeq_descr& seqDescr = bioseq->SetDescr();
1492 int sourceOrder = 0;
1493 if (seqDescr.IsSet())
1494 {
1495 list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
1496 list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
1497 while (cit != descrList.end())
1498 {
1499 if ((*cit)->IsSource()) //only keep one source field
1500 {
1501 if (sourceOrder == order)
1502 cit++; //keep
1503 else
1504 cit = descrList.erase(cit);
1505
1506 // Do this for both cases; if sourceOrder == order must increment
1507 // otherwise will keep all sources *after* order.
1508 sourceOrder++;
1509 }
1510 else if ( (*cit)->IsTitle())
1511 cit = descrList.erase(cit);
1512 }
1513 //add the title from the defLine
1514 CRef< CSeqdesc > title(new CSeqdesc);
1515 title->SetTitle(blastDefline->GetTitle());
1516 descrList.push_back(title);
1517 }
1518
1519 //add seq_ids from the defline
1520 bioseq->SetId().assign(blastDefline->GetSeqid().begin(), blastDefline->GetSeqid().end());
1521 }
1522
1523 // IMPORTANT: This code was forked from src/objtools/align_format/align_format_util.cpp.
1524 // Check for changes in original source if this forked version misbehaves in the future.
1525 /// Efficiently decode a Blast-def-line-set from binary ASN.1.
1526 /// @param oss Octet string sequence of binary ASN.1 data.
1527 /// @param bdls Blast def line set decoded from oss.
OssToDefline(const CUser_field::TData::TOss & oss,CBlast_def_line_set & bdls)1528 void CDUpdater::OssToDefline(const CUser_field::TData::TOss & oss, CBlast_def_line_set& bdls)
1529 {
1530 typedef const CUser_field::TData::TOss TOss;
1531
1532 const char * data = NULL;
1533 size_t size = 0;
1534 string temp;
1535
1536 if (oss.size() == 1) {
1537 // In the single-element case, no copies are needed.
1538
1539 const vector<char> & v = *oss.front();
1540 data = & v[0];
1541 size = v.size();
1542 } else {
1543 // Determine the octet string length and do one allocation.
1544
1545 ITERATE (TOss, iter1, oss) {
1546 size += (**iter1).size();
1547 }
1548
1549 temp.reserve(size);
1550
1551 ITERATE (TOss, iter3, oss) {
1552 // 23.2.4[1] "The elements of a vector are stored contiguously".
1553 temp.append(& (**iter3)[0], (*iter3)->size());
1554 }
1555
1556 data = & temp[0];
1557 }
1558
1559 CObjectIStreamAsnBinary inpstr(data, size);
1560 inpstr >> bdls;
1561 }
1562
1563
1564
1565 // IMPORTANT: This code was forked from src/objtools/align_format/align_format_util.cpp.
1566 // That method uses the object manager, however, so we're not calling the function directly.
1567 // Check for changes in original source if this forked version misbehaves in the future.
GetBlastDefline(const CBioseq & bioseq)1568 CRef<CBlast_def_line_set> CDUpdater::GetBlastDefline (const CBioseq& bioseq)
1569 {
1570 static const string asnDeflineObjLabel = "ASN1_BlastDefLine";
1571
1572 CRef<CBlast_def_line_set> bdls(new CBlast_def_line_set());
1573 if(bioseq.IsSetDescr()){
1574 const CSeq_descr& desc = bioseq.GetDescr();
1575 const list< CRef< CSeqdesc > >& descList = desc.Get();
1576 for (list<CRef< CSeqdesc > >::const_iterator iter = descList.begin(); iter != descList.end(); iter++){
1577
1578 if((*iter)->IsUser()){
1579 const CUser_object& uobj = (*iter)->GetUser();
1580 const CObject_id& uobjid = uobj.GetType();
1581 if(uobjid.IsStr()){
1582
1583 const string& label = uobjid.GetStr();
1584 if (label == asnDeflineObjLabel){
1585 const vector< CRef< CUser_field > >& usf = uobj.GetData();
1586
1587 if(usf.front()->GetData().IsOss()){ //only one user field
1588 typedef const CUser_field::TData::TOss TOss;
1589 const TOss& oss = usf.front()->GetData().GetOss();
1590 OssToDefline(oss, *bdls);
1591 }
1592 }
1593 }
1594 }
1595 }
1596 }
1597 return bdls;
1598 }
1599
RemoveBlastDefline(CBioseq & handle)1600 void CDUpdater::RemoveBlastDefline (CBioseq& handle)
1601 {
1602 static const string asnDeflineObjLabel = "ASN1_BlastDefLine";
1603 if(handle.IsSetDescr())
1604 {
1605 CSeq_descr& desc = handle.SetDescr();
1606 list< CRef< CSeqdesc > >& descList = desc.Set();
1607 for (list<CRef< CSeqdesc > >::iterator iter = descList.begin(); iter != descList.end(); iter++)
1608 {
1609 if((*iter)->IsUser())
1610 {
1611 const CUser_object& uobj = (*iter)->GetUser();
1612 const CObject_id& uobjid = uobj.GetType();
1613 if(uobjid.IsStr())
1614 {
1615 const string& label = uobjid.GetStr();
1616 if (label == asnDeflineObjLabel)
1617 {
1618 descList.erase(iter);
1619 return;
1620 }
1621 }
1622 }
1623 }
1624 }
1625 }
1626
processPendingToNormal(int overlap,CCdCore * cd)1627 int CDUpdater::processPendingToNormal(int overlap, CCdCore* cd)
1628 {
1629 AlignmentCollection ac(cd); //default:pending only
1630 int num = ac.GetNumRows();
1631 int seqlen = cd->GetSequenceStringByRow(0).size();
1632 if (seqlen <= 0)
1633 return num;
1634 vector< CRef< CSeq_align > > seqAlignVec;
1635 for (int i = 0; i < num; i++)
1636 seqAlignVec.push_back(ac.getSeqAlign(i));
1637 cd_utils::BlockFormater bf(seqAlignVec, seqlen);
1638 list< CRef< CSeq_align > >& seqAlignList = cd->GetSeqAligns();
1639 if (seqAlignList.size() > 0)
1640 {
1641 BlockModelPair bmp(*seqAlignList.begin());
1642 if (bmp.getMaster() == bmp.getSlave()) //aligned to self; used as a seed
1643 seqAlignList.erase(seqAlignList.begin());
1644 if (seqAlignList.size() > 0)
1645 bf.setReferenceSeqAlign(*seqAlignList.begin());
1646 }
1647 int numGood = bf.findIntersectingBlocks(overlap);
1648 bf.formatBlocksForQualifiedRows(seqAlignList);
1649 set<int> goodRows;
1650 vector<int> rows;
1651 bf.getQualifiedRows(rows);
1652 for (unsigned int r = 0; r < rows.size(); r++)
1653 goodRows.insert(rows[r]);
1654 cd->ErasePendingRows(goodRows);
1655 return num - numGood;
1656 }
1657
1658
CDRefresher(CCdCore * cd)1659 CDRefresher::CDRefresher (CCdCore* cd)
1660 : m_cd(cd)
1661 {
1662 addSequences(cd->SetSequences());
1663 }
1664
addSequences(CSeq_entry & seqEntry)1665 void CDRefresher::addSequences(CSeq_entry& seqEntry)
1666 {
1667 if (seqEntry.IsSet())
1668 {
1669 list< CRef< CSeq_entry > >& seqSet = seqEntry.SetSet().SetSeq_set();
1670 list< CRef< CSeq_entry > >::iterator it = seqSet.begin();
1671 for (; it != seqSet.end(); it++)
1672 addSequences(*(*it));
1673 }
1674 else
1675 {
1676 CRef< CBioseq > bioseq(&(seqEntry.SetSeq()));
1677 addSequence(bioseq);
1678 }
1679 }
1680
addSequence(CRef<CBioseq> bioseq)1681 void CDRefresher::addSequence(CRef< CBioseq > bioseq)
1682 {
1683 string acc;
1684 int ver=0;
1685 CRef< CSeq_id > textId;
1686 if (GetAccAndVersion(bioseq, acc, ver, textId))
1687 m_accSeqMap.insert(AccessionBioseqMap::value_type(acc, bioseq));
1688 }
1689
hasOlderVersion(CRef<CBioseq> bioseq)1690 bool CDRefresher::hasOlderVersion(CRef< CBioseq > bioseq)
1691 {
1692 string nAcc;
1693 int nVer=0;
1694 CRef< CSeq_id > textId;
1695 if (!GetAccAndVersion(bioseq, nAcc, nVer, textId))
1696 return false;
1697 AccessionBioseqMap::iterator it = m_accSeqMap.find(nAcc);
1698 if (it != m_accSeqMap.end())
1699 {
1700 TGi newgi = CDUpdater::getGi(bioseq);
1701 TGi oldgi = CDUpdater::getGi(it->second);
1702 return newgi != oldgi;
1703 }
1704 else
1705 return false;
1706 }
1707
1708 //return the gi that's replaced; return -1 if none is replaced
refresh(CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)1709 TGi CDRefresher::refresh(CRef< CSeq_align> seqAlign, CRef< CSeq_entry > seqEntry)
1710 {
1711 CRef< CBioseq > bioseqRef;
1712 if (!CDUpdater::GetOneBioseqFromSeqEntry(seqEntry, bioseqRef))
1713 return INVALID_GI;
1714 if (!hasOlderVersion(bioseqRef))
1715 return INVALID_GI;
1716 string nAcc;
1717 int nVer=0;
1718 CRef< CSeq_id > textId;
1719 if (!GetAccAndVersion(bioseqRef, nAcc, nVer, textId))
1720 return INVALID_GI;
1721 CRef< CBioseq > oldBioseq = m_accSeqMap[nAcc];
1722 string newStr, oldStr;
1723 GetNcbieaaString(*bioseqRef, newStr);
1724 GetNcbieaaString(*oldBioseq, oldStr);
1725 if (newStr.size() != oldStr.size())
1726 return INVALID_GI;
1727 //proceed to do the placement
1728 vector< CRef< CSeq_id > > newIds;
1729 CDUpdater::GetAllIdsFromSeqEntry(seqEntry, newIds);
1730 CRef< CSeq_id > giId, pdbId;
1731 for (unsigned int i = 0; i < newIds.size(); i++)
1732 {
1733 if (newIds[i]->IsGi())
1734 giId = newIds[i];
1735 else if (newIds[i]->IsPdb())
1736 pdbId = newIds[i];
1737 }
1738 list< CRef< CSeq_align > >& seqAlignList = m_cd->GetSeqAligns();
1739 bool replaced = false;
1740 for (list< CRef< CSeq_align > >::iterator lit = seqAlignList.begin(); lit != seqAlignList.end(); lit++)
1741 {
1742 CRef< CSeq_align >& seqAlign = *lit;
1743 CRef< CSeq_id > idInAlign;
1744 GetSeqID(seqAlign, idInAlign, true);
1745 if (CDUpdater::BioseqHasSeqId(*oldBioseq, *idInAlign))
1746 {
1747 BlockModelPair bmp(seqAlign);
1748
1749 if (pdbId.NotNull())
1750 {
1751 bmp.getSlave().setSeqId(pdbId);
1752 seqAlign = bmp.toSeqAlign();
1753 replaced =true;
1754 }
1755 else if (giId.NotNull())
1756 {
1757 bmp.getSlave().setSeqId(giId);
1758 seqAlign = bmp.toSeqAlign();
1759 replaced = true;
1760 }
1761 }
1762 }
1763 if (replaced)
1764 {
1765 m_cd->AddSequence(seqEntry);
1766 return CDUpdater::getGi(oldBioseq);
1767 }
1768 else
1769 return INVALID_GI;
1770 }
1771
1772 END_SCOPE(cd_utils)
1773 END_NCBI_SCOPE
1774