1 /* $Id: cuCdUpdater.cpp 609836 2020-06-08 15:56:03Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Charlie Liu
27  *
28  * File Description:
29  *
30  *       Update CDs
31  *
32  * ===========================================================================
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <algo/structure/cd_utils/cuAlign.hpp>
37 #include <algo/structure/cd_utils/cuTaxClient.hpp>
38 #include <objects/id1/Entry_complexities.hpp>
39 #include <objects/id1/ID1server_maxcomplex.hpp>
40 #include <objects/seqalign/Dense_seg.hpp>
41 #include <objects/general/User_object.hpp>
42 #include <objects/general/User_field.hpp>
43 #include <algo/blast/api/blast_exception.hpp>
44 #include <algo/blast/api/effsearchspace_calc.hpp>
45 #include <algo/blast/api/psiblast_options.hpp>
46 #include <objects/blastdb/Blast_def_line.hpp>
47 #include <objtools/blast/services/blast_services.hpp>
48 #include <algo/structure/cd_utils/cuSequence.hpp>
49 #include <algo/structure/cd_utils/cuCdUpdateParameters.hpp>
50 #include <algo/structure/cd_utils/cuCD.hpp>
51 #include <algo/structure/cd_utils/cuCdUpdater.hpp>
52 #include <algo/structure/cd_utils/cuUtils.hpp>
53 #include <algo/structure/cd_utils/cuCdReadWriteASN.hpp>
54 #include <algo/structure/cd_utils/cuHitsDistributor.hpp>
55 #include <algo/structure/cd_utils/cuSequenceTable.hpp>
56 #include <algo/structure/cd_utils/cuPssmMaker.hpp>
57 #include <algo/structure/cd_utils/cuBlockIntersector.hpp>
58 #include <algo/structure/cd_utils/cuBlockFormater.hpp>
59 #include <objects/scoremat/Pssm.hpp>
60 #include <algo/structure/cd_utils/cuAlignmentCollection.hpp>
61 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(cd_utils)62 BEGIN_SCOPE(cd_utils)
63 
64 CDUpdateStats::CDUpdateStats() : numBlastHits(0), numRedundant(0), numObsolete(0),
65 	numFilteredByOverlap(0){}
66 
toString(bool detailed)67 string CDUpdateStats::toString(bool detailed)
68 {
69 	int added = numBlastHits - envSeq.size() - fragmented.size() - overlap.size()
70 		- noSeq.size() - badAlign.size() - numRedundant - oldNewPairs.size();
71 	string result = "Total number of sequences added to the pending list of the Cd:" + NStr::IntToString(added)
72 		+". ";
73 	if (numFilteredByOverlap > 0)
74 	{
75 		result += "Total number of pending sequences  that are not moved to normal alignment because of insufficient overlapping:"
76 			+ NStr::IntToString(numFilteredByOverlap) + ". ";
77 	}
78 	if (!detailed)
79 		return result;
80 	result += "Number of Blast Hits = ";
81 	result += NStr::IntToString(numBlastHits);
82 	result += ". ";
83 
84 	result += toString(envSeq, "Environmental Sequences");
85 	result += toString(fragmented, "Sequence Fragments");
86 	result += toString(overlap, "Alignments overlapping a row already in CD");
87 	result += toString(noSeq, "Alignments with no sequence data");
88 	result += toString(badAlign, "Alignments that are corrupted or do not match with the CD");
89 	if (numRedundant > 0)
90 	{
91 		result += "Alignments removed due to redundancy:";
92 		result += NStr::IntToString(numRedundant);
93 		result += ". ";
94 	}
95 	result += toString(oldNewPairs, "New sequences that can replace old sequences (in  parentheses) already in CD");
96 	result += ". ";
97 	if (numObsolete > 0)
98 	{
99 		result += "Numer of obsolete sequences removed:";
100 		result += NStr::IntToString(numObsolete);
101 		result += ". ";
102 	}
103 	return result;
104 }
105 
toString(vector<TGi> & gis,string type)106 string CDUpdateStats::toString(vector<TGi>& gis, string type)
107 {
108 	if (gis.size() <= 0)
109 		return "";
110 	string result = "Number of " + type + " =";
111 	result += NStr::UIntToString((unsigned)gis.size());
112 	result += ":\n";
113 	result += toString(gis);
114 	result += "\n";
115 	return result;
116 }
117 
toString(vector<TGi> & gis)118 string CDUpdateStats::toString(vector<TGi>& gis)
119 {
120 	string result;
121 	for(unsigned int i = 0; i < gis.size(); i++)
122 	{
123 		result += NStr::NumericToString<TGi>( gis[i] );
124 		result += ",";
125 	}
126 	return result;
127 }
128 
toString(vector<OldNewGiPair> & giPairs,string type)129 string CDUpdateStats::toString(vector<OldNewGiPair>& giPairs, string type)
130 {
131 	if (giPairs.size() <= 0)
132 		return "";
133 	string result = "Number of " + type + " =";
134 	result += NStr::UIntToString((unsigned)giPairs.size());
135 	result += ":\n";
136 	for (unsigned int i = 0 ; i < giPairs.size(); i++)
137 	{
138 		result += NStr::NumericToString<TGi>(giPairs[i].second);
139 		result += "(";
140 		result += NStr::NumericToString<TGi>(giPairs[i].first);
141 		result += ")";
142 		result += ",";
143 	}
144 	result += "\n";
145 	return result;
146 }
147 
148 
149 /*---------------------------------------------------------------------------------*/
150 /*                     UpdaterInterface                                                   */
151 /*---------------------------------------------------------------------------------*/
152 
153 list<UpdaterInterface*> UpdaterInterface::m_updaterList;
154 
addUpdater(UpdaterInterface * updater)155 void UpdaterInterface::addUpdater(UpdaterInterface* updater)
156 {
157 	m_updaterList.push_back(updater);
158 }
159 
IsEmpty()160 bool UpdaterInterface::IsEmpty()
161 {
162 	return m_updaterList.empty();
163 }
164 
removeUpdaters(const vector<CCdCore * > & cds)165 void UpdaterInterface::removeUpdaters(const vector<CCdCore*>& cds)
166 {
167 	for (unsigned int i = 0; i < cds.size(); i++)
168 	{
169 		for (list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
170 			lit != m_updaterList.end(); lit++)
171 		{
172 			if ((*lit)->hasCd(cds[i]))
173 			{
174 				UpdaterInterface* cup = *lit;
175 				m_updaterList.erase(lit);
176 				if (cup)
177 				{
178 					//cup->getCd()->ClearUpdateInfo();
179 					delete cup;
180 				}
181 				break;
182 			}
183 		}
184 	}
185 }
186 
removeUpdaters(const vector<UpdaterInterface * > & updaters)187 void UpdaterInterface::removeUpdaters(const vector<UpdaterInterface*>& updaters)
188 {
189 	for (unsigned int i = 0; i < updaters.size(); i++)
190 	{
191 		for (list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
192 			lit != m_updaterList.end(); lit++)
193 		{
194 			if ((*lit) == updaters[i])
195 			{
196 				UpdaterInterface* cup = *lit;
197 				m_updaterList.erase(lit);
198 				if (cup)
199 				{
200 					//cup->getCd()->ClearUpdateInfo();
201 					delete cup;
202 				}
203 				break;
204 			}
205 		}
206 	}
207 }
208 
checkAllBlasts(vector<UpdaterInterface * > & blasted)209 int UpdaterInterface::checkAllBlasts(vector<UpdaterInterface*>& blasted)
210 {
211 	list<UpdaterInterface*>::iterator lit = m_updaterList.begin();
212 	while(lit != m_updaterList.end())
213 	{
214 		UpdaterInterface* updater = *lit;
215 		if(updater->getBlastHits())
216 		{
217 			blasted.push_back(updater);
218 		}
219 		lit++;
220 	}
221 	return blasted.size();
222 }
223 
224 /*---------------------------------------------------------------------------------*/
225 /*                     GroupUpdater                                                   */
226 /*---------------------------------------------------------------------------------*/
227 
GroupUpdater(vector<CCdCore * > & cds,CdUpdateParameters & config)228 GroupUpdater::GroupUpdater(vector<CCdCore*>& cds, CdUpdateParameters& config)
229 {
230 	for (unsigned int i = 0; i < cds.size(); i++)
231 		m_cdUpdaters.push_back(new CDUpdater(cds[i], config));
232 }
233 
~GroupUpdater()234 GroupUpdater::~GroupUpdater() //delete all in m_cdUpdaters
235 {
236 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
237 		delete (m_cdUpdaters[i]);
238 }
239 
240 	//UpdaterInterface
submitBlast(bool wait,int row)241 int GroupUpdater::submitBlast(bool wait, int row)
242 {
243     int count = 0;
244     for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
245 	{
246         if (!(m_cdUpdaters[i])->submitBlast(wait,row)) {
247 			return 0;  // return 0 if one fails so legacy "if" statements still work.
248         }
249         else {
250             count++;
251         }
252 	}
253 	return count;
254 }
255 
getBlastHits()256 bool GroupUpdater::getBlastHits()
257 {
258 	bool allDone = true;
259 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
260 	{
261 		if (!(m_cdUpdaters[i]->getBlastHits()))
262 			allDone = false;
263 	}
264 	return allDone;
265 }
266 
processBlastHits()267 bool GroupUpdater::processBlastHits()
268 {
269 	if (!getBlastHits())
270 	{
271 		LOG_POST("Not all BLASTs on the group are done.  Thus updating this group can't be done at this time.\n");
272 		return false;
273 	}
274 	//distribute
275 	HitDistributor dist;
276 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
277 	{
278 		dist.addBatch(m_cdUpdaters[i]->getAlignments());
279 	}
280 	//dist.dump("DistTab.txt");
281 	dist.distribute();
282 	//update individual CDs
283 	bool allDone = true;
284 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
285 	{
286 		if (!(m_cdUpdaters[i]->processBlastHits()))
287 			allDone = false;
288 	}
289 	return allDone;
290 }
291 
getCds(vector<CCdCore * > & cds)292 void GroupUpdater::getCds(vector<CCdCore*>& cds)
293 {
294 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
295 	{
296 		m_cdUpdaters[i]->getCds(cds);
297 	}
298 }
299 
hasCd(CCdCore * cd)300 bool GroupUpdater::hasCd(CCdCore* cd)
301 {
302 	for (unsigned int i = 0; i < m_cdUpdaters.size(); i++)
303 	{
304 		if (m_cdUpdaters[i]->hasCd(cd))
305 			return true;
306 	}
307 	return false;;
308 }
309 
310 /*---------------------------------------------------------------------------------*/
311 /*                     CDUpdater                                                   */
312 /*---------------------------------------------------------------------------------*/
313 
CDUpdater(CCdCore * cd,CdUpdateParameters & config)314 CDUpdater::CDUpdater(CCdCore* cd, CdUpdateParameters& config)
315 : m_config(config), m_cd(cd), m_guideAlignment(0), m_processPendingThreshold(-1), m_hitsNeeded(-1),
316   m_blastQueryRow(0)
317 {
318 }
319 
320 
~CDUpdater()321 CDUpdater::~CDUpdater()
322 {
323 	if (m_guideAlignment)
324 		delete m_guideAlignment;
325 }
326 
submitBlast(bool wait,int row)327 int CDUpdater::submitBlast(bool wait, int row)
328 {
329 	m_blastQueryRow = row;
330 	bool blasted = false;
331 	try {
332 		blasted = blast(wait, row);
333 	}
334 	catch (blast::CBlastException& be) {
335 		blasted = false;
336 		m_lastError = "Blast exception in submitBlast for row " + NStr::IntToString(row) + ":\n";
337         m_lastError += be.ReportAll();
338 	}
339 	catch (CException& e) {
340 		blasted = false;
341 		m_lastError = "NCBI C++ Toolkit exception in submitBlast for row " + NStr::IntToString(row) + ":\n";
342         m_lastError += e.ReportAll();
343 	}
344     catch (...) {
345         blasted = false;
346 		m_lastError = "Unknown exception in submitBlast for row " + NStr::IntToString(row) + "\n";
347     }
348 
349 	if (blasted)
350 	{
351 		LOG_POST("RID of Blast for the update of " << m_cd->GetAccession() << " is " << getRid());
352 	}
353 	else
354 	{
355 		LOG_POST("Update of " << m_cd->GetAccession() << " failed due to error\n" << getLastError());
356 	}
357 
358     return(blasted ? 1 : 0);
359 }
360 
getBlastHits()361 bool CDUpdater::getBlastHits()
362 {
363 	if (!m_hits.Empty())
364 		return true;
365 	else
366 		return getHits(m_hits);
367 }
368 
processBlastHits()369 bool CDUpdater::processBlastHits()
370 {
371 	bool updated = false;
372 
373 	//m_Data.cacheCD(m_cd->GetAccession().c_str(),"Updated by Blast",EDITTYPE_ALIGNMENT);
374 	if (!m_hits.Empty())
375 	{
376 		updated = true;
377 		update(m_cd, *m_hits);
378 		//LOG_POST("Stats of Updating "<<m_cd->GetAccession()<< ":\n"<<getStats().toString());
379 		unsigned numNoAlignment = getStats().badAlign.size();
380 		if (numNoAlignment > 0)
381 			LOG_POST("There are "<<numNoAlignment
382 			<<" hits whose alignments do not overlap with the CD.  This may indicate there are long insert to the CD alignment.  You can find the GIs for those hits in the log.");
383 	}
384 	else
385 	{
386 		LOG_POST("Found no BLAST hits to process for CD " << m_cd->GetAccession() << ".\n");
387 	}
388 	return updated;
389 }
390 
getCds(vector<CCdCore * > & cds)391 void CDUpdater::getCds(vector<CCdCore*>& cds)
392 {
393 	cds.push_back(m_cd);
394 }
395 
hasCd(CCdCore * cd)396 bool CDUpdater::hasCd(CCdCore* cd)
397 {
398 	return m_cd == cd;
399 }
400 
blast(bool wait,int row)401 bool CDUpdater::blast(bool wait, int row)
402 {
403 	blast::CRemoteBlast* rblast;
404 	blast::CBlastProteinOptionsHandle* blastopt;
405 	CPSIBlastOptionsHandle * psiopt = NULL;
406 	if (m_config.blastType == eBLAST)
407 	{
408 		blastopt = new blast::CBlastProteinOptionsHandle(blast::CBlastOptions::eRemote);
409 		rblast = new blast::CRemoteBlast(blastopt);
410 	}
411 	else
412 	{
413 		psiopt = new blast::CPSIBlastOptionsHandle(blast::CBlastOptions::eRemote);
414 		//psiopt->SetCompositionBasedStats(eNoCompositionBasedStats);
415 		rblast = new blast::CRemoteBlast(psiopt);
416 		blastopt = psiopt;
417 	}
418 	//rblast-> SetVerbose();
419 	blastopt->SetSegFiltering(false);
420 	if (m_config.numHits > 0)
421 		blastopt->SetHitlistSize(m_config.numHits);
422 	if (m_config.evalue > 0)
423 		blastopt->SetEvalueThreshold(m_config.evalue);
424 	if (m_config.identityThreshold > 0) //does not seem to do anything with RemoteBlast
425 		blastopt->SetPercentIdentity((double)m_config.identityThreshold);
426 	rblast->SetDatabase(CdUpdateParameters::getBlastDatabaseName(m_config.database));
427 
428 	string entrezQuery;
429 	if (m_config.organism != eAll_organisms)
430 	{
431 		entrezQuery += CdUpdateParameters::getOrganismName(m_config.organism);
432 		entrezQuery += "[Organism]";
433 	}
434 	if (m_config.entrezQuery.size() > 0)
435 		entrezQuery += m_config.entrezQuery;
436 	if (!entrezQuery.empty())
437 		rblast->SetEntrezQuery(entrezQuery.c_str());
438 
439 	//set PSSM here
440 	if (m_config.blastType == eBLAST)
441 	{
442 		CRef<objects::CBioseq_set> bioseqs(new CBioseq_set);
443 		list< CRef< CSeq_entry > >& seqList = bioseqs->SetSeq_set();
444 		CRef< CSeq_entry > seqOld;
445 
446 		if (!m_cd->GetSeqEntryForRow(row, seqOld)) {
447             delete rblast;
448 			return false;
449         }
450 
451 		seqList.push_back(seqOld);
452 		CRef< CSeq_id > seqId = seqOld->SetSeq().SetId().front();
453 		TMaskedQueryRegions masks;
454 		int lo = m_cd->GetLowerBound(row);
455 		int hi = m_cd->GetUpperBound(row);
456 		int len = m_cd->GetSequenceStringByRow(row).length();
457 		if (lo > 0)
458 			masks.push_back(CRef<CSeqLocInfo>( new CSeqLocInfo(new CSeq_interval(*seqId, 0,lo-1),0)));
459 		if (hi < (len-1))
460 			masks.push_back(CRef<CSeqLocInfo> ((new CSeqLocInfo(new CSeq_interval(*seqId, hi + 1, len - 1),0))));
461 		if (masks.size() > 0)
462 		{
463 			TSeqLocInfoVector masking_locations;
464 			masking_locations.push_back(masks);
465 			rblast->SetQueries(bioseqs,masking_locations);
466 		}
467 		else
468 			rblast->SetQueries(bioseqs);
469 		//debug
470 		/*
471 		string err;
472 		if (!WriteASNToFile("blast_query", *bioseqs, false,&err))
473 			LOG_POST("Failed to write to blast_query");
474 		*/
475 		//end of debug
476 	}
477 	else //psi-blast
478 	{
479 		bool useConsensus = true;
480 		PssmMaker pm(m_cd, useConsensus, true);
481 		PssmMakerOptions config;
482 		config.unalignedSegThreshold = 35;
483 		config.requestFrequencyRatios = true;
484 		pm.setOptions(config);
485 		CRef<CPssmWithParameters> pssm = pm.make();
486 
487 		m_guideAlignment = new BlockModelPair(pm.getGuideAlignment());
488 		m_guideAlignment->degap();
489 		m_guideAlignment->reverse(); //keep con::master
490 		m_consensus = pm.getConsensus();
491 
492 		psiopt->SetPseudoCount(pm.getPseudoCount());
493 
494 		rblast->SetQueries(pssm);
495 	}
496 
497 
498     //  Submit and, if requested, wait for blast results.
499     //  Trap any exceptions and return 'false' in all such cases.
500 	bool blasted = false;
501 	try {
502 		if (wait) {
503 			blasted = rblast-> SubmitSync();
504 			m_rid = rblast->GetRID();
505 			//LOG_POST("RID="<<m_rid);
506 			getBlastHits();
507 		}
508 
509 		blasted = rblast->Submit();
510 
511         if (!blasted) {
512             m_lastError = rblast->GetErrors();
513         }
514 
515 	} catch (CRemoteBlastException& e) {
516 		m_lastError = "RemoteBlast exception in CDUpdater::blast() for row " + NStr::IntToString(row) + ":\n";
517         m_lastError += e.ReportAll();
518 //		string err = e.GetErrCodeString();
519 //		LOG_POST("RemoteBlast exception in CDUpdater::blast() for row " << NStr::IntToString(row) << ":  error code = " << err);
520 	}
521 	catch (CException& e) {
522 		m_lastError = "NCBI C++ Toolkit exception in CDUpdater::blast() for row " + NStr::IntToString(row) + ":\n";
523         m_lastError += e.ReportAll();
524 	}
525     catch (...) {
526 		m_lastError = "Unknown exception in CDUpdater::blast() for row " + NStr::IntToString(row) + "\n";
527     }
528 
529 	m_rid = rblast->GetRID();
530 	delete rblast;
531 	return blasted;
532 }
533 
getHits(CRef<CSeq_align_set> & hits)534 bool CDUpdater::getHits(CRef<CSeq_align_set> & hits)
535 {
536     bool done = false;
537 	blast::CRemoteBlast rblast(getRid());
538 	try {
539 		//LOG_POST("Calling RemoteBlast::CheckDone().\n");
540 		done = rblast.CheckDone();
541 		//LOG_POST("Returned from RemoteBlast::CheckDone().\n");
542 		if (done)
543 		{
544 			hits = rblast.GetAlignments();
545 		}
546 	} catch (...) {
547 		LOG_POST("Exception while getting BLAST hits of CD " << m_cd->GetAccession() << " for RID " << getRid());
548 	}
549 	return done;
550 }
551 
checkDone()552 bool CDUpdater::checkDone()
553 {
554     bool done = false;
555 	blast::CRemoteBlast rblast(getRid());
556 	try {
557 		//LOG_POST("Calling RemoteBlast::CheckDone().\n");
558 		done = rblast.CheckDone();
559 		//LOG_POST("Returned from RemoteBlast::CheckDone().\n");
560 
561 			//CCdCore::UpdateInfo ui = m_cd->GetUpdateInfo();
562 			//ui.status = CCdCore::BLAST_DONE;
563 			//m_cd->SetUpdateInfo(ui);
564 
565 	} catch (...) {
566 		LOG_POST("Exception during CheckDone for CD " << m_cd->GetAccession() << ", RID " << getRid());
567 	}
568 	return done;
569 }
570 
checkBlastAndUpdate()571 bool CDUpdater::checkBlastAndUpdate()
572 {
573 	CRef<CSeq_align_set> seqAligns;
574 	if(getHits(seqAligns))
575 	{
576 		if(!seqAligns.Empty())
577 		{
578 			update(m_cd, *seqAligns);
579 			SetUpdateDate(m_cd);
580 			//LOG_POST("Stats of Updating "<<m_cd->GetAccession()<<" are "<<getStats().toString());
581 			unsigned numNoAlignment = getStats().badAlign.size();
582 			if (numNoAlignment > 0)
583 				LOG_POST("There are hits whose alignments do not overlap with the CD.  This may indicate there are long insert to the CD alignment.  You find the GIs for those hits in the log\n");
584 		}
585 		else
586 		{
587 			LOG_POST("Got no alignment for BLAST hits for CD " << m_cd->GetAccession() << ". will try again to retrieve the hits.\n");
588 			return true;
589 		}
590 		return true;
591 	}
592 	else
593 		return false;
594 }
595 
ComputePercentIdentity(const CRef<CSeq_align> & alignment,const string & queryString,const string & subjectString)596 double CDUpdater::ComputePercentIdentity(const CRef< CSeq_align >& alignment, const string& queryString, const string& subjectString)
597 {
598     double result = 0.0;
599     unsigned int nIdent = 0;
600     unsigned int qLen = queryString.length(), sLen = subjectString.length();
601     unsigned int i, j, qStart, qStop, sStart, sStop;
602 
603     if (alignment.Empty() || qLen == 0 || sLen == 0) return result;
604 
605     //  Note that is is only %id in the aligned region, and doesn't factor in any non-identities
606     //  implicit in any parts of the query N- and/or C-terminal to the alignment.
607 	const CSeq_align::C_Segs::TDenseg& denseg = alignment->GetSegs().GetDenseg();
608     double denom = (denseg.GetSeqStop(0) - denseg.GetSeqStart(0) + 1);
609 
610     CDense_seg::TStarts starts = denseg.GetStarts();
611     CDense_seg::TLens lens = denseg.GetLens();
612 
613     for (i = 0; i < lens.size(); ++i) {
614         //  Do this check before implicit cast to unsigned int when assigning to qStart, sStart.
615         if (starts[2*i] < 0 || starts[2*i + 1] < 0) continue;  //  gap
616         qStart = starts[2*i];
617         sStart = starts[2*i + 1];
618 
619         qStop = qStart + lens[i] - 1;
620         sStop = sStart + lens[i] - 1;
621         if (qStop >= qLen || sStop >= sLen) continue;  //  string index out of range
622 
623         for (j = 0; j < lens[i]; ++j) {
624             if (queryString[qStart + j] == subjectString[sStart + j]) ++nIdent;
625         }
626     }
627     result = 100.0*nIdent/denom;
628 //    LOG_POST(nIdent << " identities found (" << result << "%)\nquery:    " << queryString << "\nsubject:  " << subjectString);
629 
630     return result;
631 }
632 
update(CCdCore * cd,CSeq_align_set & alignments)633 bool CDUpdater::update(CCdCore* cd, CSeq_align_set& alignments)
634 {
635 	if ( !cd || (!alignments.IsSet()))
636 		return false;
637 
638     double pidScore = 0.0;
639 	list< CRef< CSeq_align > >& seqAligns = alignments.Set();
640 	m_stats.numBlastHits = seqAligns.size();
641 	vector< CRef< CBioseq > > bioseqs;
642 	LOG_POST("Got "<<m_stats.numBlastHits<<" blast hits for CD " << cd->GetAccession() << ".");
643 	retrieveAllSequences(alignments, bioseqs);
644 
645 	SequenceTable seqTable;
646 	CDRefresher* refresher = 0;
647 	if (m_config.replaceOldAcc)
648 		refresher = new CDRefresher(cd);
649 	vector< CRef< CBioseq > > bioseqVec;
650 	for (unsigned int i = 0; i < bioseqs.size(); i++)
651 	{
652 		bioseqVec.clear();
653 		SplitBioseqByBlastDefline (bioseqs[i], bioseqVec);
654 		seqTable.addSequences(bioseqVec, true); //as a group
655 	}
656 
657     // debugging *** ***
658     //string err;
659     //WriteASNToFile("alignments.txt", alignments, false, &err);
660 
661 	// debugging *** ***
662 	//seqTable.dump("seqTable.txt");
663 	//LOG_POST("Retrieved "<<bioseqs.size()<<" Bioseqs for blast hits\n");
664 	//LOG_POST("Process BLAST Hits and add them to "<<cd->GetAccession());
665 	int completed = 0;
666 	list< CRef< CSeq_align > >::iterator it = seqAligns.begin();
667 	//CID1Client id1Client;
668 	//id1Client.SetAllowDeadEntries(false);
669 
670 	CRef< CSeq_id > seqID, querySeqID;
671 	CRef<CSeq_entry> seqEntry;
672     CRef< CBioseq > queryBioseq(new CBioseq);
673     string queryString, subjectString;
674 
675     //  Get bioseq corresponding to the query.
676     if (it != seqAligns.end()) {
677         querySeqID = (*it)->SetSegs().SetDenseg().GetIds()[0];
678         if (!cd->CopyBioseqForSeqId(querySeqID, queryBioseq)) {
679             queryBioseq.Reset();
680 
681             //  This message isn't relevant when the query is a PSSM.
682             if (m_config.blastType == eBLAST)
683                 LOG_POST("No bioseq found in CD " << cd->GetAccession() << " for update query.");
684         } else {
685             queryString = GetRawSequenceString(*queryBioseq);
686         }
687     }
688 
689 
690 	//for BLAST, if the master is PDB, the master seqid (gi from BLAST) needs to be changed
691     if (m_config.blastType == eBLAST && it != seqAligns.end())
692 	{
693 		CSeq_align::C_Segs& oldSegs = (*it)->SetSegs();
694 		CRef< CSeq_align::C_Segs::TDenseg> denseg( &(oldSegs.SetDenseg()) );
695 		vector< CRef< CSeq_id > >& seqIds= denseg->SetIds();
696 		CRef< CSeq_entry > masterSeq;
697 		cd->GetSeqEntryForRow(m_blastQueryRow, masterSeq);
698 		vector< CRef< CSeq_id > > pdbIds;
699 		GetAllIdsFromSeqEntry(masterSeq, pdbIds, true);
700 		if ((pdbIds.size() > 0) && SeqEntryHasSeqId(masterSeq, *seqIds[0]) && (!seqIds[0] ->IsPdb()))
701 			m_masterPdb = pdbIds[0];
702 	}
703 	for(; it != seqAligns.end(); it++)
704     {
705         pidScore = 0.0;
706 		CRef< CSeq_align > seqAlignRef = *it;
707 		//seqAlign from BLAST is in Denseg
708 		CSeq_align::C_Segs::TDenseg& denseg = seqAlignRef->SetSegs().SetDenseg();
709 
710         //  9/25/08:  CRemoteBlast dropped the identity count from the scores, and is a
711         //  pending issue (JIRA ticket http://jira.be-md.ncbi.nlm.nih.gov/browse/SB-114).
712         //  Workaround is to compute the % identity directly, as done below in
713         //  the function ComputePercentIdentity.
714 		if (false && m_config.identityThreshold > 0)
715 		{
716 
717             //  Note:  if the type isn't in the seq-align, pidScore remains 0.0 and the
718             //  scan of seqAligns will be aborted.
719             seqAlignRef->GetNamedScore(CSeq_align::eScore_IdentityCount, pidScore);
720 
721 			int start = denseg.GetSeqStart(0);
722 			int stop = denseg.GetSeqStop(0);
723 			pidScore = 100*pidScore/(stop - start + 1);
724 			if ((int)pidScore < m_config.identityThreshold)
725 				break; //stop
726 		}
727 		//the second is slave
728 		if (denseg.GetDim() > 1)
729 			seqID = denseg.GetIds()[1];
730 		else
731 			break; //should not be here
732 
733         TGi gi  = seqID->GetGi();
734 		vector< CRef< CBioseq > > bioseqVec;
735 		if(seqTable.findSequencesInTheGroup(seqID, bioseqVec) > 0)
736 		{
737 			//one SeqAlign returned from BLAST may represent the hits on several
738 			//different Bioseq that all have the same seq_data
739 			//pick the Seq_id from the most useful Bioseq
740 			if (bioseqVec.size() > 1)
741 			{
742 				int index = pickBioseq(refresher, seqAlignRef, bioseqVec);
743 				seqEntry = new CSeq_entry;
744 				seqEntry->SetSeq(*bioseqVec[index]);
745 				seqID = denseg.GetIds()[1];
746 				gi  = seqID->GetGi();
747                 subjectString = GetRawSequenceString(*bioseqVec[index]);
748 			}
749 			else
750 			{
751 				seqEntry = new CSeq_entry;
752 				seqEntry->SetSeq(*bioseqVec[0]);
753                 subjectString = GetRawSequenceString(*bioseqVec[0]);
754 			}
755 
756             //  when there's no identity count in the seq-align, compute it directly
757     		if (m_config.identityThreshold > 0)
758 	    	{
759                 pidScore = ComputePercentIdentity(seqAlignRef, queryString, subjectString);
760         		if ((int)pidScore < m_config.identityThreshold)
761 		    		break; //stop
762             }
763 
764 			//change seqAlign from denseg to dendiag
765 			//use pdb_id if available
766 			if(!modifySeqAlignSeqEntry(cd, *it, seqEntry))
767 			{
768 				m_stats.badAlign.push_back(gi);
769 				continue;
770 			}
771 
772 			bool passed = true;
773 			if (!m_config.noFilter)
774 				passed = passedFilters(cd, *it, seqEntry);
775 			if(passed) //not framented; not environmental seq
776 			{
777 				// add merge fragment later here
778 
779 				//remaster the seqAlign from consensus to CD.master
780 				if (m_config.blastType == ePSI_BLAST)
781 				{
782 					BlockModelPair bmp(*it);
783 					bmp.remaster(*m_guideAlignment);
784 					CRef<CSeq_align> saRef = bmp.toSeqAlign();
785 					if (saRef.Empty())
786 					{
787 						m_stats.badAlign.push_back(gi);
788 						//LOG_POST("No valid alignment after remastering to the CD for maste.  gi|%d is ignored", gi);
789 						continue;
790 					}
791 					else
792 						(*it) = saRef;
793 				}
794 				//check to see if it is necessary to replace old sequences
795 				TGi replacedGi = INVALID_GI;
796 				if (m_config.replaceOldAcc)
797 					replacedGi = refresher->refresh(seqAlignRef, seqEntry);
798 				if (replacedGi > ZERO_GI)
799 				{
800 					m_stats.oldNewPairs.push_back(CDUpdateStats::OldNewGiPair(replacedGi, gi));
801 				}
802 				else
803 				{
804 					cd->AddPendingSeqAlign(*(it));
805 					//just add sequence now.  redundancy will be removed later
806                     cd->AddSequence(seqEntry);
807 				}
808 			}
809 		}
810 		else
811 			m_stats.noSeq.push_back(gi);
812 		completed++;
813 		if (m_hitsNeeded > 0)
814 		{
815 			if (completed >= m_hitsNeeded)
816 				break;
817 		}
818 		if ((completed % 500) == 0)
819 			LOG_POST("Processed "<<completed<<" of "<<m_stats.numBlastHits<<" hits.");
820 	}
821 
822     //  always keep normal rows w/ automatic NR
823     LOG_POST("Finishing processing hits for "<<cd->GetAccession());
824 
825 	if (m_processPendingThreshold > 0)
826 	{
827 		m_stats.numFilteredByOverlap = mergePending(cd, m_processPendingThreshold,true);
828 	}
829 	if (refresher)
830 		delete refresher;
831 	return true;
832 }
833 
mergePending(CCdCore * cd,int threshold,bool remaster)834 int CDUpdater::mergePending(CCdCore* cd, int threshold, bool remaster)
835 {
836 	int excluded = processPendingToNormal(threshold, cd);
837 	if (remaster)
838 	{//check and remaster if necessary
839 		CRef< CSeq_entry > seqEntry;
840 		cd->GetSeqEntryForRow(0,seqEntry);
841 		vector< CRef< CSeq_id > > seqIds;
842 		GetAllIdsFromSeqEntry(seqEntry,seqIds, true);
843 		if (seqIds.size() == 0)
844 		{
845 			int nRows = cd->GetNumRows();
846 			int i = 1;
847 			for (; i < nRows; i++)
848 			{
849 				CRef< CSeq_id >  SeqID;
850 				if (cd->GetSeqIDForRow(i-1,1, SeqID))
851 				{
852 					if (SeqID->IsPdb())
853 						break;
854 				}
855 			}
856 			if (i < nRows)
857 			{
858 				string err;
859 				ReMasterCdWithoutUnifiedBlocks(cd, i, true);
860 			}
861 		}
862 	}
863 	return excluded;
864 }
865 
866 
pickBioseq(CDRefresher * refresher,CRef<CSeq_align> seqAlignRef,vector<CRef<CBioseq>> & bioseqVec)867 int CDUpdater::pickBioseq(CDRefresher* refresher, CRef< CSeq_align > seqAlignRef,
868 						   vector< CRef< CBioseq > >&  bioseqVec)
869 {
870 	CSeq_align::C_Segs::TDenseg& denseg = seqAlignRef->SetSegs().SetDenseg();
871 	//the second is slave
872 	vector< CRef< CSeq_id > >& seqIdVec = denseg.SetIds();
873 	CRef< CSeq_id > seqID;
874 	assert(denseg.GetDim() > 1);
875 	seqID = seqIdVec[1];
876 	int index = -1;
877 	//if the current seqId's bioseq has a PDB , no change
878 	for (int i = 0; i < (int) bioseqVec.size(); i++)
879 	{
880 		if(BioseqHasSeqId(*(bioseqVec[i]), *seqID))
881 		{
882 			index = i;
883 			const CBioseq::TId& ids = bioseqVec[i]->GetId();
884 			CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
885 			for (; it != itend; ++it)
886 			{
887 				if ((*it)->IsPdb())
888 				{
889 					 return index;
890 				}
891 			}
892 		}
893 	}
894 	assert(index >= 0);
895 	//use other PDB if there is one.
896 	for (int i = 0; i < (int) bioseqVec.size(); i++)
897 	{
898 		if (i==index)
899 			continue;
900 		const CBioseq::TId& ids = bioseqVec[i]->GetId();
901 		CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
902 		CRef< CSeq_id > giId;
903 		bool foundPDB =false;
904 		for (; it != itend; ++it)
905 		{
906 			if ((*it)->IsPdb())
907 				foundPDB = 1;
908 			else if ((*it)->IsGi())
909 				giId = *it;
910 		}
911 		if (foundPDB)
912 		{
913 			seqIdVec[1] = giId; //replace id in SeqAlign
914 			return i;
915 		}
916 	}
917 
918 	//use the one whose older version is already in CD
919 	//this can be used to replace the old one later
920 	if (refresher)
921 	{
922 		for (int i = 0; i < (int) bioseqVec.size(); i++)
923 		{
924 			if (refresher->hasOlderVersion(bioseqVec[i]))
925 			{
926 				const CBioseq::TId& ids = bioseqVec[i]->GetId();
927 				CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
928 				for (; it != itend; ++it)
929 				{
930 					if ((*it)->IsGi())
931 					{
932 						seqIdVec[1] = (*it); //replace
933 						return i;
934 					}
935 				}
936 			}
937 		}
938 	}
939 	return index;
940 }
941 
findRowsWithOldSeq(CCdCore * cd,CBioseq & bioseq)942 bool CDUpdater::findRowsWithOldSeq(CCdCore* cd, CBioseq& bioseq)
943 {
944 	CRef< CBioseq > bioseqRef(&bioseq);
945 	TGi giNew = getGi(bioseqRef);
946 	int num = cd->GetNumRows();
947 	vector<int> rows;
948 	string nAcc;
949 	int nVer=0;
950 	CRef< CSeq_id > seqId;
951 	if (!GetAccAndVersion(bioseqRef, nAcc, nVer, seqId))
952 		return false;
953 	bool foundOldSeq = false;
954 	for (int i = 0; i < num; i++)
955 	{
956 		cd->GetBioseqForRow(i, bioseqRef);
957 		string oAcc;
958 		int oVer=0;
959 		if (GetAccAndVersion(bioseqRef, oAcc, oVer,seqId))
960 		{
961 			TGi giOld = getGi(bioseqRef);
962 			if ((oAcc == nAcc) && (giNew != giOld))
963 			{
964 				rows.push_back(i);
965 				m_stats.oldNewPairs.push_back(CDUpdateStats::OldNewGiPair(giOld, giNew));
966 				foundOldSeq = true;
967 			}
968 		}
969 	}
970 	return foundOldSeq;
971 }
972 
passedFilters(CCdCore * cd,CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)973 bool CDUpdater::passedFilters(CCdCore* cd, CRef< CSeq_align > seqAlign,
974 						CRef< CSeq_entry > seqEntry)
975 {
976 	//filter environmental seq
977 	CRef< CBioseq > bioseq;
978 	TGi gi =  getGi(seqEntry);
979 	if (!GetOneBioseqFromSeqEntry(seqEntry, bioseq))
980 	{
981 		m_stats.noSeq.push_back(gi);
982 		return false; //no seq is not acceptable
983 	}
984 
985 	//filter fragmented
986 	if (m_config.missingResidueThreshold > 0 && isFragmentedSeq(cd, seqAlign, seqEntry))
987 	{
988 		m_stats.fragmented.push_back(gi);
989 		return false;
990 	}
991     //  filter overlapping updates unless disabled
992 	if (m_config.allowedOverlapWithCDRow >= 0 && overlapWithCDRow(cd, seqAlign))
993 	{
994 		m_stats.overlap.push_back(gi);
995 		return false;
996 	}
997 	return true;
998 }
999 
overlapWithCDRow(CCdCore * cd,CRef<CSeq_align> seqAlign)1000 bool CDUpdater::overlapWithCDRow(CCdCore* cd,CRef< CSeq_align > seqAlign)
1001 {
1002     //  Ignore overlaps by disabling this check when requested.
1003     int overlap = m_config.allowedOverlapWithCDRow;
1004     if (overlap < 0) return false;
1005 
1006     bool result = false;
1007 	BlockModel bm(seqAlign);
1008     int lo, hi;
1009 	int lastPos = bm.getLastAlignedPosition();
1010 	int firstPos = bm.getFirstAlignedPosition();
1011 	CRef< CSeq_id > seqId = bm.getSeqId();
1012 	CRef< CSeq_id > seqIdRow;
1013 
1014     //  Scan until the first overlap of significant size found.
1015     //  Do not return 'false' after first seq-id match in case there are repeats.
1016 	for(int i = 0; !result && i < cd->GetNumRows(); i++)
1017 	{
1018 		if(cd->GetSeqIDFromAlignment(i, seqIdRow))
1019 		{
1020 			if (SeqIdsMatch(seqId, seqIdRow))
1021 			{
1022 				lo = cd->GetLowerBound(i);
1023 				hi = cd->GetUpperBound(i);
1024 				if (lo + overlap <= firstPos)
1025                     result = (hi - overlap >= firstPos);
1026 				else
1027 					result = (lo + overlap <lastPos);
1028 
1029                 if (result) {
1030                     if (overlap > 0) {
1031                         LOG_POST("CD sequence " << i << " [" << lo << ", " << hi << "] and proposed update with range [" << firstPos << ", " << lastPos << "] exceed maximum allowed overlap = " << overlap);
1032                     } else {
1033                         LOG_POST("Disallowed overlap of CD sequence " << i << " [" << lo << ", " << hi << "] and proposed update with range [" << firstPos << ", " << lastPos << "]");
1034                     }
1035                 }
1036 //				if (lo <= firstPos)
1037 //					return hi >=firstPos;
1038 //				else
1039 //					return lo <lastPos;
1040 			}
1041 		}
1042 	}
1043 	return result;
1044 }
1045 
isFragmentedSeq(CCdCore * cd,CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)1046 bool CDUpdater::isFragmentedSeq(CCdCore* cd, CRef< CSeq_align > seqAlign,
1047 						CRef< CSeq_entry > seqEntry)
1048 {
1049 	int pssmLen = m_consensus.size(); //equal to PSSM length
1050 	int lenAligned = GetNumAlignedResidues(seqAlign);
1051 	if (lenAligned >= pssmLen)
1052 		return false;
1053 	BlockModel master(seqAlign, false);
1054 	int mGapToN = master.getGapToNTerminal(0);
1055 
1056 	//master is consensus at this point
1057 	int mGapToC = master.getGapToCTerminal(master.getBlocks().size() -1, m_consensus.size());
1058 	BlockModel slave(seqAlign);
1059 	CRef< CBioseq > bioseq;
1060 	if (!GetOneBioseqFromSeqEntry(seqEntry, bioseq))
1061 		return true; //no seq is a fragmented seq
1062 
1063 	int seqLen = GetSeqLength(*bioseq);
1064 	int sGapToN = slave.getGapToNTerminal(0);
1065 	int sGapToC = slave.getGapToCTerminal(slave.getBlocks().size() - 1, seqLen);
1066 	int allowed = m_config.missingResidueThreshold;
1067 	if ( ( mGapToN - sGapToN > allowed) ||
1068 		 (mGapToC - sGapToC > allowed))
1069 		 return true;
1070 	else
1071 		return false;
1072 }
1073 
retrieveSeq(CID1Client & client,CRef<CSeq_id> seqID,CRef<CSeq_entry> & seqEntry)1074 bool CDUpdater::retrieveSeq(CID1Client& client, CRef<CSeq_id> seqID, CRef<CSeq_entry>& seqEntry)
1075 {
1076 	try {
1077 		//seqEntry = client.AskGetsefromgi(maxplex);
1078 		seqEntry = client.FetchEntry(*seqID, 1);
1079 	} catch (...)
1080 	{
1081 		return false;
1082 	}
1083 	return true;
1084 }
1085 
findSeq(CRef<CSeq_id> seqID,vector<CRef<CBioseq>> & bioseqs,CRef<CSeq_entry> & seqEntry)1086 bool CDUpdater::findSeq(CRef<CSeq_id> seqID, vector< CRef< CBioseq > >& bioseqs, CRef<CSeq_entry>& seqEntry)
1087 {
1088     for (unsigned int i = 0; i < bioseqs.size(); i++)
1089 	{
1090 		const CBioseq::TId& ids = bioseqs[i]->SetId();
1091         CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
1092         for (; it != itend; ++it) {
1093 		    if (SeqIdsMatch(seqID, *it))
1094 		    {
1095 			    seqEntry = new CSeq_entry;
1096 			    seqEntry->SetSeq(*bioseqs[i]);
1097 			    return true;
1098 		    }
1099         }
1100 	}
1101 	return false;
1102 }
1103 
retrieveAllSequences(CSeq_align_set & alignments,vector<CRef<CBioseq>> & bioseqs)1104 void CDUpdater::retrieveAllSequences(CSeq_align_set& alignments, vector< CRef< CBioseq > >& bioseqs)
1105 {
1106 	vector< CRef<CSeq_id> > seqids;
1107 	unsigned int batchSize = 500;
1108 	unsigned int maxBatchSize = 2000;
1109     string dbName = CdUpdateParameters::getBlastDatabaseName(m_config.database);
1110 
1111 	list< CRef< CSeq_align > >& seqAligns = alignments.Set();
1112 	list< CRef< CSeq_align > >::iterator lit = seqAligns.begin();
1113 	for (; lit != seqAligns.end(); lit++)
1114 	{
1115 		seqids.push_back((*lit)->SetSegs().SetDenseg().GetIds()[1]);
1116 		list< CRef< CSeq_align > >::iterator next = lit;
1117 		next++;
1118 		//the batch is full or reach the end
1119 		if (seqids.size() >= batchSize || (next == (seqAligns.end())) )
1120 		{
1121 			string errors, warnings;
1122 			vector< CRef< CBioseq > > bioseqBatch;
1123 			try {
1124 				//LOG_POST("Calling CBlastServices::GetSequences().\n");
1125                 //  For Blast v5 databases, not all members of an identical protein group are in the database
1126                 //  which may cause GetSequences to not find the exact sequence specified from such a group
1127                 //  if it wasn't one of the representatives (5 as of early 2020).
1128 				CBlastServices::GetSequences(seqids, dbName, 'p', bioseqBatch, errors,warnings);
1129 				LOG_POST("Returned from CBlastServices::GetSequences('" << dbName << "') with a batch of " << bioseqBatch.size() << " sequences.");
1130 			}
1131 			catch (blast::CBlastException& be)
1132             {
1133 				if (seqids.size() > maxBatchSize)
1134 				{
1135 					seqids.clear(); //give up on retrieving sequence on these hits.
1136 					//LOG_POST("Retrieving sequences from RemoteBlast failed after repeated tries. Giving up on these %d blast hits");
1137 				}
1138 				else
1139 					LOG_POST("Retrieving sequences from RemoteBlast failed with an exception of "<<be.GetErrCodeString());
1140 				continue;
1141 			} catch (...)
1142             {
1143                 LOG_POST("Unspecified exception during CBlastServices::GetSequences().  Skipping to next Seq-align.\n");
1144                 continue;
1145             }
1146 
1147 			if (seqids.size()!= bioseqBatch.size())
1148 			{
1149 				LOG_POST("Ask for "<< seqids.size()<<" sequences.  Got "<<bioseqBatch.size()<<" back\n");
1150 				LOG_POST("Error="<<errors<<"\nWarnings="<<warnings);
1151 			}
1152 			seqids.clear();
1153 			for (unsigned int i = 0 ; i < bioseqBatch.size(); i++)
1154 			{
1155 				bioseqs.push_back(bioseqBatch[i]);
1156 			}
1157 		}
1158 	}
1159 }
1160 
getGi(CRef<CSeq_entry> seqEntry)1161 TGi CDUpdater::getGi(CRef< CSeq_entry > seqEntry)
1162 {
1163 	vector< CRef< CSeq_id > > seqIds;
1164 	GetAllIdsFromSeqEntry(seqEntry, seqIds);
1165 	for (unsigned int i = 0; i < seqIds.size(); i++)
1166 		if (seqIds[i]->IsGi())
1167 			return seqIds[i]->GetGi();
1168 	return ZERO_GI;
1169 }
1170 
getGi(CRef<CBioseq> bioseq)1171 TGi CDUpdater::getGi(CRef<CBioseq> bioseq)
1172 {
1173 	const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
1174 	list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
1175 	for (; cit != seqIds.end(); cit++)
1176 		if ((*cit)->IsGi())
1177 			return (*cit)->GetGi();
1178 	return ZERO_GI;
1179 }
1180 
1181 //change seqAlign from denseg to dendiag
1182 //remaster back to the master.
1183 //use pdb_id if available
modifySeqAlignSeqEntry(CCdCore * cd,CRef<CSeq_align> & seqAlign,CRef<CSeq_entry> seqEntry)1184 bool CDUpdater::modifySeqAlignSeqEntry(CCdCore* cd, CRef< CSeq_align >& seqAlign,
1185 						CRef< CSeq_entry > seqEntry)
1186 {
1187 	CSeq_align::C_Segs& oldSegs = seqAlign->SetSegs();
1188 	CRef< CSeq_align::C_Segs::TDenseg> denseg( &(oldSegs.SetDenseg()) );
1189 	vector< CRef< CSeq_id > >& seqIds= denseg->SetIds();
1190 	if(seqIds.size() <= 1)
1191 		return false;
1192 
1193 	if (!m_masterPdb.Empty())
1194 	{
1195 		seqIds[0].Reset(m_masterPdb.GetPointer());
1196 	}
1197 
1198 	//if slave has a pdb-id use it in seqAlign
1199 	vector< CRef< CSeq_id > > slaveIds;
1200 	GetAllIdsFromSeqEntry(seqEntry, slaveIds, true); //pdb only
1201 	if (slaveIds.size() > 0)
1202 		seqIds[1].Reset( (slaveIds[0]).GetPointer() );
1203 
1204 	if (seqEntry->IsSet())
1205 	{
1206 		//pick the right BioSeq from the Set
1207 		CRef< CBioseq > bioseq;
1208 		if (GetOneBioseqFromSeqEntry(seqEntry, bioseq, seqIds[1].GetPointer()))
1209 		{
1210 			if (!reformatBioseq(bioseq, seqEntry, m_client))
1211 				return false;
1212 			seqEntry->SetSeq(*bioseq);
1213 		}
1214 		else
1215 			return false;
1216 	}
1217 	else
1218 	{
1219 		CRef< CBioseq > bioseq(&seqEntry->SetSeq());
1220 		if (!reformatBioseq(bioseq, seqEntry, m_client))
1221 			return false;
1222 	}
1223 
1224 	CSeq_align::C_Segs::TDendiag& dendiag = seqAlign->SetSegs().SetDendiag();
1225 	Denseg2DenseDiagList(*denseg, dendiag);
1226 	/*
1227 	BlockModelPair bmp(seqAlign);
1228 	bmp.remaster(*m_guideAlignment);
1229 	seqAlign = bmp.toSeqAlign();*/
1230 
1231 	return true;
1232 }
1233 
1234 //get org-ref from seqEntry if bioseq does not have one
1235 //remove all unnecessary fields
1236 //replace ftable with mmdb-id
reformatBioseq(CRef<CBioseq> bioseq,CRef<CSeq_entry> seqEntry,CEntrez2Client & client)1237 bool CDUpdater::reformatBioseq(CRef< CBioseq > bioseq, CRef< CSeq_entry > seqEntry, CEntrez2Client& client )
1238 {
1239 	//get BioSource if there is none in bioseq
1240 	CSeq_descr& seqDescr = bioseq->SetDescr();
1241 	bool hasSource = false;
1242 	bool hasTitle = false;
1243 	//reset all fields except the source field
1244 
1245 	//need trim even if bioseq is not a Set
1246 	if (seqDescr.IsSet())
1247 	{
1248 		list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
1249 		list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
1250 		while (cit != descrList.end())
1251 		{
1252 			if ((*cit)->IsSource() && (!hasSource)) //only keep one source field
1253 			{
1254 				hasSource = true;
1255 				cit++;
1256 			}
1257 			else if ( (*cit)->IsTitle())
1258 			{
1259 				cit++;
1260 				hasTitle = true;
1261 			}
1262 			 //extract taxid/taxname from "TaxNamesData" field
1263 			//blastdb uses it to send tax info
1264 			else if ((*cit)->IsUser() && (!hasSource))
1265 			{
1266 				if ((*cit)->SetUser().SetType().SetStr() == "TaxNamesData")
1267 				{
1268 					vector< CRef< CUser_field > >& fields = (*cit)->SetUser().SetData();
1269 					if ( fields.size() > 0)
1270 					{
1271 						CRef< CUser_field > field = fields[0];
1272 						TTaxId taxid = TAX_ID_FROM(CObject_id::TId, field->GetLabel().GetId());
1273 						string taxname = field->GetData().GetStrs()[0];
1274 						//create a source seedsc and add it
1275 						CRef< CSeqdesc > source(new CSeqdesc);
1276 						COrg_ref& orgRef = source->SetSource().SetOrg();
1277 						orgRef.SetTaxId(taxid);
1278 						orgRef.SetTaxname(taxname);
1279 						descrList.push_back(source);
1280 						hasSource = true;
1281 					}
1282 				}
1283 				cit = descrList.erase(cit);
1284 			}
1285 			else
1286 				cit = descrList.erase(cit);
1287 		}
1288 	}
1289 	if (!hasSource)
1290 	{
1291 		//get source or org-ref from seqEntry
1292 		if (seqEntry->IsSet())
1293 		{
1294 			const list< CRef< CSeqdesc > >& descrList = seqEntry->GetSet().GetDescr().Get();
1295 			list< CRef< CSeqdesc > >::const_iterator cit = descrList.begin();
1296 			for (; cit != descrList.end(); cit++)
1297 			{
1298 				if ((*cit)->IsSource())
1299 				{
1300 					seqDescr.Set().push_back(*cit);
1301 					break;
1302 				}
1303 			}
1304 		}
1305 	}
1306 	// if bioSeq is pdb
1307 	//replace annot field with mmdb-id
1308 	//otherwise reset annot field
1309 	bioseq->ResetAnnot();
1310 	const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
1311 	list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
1312 	bool isPdb = false;
1313 	for (; cit != seqIds.end(); cit++)
1314 	{
1315 		if ((*cit)->IsPdb())
1316 		{
1317 			isPdb = true;
1318 			break;
1319 		}
1320 	}
1321 	if (isPdb)
1322 	{
1323 		//CEntrez2Client client;
1324 		vector<TIntId> uids;
1325 		string pdb = (*cit)->GetPdb().GetMol().Get();
1326 		pdb += "[ACCN]";
1327 		try {
1328 			client.Query(pdb, "structure", uids);
1329 		} catch (CException& e)
1330 		{
1331 			LOG_POST("\nFailed to retrieve mmdb-id for "<<pdb<<" because the error:\n "<<e.ReportAll());
1332 			return false;
1333 		}
1334 		int mmdbId = 0;
1335 		if (uids.size() > 0)
1336 		{
1337 			mmdbId = uids[0];
1338 			CRef<CSeq_id> mmdbTag (new CSeq_id);
1339 			CSeq_id::TGeneral& generalId = mmdbTag->SetGeneral();
1340 			generalId.SetDb("mmdb");
1341 			generalId.SetTag().SetId(mmdbId);
1342 			CRef< CSeq_annot> seqAnnot (new CSeq_annot);
1343 			seqAnnot->SetData().SetIds().push_back(mmdbTag);
1344 			bioseq->SetAnnot().push_back(seqAnnot);
1345 		}
1346 		if (!hasTitle)
1347 		{
1348 			CRef< CPDB_block > pdbBlock;
1349 			if (GetPDBBlockFromSeqEntry(seqEntry, pdbBlock))
1350 			{
1351 				CRef< CSeqdesc > seqDesc(new CSeqdesc);
1352 				if (pdbBlock->CanGetCompound())
1353 				{
1354 					const list< string >& compounds = pdbBlock->GetCompound();
1355 					if (compounds.size() != 0)
1356 						seqDesc->SetTitle(*(compounds.begin()));
1357 					seqDescr.Set().push_back(seqDesc);
1358 				}
1359 			}
1360 		}
1361 	}
1362 	return true;
1363 
1364 }
1365 
GetAllIdsFromSeqEntry(CRef<CSeq_entry> seqEntry,vector<CRef<CSeq_id>> & slaveIds,bool pdbOnly)1366 int CDUpdater::GetAllIdsFromSeqEntry(CRef< CSeq_entry > seqEntry,
1367 		vector< CRef< CSeq_id > >& slaveIds, bool pdbOnly)
1368 {
1369 	if (seqEntry->IsSeq())
1370 	{
1371 		const list< CRef< CSeq_id > >& seqIdList = seqEntry->GetSeq().GetId();
1372 		list< CRef< CSeq_id > >::const_iterator lsii;
1373 		for (lsii = seqIdList.begin(); lsii != seqIdList.end(); ++lsii)
1374 		{
1375 			if (pdbOnly)
1376 			{
1377 				if ((*lsii)->IsPdb())
1378 					slaveIds.push_back(*lsii);
1379 			}
1380 			else
1381 				slaveIds.push_back(*lsii);
1382 		}
1383 		return slaveIds.size();
1384 	}
1385 	else
1386 	{
1387 		list< CRef< CSeq_entry > >::const_iterator lsei;
1388 		const list< CRef< CSeq_entry > >& seqEntryList = seqEntry->GetSet().GetSeq_set();
1389 		for (lsei = seqEntryList.begin(); lsei != seqEntryList.end(); ++lsei)
1390 		{
1391 			 GetAllIdsFromSeqEntry(*lsei, slaveIds, pdbOnly);  //  RECURSIVE!!
1392 		}
1393 		return slaveIds.size();
1394 	}
1395 }
1396 //get only protein
GetOneBioseqFromSeqEntry(CRef<CSeq_entry> seqEntry,CRef<CBioseq> & bioseq,const CSeq_id * seqId)1397 bool CDUpdater::GetOneBioseqFromSeqEntry(CRef< CSeq_entry > seqEntry,
1398 		CRef< CBioseq >& bioseq,const CSeq_id* seqId)
1399 {
1400 	if (seqEntry->IsSeq())
1401 	{
1402 		if (seqEntry->GetSeq().IsAa())
1403 		{
1404 			if (seqId)
1405 			{
1406 				if (SeqEntryHasSeqId(seqEntry, *seqId))
1407 				{
1408 					bioseq.Reset(&seqEntry->SetSeq());
1409 					return true;
1410 				}
1411 				else
1412 					return false;
1413 			}
1414 			else
1415 			{
1416 				bioseq.Reset(&seqEntry->SetSeq());
1417 				return true;
1418 			}
1419 		}
1420 		else
1421 			return false;
1422 
1423 	}
1424 	else
1425 	{
1426 		list< CRef< CSeq_entry > >::const_iterator lsei;
1427 		const list< CRef< CSeq_entry > >& seqEntryList = seqEntry->GetSet().GetSeq_set();
1428 		for (lsei = seqEntryList.begin(); lsei != seqEntryList.end(); ++lsei)
1429 		{
1430 			 if (GetOneBioseqFromSeqEntry(*lsei, bioseq, seqId))  //  RECURSIVE!!
1431 				 return true;
1432 		}
1433 		return false;
1434 	}
1435 }
1436 
SeqEntryHasSeqId(CRef<CSeq_entry> seqEntry,const CSeq_id & seqId)1437 bool CDUpdater::SeqEntryHasSeqId(CRef< CSeq_entry > seqEntry, const CSeq_id& seqId)
1438 {
1439 	vector< CRef< CSeq_id > > seqIds;
1440 	GetAllIdsFromSeqEntry(seqEntry, seqIds,false);
1441 	for (unsigned int i = 0; i < seqIds.size(); i++)
1442 	{
1443 		if (seqIds[i]->Match(seqId))
1444 			return true;
1445 	}
1446 	return false;
1447 }
1448 
BioseqHasSeqId(const CBioseq & bioseq,const CSeq_id & seqId)1449 bool CDUpdater::BioseqHasSeqId(const CBioseq& bioseq, const CSeq_id& seqId)
1450 {
1451 	const CBioseq::TId& ids = bioseq.GetId();
1452     CBioseq::TId::const_iterator it = ids.begin(), itend = ids.end();
1453     for (; it != itend; ++it)
1454 	{
1455 	    if ((*it)->Match(seqId))
1456 	    {
1457 		    return true;
1458 	    }
1459     }
1460 	return false;
1461 }
1462 
1463 
SplitBioseqByBlastDefline(CRef<CBioseq> orig,vector<CRef<CBioseq>> & bioseqs)1464 int CDUpdater::SplitBioseqByBlastDefline (CRef<CBioseq> orig, vector< CRef<CBioseq> >& bioseqs)
1465 {
1466 	CRef<CBlast_def_line_set> blastDefLine = GetBlastDefline(*orig);
1467 	RemoveBlastDefline(*orig);
1468 	list< CRef< CBlast_def_line > >& deflines = blastDefLine->Set();
1469 	//most cases
1470 	if (deflines.size() <= 1)
1471 	{
1472 		bioseqs.push_back(orig);
1473 		return 1;
1474 	}
1475 	//PDBs likely
1476 	int order = 0;
1477 	for (list< CRef< CBlast_def_line > >::iterator iter = deflines.begin();
1478 		iter != deflines.end(); iter++)
1479 	{
1480 		CRef<CBioseq> splitBioseq(new CBioseq);
1481 		splitBioseq->Assign(*orig);
1482 		reformatBioseqByBlastDefline(splitBioseq, *iter, order);
1483 		bioseqs.push_back(splitBioseq);
1484 		order++;
1485 	}
1486 	return deflines.size();
1487 }
1488 
reformatBioseqByBlastDefline(CRef<CBioseq> bioseq,CRef<CBlast_def_line> blastDefline,int order)1489 void CDUpdater::reformatBioseqByBlastDefline(CRef<CBioseq> bioseq, CRef< CBlast_def_line > blastDefline, int order)
1490 {
1491 	CSeq_descr& seqDescr = bioseq->SetDescr();
1492 	int sourceOrder = 0;
1493 	if (seqDescr.IsSet())
1494 	{
1495 		list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
1496 		list< CRef< CSeqdesc > >::iterator cit = descrList.begin();
1497 		while (cit != descrList.end())
1498 		{
1499 			if ((*cit)->IsSource()) //only keep one source field
1500 			{
1501 				if (sourceOrder == order)
1502 					cit++; //keep
1503 				else
1504 					cit = descrList.erase(cit);
1505 
1506                 //  Do this for both cases; if sourceOrder == order must increment
1507                 //  otherwise will keep all sources *after* order.
1508                 sourceOrder++;
1509 			}
1510 			else if ( (*cit)->IsTitle())
1511 				cit = descrList.erase(cit);
1512 		}
1513 		//add the title from the defLine
1514 		CRef< CSeqdesc > title(new CSeqdesc);
1515 		title->SetTitle(blastDefline->GetTitle());
1516 		descrList.push_back(title);
1517 	}
1518 
1519 	//add seq_ids from the defline
1520 	bioseq->SetId().assign(blastDefline->GetSeqid().begin(), blastDefline->GetSeqid().end());
1521 }
1522 
1523 //  IMPORTANT:  This code was forked from src/objtools/align_format/align_format_util.cpp.
1524 //  Check for changes in original source if this forked version misbehaves in the future.
1525 /// Efficiently decode a Blast-def-line-set from binary ASN.1.
1526 /// @param oss Octet string sequence of binary ASN.1 data.
1527 /// @param bdls Blast def line set decoded from oss.
OssToDefline(const CUser_field::TData::TOss & oss,CBlast_def_line_set & bdls)1528 void CDUpdater::OssToDefline(const CUser_field::TData::TOss & oss, CBlast_def_line_set& bdls)
1529 {
1530     typedef const CUser_field::TData::TOss TOss;
1531 
1532     const char * data = NULL;
1533     size_t size = 0;
1534     string temp;
1535 
1536     if (oss.size() == 1) {
1537         // In the single-element case, no copies are needed.
1538 
1539         const vector<char> & v = *oss.front();
1540         data = & v[0];
1541         size = v.size();
1542     } else {
1543         // Determine the octet string length and do one allocation.
1544 
1545         ITERATE (TOss, iter1, oss) {
1546             size += (**iter1).size();
1547         }
1548 
1549         temp.reserve(size);
1550 
1551         ITERATE (TOss, iter3, oss) {
1552             // 23.2.4[1] "The elements of a vector are stored contiguously".
1553             temp.append(& (**iter3)[0], (*iter3)->size());
1554         }
1555 
1556         data = & temp[0];
1557     }
1558 
1559     CObjectIStreamAsnBinary inpstr(data, size);
1560     inpstr >> bdls;
1561 }
1562 
1563 
1564 
1565 //  IMPORTANT:  This code was forked from src/objtools/align_format/align_format_util.cpp.
1566 //  That method uses the object manager, however, so we're not calling the function directly.
1567 //  Check for changes in original source if this forked version misbehaves in the future.
GetBlastDefline(const CBioseq & bioseq)1568 CRef<CBlast_def_line_set>  CDUpdater::GetBlastDefline (const CBioseq& bioseq)
1569 {
1570 	static const string asnDeflineObjLabel = "ASN1_BlastDefLine";
1571 
1572     CRef<CBlast_def_line_set> bdls(new CBlast_def_line_set());
1573     if(bioseq.IsSetDescr()){
1574         const CSeq_descr& desc = bioseq.GetDescr();
1575         const list< CRef< CSeqdesc > >& descList = desc.Get();
1576         for (list<CRef< CSeqdesc > >::const_iterator iter = descList.begin(); iter != descList.end(); iter++){
1577 
1578             if((*iter)->IsUser()){
1579                 const CUser_object& uobj = (*iter)->GetUser();
1580                 const CObject_id& uobjid = uobj.GetType();
1581                 if(uobjid.IsStr()){
1582 
1583                     const string& label = uobjid.GetStr();
1584                     if (label == asnDeflineObjLabel){
1585                         const vector< CRef< CUser_field > >& usf = uobj.GetData();
1586 
1587                         if(usf.front()->GetData().IsOss()){ //only one user field
1588                             typedef const CUser_field::TData::TOss TOss;
1589                             const TOss& oss = usf.front()->GetData().GetOss();
1590                             OssToDefline(oss, *bdls);
1591                         }
1592                     }
1593                 }
1594             }
1595         }
1596     }
1597     return bdls;
1598 }
1599 
RemoveBlastDefline(CBioseq & handle)1600 void CDUpdater::RemoveBlastDefline (CBioseq& handle)
1601 {
1602 	static const string asnDeflineObjLabel = "ASN1_BlastDefLine";
1603 	if(handle.IsSetDescr())
1604 	{
1605 		CSeq_descr& desc = handle.SetDescr();
1606 		list< CRef< CSeqdesc > >& descList = desc.Set();
1607 		for (list<CRef< CSeqdesc > >::iterator iter = descList.begin(); iter != descList.end(); iter++)
1608 		{
1609 			if((*iter)->IsUser())
1610 			{
1611 				const CUser_object& uobj = (*iter)->GetUser();
1612 				const CObject_id& uobjid = uobj.GetType();
1613 				if(uobjid.IsStr())
1614 				{
1615 					const string& label = uobjid.GetStr();
1616 					if (label == asnDeflineObjLabel)
1617 					{
1618 						descList.erase(iter);
1619 						return;
1620 					}
1621 				}
1622 			}
1623 		}
1624 	}
1625 }
1626 
processPendingToNormal(int overlap,CCdCore * cd)1627 int CDUpdater::processPendingToNormal(int overlap, CCdCore* cd)
1628 {
1629 	AlignmentCollection ac(cd); //default:pending only
1630 	int num = ac.GetNumRows();
1631 	int seqlen = cd->GetSequenceStringByRow(0).size();
1632 	if (seqlen <= 0)
1633 		return num;
1634 	vector< CRef< CSeq_align > > seqAlignVec;
1635 	for (int i = 0; i < num; i++)
1636 		seqAlignVec.push_back(ac.getSeqAlign(i));
1637 	cd_utils::BlockFormater bf(seqAlignVec, seqlen);
1638 	list< CRef< CSeq_align > >& seqAlignList = cd->GetSeqAligns();
1639 	if (seqAlignList.size() > 0)
1640 	{
1641 		BlockModelPair bmp(*seqAlignList.begin());
1642 		if (bmp.getMaster() == bmp.getSlave()) //aligned to self; used as a seed
1643 			seqAlignList.erase(seqAlignList.begin());
1644 		if (seqAlignList.size() > 0)
1645 			bf.setReferenceSeqAlign(*seqAlignList.begin());
1646 	}
1647 	int numGood = bf.findIntersectingBlocks(overlap);
1648 	bf.formatBlocksForQualifiedRows(seqAlignList);
1649 	set<int> goodRows;
1650 	vector<int> rows;
1651 	bf.getQualifiedRows(rows);
1652 	for (unsigned int r = 0; r < rows.size(); r++)
1653 		goodRows.insert(rows[r]);
1654 	cd->ErasePendingRows(goodRows);
1655 	return num - numGood;
1656 }
1657 
1658 
CDRefresher(CCdCore * cd)1659 CDRefresher::CDRefresher (CCdCore* cd)
1660 : m_cd(cd)
1661 {
1662 	addSequences(cd->SetSequences());
1663 }
1664 
addSequences(CSeq_entry & seqEntry)1665 void CDRefresher::addSequences(CSeq_entry& seqEntry)
1666 {
1667 	if (seqEntry.IsSet())
1668 	{
1669 		list< CRef< CSeq_entry > >& seqSet = seqEntry.SetSet().SetSeq_set();
1670 		list< CRef< CSeq_entry > >::iterator it = seqSet.begin();
1671 		for (; it != seqSet.end(); it++)
1672 			addSequences(*(*it));
1673 	}
1674 	else
1675 	{
1676 		CRef< CBioseq > bioseq(&(seqEntry.SetSeq()));
1677 		addSequence(bioseq);
1678 	}
1679 }
1680 
addSequence(CRef<CBioseq> bioseq)1681 void CDRefresher::addSequence(CRef< CBioseq > bioseq)
1682 {
1683 	string acc;
1684 	int ver=0;
1685 	CRef< CSeq_id > textId;
1686 	if (GetAccAndVersion(bioseq, acc, ver, textId))
1687 		m_accSeqMap.insert(AccessionBioseqMap::value_type(acc, bioseq));
1688 }
1689 
hasOlderVersion(CRef<CBioseq> bioseq)1690 bool CDRefresher::hasOlderVersion(CRef< CBioseq > bioseq)
1691 {
1692 	string nAcc;
1693 	int nVer=0;
1694 	CRef< CSeq_id > textId;
1695 	if (!GetAccAndVersion(bioseq, nAcc, nVer, textId))
1696 		return false;
1697 	AccessionBioseqMap::iterator it = m_accSeqMap.find(nAcc);
1698 	if (it != m_accSeqMap.end())
1699 	{
1700 		TGi newgi = CDUpdater::getGi(bioseq);
1701 		TGi oldgi = CDUpdater::getGi(it->second);
1702 		return newgi != oldgi;
1703 	}
1704 	else
1705 		return false;
1706 }
1707 
1708 	//return the gi that's replaced; return -1 if none is replaced
refresh(CRef<CSeq_align> seqAlign,CRef<CSeq_entry> seqEntry)1709 TGi CDRefresher::refresh(CRef< CSeq_align> seqAlign, CRef< CSeq_entry > seqEntry)
1710 {
1711 	CRef< CBioseq > bioseqRef;
1712 	if (!CDUpdater::GetOneBioseqFromSeqEntry(seqEntry, bioseqRef))
1713 		return INVALID_GI;
1714 	if (!hasOlderVersion(bioseqRef))
1715 		return INVALID_GI;
1716 	string nAcc;
1717 	int nVer=0;
1718 	CRef< CSeq_id > textId;
1719 	if (!GetAccAndVersion(bioseqRef, nAcc, nVer, textId))
1720 		return INVALID_GI;
1721 	CRef< CBioseq > oldBioseq = m_accSeqMap[nAcc];
1722 	string newStr, oldStr;
1723 	GetNcbieaaString(*bioseqRef, newStr);
1724 	GetNcbieaaString(*oldBioseq, oldStr);
1725 	if (newStr.size() != oldStr.size())
1726 		return INVALID_GI;
1727 	//proceed to do the placement
1728 	vector< CRef< CSeq_id > > newIds;
1729 	CDUpdater::GetAllIdsFromSeqEntry(seqEntry, newIds);
1730 	CRef< CSeq_id > giId, pdbId;
1731 	for (unsigned int i = 0; i < newIds.size(); i++)
1732 	{
1733 		if (newIds[i]->IsGi())
1734 			giId = newIds[i];
1735 		else if (newIds[i]->IsPdb())
1736 			pdbId = newIds[i];
1737 	}
1738 	list< CRef< CSeq_align > >& seqAlignList = m_cd->GetSeqAligns();
1739 	bool replaced = false;
1740 	for (list< CRef< CSeq_align > >::iterator lit = seqAlignList.begin(); lit != seqAlignList.end(); lit++)
1741 	{
1742 		CRef< CSeq_align >& seqAlign = *lit;
1743 		CRef< CSeq_id > idInAlign;
1744 		GetSeqID(seqAlign, idInAlign, true);
1745 		if (CDUpdater::BioseqHasSeqId(*oldBioseq, *idInAlign))
1746 		{
1747 			BlockModelPair bmp(seqAlign);
1748 
1749 			if (pdbId.NotNull())
1750 			{
1751 				bmp.getSlave().setSeqId(pdbId);
1752 				seqAlign = bmp.toSeqAlign();
1753 				replaced =true;
1754 			}
1755 			else if (giId.NotNull())
1756 			{
1757 				bmp.getSlave().setSeqId(giId);
1758 				seqAlign = bmp.toSeqAlign();
1759 				replaced = true;
1760 			}
1761 		}
1762 	}
1763 	if (replaced)
1764 	{
1765 		m_cd->AddSequence(seqEntry);
1766 		return CDUpdater::getGi(oldBioseq);
1767 	}
1768 	else
1769 		return INVALID_GI;
1770 }
1771 
1772 END_SCOPE(cd_utils)
1773 END_NCBI_SCOPE
1774