1 /* $Id: read_blast_result_lib.cpp 434869 2014-05-12 17:42:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 * major top-level routines, including main
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include "read_blast_result.hpp"
35
36
37 // static members definition
38 // algorithm control
39 double CReadBlastApp::m_small_tails_threshold;
40 int CReadBlastApp::m_n_best_hit;
41 double CReadBlastApp::m_eThreshold;
42 double CReadBlastApp::m_entireThreshold;
43 double CReadBlastApp::m_partThreshold;
44 int CReadBlastApp::m_rna_overlapThreshold;
45 int CReadBlastApp::m_cds_overlapThreshold;
46 double CReadBlastApp::m_trnascan_scoreThreshold;
47 int CReadBlastApp::m_shortProteinThreshold;
48 // verbosity
49 int CReadBlastApp::m_verbosity_threshold;
50 int CReadBlastApp::m_current_verbosity;
51 stack < int > CReadBlastApp::m_saved_verbosity;
52
s_GetFormat(const string & name)53 ESerialDataFormat s_GetFormat(const string& name)
54 {
55 if (name == "asn") {
56 return eSerial_AsnText;
57 } else if (name == "asnb") {
58 return eSerial_AsnBinary;
59 } else if (name == "xml") {
60 return eSerial_Xml;
61 } else {
62 // Should be caught by argument processing, but as an illustration...
63 THROW1_TRACE(runtime_error, "Bad serial format name " + name);
64 }
65 }
66
67
getGIs(CBioseq::TAnnot::const_iterator & annot)68 vector<long> CReadBlastApp::getGIs(CBioseq::TAnnot::const_iterator& annot)
69 {
70 vector<long> result;
71 result.clear();
72 ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
73 {
74 string name = (*score)->GetId().GetStr();
75 if(name!="use_this_gi") continue;
76 result.push_back((*score)->GetValue().GetInt());
77 }
78
79 return result;
80 }
81
giMatch(const vector<long> & left,const vector<long> & right)82 bool CReadBlastApp::giMatch(const vector<long>& left, const vector<long>& right)
83 {
84
85 bool result=false;
86 if(PrintDetails()) NcbiCerr << "giMatch starts" << NcbiEndl;
87 IncreaseVerbosity();
88 ITERATE(vector<long>, gi1, left)
89 {
90 IncreaseVerbosity();
91 ITERATE(vector<long>, gi2, right)
92 {
93 if(PrintDetails()) NcbiCerr << "try: gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
94 if (*gi1==*gi2)
95 {
96 if(PrintDetails()) NcbiCerr << "giMatch!!! : gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
97 return true;
98 }
99 }
100 DecreaseVerbosity();
101 }
102 DecreaseVerbosity();
103 if(PrintDetails()) NcbiCerr << "giMatch oops" << NcbiEndl;
104
105 return result;
106 }
107
dumpAlignment(const string & alignment,const string & file)108 void CReadBlastApp::dumpAlignment( const string& alignment, const string& file)
109 {
110 ofstream out(file.c_str());
111 out<< alignment;
112 }
113
dump_fasta_for_pretty_blast(diagMap & diag)114 void CReadBlastApp::dump_fasta_for_pretty_blast(diagMap& diag)
115 {
116 // open out2 here
117 string fn = GetArgs()["in"].AsString();
118 string::size_type ipos=fn.rfind(".");
119 if(ipos!=string::npos) fn.erase(ipos);
120 ipos=fn.rfind("/");
121 fn.erase(0,ipos+1);
122 string fncdd = fn + "_cdd_html.fsa";
123 fn+= "_html.fsa";
124
125 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
126 CScope scope(*objmgr);
127
128 auto_ptr<CNcbiOfstream> out2 (new CNcbiOfstream (fn.c_str()));
129 CFastaOstream fasta_out(*out2);
130
131 auto_ptr<CNcbiOfstream> out2_cdd ( new CNcbiOfstream (fncdd.c_str()));
132 CFastaOstream fasta_out_cdd(*out2_cdd);
133
134 for (CTypeConstIterator<CBioseq> seq = ConstBegin(); seq; ++seq)
135 {
136 if( hasProblems(*seq, diag, eFrameShift )||
137 hasProblems(*seq, diag, ePartial)
138 )
139 {
140 CBioseq_Handle handle = scope.AddBioseq(*seq);
141 if( hasProblems(*seq, diag, eFrameShift )) fasta_out.Write(handle);
142 if( hasProblems(*seq, diag, ePartial)) fasta_out_cdd.Write(handle);
143 }
144 }
145
146 }
147
printOverlapReport(distanceReportStr * report,ostream & out)148 void CReadBlastApp::printOverlapReport
149 (
150 distanceReportStr *report,
151 ostream& out
152 )
153 {
154 out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
155 out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
156 out << report->q_id_left
157 << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
158 << "\t"
159 << report->q_id_right
160 << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
161 << NcbiEndl;
162 string range1 = report->loc1.Empty()
163 ? printed_range(report->q_loc_left_from, report->q_loc_left_to)
164 : printed_ranges(*(report->loc1));
165 string range2 = report->loc2.Empty()
166 ? printed_range(report->q_loc_right_from, report->q_loc_right_to)
167 : printed_ranges(*(report->loc2));
168 out
169 << "[" << range1 << "] " << report->left_frame << "\t\t"
170 << "[" << range2 << "] " << report->right_frame
171 << NcbiEndl;
172 out << NcbiEndl;
173
174 out << "Overlap: " << report->space << " aa" << NcbiEndl;
175 }
176
printPerfectHit(const perfectHitStr & hit,ostream & out)177 void CReadBlastApp::printPerfectHit
178 (
179 const perfectHitStr& hit,
180 ostream& out
181 )
182 {
183
184 out << "Perfect hit: " << hit.s_name << NcbiEndl;
185
186 }
187
printGeneralInfo(ostream & out)188 void CReadBlastApp::printGeneralInfo(ostream& out)
189 {
190 out << "General info extracted from the header of the submission:" << NcbiEndl;
191 out << NcbiEndl;
192 out << "reldate: ";
193 if ( IsSubmit() && m_Submit.GetSub().CanGetReldate() )
194 {
195 string date;
196 m_Submit.GetSub().GetReldate().GetDate(&date);
197 out << date.c_str();
198 }
199 else
200 out << "unspecified";
201 out << NcbiEndl;
202
203
204 // simple texts
205 out << "tool: ";
206 if ( IsSubmit() && m_Submit.GetSub().CanGetTool() )
207 out << m_Submit.GetSub().GetTool();
208 else
209 out << "unspecified";
210 out << NcbiEndl;
211
212 out << "user_tag: ";
213 if ( IsSubmit() && m_Submit.GetSub().CanGetUser_tag() )
214 out << m_Submit.GetSub().GetUser_tag();
215 else
216 out << "unspecified";
217 out << NcbiEndl;
218
219 out << "comment: ";
220 if ( IsSubmit() && m_Submit.GetSub().CanGetComment() )
221 out << m_Submit.GetSub().GetComment();
222 else
223 out << "unspecified";
224 out << NcbiEndl;
225
226
227 out << NcbiEndl;
228 }
229
printReport(distanceReportStr * report,ostream & out)230 void CReadBlastApp::printReport
231 (
232 distanceReportStr *report,
233 ostream& out
234 )
235 {
236 out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
237 out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
238 out << "s | " << report->s_name << NcbiEndl;
239 out << report->q_id_left
240 << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
241 << "\t"
242 << report->q_id_right
243 << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
244 << NcbiEndl;
245 out
246 << "[" << report->q_loc_left_from << "..." << report->q_loc_left_to << "]\t\t"
247 << "[" << report->q_loc_right_from << "..." << report->q_loc_right_to << "]"
248 << NcbiEndl;
249 out << NcbiEndl;
250 out
251 << "q\t" << report->q_left_left << "\t" << report->q_left_middle << "\t" << report->q_left_right
252 << "\t" << report->space
253 << "\t" << report->q_right_left << "\t" << report->q_right_middle << "\t" << report->q_right_right
254 << NcbiEndl;
255 out
256 << "s\t" << report->s_left_left << "\t" << report->s_left_middle << "\t" << "..."
257 << "\t" << "..."
258 << "\t" << "..." << "\t" << report->s_left_right << "\t" << "..."
259 << NcbiEndl;
260 out
261 << "s\t" << "..." << "\t" << report->s_right_left << "\t" << "..."
262 << "\t" << "..."
263 << "\t" << "..." << "\t" << report->s_right_middle << "\t" << report->s_right_right
264 << NcbiEndl;
265 out << "diff_left, diff_right: " << report->diff_left << ", " << report->diff_right << NcbiEndl;
266 out << "diff_edge_left, diff_edge_right: " << report->diff_edge_left << ", " << report->diff_edge_right << NcbiEndl;
267 if(report->diff_edge_left > 0)
268 {
269 out << "Potential deletetion of a nucleotide sequence equivalent to " << report->diff_edge_left << " occurred." << NcbiEndl;
270 }
271 else if (report->diff_edge_left < 0)
272 {
273 if ( -report->diff_edge_left < report->space )
274 out << "Potential insertion of a nucleotide sequence equivalent to " << -report->diff_edge_left << " occurred." << NcbiEndl;
275 else
276 out << "Subjects are shifted apart by " << -report->diff_edge_left << NcbiEndl;
277 }
278 else
279 {
280 out << "Potential sequencing error or replacement mutation without insertion or deletion of a nucleotide sequence occurred." << NcbiEndl;
281 }
282
283
284 }
285
GetRNAfeats(const LocMap & loc_map,CSeq_annot::C_Data::TFtable & rna_feats,const CSeq_annot::C_Data::TFtable & feats)286 void CReadBlastApp::GetRNAfeats
287 (
288 const LocMap& loc_map,
289 CSeq_annot::C_Data::TFtable& rna_feats,
290 const CSeq_annot::C_Data::TFtable& feats
291 )
292 {
293 if(PrintDetails()) NcbiCerr << "GetRNAfeats() starts" << NcbiEndl;
294 IncreaseVerbosity();
295 ITERATE(CSeq_annot::C_Data::TFtable, f, feats)
296 {
297 if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature start " << NcbiEndl;
298 if( !(*f)->GetData().IsRna() ) continue;
299 if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature is RNA" << NcbiEndl;
300 string loc_string = GetLocationString( (*f)->GetLocation() );
301 if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
302 << loc_string
303 << "] feature location" << NcbiEndl;
304 LocMap::const_iterator aname = loc_map.find(loc_string);
305 if(aname == loc_map.end())
306 {
307 // error
308 NcbiCerr << "CReadBlastApp::GetRNAfeats(): ERROR: cannot find gene for location "
309 << loc_string
310 << NcbiEndl;
311 continue;
312 }
313 rna_feats.push_back(aname->second);
314 if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
315 << aname->first
316 << "] feature end" << NcbiEndl;
317 }
318 DecreaseVerbosity();
319 if(PrintDetails()) NcbiCerr << "GetRNAfeats(): rna_feats size: "
320 << rna_feats.size()
321 << NcbiEndl;
322 if(PrintDetails()) NcbiCerr << "GetRNAfeats() ends" << NcbiEndl;
323
324 return;
325 }
326
327
GetLocusTag(const CSeq_feat & f,const LocMap & loc_map)328 string GetLocusTag(const CSeq_feat& f, const LocMap& loc_map)
329 {
330 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: start" << NcbiEndl;
331 string loc_string = GetLocationString(f.GetLocation());
332 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: I" << NcbiEndl;
333 LocMap::const_iterator aname = loc_map.find(loc_string);
334 if(aname == loc_map.end())
335 {
336 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not in locmap" << NcbiEndl;
337 return loc_string;
338 }
339 else
340 {
341 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: in locmap" << NcbiEndl;
342 const CGene_ref& gene=aname->second->GetData().GetGene();
343 string qname="";
344 if(gene.IsSetLocus_tag())
345 {
346 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: getting it" << NcbiEndl;
347 return gene.GetLocus_tag();
348 }
349
350 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not getting it" << NcbiEndl;
351 return qname;
352 }
353
354 if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: INTERNAL ERROR: should be never here" << NcbiEndl;
355 return loc_string;
356 }
get_nucleotide_seq(const CBioseq & seq)357 const CBioseq& CReadBlastApp::get_nucleotide_seq(const CBioseq& seq)
358 {
359 const CBioseq_set::TSeq_set* seqset = get_parent_seqset(seq);
360 const CSeq_loc& loc = getGenomicLocation(seq);
361 CTypeConstIterator<CSeq_interval> inter = ::ConstBegin(loc);
362 const CSeq_id& nu_id = inter->GetId();
363
364 ITERATE(CBioseq_set::TSeq_set, seq_nu, *seqset)
365 {
366 if(!(*seq_nu)->IsSeq()) continue;
367 if(!(*seq_nu)->GetSeq().IsNa()) continue;
368 const CBioseq::TId& nu_ids = (*seq_nu)->GetSeq().GetId();
369 ITERATE(CBioseq::TId, cnu_id , nu_ids)
370 {
371 if((*cnu_id)->Compare(nu_id) == CSeq_id::e_YES) return (*seq_nu)->GetSeq();
372 }
373 }
374 NcbiCerr << "CReadBlastApp::get_nucleotide_seq: INTERNAL FATAL ERROR: could not find nucleotide sequence" << NcbiEndl;
375 throw;
376 return seq; // formality
377 }
378
get_parent_seqset(const CBioseq & seq)379 CBioseq_set::TSeq_set* get_parent_seqset(const CBioseq& seq)
380 {
381 string name = GetStringDescr(seq);
382 CSeq_entry* parent = seq.GetParentEntry();
383 if(parent==NULL)
384 {
385 NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no parent\n";
386 return NULL;
387 }
388 if(parent->IsSeq())
389 {
390 while(parent!=NULL && !parent->IsSet())
391 parent = parent->GetParentEntry();
392 if(parent==NULL)
393 {
394 NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no set ancestor\n";
395 return NULL;
396 }
397 }
398 else if(!parent->IsSet()) return NULL;
399
400 if(!parent->GetSet().CanGetSeq_set())
401 {
402 NcbiCerr << "get_parent_seqset: WARNING: " << name << ": (grand)parent set does not have Seq_set\n";
403 return NULL;
404 }
405 return &(parent->SetSet().SetSeq_set());
406 }
407
408
let1_2_let3(char let1)409 string let1_2_let3(char let1)
410 {
411 switch(let1)
412 {
413 case 'A': return "Ala"; break;
414 case 'Q': return "Gln"; break;
415 case 'W': return "Trp"; break;
416 case 'E': return "Glu"; break;
417 case 'R': return "Arg"; break;
418 case 'T': return "Thr"; break;
419 case 'Y': return "Tyr"; break;
420 case 'U': return "Sec"; break;
421 case 'I': return "Ile"; break;
422 case 'J': return "Xeu"; break; // Ile or Leu, there is no conventional three-letter code
423 case 'P': return "Pro"; break;
424 case 'S': return "Ser"; break;
425 case 'D': return "Asp"; break;
426 case 'F': return "Phe"; break;
427 case 'G': return "Gly"; break;
428 case 'H': return "His"; break;
429 case 'K': return "Lys"; break;
430 case 'L': return "Leu"; break;
431 case 'Z': return "Glx"; break;
432 case 'X': return "Ukn"; break;
433 case 'C': return "Cys"; break;
434 case 'V': return "Val"; break;
435 case 'B': return "Asx"; break;
436 case 'N': return "Asn"; break;
437 case 'M': return "Met"; break;
438 case 'O': return "Pyl"; break;
439 default: break;
440 }
441 NcbiCerr << "let1_2_let3: ERROR: char " << let1 << "(" << (int)let1 << ") is not handled" << NcbiEndl;
442
443 return "";
444 }
445
collectPerfectHits(vector<perfectHitStr> & perfect,const CBioseq & seq)446 int CReadBlastApp::collectPerfectHits(vector<perfectHitStr>& perfect, const CBioseq& seq)
447 {
448 ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
449 {
450 if(!(*annot)->GetData().IsAlign()) continue;
451 check_alignment(annot, seq, perfect);
452 }
453 return perfect.size();
454 }
455
check_alignment(CBioseq::TAnnot::const_iterator & annot,const CBioseq & seq,vector<perfectHitStr> & results)456 void CReadBlastApp::check_alignment
457 (
458 CBioseq::TAnnot::const_iterator& annot,
459 const CBioseq& seq,
460 vector<perfectHitStr>& results
461 )
462 {
463 int sLen = getLenScore(annot);
464 int qLen = getQueryLen(seq);
465 string q_name = GetProtName(seq);
466
467 string s_name = getAnnotName(annot);
468
469 int qFrom, qTo, sFrom, sTo;
470 getBounds(annot, &qFrom, &qTo, &sFrom, &sTo);
471
472 bool result=false;
473
474 int qtails = qLen - qTo + qFrom - 1;
475 int stails = sLen - sTo + sFrom - 1;
476 double thr = m_small_tails_threshold; // default should be .1
477 result =
478 (double)qtails/qLen < thr &&
479 (double)stails/sLen < thr &&
480 s_name.find("hypothetical") == string::npos;
481 // hypothetical exhonerating hits do not count
482
483 if(result)
484 {
485 results.resize(results.size()+1);
486 int ihit = results.size()-1;
487 results[ihit].s_name = s_name;
488 }
489 }
490
is_prot_entry(const CBioseq & seq)491 bool CReadBlastApp::is_prot_entry(const CBioseq& seq)
492 {
493 bool result;
494 // if(PrintDetails()) NcbiCerr << "is_prot_entry?" << NcbiEndl;
495 result=seq.GetInst().IsAa();
496 // if(PrintDetails()) NcbiCerr << result << NcbiEndl;
497 return result;
498 }
499
500
SortSeqs(void)501 int CReadBlastApp::SortSeqs(void)
502 {
503 if(PrintDetails()) NcbiCerr << "SortSeqs: start " << NcbiEndl;
504 PushVerbosity();
505 if(IsSubmit())
506 {
507 if(PrintDetails()) NcbiCerr << "SortSeqs: IsSubmit" << NcbiEndl;
508 if (
509 m_Submit.GetData().IsEntrys()
510 &&
511 (*m_Submit.GetData().GetEntrys().begin())->IsSet()
512 )
513 {
514 SortSeqs((*m_Submit.SetData().SetEntrys().begin())->SetSet().SetSeq_set());
515 }
516 else
517 {
518 NcbiCerr << "ERROR: submit file does not have proper seqset" << NcbiEndl;
519 }
520 }
521 else
522 {
523 if(PrintDetails()) NcbiCerr << "SortSeqs: IsEntry" << NcbiEndl;
524 SortSeqs(m_Entry.SetSet().SetSeq_set());
525 }
526
527 PopVerbosity();
528 if(PrintDetails()) NcbiCerr << "SortSeqs: end" << NcbiEndl;
529 return -1;
530 }
531
532
533
SetParents(CSeq_entry * parent,CBioseq_set::TSeq_set & where)534 int CReadBlastApp::SetParents(CSeq_entry* parent, CBioseq_set::TSeq_set& where)
535 {
536 /*
537 NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, where)
538 {
539 if((*left)->IsSet())
540 {
541 CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set();
542 SetParents(parent, seqs_down);
543 }
544 else
545 {
546 (*left)->SetParentEntry(parent);
547 }
548 }
549 */
550 return 0;
551 }
552
553 // TSimpleSeq
554
less_simple_seq(const TSimpleSeq & first,const TSimpleSeq & second)555 bool CReadBlastApp::less_simple_seq(const TSimpleSeq& first,
556 const TSimpleSeq& second)
557 {
558 return first.key < second.key;
559 } // less_seq
560
less_pair(const pair<int,int> & first,const pair<int,int> & second)561 bool CReadBlastApp::less_pair(const pair<int,int>& first,
562 const pair<int,int>& second)
563 {
564 if(first.first != second.first) return first.first < second.first;
565 return first.second < second.second;
566 }
567
568
569
less_seq(const CRef<CSeq_entry> & first,const CRef<CSeq_entry> & second)570 bool CReadBlastApp::less_seq(const CRef<CSeq_entry>& first,
571 const CRef<CSeq_entry>& second)
572 {
573 // if(PrintDetails()) NcbiCerr << "less_seq start" << NcbiEndl;
574 if (first->IsSet() || second->IsSet())
575 {
576 return false;
577 }
578 // if(PrintDetails()) NcbiCerr << "less_seq both seqs" << NcbiEndl;
579 if (!hasGenomicLocation(first->GetSeq()))
580 {
581 if(first->GetSeq().IsAa())
582 NcbiCerr << "less_seq first does not have genomic location or is nucleotide seq" << NcbiEndl;
583 return true; // to take care of nucleotide sequence
584 }
585 if (!hasGenomicLocation(second->GetSeq()))
586 {
587 if(second->GetSeq().IsAa())
588 NcbiCerr << "less_seq second does not have genomic location or is nucleotide seq" << NcbiEndl;
589 return false;
590 }
591 // if(PrintDetails()) NcbiCerr << "less_seq both have genomic locations" << NcbiEndl;
592 const CSeq_loc& left_genomic_int = getGenomicLocation(first->GetSeq());
593 const CSeq_loc& right_genomic_int = getGenomicLocation(second->GetSeq());
594 TSeqPos from1, to1, from2, to2;
595
596 ENa_strand strand1;
597 ENa_strand strand2;
598
599 getFromTo(left_genomic_int, from1, to1, strand1);
600 getFromTo(right_genomic_int, from2, to2, strand2);
601 /*
602 if(PrintDetails()) NcbiCerr << "less_seq comparing"
603 << ": " << GetStringDescr( first->GetSeq()) << " = " << from1 << "-" << to1
604 << ", " << GetStringDescr(second->GetSeq()) << " = " << from2 << "-" << to2
605 << NcbiEndl;
606 */
607 return from1 < from2;
608 } // less_seq
609
SortSeqs(CBioseq_set::TSeq_set & seqs)610 int CReadBlastApp::SortSeqs(CBioseq_set::TSeq_set& seqs)
611 {
612 if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): start"
613 << ": " << seqs.size()
614 << NcbiEndl;
615
616 NON_CONST_ITERATE(CBioseq_set::TSeq_set, seq, seqs)
617 {
618 if ((*seq)->IsSet()) SortSeqs((*seq)->SetSet().SetSeq_set());
619 }
620
621 seqs.sort(&less_seq); // does not do anything
622 int n =1;
623
624 if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): stop" << NcbiEndl;
625 return n;
626 }
627
getCoreDataType(istream & in)628 ECoreDataType CReadBlastApp::getCoreDataType(istream& in)
629 {
630 char buffer[1024];
631 in.getline(buffer, 1024);
632 in.seekg(0);
633
634 string result = buffer;
635 result.erase(result.find_first_of(": "));
636
637 if(result=="Seq-entry")
638 {
639 result = buffer;
640 if(result.find("set") != string::npos)
641 return eEntry;
642 return eUndefined;
643 }
644 if(result=="Seq-submit") return eSubmit;
645 if(NStr::StartsWith(result, ">Feature")) return eTbl;
646 return eUndefined;
647 }
648
IsSubmit()649 bool CReadBlastApp::IsSubmit()
650 {
651 return m_coreDataType == eSubmit;
652 }
653
IsEntry()654 bool CReadBlastApp::IsEntry()
655 {
656 return m_coreDataType == eEntry;
657 }
658
IsTbl()659 bool CReadBlastApp::IsTbl()
660 {
661 return m_coreDataType == eTbl;
662 }
663
664
Begin(void)665 CBeginInfo CReadBlastApp::Begin(void)
666 {
667 if(m_coreDataType == eSubmit) return ::Begin(m_Submit);
668 if(m_coreDataType == eEntry) return ::Begin(m_Entry);
669
670 return ::Begin(m_Submit); // some platforms might have warnings without it
671 }
672
ConstBegin(void)673 CConstBeginInfo CReadBlastApp::ConstBegin(void)
674 {
675 if(m_coreDataType == eSubmit) return ::ConstBegin(m_Submit);
676 if(m_coreDataType == eEntry) return ::ConstBegin(m_Entry);
677
678 return ::ConstBegin(m_Submit); // some platforms might have warnings without it
679 }
680
681
next_w(char * w)682 char *CReadBlastApp::next_w(char *w)
683 {
684 while(*w && !isspace(*w)) ++w;
685 while(isspace(*w)) ++w;
686 return(w);
687
688 }
skip_space(char * w)689 char *CReadBlastApp::skip_space(char *w)
690 {
691 while(*w && isspace(*w)) ++w;
692 return(w);
693
694 }
skip_toprot(CTypeIterator<CBioseq> & seq)695 int CReadBlastApp::skip_toprot(CTypeIterator<CBioseq>& seq)
696 {
697 int nums=0;
698 for(nums=0 ;seq.IsValid(); ++seq, nums++)
699 {
700 const CSeq_inst& inst = seq->GetInst();
701 if(!inst.IsAa()) continue;
702 break;
703 }
704 return nums;
705 }
706
skip_toprot(CTypeConstIterator<CBioseq> & seq)707 int CReadBlastApp::skip_toprot(CTypeConstIterator<CBioseq>& seq)
708 {
709 int nums=0;
710 for(nums=0 ;seq; ++seq, nums++)
711 {
712 const CSeq_inst& inst = seq->GetInst();
713 if(!inst.IsAa()) continue;
714 break;
715 }
716 return nums;
717 }
718
skip_toprot(CBioseq_set::TSeq_set::const_iterator & seq,const CBioseq_set::TSeq_set & seqs)719 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::const_iterator& seq,
720 const CBioseq_set::TSeq_set& seqs)
721 {
722
723 IncreaseVerbosity();
724 while( seq != seqs.end() &&
725 (
726 (*seq)->IsSet() || // set of sequences, skip
727 !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
728 )
729 )
730 {
731 if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
732 // << CSeq_id::GetStringDescr ((*seq)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl;
733 << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
734 ++seq;
735 }
736 DecreaseVerbosity();
737
738 return (seq != seqs.end()) ;
739
740 }
741
742
skip_toprot(CBioseq_set::TSeq_set::iterator & seq,CBioseq_set::TSeq_set & seqs)743 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::iterator& seq,
744 CBioseq_set::TSeq_set& seqs)
745 {
746
747 IncreaseVerbosity();
748 while( seq != seqs.end() &&
749 (
750 (*seq)->IsSet() || // set of sequences, skip
751 !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
752 )
753 )
754 {
755 if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
756 << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
757 ++seq;
758 }
759 DecreaseVerbosity();
760
761 return (seq != seqs.end()) ;
762
763 }
764
765
skip_to_valid_seq_cand(CBioseq_set::TSeq_set::const_iterator & seq,const CBioseq_set::TSeq_set & seqs)766 bool CReadBlastApp::skip_to_valid_seq_cand(
767 CBioseq_set::TSeq_set::const_iterator& seq,
768 const CBioseq_set::TSeq_set& seqs)
769 {
770 IncreaseVerbosity();
771 while( seq != seqs.end() &&
772 (
773 (*seq)->IsSet() || // set of sequences, skip
774 !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
775 !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
776 )
777 )
778 {
779 if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
780 << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
781 ++seq;
782 }
783 DecreaseVerbosity();
784
785 return (seq != seqs.end()) ;
786 }
787
skip_to_valid_seq_cand(CBioseq_set::TSeq_set::iterator & seq,CBioseq_set::TSeq_set & seqs)788 bool CReadBlastApp::skip_to_valid_seq_cand(
789 CBioseq_set::TSeq_set::iterator& seq,
790 CBioseq_set::TSeq_set& seqs)
791 {
792 IncreaseVerbosity();
793 while( seq != seqs.end() &&
794 (
795 (*seq)->IsSet() || // set of sequences, skip
796 !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
797 !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
798 )
799 )
800 {
801 if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
802 << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
803 ++seq;
804 }
805 DecreaseVerbosity();
806
807 return (seq != seqs.end()) ;
808 }
809
810 // ReadPreviousAcc(args["parentacc"].AsString(), m_previous_genome)
ReadPreviousAcc(const string & file,list<long> & input_acc)811 bool CReadBlastApp::ReadPreviousAcc(const string& file, list<long>& input_acc)
812 {
813 const unsigned int max_acc_len=0xF;
814 ifstream in(file.c_str());
815 if(!in.is_open()) return false; // the input parameter is not a file
816 list<long> scratch_acc;
817 while(in.good())
818 {
819 char buffer[max_acc_len+2];
820 in.getline(buffer, max_acc_len+1);
821 /*
822 if(buffer[0] < 'A' || buffer[0] > 'Z' ||
823 buffer[1] < 'A' || buffer[1] > 'Z' ||
824 buffer[2] != '_') return false;
825 */
826 string test(buffer);
827 // remove white spaces
828 string::size_type ipos=string::npos;
829 while((ipos=test.find_first_of("\t\n\r ")) != string::npos)
830 test.erase(ipos,1);
831 // too long, may be it is a wrong file
832 if(test.size()>max_acc_len) return false;
833 // has non-digits, go away
834 if(test.find_first_not_of("0123456789") != string::npos) return false;
835 // accept
836 scratch_acc.push_back(atol(test.c_str()));
837 }
838 if(!scratch_acc.size()) return false;
839 input_acc = scratch_acc;
840 return true;
841 }
842
843 /////////////////////////////////////////////////////////////////////////////
844 // MAIN
845
846
main(int argc,const char * argv[])847 int main(int argc, const char* argv[])
848 {
849 // Execute main application function
850 return CReadBlastApp().AppMain(argc, argv, 0, eDS_Default, 0);
851 }
852
853
854