1 /*  $Id: read_blast_result_lib.cpp 434869 2014-05-12 17:42:01Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *   major top-level routines, including main
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include "read_blast_result.hpp"
35 
36 
37 // static members definition
38 // algorithm control
39 double CReadBlastApp::m_small_tails_threshold;
40 int    CReadBlastApp::m_n_best_hit;
41 double CReadBlastApp::m_eThreshold;
42 double CReadBlastApp::m_entireThreshold;
43 double CReadBlastApp::m_partThreshold;
44 int    CReadBlastApp::m_rna_overlapThreshold;
45 int    CReadBlastApp::m_cds_overlapThreshold;
46 double CReadBlastApp::m_trnascan_scoreThreshold;
47 int    CReadBlastApp::m_shortProteinThreshold;
48 // verbosity
49 int    CReadBlastApp::m_verbosity_threshold;
50 int    CReadBlastApp::m_current_verbosity;
51 stack < int > CReadBlastApp::m_saved_verbosity;
52 
s_GetFormat(const string & name)53 ESerialDataFormat s_GetFormat(const string& name)
54 {
55     if (name == "asn") {
56         return eSerial_AsnText;
57     } else if (name == "asnb") {
58         return eSerial_AsnBinary;
59     } else if (name == "xml") {
60         return eSerial_Xml;
61     } else {
62         // Should be caught by argument processing, but as an illustration...
63         THROW1_TRACE(runtime_error, "Bad serial format name " + name);
64     }
65 }
66 
67 
getGIs(CBioseq::TAnnot::const_iterator & annot)68 vector<long> CReadBlastApp::getGIs(CBioseq::TAnnot::const_iterator& annot)
69 {
70   vector<long> result;
71   result.clear();
72   ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
73     {
74     string name = (*score)->GetId().GetStr();
75     if(name!="use_this_gi") continue;
76     result.push_back((*score)->GetValue().GetInt());
77     }
78 
79   return result;
80 }
81 
giMatch(const vector<long> & left,const vector<long> & right)82 bool CReadBlastApp::giMatch(const vector<long>& left, const vector<long>& right)
83 {
84 
85   bool result=false;
86   if(PrintDetails()) NcbiCerr << "giMatch starts" << NcbiEndl;
87   IncreaseVerbosity();
88   ITERATE(vector<long>, gi1, left)
89     {
90     IncreaseVerbosity();
91     ITERATE(vector<long>, gi2, right)
92       {
93       if(PrintDetails()) NcbiCerr << "try: gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
94       if (*gi1==*gi2)
95         {
96         if(PrintDetails()) NcbiCerr << "giMatch!!! : gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
97         return true;
98         }
99       }
100     DecreaseVerbosity();
101     }
102   DecreaseVerbosity();
103   if(PrintDetails()) NcbiCerr << "giMatch oops" << NcbiEndl;
104 
105   return result;
106 }
107 
dumpAlignment(const string & alignment,const string & file)108 void CReadBlastApp::dumpAlignment( const string& alignment, const string& file)
109 {
110   ofstream out(file.c_str());
111   out<< alignment;
112 }
113 
dump_fasta_for_pretty_blast(diagMap & diag)114 void CReadBlastApp::dump_fasta_for_pretty_blast(diagMap& diag)
115 {
116   // open out2 here
117    string fn = GetArgs()["in"].AsString();
118    string::size_type ipos=fn.rfind(".");
119    if(ipos!=string::npos) fn.erase(ipos);
120    ipos=fn.rfind("/");
121    fn.erase(0,ipos+1);
122    string fncdd = fn + "_cdd_html.fsa";
123    fn+= "_html.fsa";
124 
125    CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
126    CScope scope(*objmgr);
127 
128    auto_ptr<CNcbiOfstream> out2  (new CNcbiOfstream (fn.c_str()));
129    CFastaOstream fasta_out(*out2);
130 
131    auto_ptr<CNcbiOfstream> out2_cdd ( new CNcbiOfstream (fncdd.c_str()));
132    CFastaOstream fasta_out_cdd(*out2_cdd);
133 
134    for (CTypeConstIterator<CBioseq> seq = ConstBegin();  seq;  ++seq)
135       {
136       if( hasProblems(*seq, diag, eFrameShift )||
137           hasProblems(*seq, diag, ePartial)
138         )
139         {
140         CBioseq_Handle handle = scope.AddBioseq(*seq);
141         if( hasProblems(*seq, diag, eFrameShift )) fasta_out.Write(handle);
142         if( hasProblems(*seq, diag, ePartial)) fasta_out_cdd.Write(handle);
143         }
144       }
145 
146 }
147 
printOverlapReport(distanceReportStr * report,ostream & out)148 void CReadBlastApp::printOverlapReport
149   (
150   distanceReportStr *report,
151   ostream& out
152   )
153 {
154   out << "q1 | lcl | " << report->q_id_left  << " | "   << report->q_name_left  << NcbiEndl;
155   out << "q2 | lcl | " << report->q_id_right << " | "   << report->q_name_right << NcbiEndl;
156   out      << report->q_id_left
157            << (report->left_strand  == eNa_strand_plus ? "(+)" : "(-)" )
158            << "\t"
159            << report->q_id_right
160            << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
161            << NcbiEndl;
162   string range1 = report->loc1.Empty()
163       ? printed_range(report->q_loc_left_from, report->q_loc_left_to)
164       : printed_ranges(*(report->loc1));
165   string range2 = report->loc2.Empty()
166       ? printed_range(report->q_loc_right_from, report->q_loc_right_to)
167       : printed_ranges(*(report->loc2));
168   out
169            << "[" <<  range1 << "] " << report->left_frame << "\t\t"
170            << "[" <<  range2 << "] " << report->right_frame
171            << NcbiEndl;
172   out << NcbiEndl;
173 
174   out << "Overlap: " << report->space << " aa" << NcbiEndl;
175 }
176 
printPerfectHit(const perfectHitStr & hit,ostream & out)177 void CReadBlastApp::printPerfectHit
178   (
179   const perfectHitStr& hit,
180   ostream& out
181   )
182 {
183 
184   out << "Perfect hit: " << hit.s_name << NcbiEndl;
185 
186 }
187 
printGeneralInfo(ostream & out)188 void CReadBlastApp::printGeneralInfo(ostream& out)
189 {
190   out << "General info extracted from the header of the submission:" << NcbiEndl;
191   out << NcbiEndl;
192   out << "reldate: ";
193   if ( IsSubmit() && m_Submit.GetSub().CanGetReldate() )
194     {
195     string date;
196     m_Submit.GetSub().GetReldate().GetDate(&date);
197     out << date.c_str();
198     }
199   else
200     out << "unspecified";
201   out << NcbiEndl;
202 
203 
204 // simple texts
205   out << "tool: ";
206   if ( IsSubmit() && m_Submit.GetSub().CanGetTool() )
207     out << m_Submit.GetSub().GetTool();
208   else
209     out << "unspecified";
210   out << NcbiEndl;
211 
212   out << "user_tag: ";
213   if ( IsSubmit() && m_Submit.GetSub().CanGetUser_tag() )
214     out << m_Submit.GetSub().GetUser_tag();
215   else
216     out << "unspecified";
217   out << NcbiEndl;
218 
219   out << "comment: ";
220   if ( IsSubmit() && m_Submit.GetSub().CanGetComment() )
221     out << m_Submit.GetSub().GetComment();
222   else
223     out << "unspecified";
224   out << NcbiEndl;
225 
226 
227   out << NcbiEndl;
228 }
229 
printReport(distanceReportStr * report,ostream & out)230 void CReadBlastApp::printReport
231   (
232   distanceReportStr *report,
233   ostream& out
234   )
235 {
236   out << "q1 | lcl | " << report->q_id_left  << " | "   << report->q_name_left  << NcbiEndl;
237   out << "q2 | lcl | " << report->q_id_right << " | "   << report->q_name_right << NcbiEndl;
238   out << "s | "   << report->s_name       << NcbiEndl;
239   out      << report->q_id_left
240            << (report->left_strand  == eNa_strand_plus ? "(+)" : "(-)" )
241            << "\t"
242            << report->q_id_right
243            << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
244            << NcbiEndl;
245   out
246            << "[" <<  report->q_loc_left_from  << "..." << report->q_loc_left_to  << "]\t\t"
247            << "[" <<  report->q_loc_right_from << "..." << report->q_loc_right_to << "]"
248            << NcbiEndl;
249   out      << NcbiEndl;
250   out
251            << "q\t" << report->q_left_left  << "\t" << report->q_left_middle  << "\t" << report->q_left_right
252            << "\t"  << report->space
253            << "\t"  << report->q_right_left << "\t" << report->q_right_middle << "\t" << report->q_right_right
254            << NcbiEndl;
255   out
256            << "s\t" << report->s_left_left  << "\t" << report->s_left_middle  << "\t" << "..."
257            << "\t"  << "..."
258            << "\t"  << "..."                << "\t" << report->s_left_right   << "\t" << "..."
259            << NcbiEndl;
260   out
261            << "s\t" << "..."                << "\t" << report->s_right_left   << "\t" << "..."
262            << "\t"  << "..."
263            << "\t"  << "..."                << "\t" << report->s_right_middle << "\t" << report->s_right_right
264            << NcbiEndl;
265   out << "diff_left, diff_right: " << report->diff_left << ", " << report->diff_right << NcbiEndl;
266   out << "diff_edge_left, diff_edge_right: " << report->diff_edge_left << ", " << report->diff_edge_right << NcbiEndl;
267   if(report->diff_edge_left > 0)
268     {
269     out << "Potential deletetion of a nucleotide sequence equivalent to " << report->diff_edge_left << " occurred." << NcbiEndl;
270     }
271   else if (report->diff_edge_left < 0)
272     {
273     if ( -report->diff_edge_left < report->space )
274       out << "Potential insertion of a nucleotide sequence equivalent to " << -report->diff_edge_left << " occurred." << NcbiEndl;
275     else
276       out << "Subjects are shifted apart by " << -report->diff_edge_left << NcbiEndl;
277     }
278   else
279     {
280     out << "Potential sequencing error or replacement mutation without insertion or deletion of a nucleotide sequence occurred." << NcbiEndl;
281     }
282 
283 
284 }
285 
GetRNAfeats(const LocMap & loc_map,CSeq_annot::C_Data::TFtable & rna_feats,const CSeq_annot::C_Data::TFtable & feats)286 void CReadBlastApp::GetRNAfeats
287   (
288   const LocMap& loc_map,
289   CSeq_annot::C_Data::TFtable& rna_feats,
290   const CSeq_annot::C_Data::TFtable& feats
291   )
292 {
293   if(PrintDetails()) NcbiCerr << "GetRNAfeats() starts"  << NcbiEndl;
294   IncreaseVerbosity();
295   ITERATE(CSeq_annot::C_Data::TFtable, f, feats)
296     {
297     if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature start "  << NcbiEndl;
298     if( !(*f)->GetData().IsRna() ) continue;
299     if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature is RNA"  << NcbiEndl;
300     string loc_string = GetLocationString( (*f)->GetLocation() );
301     if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
302                                 << loc_string
303                                 << "] feature location"  << NcbiEndl;
304     LocMap::const_iterator aname = loc_map.find(loc_string);
305     if(aname == loc_map.end())
306       {
307       // error
308       NcbiCerr << "CReadBlastApp::GetRNAfeats(): ERROR: cannot find gene for location "
309                << loc_string
310                << NcbiEndl;
311       continue;
312       }
313     rna_feats.push_back(aname->second);
314     if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
315                                 << aname->first
316                                 << "] feature end"  << NcbiEndl;
317     }
318   DecreaseVerbosity();
319   if(PrintDetails()) NcbiCerr << "GetRNAfeats(): rna_feats size: "
320                               << rna_feats.size()
321                               << NcbiEndl;
322   if(PrintDetails()) NcbiCerr << "GetRNAfeats() ends"  << NcbiEndl;
323 
324   return;
325 }
326 
327 
GetLocusTag(const CSeq_feat & f,const LocMap & loc_map)328 string GetLocusTag(const CSeq_feat& f, const LocMap& loc_map)
329 {
330   if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: start" << NcbiEndl;
331   string loc_string = GetLocationString(f.GetLocation());
332   if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: I" << NcbiEndl;
333   LocMap::const_iterator aname = loc_map.find(loc_string);
334   if(aname == loc_map.end())
335     {
336     if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not in locmap" << NcbiEndl;
337     return loc_string;
338     }
339   else
340     {
341     if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: in locmap" << NcbiEndl;
342     const CGene_ref& gene=aname->second->GetData().GetGene();
343     string qname="";
344     if(gene.IsSetLocus_tag())
345       {
346       if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: getting it" << NcbiEndl;
347       return gene.GetLocus_tag();
348       }
349 
350     if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not getting it" << NcbiEndl;
351     return qname;
352     }
353 
354  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: INTERNAL ERROR: should be never here" << NcbiEndl;
355  return loc_string;
356 }
get_nucleotide_seq(const CBioseq & seq)357 const CBioseq& CReadBlastApp::get_nucleotide_seq(const CBioseq& seq)
358 {
359   const CBioseq_set::TSeq_set* seqset = get_parent_seqset(seq);
360   const CSeq_loc&  loc = getGenomicLocation(seq);
361   CTypeConstIterator<CSeq_interval> inter = ::ConstBegin(loc);
362   const CSeq_id& nu_id = inter->GetId();
363 
364   ITERATE(CBioseq_set::TSeq_set, seq_nu, *seqset)
365     {
366     if(!(*seq_nu)->IsSeq()) continue;
367     if(!(*seq_nu)->GetSeq().IsNa()) continue;
368     const CBioseq::TId& nu_ids = (*seq_nu)->GetSeq().GetId();
369     ITERATE(CBioseq::TId, cnu_id , nu_ids)
370       {
371       if((*cnu_id)->Compare(nu_id) == CSeq_id::e_YES) return (*seq_nu)->GetSeq();
372       }
373     }
374   NcbiCerr << "CReadBlastApp::get_nucleotide_seq: INTERNAL FATAL ERROR: could not find nucleotide sequence" << NcbiEndl;
375   throw;
376   return seq; // formality
377 }
378 
get_parent_seqset(const CBioseq & seq)379 CBioseq_set::TSeq_set* get_parent_seqset(const CBioseq& seq)
380 {
381   string name = GetStringDescr(seq);
382   CSeq_entry* parent = seq.GetParentEntry();
383   if(parent==NULL)
384     {
385     NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no parent\n";
386     return NULL;
387     }
388   if(parent->IsSeq())
389     {
390     while(parent!=NULL && !parent->IsSet())
391        parent = parent->GetParentEntry();
392     if(parent==NULL)
393       {
394       NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no set ancestor\n";
395       return NULL;
396       }
397     }
398   else if(!parent->IsSet()) return NULL;
399 
400   if(!parent->GetSet().CanGetSeq_set())
401     {
402     NcbiCerr << "get_parent_seqset: WARNING: " << name << ": (grand)parent set does not have Seq_set\n";
403     return NULL;
404     }
405   return &(parent->SetSet().SetSeq_set());
406 }
407 
408 
let1_2_let3(char let1)409 string let1_2_let3(char let1)
410 {
411   switch(let1)
412     {
413     case 'A': return "Ala"; break;
414     case 'Q': return "Gln"; break;
415     case 'W': return "Trp"; break;
416     case 'E': return "Glu"; break;
417     case 'R': return "Arg"; break;
418     case 'T': return "Thr"; break;
419     case 'Y': return "Tyr"; break;
420     case 'U': return "Sec"; break;
421     case 'I': return "Ile"; break;
422     case 'J': return "Xeu"; break; // Ile or Leu, there is no conventional three-letter code
423     case 'P': return "Pro"; break;
424     case 'S': return "Ser"; break;
425     case 'D': return "Asp"; break;
426     case 'F': return "Phe"; break;
427     case 'G': return "Gly"; break;
428     case 'H': return "His"; break;
429     case 'K': return "Lys"; break;
430     case 'L': return "Leu"; break;
431     case 'Z': return "Glx"; break;
432     case 'X': return "Ukn"; break;
433     case 'C': return "Cys"; break;
434     case 'V': return "Val"; break;
435     case 'B': return "Asx"; break;
436     case 'N': return "Asn"; break;
437     case 'M': return "Met"; break;
438     case 'O': return "Pyl"; break;
439     default: break;
440     }
441   NcbiCerr << "let1_2_let3: ERROR: char " << let1 << "(" << (int)let1 << ") is not handled" << NcbiEndl;
442 
443   return "";
444 }
445 
collectPerfectHits(vector<perfectHitStr> & perfect,const CBioseq & seq)446 int CReadBlastApp::collectPerfectHits(vector<perfectHitStr>& perfect, const CBioseq& seq)
447 {
448   ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
449     {
450     if(!(*annot)->GetData().IsAlign()) continue;
451     check_alignment(annot, seq, perfect);
452     }
453   return perfect.size();
454 }
455 
check_alignment(CBioseq::TAnnot::const_iterator & annot,const CBioseq & seq,vector<perfectHitStr> & results)456 void CReadBlastApp::check_alignment
457   (
458   CBioseq::TAnnot::const_iterator& annot,
459   const CBioseq& seq,
460   vector<perfectHitStr>& results
461   )
462 {
463   int sLen  = getLenScore(annot);
464   int qLen  = getQueryLen(seq);
465   string q_name  = GetProtName(seq);
466 
467   string s_name = getAnnotName(annot);
468 
469   int qFrom, qTo, sFrom, sTo;
470   getBounds(annot,  &qFrom,  &qTo,  &sFrom,  &sTo);
471 
472   bool result=false;
473 
474   int qtails = qLen - qTo + qFrom - 1;
475   int stails = sLen - sTo + sFrom - 1;
476   double thr = m_small_tails_threshold; // default should be .1
477   result =
478      (double)qtails/qLen < thr &&
479      (double)stails/sLen < thr &&
480      s_name.find("hypothetical") == string::npos;
481 // hypothetical exhonerating hits do not count
482 
483   if(result)
484     {
485     results.resize(results.size()+1);
486     int ihit = results.size()-1;
487     results[ihit].s_name = s_name;
488     }
489 }
490 
is_prot_entry(const CBioseq & seq)491 bool CReadBlastApp::is_prot_entry(const CBioseq& seq)
492 {
493     bool result;
494     // if(PrintDetails()) NcbiCerr << "is_prot_entry?"  << NcbiEndl;
495     result=seq.GetInst().IsAa();
496     // if(PrintDetails()) NcbiCerr << result  << NcbiEndl;
497     return result;
498 }
499 
500 
SortSeqs(void)501 int CReadBlastApp::SortSeqs(void)
502 {
503    if(PrintDetails()) NcbiCerr << "SortSeqs: start " << NcbiEndl;
504    PushVerbosity();
505    if(IsSubmit())
506      {
507      if(PrintDetails()) NcbiCerr << "SortSeqs: IsSubmit" << NcbiEndl;
508      if (
509            m_Submit.GetData().IsEntrys()
510             &&
511            (*m_Submit.GetData().GetEntrys().begin())->IsSet()
512         )
513        {
514        SortSeqs((*m_Submit.SetData().SetEntrys().begin())->SetSet().SetSeq_set());
515        }
516      else
517        {
518        NcbiCerr << "ERROR: submit file does not have proper seqset" << NcbiEndl;
519        }
520      }
521    else
522      {
523      if(PrintDetails()) NcbiCerr << "SortSeqs: IsEntry" << NcbiEndl;
524        SortSeqs(m_Entry.SetSet().SetSeq_set());
525      }
526 
527    PopVerbosity();
528    if(PrintDetails()) NcbiCerr << "SortSeqs: end" << NcbiEndl;
529    return -1;
530 }
531 
532 
533 
SetParents(CSeq_entry * parent,CBioseq_set::TSeq_set & where)534 int CReadBlastApp::SetParents(CSeq_entry* parent, CBioseq_set::TSeq_set& where)
535 {
536 /*
537    NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, where)
538      {
539      if((*left)->IsSet())
540         {
541         CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set();
542         SetParents(parent, seqs_down);
543         }
544      else
545        {
546        (*left)->SetParentEntry(parent);
547        }
548      }
549 */
550     return 0;
551 }
552 
553 // TSimpleSeq
554 
less_simple_seq(const TSimpleSeq & first,const TSimpleSeq & second)555 bool CReadBlastApp::less_simple_seq(const TSimpleSeq& first,
556                                     const TSimpleSeq& second)
557 {
558    return first.key < second.key;
559 } // less_seq
560 
less_pair(const pair<int,int> & first,const pair<int,int> & second)561 bool CReadBlastApp::less_pair(const pair<int,int>& first,
562                               const pair<int,int>& second)
563 {
564    if(first.first != second.first) return first.first < second.first;
565    return first.second < second.second;
566 }
567 
568 
569 
less_seq(const CRef<CSeq_entry> & first,const CRef<CSeq_entry> & second)570 bool CReadBlastApp::less_seq(const CRef<CSeq_entry>& first,
571                              const CRef<CSeq_entry>& second)
572 {
573   // if(PrintDetails()) NcbiCerr << "less_seq start" << NcbiEndl;
574   if (first->IsSet() || second->IsSet())
575      {
576      return false;
577      }
578   // if(PrintDetails()) NcbiCerr << "less_seq both seqs" << NcbiEndl;
579   if (!hasGenomicLocation(first->GetSeq()))
580     {
581     if(first->GetSeq().IsAa())
582       NcbiCerr << "less_seq first does not have genomic location or is nucleotide seq" << NcbiEndl;
583     return true; // to take care of nucleotide sequence
584     }
585   if (!hasGenomicLocation(second->GetSeq()))
586     {
587     if(second->GetSeq().IsAa())
588       NcbiCerr << "less_seq second does not have genomic location or is nucleotide seq" << NcbiEndl;
589     return false;
590     }
591   // if(PrintDetails()) NcbiCerr << "less_seq both have genomic locations" << NcbiEndl;
592   const CSeq_loc& left_genomic_int = getGenomicLocation(first->GetSeq());
593   const CSeq_loc& right_genomic_int = getGenomicLocation(second->GetSeq());
594   TSeqPos from1, to1, from2, to2;
595 
596   ENa_strand  strand1;
597   ENa_strand  strand2;
598 
599   getFromTo(left_genomic_int, from1, to1, strand1);
600   getFromTo(right_genomic_int, from2, to2, strand2);
601 /*
602   if(PrintDetails()) NcbiCerr << "less_seq comparing"
603     << ": " << GetStringDescr( first->GetSeq()) << " = " << from1 << "-" << to1
604     << ", " << GetStringDescr(second->GetSeq()) << " = " << from2 << "-" << to2
605     << NcbiEndl;
606 */
607   return from1 < from2;
608 } // less_seq
609 
SortSeqs(CBioseq_set::TSeq_set & seqs)610 int CReadBlastApp::SortSeqs(CBioseq_set::TSeq_set& seqs)
611 {
612    if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): start"
613         << ": " << seqs.size()
614         << NcbiEndl;
615 
616    NON_CONST_ITERATE(CBioseq_set::TSeq_set, seq, seqs)
617      {
618      if ((*seq)->IsSet()) SortSeqs((*seq)->SetSet().SetSeq_set());
619      }
620 
621    seqs.sort(&less_seq); // does not do anything
622    int n =1;
623 
624   if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): stop"  << NcbiEndl;
625   return n;
626 }
627 
getCoreDataType(istream & in)628 ECoreDataType CReadBlastApp::getCoreDataType(istream& in)
629 {
630   char buffer[1024];
631   in.getline(buffer, 1024);
632   in.seekg(0);
633 
634   string result =  buffer;
635   result.erase(result.find_first_of(": "));
636 
637   if(result=="Seq-entry")
638     {
639     result =  buffer;
640     if(result.find("set") != string::npos)
641       return eEntry;
642     return eUndefined;
643     }
644   if(result=="Seq-submit") return eSubmit;
645   if(NStr::StartsWith(result, ">Feature")) return eTbl;
646   return eUndefined;
647 }
648 
IsSubmit()649 bool CReadBlastApp::IsSubmit()
650 {
651   return m_coreDataType == eSubmit;
652 }
653 
IsEntry()654 bool CReadBlastApp::IsEntry()
655 {
656   return m_coreDataType == eEntry;
657 }
658 
IsTbl()659 bool CReadBlastApp::IsTbl()
660 {
661   return m_coreDataType == eTbl;
662 }
663 
664 
Begin(void)665 CBeginInfo CReadBlastApp::Begin(void)
666 {
667   if(m_coreDataType == eSubmit) return ::Begin(m_Submit);
668   if(m_coreDataType == eEntry)  return ::Begin(m_Entry);
669 
670   return ::Begin(m_Submit); // some platforms might have warnings without it
671 }
672 
ConstBegin(void)673 CConstBeginInfo CReadBlastApp::ConstBegin(void)
674 {
675   if(m_coreDataType == eSubmit) return ::ConstBegin(m_Submit);
676   if(m_coreDataType == eEntry)  return ::ConstBegin(m_Entry);
677 
678   return ::ConstBegin(m_Submit); // some platforms might have warnings without it
679 }
680 
681 
next_w(char * w)682 char *CReadBlastApp::next_w(char *w)
683 {
684   while(*w && !isspace(*w)) ++w;
685         while(isspace(*w)) ++w;
686         return(w);
687 
688 }
skip_space(char * w)689 char *CReadBlastApp::skip_space(char *w)
690 {
691         while(*w && isspace(*w)) ++w;
692         return(w);
693 
694 }
skip_toprot(CTypeIterator<CBioseq> & seq)695 int CReadBlastApp::skip_toprot(CTypeIterator<CBioseq>& seq)
696 {
697     int nums=0;
698     for(nums=0 ;seq.IsValid(); ++seq, nums++)
699       {
700       const CSeq_inst& inst = seq->GetInst();
701       if(!inst.IsAa()) continue;
702       break;
703       }
704     return nums;
705 }
706 
skip_toprot(CTypeConstIterator<CBioseq> & seq)707 int CReadBlastApp::skip_toprot(CTypeConstIterator<CBioseq>& seq)
708 {
709     int nums=0;
710     for(nums=0 ;seq; ++seq, nums++)
711       {
712       const CSeq_inst& inst = seq->GetInst();
713       if(!inst.IsAa()) continue;
714       break;
715       }
716     return nums;
717 }
718 
skip_toprot(CBioseq_set::TSeq_set::const_iterator & seq,const CBioseq_set::TSeq_set & seqs)719 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::const_iterator& seq,
720      const CBioseq_set::TSeq_set& seqs)
721 {
722 
723      IncreaseVerbosity();
724      while( seq != seqs.end() &&
725               (
726               (*seq)->IsSet()                    ||        // set of sequences, skip
727               !is_prot_entry((*seq)->GetSeq())             // not a protein, skip
728               )
729            )
730         {
731         if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
732                    // <<  CSeq_id::GetStringDescr ((*seq)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl;
733                    <<  GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
734         ++seq;
735         }
736      DecreaseVerbosity();
737 
738   return (seq != seqs.end()) ;
739 
740 }
741 
742 
skip_toprot(CBioseq_set::TSeq_set::iterator & seq,CBioseq_set::TSeq_set & seqs)743 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::iterator& seq,
744      CBioseq_set::TSeq_set& seqs)
745 {
746 
747      IncreaseVerbosity();
748      while( seq != seqs.end() &&
749               (
750               (*seq)->IsSet()                    ||        // set of sequences, skip
751               !is_prot_entry((*seq)->GetSeq())             // not a protein, skip
752               )
753            )
754         {
755         if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
756                    <<  GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
757         ++seq;
758         }
759      DecreaseVerbosity();
760 
761   return (seq != seqs.end()) ;
762 
763 }
764 
765 
skip_to_valid_seq_cand(CBioseq_set::TSeq_set::const_iterator & seq,const CBioseq_set::TSeq_set & seqs)766 bool CReadBlastApp::skip_to_valid_seq_cand(
767      CBioseq_set::TSeq_set::const_iterator& seq,
768      const CBioseq_set::TSeq_set& seqs)
769 {
770      IncreaseVerbosity();
771      while( seq != seqs.end() &&
772               (
773               (*seq)->IsSet()                    ||        // set of sequences, skip
774               !is_prot_entry((*seq)->GetSeq())   ||        // not a protein, skip
775               !has_blast_hits((*seq)->GetSeq())            // does not have a blast , skip
776               )
777            )
778         {
779         if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
780                    <<  GetStringDescr ((*seq)->GetSeq())  << NcbiEndl;
781         ++seq;
782         }
783      DecreaseVerbosity();
784 
785   return (seq != seqs.end()) ;
786 }
787 
skip_to_valid_seq_cand(CBioseq_set::TSeq_set::iterator & seq,CBioseq_set::TSeq_set & seqs)788 bool CReadBlastApp::skip_to_valid_seq_cand(
789      CBioseq_set::TSeq_set::iterator& seq,
790      CBioseq_set::TSeq_set& seqs)
791 {
792      IncreaseVerbosity();
793      while( seq != seqs.end() &&
794               (
795               (*seq)->IsSet()                    ||        // set of sequences, skip
796               !is_prot_entry((*seq)->GetSeq())   ||        // not a protein, skip
797               !has_blast_hits((*seq)->GetSeq())            // does not have a blast , skip
798               )
799            )
800         {
801         if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
802                    <<  GetStringDescr ((*seq)->GetSeq())  << NcbiEndl;
803         ++seq;
804         }
805      DecreaseVerbosity();
806 
807   return (seq != seqs.end()) ;
808 }
809 
810 // ReadPreviousAcc(args["parentacc"].AsString(), m_previous_genome)
ReadPreviousAcc(const string & file,list<long> & input_acc)811 bool CReadBlastApp::ReadPreviousAcc(const string& file, list<long>& input_acc)
812 {
813    const unsigned int max_acc_len=0xF;
814    ifstream in(file.c_str());
815    if(!in.is_open()) return false; // the input parameter is not a file
816    list<long> scratch_acc;
817    while(in.good())
818      {
819      char buffer[max_acc_len+2];
820      in.getline(buffer, max_acc_len+1);
821 /*
822      if(buffer[0] < 'A' || buffer[0] > 'Z' ||
823         buffer[1] < 'A' || buffer[1] > 'Z' ||
824         buffer[2] != '_') return false;
825 */
826      string test(buffer);
827 // remove white spaces
828      string::size_type ipos=string::npos;
829      while((ipos=test.find_first_of("\t\n\r ")) != string::npos)
830         test.erase(ipos,1);
831 // too long, may be it is a wrong file
832      if(test.size()>max_acc_len) return false;
833 // has non-digits, go away
834      if(test.find_first_not_of("0123456789") != string::npos) return false;
835 // accept
836      scratch_acc.push_back(atol(test.c_str()));
837      }
838    if(!scratch_acc.size()) return false;
839    input_acc = scratch_acc;
840    return true;
841 }
842 
843 /////////////////////////////////////////////////////////////////////////////
844 //  MAIN
845 
846 
main(int argc,const char * argv[])847 int main(int argc, const char* argv[])
848 {
849     // Execute main application function
850     return CReadBlastApp().AppMain(argc, argv, 0, eDS_Default, 0);
851 }
852 
853 
854