1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
35 
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const CBioseq::TAnnot & annots,const EProblem type)36 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
37    const CBioseq::TAnnot& annots, const EProblem type)
38 {
39    ITERATE(CBioseq::TAnnot, gen_feature, annots)
40      {
41      if ( !(*gen_feature)->GetData().IsFtable() ) continue;
42      reportProblems(report_and_forget, diag, out, (*gen_feature)->GetData().GetFtable(), type);
43      }
44 }
45 
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const CSeq_annot::C_Data::TFtable & feats,const EProblem type)46 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
47    const CSeq_annot::C_Data::TFtable& feats, const EProblem type)
48 {
49   ITERATE(CSeq_annot::C_Data::TFtable, f, feats)
50     {
51     if( !(*f)->GetData().IsGene() ) continue;
52     string qname; (*f)->GetData().GetGene().GetLabel(&qname);
53     if( diag.find(qname) == diag.end() ) continue;
54     reportProblems(qname, diag, out, type);
55     if(report_and_forget) erase_problems(qname, diag, type);
56     }
57 }
reportProblems(const string & qname,diagMap & diag,ostream & out,const EProblem type)58 void CReadBlastApp::reportProblems(const string& qname, diagMap& diag, ostream& out, const EProblem type)
59 {
60       if(!hasProblems(qname, diag, type)) return;
61       if(PrintDetails()) NcbiCerr << "problem type: " << ProblemType(type) << NcbiEndl;
62       reportProblemSequenceName(qname, out);
63 
64 // reporting
65       IncreaseVerbosity();
66       NON_CONST_ITERATE(list<problemStr>, problem, diag[qname].problems)
67         {
68           if(PrintDetails()) NcbiCerr << "current problem: " << ProblemType(problem->type) << NcbiEndl;
69           if( !(problem->type & type) ) continue;
70           if(PrintDetails()) NcbiCerr << "it is that problem"  << NcbiEndl;
71           if(!problem->message.size()) continue;
72           if(PrintDetails()) NcbiCerr << "has nonzero message"  << NcbiEndl;
73           reportProblemType(problem->type, out);
74           reportProblemMessage(problem->message, out);
75           if(PrintDetails()) NcbiCerr << "current problem: done: " << problem->type << NcbiEndl;
76         }
77       DecreaseVerbosity();
78 }
erase_problems(const string & qname,diagMap & diag,const EProblem type)79 void CReadBlastApp::erase_problems(const string& qname, diagMap& diag, const EProblem type)
80 {
81       IncreaseVerbosity();
82       for(list<problemStr>::iterator problem = diag[qname].problems.begin(); problem!=diag[qname].problems.end();)
83         {
84         if(PrintDetails()) NcbiCerr << "erasing? current problem: " << problem->type << NcbiEndl;
85         if( !(problem->type & type) )
86              {
87              problem++;
88              continue;
89              }
90         if(PrintDetails()) NcbiCerr << "it is that problem for erazing"  << NcbiEndl;
91         problem=diag[qname].problems.erase(problem);
92         if(PrintDetails()) NcbiCerr << "erased"  << NcbiEndl;
93         if(PrintDetails()) NcbiCerr << "next current problem: " << problem->type << NcbiEndl;
94         }
95       DecreaseVerbosity();
96 }
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const EProblem type)97 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out, const EProblem type)
98 {
99    IncreaseVerbosity();
100    map<string,bool> done;
101    if(PrintDetails()) NcbiCerr << "reportProblems: all problems of type ("  << ProblemType(type) << ")" << NcbiEndl;
102    for (CTypeConstIterator<CBioseq> seq = ConstBegin();  seq;  ++seq)
103       {
104       if ( !is_prot_entry(*seq) )
105         {
106         reportProblems(report_and_forget, diag, out, seq->GetAnnot(), type);
107         }
108       string qname1 = GetStringDescr (*seq);
109       string qname2 = CSeq_id::GetStringDescr (*seq, CSeq_id::eFormat_FastA);
110       if(PrintDetails()) NcbiCerr << "reportProblems: start: "
111           << qname1 << " or "
112           << qname2
113           << NcbiEndl;
114       string qnames[2];
115       qnames[0]=qname1; qnames[1]=qname2;
116       for(int i=0; i<2; i++)
117         {
118         string& qname = qnames[i];
119         if( diag.find(qname) != diag.end() )
120           {
121           reportProblems(qname, diag, out, type);
122           if(report_and_forget) erase_problems(qname, diag, type);
123           done[qname]=true;
124           if(PrintDetails()) NcbiCerr << "reportProblems: full end: " << qname << NcbiEndl;
125           }
126         }
127       }
128 
129    ITERATE(diagMap, problem, diag)
130      {
131      string qname = problem->first;
132      if(done.find(qname)!=done.end()) continue;
133      reportProblems(qname, diag, out, type);
134      if(report_and_forget) erase_problems(qname, diag, type);
135      if(PrintDetails()) NcbiCerr << "reportProblems: alternative problems: " << qname << NcbiEndl;
136      }
137    DecreaseVerbosity();
138 }
139 // hasProblemType(seq, diag[qname].problems), problem->type
140 
hasProblems(const CBioseq & seq,diagMap & diag,const EProblem type)141 bool CReadBlastApp::hasProblems(const CBioseq& seq, diagMap& diag, const EProblem type)
142 {
143    string qname = GetStringDescr (seq);
144    string qname2 = CSeq_id::GetStringDescr (seq, CSeq_id::eFormat_FastA);
145    return hasProblems(qname, diag, type) || hasProblems(qname2, diag, type);
146 }
147 
hasProblems(const string & qname,diagMap & diag,const EProblem type)148 bool CReadBlastApp::hasProblems(const string& qname, diagMap& diag, const EProblem type)
149 {
150   if(PrintDetails()) NcbiCerr << "hasProblems: start: " << qname << NcbiEndl;
151   if( type != eAllProblems && diag.find(qname) == diag.end() ) return false;
152   if ( type == eAllProblems && diag[qname].problems.size()>0) return true;
153 
154   IncreaseVerbosity();
155   ITERATE(list<problemStr>, problem, diag[qname].problems)
156     {
157     if(PrintDetails()) NcbiCerr << "hasProblems: checking problem type: " << "\t"
158                                << ProblemType(problem->type) << "\t"
159                                << qname << "\t"
160                                << NcbiEndl;
161     if (problem->type & type)
162       {
163       if(PrintDetails()) NcbiCerr << "hasProblems: end: does have problem: " << qname << NcbiEndl;
164       return true;
165       }
166     }
167   DecreaseVerbosity();
168   if(PrintDetails()) NcbiCerr << "hasProblems: end: no problem: " << qname << NcbiEndl;
169   return false;
170 }
171 
reportProblemMessage(const string & message,ostream & out)172 void CReadBlastApp::reportProblemMessage(const string& message, ostream& out)
173 {
174    out << message.c_str() << NcbiEndl;
175 }
176 
ProblemType(const EProblem type)177 string CReadBlastApp::ProblemType(const EProblem type)
178 {
179    CNcbiStrstream strres;
180    strres << "unknown_problem_type=" << type << '\0';
181    string result=strres.str();
182    if(type & eOverlap)
183       result = "Potential overlap found";
184    else if(type & eRnaOverlap)
185       result =  "Potential RNA overlap found";
186    else if(type & eCompleteOverlap)
187       result =  "Complete overlap found";
188    else if(type & eRemoveOverlap)
189       result =  "overlap marked for removal";
190    else if(type & eRelFrameShift)
191       result =  "Something relevant to frame shift found";
192    else if(type & eFrameShift)
193       result =  "Potential frame shift evidence found";
194    else if(type & eMayBeNotFrameShift)
195       result =  "Evidence absolving from the frame shift accusation found";
196    else if(type & ePartial)
197       result =  "Potential partial protein annotation found";
198    else if(type & eShortProtein)
199       result =  "Short annotation found";
200    else if(type & eTRNAMissing)
201       result =  "tRNA is missing in the list of independently annotated tRNAs";
202    else if(type & eTRNAAbsent)
203       result =  "RNA is missing in the list of annotated RNAs in the input";
204    else if(type & eTRNABadStrand)
205       result =  "RNA is present at the wrong strand";
206    else if(type & eTRNAUndefStrand)
207       result =  "RNA is present with undefined strand";
208    else if(type & eTRNAComMismatch)
209       result =  "tRNA is a complete mismatch";
210    else if(type & eTRNAMismatch)
211       result =  "tRNA has mismatched ends";
212 
213    return result;
214 
215 }
216 
reportProblemType(const EProblem type,ostream & out)217 void CReadBlastApp::reportProblemType(const EProblem type, ostream& out)
218 {
219    out
220       << "---"
221       << " ";
222    string stype = ProblemType(type);
223    if(!stype.empty())
224       {
225       out << stype;
226       }
227    else
228       {
229       NcbiCerr << "FATAL: internal error: unknown problem type" << type << NcbiEndl;
230       throw;
231       }
232    out
233       << " "
234       << "---"
235       << NcbiEndl;
236 }
237 
reportProblemSequenceName(const string & name,ostream & out)238 void CReadBlastApp::reportProblemSequenceName(const string& name, ostream& out)
239 {
240    out
241       << "====="
242       << " ";
243    out << name.c_str() ;
244    out
245       << " "
246       << "====="
247       << NcbiEndl;
248 }
249 
FixStrands(void)250 int CReadBlastApp::FixStrands(void)
251 {
252   TProblem_locs problem_locs;
253   if(PrintDetails()) NcbiCerr << "FixStrands: start " << NcbiEndl;
254 
255 // relevant problems
256   CollectRNAFeatures(problem_locs);
257 
258 // first, count features matching each range
259   for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
260     {
261     if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
262     const CSeq_loc&  loc = feat->GetLocation();
263     ENa_strand strand;
264     TSeqPos from, to;
265     getFromTo(loc, from, to, strand);
266     string range = printed_range(from, to);
267     if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene [" << range << "]" << NcbiEndl;
268 
269     if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
270     if(PrintDetails()) NcbiCerr << "FixStrands: range: " << range << NcbiEndl;
271     problem_locs[range].count++;
272     if(feat->GetData().IsRna()) problem_locs[range].rnacount++;
273     if(feat->GetData().IsGene()) problem_locs[range].genecount++;
274     }
275 // now fix
276   for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
277     {
278     if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
279     if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene for fixing" << NcbiEndl;
280     CSeq_loc&  loc = feat->SetLocation();
281     ENa_strand strand;
282     TSeqPos from, to;
283     getFromTo(loc, from, to, strand);
284     string range = printed_range(from, to);
285     if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
286     if(PrintDetails()) NcbiCerr << "FixStrands: range for fix: " << range << NcbiEndl;
287     if(   problem_locs[range].count!=2
288        || problem_locs[range].rnacount!=1
289        || problem_locs[range].genecount!=1
290       )
291       {
292       // no touching
293       // warning
294       NcbiCerr << "CReadBlastApp::FixStrands: "
295                << "WARNING: "
296                << "location found, but the number of features with that location is confusing, "
297                << "no fixing for "
298                << "[" << problem_locs[range].name << "]"
299                << "(" << range << ")"
300                << NcbiEndl;
301       continue;
302       }
303 // over all intervals
304     int ninter=0;
305     for(CTypeIterator<CSeq_interval> inter = ::Begin(loc);  inter; ++inter, ++ninter)
306       {
307       inter->SetStrand(problem_locs[range].strand);
308       NcbiCerr << "CReadBlastApp::FixStrands: "
309                << "[" << problem_locs[range].name << "] "
310                << "fixed"
311                << NcbiEndl;
312       }
313     NcbiCerr << "CReadBlastApp::FixStrands: ninters= " << ninter  << NcbiEndl;
314     } // for(CTypeIterator<CSeq_feat>
315   if(PrintDetails()) NcbiCerr << "FixStrands: end" << NcbiEndl;
316   return 1;
317 }
318 
RemoveProblems(map<string,string> & problem_names,LocMap & loc_map)319 int CReadBlastApp::RemoveProblems(map<string,string>& problem_names, LocMap& loc_map)
320 {
321    if(PrintDetails()) NcbiCerr << "RemoveProblems: start " << NcbiEndl;
322 
323    PushVerbosity();
324    if(IsSubmit())
325      {
326      if ( m_Submit.GetData().IsEntrys())
327        {
328        for(CSeq_submit::C_Data::TEntrys::iterator entry = m_Submit.SetData().SetEntrys().begin();
329            entry != m_Submit.SetData().SetEntrys().end();)
330        // NON_CONST_ITERATE(CSeq_submit::C_Data::TEntrys, entry, m_Submit.SetData().SetEntrys())
331          {
332          int removeme = RemoveProblems(**entry, problem_names, loc_map);
333          if(PrintDetails())
334             NcbiCerr
335                  << "RemoveProblems(void): doing entry: removeme =  "
336                  << removeme
337                  << NcbiEndl;
338          if(removeme)
339            {
340            NcbiCerr << "RemoveProblems(): WARNING: "
341                     << "CSeq_entry deleted, loss of annotation might occur"
342                     << NcbiEndl;
343            entry=m_Submit.SetData().SetEntrys().erase(entry);
344            }
345          else entry++;
346          }
347        }
348      else
349        {
350        NcbiCerr << "ERROR: submit file does not have proper seqset"<< NcbiEndl;
351        }
352      }
353    else
354      {
355      if(PrintDetails())
356          NcbiCerr
357                  << "RemoveProblems(void): case is single entry "
358                  << NcbiEndl;
359      RemoveProblems(m_Entry, problem_names, loc_map);
360      }
361 
362    PopVerbosity();
363    if(PrintDetails()) NcbiCerr << "RemoveProblems: end" << NcbiEndl;
364    return 1;
365 }
366 
RemoveProblems(CSeq_entry & entry,map<string,string> & problem_seqs,LocMap & loc_map)367 int CReadBlastApp::RemoveProblems(CSeq_entry& entry, map<string, string>& problem_seqs, LocMap& loc_map)
368 {
369    int removeme=0;
370    if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): start" << NcbiEndl;
371    if(entry.IsSeq())
372      {
373      removeme=RemoveProblems(entry.SetSeq(), problem_seqs, loc_map);
374      if(PrintDetails())
375          NcbiCerr
376                  << "RemoveProblems(CSeq_entry)(seq case): removeme = "
377                  << removeme
378                  << NcbiEndl;
379 
380      }
381    else //several seqs
382      {
383      CBioseq_set& bioset = entry.SetSet();
384      removeme=RemoveProblems(bioset, problem_seqs, loc_map);
385      CBioseq_set::TSeq_set& entries =  bioset.SetSeq_set();
386      int size=0;
387      for (CTypeConstIterator<CBioseq> seq = ::ConstBegin(entry);  seq;  ++seq, size++);
388      if(PrintDetails())
389          NcbiCerr
390                  << "RemoveProblems(CSeq_entry)(set case): removeme = "
391                  << removeme
392                  << ", entries.size = "
393                  << entries.size()
394                  << ", total seqs = "
395                  << size
396                  << NcbiEndl;
397      if (size<=1) NormalizeSeqentry(entry);
398      } // entry.IsSet()
399    if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): end" << NcbiEndl;
400    return removeme;
401 }
402 
NormalizeSeqentry(CSeq_entry & entry)403 void CReadBlastApp::NormalizeSeqentry(CSeq_entry& entry)
404 {
405   if(!entry.IsSet()) return;
406   CBioseq_set& bioset = entry.SetSet();
407   CBioseq_set::TSeq_set& entries =  bioset.SetSeq_set();
408   if(entries.size()!=1) return;
409 // 1. merge descriptions
410   CBioseq& seq = (*entries.begin())->SetSeq();
411   if(bioset.IsSetDescr())
412     {
413     CSeq_descr::Tdata& descs = bioset.SetDescr().Set();
414     for(CSeq_descr::Tdata::iterator desc = descs.begin(); desc!=descs.end(); )
415       {
416       seq.SetDescr().Set().push_back(*desc);
417       desc=descs.erase(desc);
418       }
419     } // if(entry.SetSet().IsSetDescr())
420 // 2.  move CBioseq under the CSeq_entr
421   CRef<CBioseq> pseq (&seq);
422   entry.SetSeq(*pseq);
423   NcbiCerr << "NormalizeSeqentry(CSeq_entry...): "
424            << "WARNING: "
425            << "converted sequence set to sequence"
426            << NcbiEndl;
427   return;
428 }
429 
RemoveProblems(CBioseq_set & setseq,map<string,string> & problem_seqs,LocMap & loc_map)430 int CReadBlastApp::RemoveProblems(CBioseq_set& setseq, map<string, string>& problem_seqs, LocMap& loc_map)
431 {
432    bool noseqs=false;
433    bool noannot=false;
434    int removeme=0;
435    if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq_set): start" << NcbiEndl;
436    if(setseq.IsSetSeq_set())
437      {
438      int all_entries_removed = RemoveProblems(setseq.SetSeq_set(), problem_seqs, loc_map);
439      if(all_entries_removed > 0) {/* mandatory, no deletion */; noseqs=true;}
440      }
441    if(setseq.IsSetAnnot())
442      {
443      int all_annot_removed = RemoveProblems(setseq.SetAnnot(), problem_seqs, loc_map);
444      if(all_annot_removed > 0) {setseq.ResetAnnot(); noannot=true;}
445      }
446    if(noseqs ) removeme = 1;
447    if(PrintDetails())
448          NcbiCerr
449                  << "RemoveProblems(CBioseq_set): noseqs = "
450                  << noseqs
451                  << ", noannot = "
452                  << noannot
453                  << ", removeme (return) = "
454                  << removeme
455                  << NcbiEndl;
456 
457    return removeme;
458 }
459 
RemoveProblems(CBioseq & seq,map<string,string> & problem_seqs,LocMap & loc_map)460 int CReadBlastApp::RemoveProblems(CBioseq& seq, map<string, string>& problem_seqs, LocMap& loc_map)
461 {
462    int remove=0;
463    if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): start" << NcbiEndl;
464    if(!seq.IsAa())  // nucleotide sequnce
465      {
466      if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): nuc" << NcbiEndl;
467      if(seq.IsSetAnnot())
468        {
469        int annotations_removed = RemoveProblems(seq.SetAnnot(), problem_seqs, loc_map);
470        if(annotations_removed) seq.ResetAnnot();
471        }
472      }
473    else // aminoacid sequence
474 // checkif needed to kill it
475      {
476      string thisName = GetStringDescr(seq);
477      string origName = thisName;
478      string::size_type ipos = thisName.rfind('|'); if(ipos!=string::npos) thisName.erase(0, ipos+1);
479      ipos = thisName.rfind('_'); if(ipos!=string::npos) ipos= thisName.rfind('_', ipos-1);
480      if(PrintDetails())
481          NcbiCerr
482                  << "RemoveProblems(CBioseq): remove? sequence "
483                  << "[" << origName << "]"
484                  << " looking for "
485                  << "[" << thisName << "]"
486                  << NcbiEndl;
487 
488      if(problem_seqs.find(thisName) != problem_seqs.end())
489        {
490        NcbiCerr
491                  << "RemoveProblems(CBioseq): sequence "
492                  << "[" << origName << "]"
493                  << " is marked for removal, because of a match to "
494                  << "[" << thisName << "]"
495                  << NcbiEndl;
496        remove=1; // whack the sequence
497        }
498      }
499    if(PrintDetails())
500          NcbiCerr
501                  << "RemoveProblems(CBioseq): remove =  "
502                  << remove
503                  << NcbiEndl;
504 
505 
506    return remove;
507 }
508 
509 
RemoveProblems(CBioseq_set::TSeq_set & entries,map<string,string> & problem_seqs,LocMap & loc_map)510 int CReadBlastApp::RemoveProblems(CBioseq_set::TSeq_set& entries, map<string, string>& problem_seqs, LocMap& loc_map)
511 {
512    IncreaseVerbosity();
513    int remove=0;
514    for(CBioseq_set::TSeq_set::iterator entries_end = entries.end(), entry=entries.begin(); entry != entries_end; )
515      {
516      int removeseq=RemoveProblems(**entry, problem_seqs, loc_map);
517      if(PrintDetails())
518          NcbiCerr
519                  << "RemoveProblems(CBioseq_set::TSeq_set): removeseq = "
520                  << removeseq
521                  << NcbiEndl;
522 
523      if(removeseq) entry=entries.erase(entry);
524      else entry++;
525      } // each seqs
526    if(entries.size()==0) remove=1;
527    if(PrintDetails())
528          NcbiCerr
529                  << "RemoveProblems(CBioseq_set::TSeq_set): nentries = "
530                  << entries.size()
531                  << NcbiEndl;
532 
533    DecreaseVerbosity();
534    return remove;
535 }
536 
RemoveProblems(CBioseq::TAnnot & annots,map<string,string> & problem_seqs,LocMap & loc_map)537 int CReadBlastApp::RemoveProblems(CBioseq::TAnnot& annots, map<string, string>& problem_seqs, LocMap& loc_map)
538 {
539   int remove=0;
540   if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): start" << NcbiEndl;
541   for(CBioseq::TAnnot::iterator annot=annots.begin(); annot!=annots.end(); )
542     {
543     int removeme=0;
544     if( (*annot)->GetData().IsFtable()) removeme=RemoveProblems((*annot)->SetData().SetFtable(), problem_seqs, loc_map);
545     if(removeme)
546       {
547       NcbiCerr << "RemoveProblems(annots, problem_seqs): "
548                  << "INFO: "
549                  << "annotation has empty feature table and it will be removed"
550                  << NcbiEndl;
551       annot=annots.erase(annot);
552       }
553     else annot++;
554     }
555   if(annots.size()==0) remove=1;
556   if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): end" << NcbiEndl;
557 
558   return remove;
559 }
560 
RemoveProblems(CSeq_annot::C_Data::TFtable & table,map<string,string> & problem_seqs,LocMap & loc_map)561 int CReadBlastApp::RemoveProblems(CSeq_annot::C_Data::TFtable& table, map<string, string>& problem_seqs, LocMap& loc_map)
562 // this one needs cleaning
563 {
564   int removeme=0;
565   if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): start" << NcbiEndl;
566   GetLocMap(loc_map, table);
567   for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
568     {
569     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat 000" << NcbiEndl;
570     bool gene, cdregion;
571     gene = (*feat)->GetData().IsGene();
572     cdregion = (*feat)->GetData().IsCdregion();
573     bool del_feature=false;
574     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat I" << NcbiEndl;
575     string real_loc_string = GetLocationString((*feat)->GetLocation());
576     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat II" << NcbiEndl;
577 
578     string loc_string = GetLocusTag(**feat, loc_map); // more general, returns location string
579     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat: (" << real_loc_string  << ")(" << loc_string << ")" << NcbiEndl;
580 //
581 // case *: matching locus tag
582 //
583     if(problem_seqs.find(loc_string) != problem_seqs.end())
584       {
585       if((*feat)->GetData().IsImp() &&
586          (*feat)->GetData().GetImp().CanGetKey())
587          {
588          NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, key = " << (*feat)->GetData().GetImp().GetKey()  << NcbiEndl;
589          }
590       if((*feat)->GetData().IsImp() &&
591           (*feat)->CanGetComment() )
592          {
593          NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, comment = " << (*feat)->GetComment()  << NcbiEndl;
594          }
595 /*
596       if((*feat)->GetData().IsImp() &&
597          (*feat)->GetData().GetImp().CanGetKey() &&
598          (*feat)->GetData().GetImp().GetKey() == "misc_feature" &&
599          (*feat)->CanGetComment() &&
600          (*feat)->GetComment().find("potential frameshift") != string::npos
601         ) del_feature = false; // this is a new feature, that we are not supposed to delete
602 */
603       if((*feat)->GetData().IsImp() &&
604          (*feat)->GetData().GetImp().CanGetKey() &&
605          (*feat)->GetData().GetImp().GetKey() == "misc_feature"
606         ) del_feature = false; // this is a new feature, we do not delete them
607       else del_feature=true;
608       }
609 
610     if ( PrintDetails() )
611       {
612       NcbiCerr << "RemoveProblems: feature " << loc_string << ": ";
613       if(del_feature) NcbiCerr << "WILL BE REMOVED";
614       else            NcbiCerr << "stays until further analysis for it";
615       NcbiCerr << NcbiEndl;
616       }
617     if(del_feature)
618         {
619         NcbiCerr << "RemoveProblems: WARNING: feature "
620                  << "{" << (*feat)->GetData().SelectionName((*feat)->GetData().Which()) << "} "
621                  << loc_string << ": ";
622         NcbiCerr << "will be removed because of a problem: ";
623         NcbiCerr << problem_seqs.find(loc_string)->second;
624         NcbiCerr << NcbiEndl;
625         }
626     if(!del_feature && gene  && (*feat)->GetData().GetGene().CanGetLocus_tag() )
627 //
628 // case *: gene
629 //
630       {
631       string locus_tag = (*feat)->GetData().GetGene().GetLocus_tag();
632       if(problem_seqs.find(locus_tag) != problem_seqs.end()) del_feature=true;
633       if ( PrintDetails() )
634         {
635         NcbiCerr << "RemoveProblems: gene " << locus_tag << ": ";
636         if(del_feature)
637            NcbiCerr << "WILL BE REMOVED";
638         else
639            NcbiCerr << "stays";
640         NcbiCerr << NcbiEndl;
641         }
642       if(del_feature)
643         {
644         NcbiCerr << "RemoveProblems: WARNING: gene " << locus_tag << ": ";
645         NcbiCerr << "will be removed because of a problem: ";
646         NcbiCerr << problem_seqs.find(locus_tag)->second;
647         NcbiCerr << NcbiEndl;
648         }
649       }
650     if(!del_feature && cdregion && (*feat)->CanGetProduct() )
651       {
652 //
653 // case *: cdregion
654 //
655       string productName;
656       if( (*feat)->CanGetProduct() &&
657           (*feat)->GetProduct().IsWhole() &&
658           (*feat)->GetProduct().GetWhole().IsGeneral() &&
659           (*feat)->GetProduct().GetWhole().GetGeneral().CanGetTag() &&
660           (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().IsStr() )
661         {
662         productName = (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().GetStr();
663         }
664       else if (
665            (*feat)->CanGetProduct() &&
666           (*feat)->GetProduct().IsWhole())
667         {
668         productName = (*feat)->GetProduct().GetWhole().AsFastaString();
669         }
670 // strip leading contig ID if any
671       string::size_type ipos=productName.rfind('_', productName.size());
672       if(ipos != string::npos)
673         {
674         string::size_type ipos2;
675         ipos2=productName.rfind('_', ipos-1);
676         if(ipos2 != string::npos) productName.erase(0, ipos2+1);
677       // "1103032000567_RAZWK3B_00550" -> "RAZWK3B_00550";
678         else
679           {
680           ipos2=productName.rfind('|', ipos-1);
681           if(ipos2 != string::npos) productName.erase(0, ipos2+1);
682           }
683 // lcl|Xoryp_00025 -> Xoryp_00025
684         }
685       if(productName.length() && problem_seqs.find(productName) != problem_seqs.end()) del_feature=true;
686       if ( PrintDetails() )
687         {
688         NcbiCerr << "RemoveProblems: cdregion " << productName << ": ";
689         if(del_feature)
690            NcbiCerr << "WILL BE REMOVED";
691         else
692            NcbiCerr << "stays";
693         NcbiCerr << NcbiEndl;
694         }
695       }
696     if(del_feature)
697       {
698       if(problem_seqs.find(real_loc_string) == problem_seqs.end())
699         {
700         problem_seqs[real_loc_string]=problem_seqs[loc_string]; // saving for later
701         }
702       }
703     if(del_feature) feat=table.erase(feat);
704     else feat++;
705     }
706   if(table.size()==0) removeme=1;
707   if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): end" << NcbiEndl;
708   return removeme;
709 }
710 
711 // RemoveInterim
712 
RemoveInterim(void)713 int CReadBlastApp::RemoveInterim(void)
714 {
715 
716    int nremoved=0;
717 
718    if(PrintDetails()) NcbiCerr << "RemoveInterim: start " << NcbiEndl;
719    PushVerbosity();
720    for(CTypeIterator<CBioseq> seq=Begin(); seq; ++seq)
721      {
722      if(seq->IsSetAnnot() && seq->IsAa()) nremoved+= RemoveInterim(seq->SetAnnot());
723      if(seq->IsSetAnnot() && seq->IsNa()) nremoved+= RemoveInterim2(seq->SetAnnot());
724      }
725 
726    PopVerbosity();
727    if(PrintDetails()) NcbiCerr << "RemoveInterim: end" << NcbiEndl;
728    return nremoved;
729 }
730 // protein sequence annotations
RemoveInterim(CBioseq::TAnnot & annots)731 int CReadBlastApp::RemoveInterim(CBioseq::TAnnot& annots)
732 {
733    int nremoved=0;
734    if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): start " << NcbiEndl;
735 
736    for(CBioseq::TAnnot::iterator annot=annots.begin(), annot_end = annots.end(); annot != annot_end; )
737      {
738      bool erased = false;
739      if((*annot)->GetData().IsAlign())
740        {
741        nremoved++; erased = true;
742        }
743      if ( (*annot)->GetData().IsFtable())
744        {
745        int dremoved=0;
746        CSeq_annot::C_Data::TFtable& table = (*annot)->SetData().SetFtable();
747        for(CSeq_annot::C_Data::TFtable::iterator feat=table.begin(), feat_end= table.end(); feat !=  feat_end; )
748          {
749          string test = "Genomic Location:";
750          if ((*feat)->IsSetData() && (*feat)->GetData().IsProt() &&
751               (*feat)->IsSetComment() && (*feat)->GetComment().substr(0, test.size()) == test  )
752            {
753            table.erase(feat++); dremoved++;
754            nremoved++;
755            }
756          else
757            {
758            feat++;
759            }
760          }
761 
762 /*
763   this is really crappy way of doing it!
764 */
765 /*
766        while( (*annot)->GetData().GetFtable().size()>1 )
767          {
768          (*annot)->SetData().SetFtable().pop_back();
769          nremoved++;
770          dremoved++;
771          }
772 */
773        if(PrintDetails()) NcbiCerr << "RemoveInterim(CBioseq::TAnnot& annots): dremoved = "
774         << dremoved
775         << ", left=" << (*annot)->GetData().GetFtable().size()
776         << NcbiEndl;
777        if((*annot)->SetData().SetFtable().size() == 0)
778          {
779          nremoved++;
780          erased = true;
781          }
782        }
783      if(erased) annot=annots.erase(annot);
784      else annot++;
785      }
786 
787   if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): end" << NcbiEndl;
788   return nremoved;
789 }
790 
791 // nucleotide sequence annotations
RemoveInterim2(CBioseq::TAnnot & annots)792 int CReadBlastApp::RemoveInterim2(CBioseq::TAnnot& annots)
793 {
794    int nremoved=0;
795    if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): start " << NcbiEndl;
796 
797    NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, annots)
798      {
799      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature start" << NcbiEndl;
800      if ( !(*gen_feature)->GetData().IsFtable() ) continue;
801      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature is ftable" << NcbiEndl;
802      map<string,bool> feat_defined;
803      CSeq_annot::C_Data::TFtable& table = (*gen_feature)->SetData().SetFtable();
804      for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
805        {
806        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat start" << NcbiEndl;
807        const CSeq_feat& featr = **feat;
808        CNcbiStrstream buff;
809        buff << MSerial_AsnText  << featr;
810        buff << '\0';
811        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): feat ASN:" << NcbiEndl;
812        if(PrintDetails()) NcbiCerr << buff.str() << NcbiEndl;
813        if(feat_defined.find(buff.str()) != feat_defined.end()) // dupe
814          {
815          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat dupe: erase" << NcbiEndl;
816          feat=table.erase(feat);
817          }
818        else
819          {
820          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map" << NcbiEndl;
821          feat_defined[buff.str()]=true;
822          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map done" << NcbiEndl;
823          // feat_defined[featr]=true;
824          feat++;
825          }
826        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat end" << NcbiEndl;
827        }
828 
829      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature end" << NcbiEndl;
830      }
831 
832   if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): end" << NcbiEndl;
833   return nremoved;
834 }
835 
836 
837 
838 
diagName(const string & type,const string & value)839 string diagName(const string& type, const string& value)
840 {
841    return type + "|" + value;
842 }
843 
addProblems(list<problemStr> & dest,const list<problemStr> & src)844 int addProblems(list<problemStr>& dest, const list<problemStr>& src)
845 {
846    int n=0;
847    ITERATE(list<problemStr>, src_p, src)
848      {
849      dest.push_back(*src_p);
850      n++;
851      }
852   return n;
853 }
854 
CollectRNAFeatures(TProblem_locs & problem_locs)855 int CReadBlastApp::CollectRNAFeatures(TProblem_locs& problem_locs)
856 {
857   ITERATE( diagMap, feat, m_diag)
858     {
859     ITERATE(list<problemStr>, problem, feat->second.problems)
860       {
861       bool added = false;
862       string name = feat->first;
863       string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
864       ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
865       if(ipos!=string::npos) name.erase(0, ipos+1);
866       string range = printed_range(problem->i1, problem->i2);
867       if(
868           (problem->type & eTRNABadStrand || problem->type & eTRNAUndefStrand)   // irrelevant problem
869          && !problem->misc_feat_message.empty()
870        )
871         {
872         problem_locs[range].strand = problem->strand;
873         problem_locs[range].name = name;
874         problem_locs[range].count =
875         problem_locs[range].rnacount =
876         problem_locs[range].genecount =  0;
877         added=true;
878         }
879       if(PrintDetails())
880         NcbiCerr << "CReadBlastApp::CollectRNAFeatures: " << feat->first
881         << "[" << range << "]: "
882         << "(" << name << ")"
883         << (added ? "added" : "skipped")  << NcbiEndl;
884       }
885     }
886   return problem_locs.size();
887 
888 }
889 
CollectFrameshiftedSeqs(map<string,string> & problem_names)890 int CReadBlastApp::CollectFrameshiftedSeqs(map<string,string>& problem_names)
891 {
892   const CArgs& args = GetArgs();
893   bool keep_frameshifted = args["kfs"].HasValue();
894   ITERATE( diagMap, feat, m_diag)
895     {
896     ITERATE(list<problemStr>, problem, feat->second.problems)
897       {
898       bool added = false;
899       string name = feat->first;
900       string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
901       ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
902       if(ipos!=string::npos) name.erase(0, ipos+1);
903       if(
904           (problem->type == eFrameShift && !keep_frameshifted)
905           ||
906           problem->type == eRemoveOverlap
907           ||
908           problem->type & eShortProtein
909         )
910         { problem_names[name]=ProblemType(problem->type); added=true; }
911       if(PrintDetails())
912         NcbiCerr << "CollectFrameshiftedSeqs: " << feat->first
913         << ": "
914         << "(" << name << ")"
915         << (added ? "added" : "skipped")  << NcbiEndl;
916       }
917     }
918   return problem_names.size();
919 }
920 
append_misc_feature(CBioseq_set::TSeq_set & seqs,const string & name,EProblem problem_type)921 void CReadBlastApp::append_misc_feature(CBioseq_set::TSeq_set& seqs, const string& name, EProblem problem_type)
922 {
923   if(m_diag.find(name)==m_diag.end())
924     {
925     // should not happen
926     NcbiCerr << "append_misc_feature: FATAL: do not have problems for " << name << NcbiEndl;
927     throw;
928     return;
929     }
930 
931   NON_CONST_ITERATE(  CBioseq_set::TSeq_set, na, seqs)
932     {
933     if( is_prot_entry((*na)->GetSeq())  ) continue;
934     list<CRef<CSeq_id> >& na_id = (*na)->SetSeq().SetId();
935     NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, (*na)->SetSeq().SetAnnot())
936       {
937       if ( !(*gen_feature)->GetData().IsFtable() ) continue;
938       typedef map<string, bool> Tmessage_misced;
939       typedef map<EProblem, Tmessage_misced> Tproblem_misced;
940       Tproblem_misced problem_misced; // list of problems fixed
941       ITERATE(list<problemStr>, problem, m_diag[name].problems)
942         {
943         if ( !(problem->type & problem_type) ) continue;
944         int from=-1, to=-1;
945 //        string lt1, lt2;
946         ENa_strand strand;
947         string message="";
948         from = problem->i1;
949         if(from<0) continue;
950 // add new feature
951         to   = problem->i2;
952 /*
953         lt1  = problem->id1;
954         lt2  = problem->id2;
955 */
956         strand = problem->strand;
957         message = problem->misc_feat_message;
958         if(message.size()==0) continue; // do not print empty misc_feat, they are empty for a reason
959         if(problem_misced.find(problem->type) != problem_misced.end() &&
960            problem_misced[problem->type].find(message) != problem_misced[problem->type].end()
961           ) continue;
962         else problem_misced[problem->type][message] = true;
963         SIZE_TYPE pos;
964         while((pos=message.find_first_of("\n\r"))!=string::npos)
965           {
966           message[pos]=' ';
967           }
968         CRef<CSeq_feat> feat(new CSeq_feat());
969 
970         feat->SetComment(message);
971         feat->SetData().SetImp().SetKey("misc_feature");
972         feat->SetLocation().SetInt().SetFrom(from);
973         feat->SetLocation().SetInt().SetTo(to);
974         feat->SetLocation().SetInt().SetId(**na_id.begin());
975         feat->SetLocation().SetInt().SetStrand(strand);
976         (*gen_feature)->SetData().SetFtable().push_back(feat);
977         }
978 
979       }
980     break;
981     }
982 
983   return;
984 }
985 
986 
987 
988 
989