1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34
35
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const CBioseq::TAnnot & annots,const EProblem type)36 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
37 const CBioseq::TAnnot& annots, const EProblem type)
38 {
39 ITERATE(CBioseq::TAnnot, gen_feature, annots)
40 {
41 if ( !(*gen_feature)->GetData().IsFtable() ) continue;
42 reportProblems(report_and_forget, diag, out, (*gen_feature)->GetData().GetFtable(), type);
43 }
44 }
45
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const CSeq_annot::C_Data::TFtable & feats,const EProblem type)46 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
47 const CSeq_annot::C_Data::TFtable& feats, const EProblem type)
48 {
49 ITERATE(CSeq_annot::C_Data::TFtable, f, feats)
50 {
51 if( !(*f)->GetData().IsGene() ) continue;
52 string qname; (*f)->GetData().GetGene().GetLabel(&qname);
53 if( diag.find(qname) == diag.end() ) continue;
54 reportProblems(qname, diag, out, type);
55 if(report_and_forget) erase_problems(qname, diag, type);
56 }
57 }
reportProblems(const string & qname,diagMap & diag,ostream & out,const EProblem type)58 void CReadBlastApp::reportProblems(const string& qname, diagMap& diag, ostream& out, const EProblem type)
59 {
60 if(!hasProblems(qname, diag, type)) return;
61 if(PrintDetails()) NcbiCerr << "problem type: " << ProblemType(type) << NcbiEndl;
62 reportProblemSequenceName(qname, out);
63
64 // reporting
65 IncreaseVerbosity();
66 NON_CONST_ITERATE(list<problemStr>, problem, diag[qname].problems)
67 {
68 if(PrintDetails()) NcbiCerr << "current problem: " << ProblemType(problem->type) << NcbiEndl;
69 if( !(problem->type & type) ) continue;
70 if(PrintDetails()) NcbiCerr << "it is that problem" << NcbiEndl;
71 if(!problem->message.size()) continue;
72 if(PrintDetails()) NcbiCerr << "has nonzero message" << NcbiEndl;
73 reportProblemType(problem->type, out);
74 reportProblemMessage(problem->message, out);
75 if(PrintDetails()) NcbiCerr << "current problem: done: " << problem->type << NcbiEndl;
76 }
77 DecreaseVerbosity();
78 }
erase_problems(const string & qname,diagMap & diag,const EProblem type)79 void CReadBlastApp::erase_problems(const string& qname, diagMap& diag, const EProblem type)
80 {
81 IncreaseVerbosity();
82 for(list<problemStr>::iterator problem = diag[qname].problems.begin(); problem!=diag[qname].problems.end();)
83 {
84 if(PrintDetails()) NcbiCerr << "erasing? current problem: " << problem->type << NcbiEndl;
85 if( !(problem->type & type) )
86 {
87 problem++;
88 continue;
89 }
90 if(PrintDetails()) NcbiCerr << "it is that problem for erazing" << NcbiEndl;
91 problem=diag[qname].problems.erase(problem);
92 if(PrintDetails()) NcbiCerr << "erased" << NcbiEndl;
93 if(PrintDetails()) NcbiCerr << "next current problem: " << problem->type << NcbiEndl;
94 }
95 DecreaseVerbosity();
96 }
reportProblems(const bool report_and_forget,diagMap & diag,ostream & out,const EProblem type)97 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out, const EProblem type)
98 {
99 IncreaseVerbosity();
100 map<string,bool> done;
101 if(PrintDetails()) NcbiCerr << "reportProblems: all problems of type (" << ProblemType(type) << ")" << NcbiEndl;
102 for (CTypeConstIterator<CBioseq> seq = ConstBegin(); seq; ++seq)
103 {
104 if ( !is_prot_entry(*seq) )
105 {
106 reportProblems(report_and_forget, diag, out, seq->GetAnnot(), type);
107 }
108 string qname1 = GetStringDescr (*seq);
109 string qname2 = CSeq_id::GetStringDescr (*seq, CSeq_id::eFormat_FastA);
110 if(PrintDetails()) NcbiCerr << "reportProblems: start: "
111 << qname1 << " or "
112 << qname2
113 << NcbiEndl;
114 string qnames[2];
115 qnames[0]=qname1; qnames[1]=qname2;
116 for(int i=0; i<2; i++)
117 {
118 string& qname = qnames[i];
119 if( diag.find(qname) != diag.end() )
120 {
121 reportProblems(qname, diag, out, type);
122 if(report_and_forget) erase_problems(qname, diag, type);
123 done[qname]=true;
124 if(PrintDetails()) NcbiCerr << "reportProblems: full end: " << qname << NcbiEndl;
125 }
126 }
127 }
128
129 ITERATE(diagMap, problem, diag)
130 {
131 string qname = problem->first;
132 if(done.find(qname)!=done.end()) continue;
133 reportProblems(qname, diag, out, type);
134 if(report_and_forget) erase_problems(qname, diag, type);
135 if(PrintDetails()) NcbiCerr << "reportProblems: alternative problems: " << qname << NcbiEndl;
136 }
137 DecreaseVerbosity();
138 }
139 // hasProblemType(seq, diag[qname].problems), problem->type
140
hasProblems(const CBioseq & seq,diagMap & diag,const EProblem type)141 bool CReadBlastApp::hasProblems(const CBioseq& seq, diagMap& diag, const EProblem type)
142 {
143 string qname = GetStringDescr (seq);
144 string qname2 = CSeq_id::GetStringDescr (seq, CSeq_id::eFormat_FastA);
145 return hasProblems(qname, diag, type) || hasProblems(qname2, diag, type);
146 }
147
hasProblems(const string & qname,diagMap & diag,const EProblem type)148 bool CReadBlastApp::hasProblems(const string& qname, diagMap& diag, const EProblem type)
149 {
150 if(PrintDetails()) NcbiCerr << "hasProblems: start: " << qname << NcbiEndl;
151 if( type != eAllProblems && diag.find(qname) == diag.end() ) return false;
152 if ( type == eAllProblems && diag[qname].problems.size()>0) return true;
153
154 IncreaseVerbosity();
155 ITERATE(list<problemStr>, problem, diag[qname].problems)
156 {
157 if(PrintDetails()) NcbiCerr << "hasProblems: checking problem type: " << "\t"
158 << ProblemType(problem->type) << "\t"
159 << qname << "\t"
160 << NcbiEndl;
161 if (problem->type & type)
162 {
163 if(PrintDetails()) NcbiCerr << "hasProblems: end: does have problem: " << qname << NcbiEndl;
164 return true;
165 }
166 }
167 DecreaseVerbosity();
168 if(PrintDetails()) NcbiCerr << "hasProblems: end: no problem: " << qname << NcbiEndl;
169 return false;
170 }
171
reportProblemMessage(const string & message,ostream & out)172 void CReadBlastApp::reportProblemMessage(const string& message, ostream& out)
173 {
174 out << message.c_str() << NcbiEndl;
175 }
176
ProblemType(const EProblem type)177 string CReadBlastApp::ProblemType(const EProblem type)
178 {
179 CNcbiStrstream strres;
180 strres << "unknown_problem_type=" << type << '\0';
181 string result=strres.str();
182 if(type & eOverlap)
183 result = "Potential overlap found";
184 else if(type & eRnaOverlap)
185 result = "Potential RNA overlap found";
186 else if(type & eCompleteOverlap)
187 result = "Complete overlap found";
188 else if(type & eRemoveOverlap)
189 result = "overlap marked for removal";
190 else if(type & eRelFrameShift)
191 result = "Something relevant to frame shift found";
192 else if(type & eFrameShift)
193 result = "Potential frame shift evidence found";
194 else if(type & eMayBeNotFrameShift)
195 result = "Evidence absolving from the frame shift accusation found";
196 else if(type & ePartial)
197 result = "Potential partial protein annotation found";
198 else if(type & eShortProtein)
199 result = "Short annotation found";
200 else if(type & eTRNAMissing)
201 result = "tRNA is missing in the list of independently annotated tRNAs";
202 else if(type & eTRNAAbsent)
203 result = "RNA is missing in the list of annotated RNAs in the input";
204 else if(type & eTRNABadStrand)
205 result = "RNA is present at the wrong strand";
206 else if(type & eTRNAUndefStrand)
207 result = "RNA is present with undefined strand";
208 else if(type & eTRNAComMismatch)
209 result = "tRNA is a complete mismatch";
210 else if(type & eTRNAMismatch)
211 result = "tRNA has mismatched ends";
212
213 return result;
214
215 }
216
reportProblemType(const EProblem type,ostream & out)217 void CReadBlastApp::reportProblemType(const EProblem type, ostream& out)
218 {
219 out
220 << "---"
221 << " ";
222 string stype = ProblemType(type);
223 if(!stype.empty())
224 {
225 out << stype;
226 }
227 else
228 {
229 NcbiCerr << "FATAL: internal error: unknown problem type" << type << NcbiEndl;
230 throw;
231 }
232 out
233 << " "
234 << "---"
235 << NcbiEndl;
236 }
237
reportProblemSequenceName(const string & name,ostream & out)238 void CReadBlastApp::reportProblemSequenceName(const string& name, ostream& out)
239 {
240 out
241 << "====="
242 << " ";
243 out << name.c_str() ;
244 out
245 << " "
246 << "====="
247 << NcbiEndl;
248 }
249
FixStrands(void)250 int CReadBlastApp::FixStrands(void)
251 {
252 TProblem_locs problem_locs;
253 if(PrintDetails()) NcbiCerr << "FixStrands: start " << NcbiEndl;
254
255 // relevant problems
256 CollectRNAFeatures(problem_locs);
257
258 // first, count features matching each range
259 for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
260 {
261 if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
262 const CSeq_loc& loc = feat->GetLocation();
263 ENa_strand strand;
264 TSeqPos from, to;
265 getFromTo(loc, from, to, strand);
266 string range = printed_range(from, to);
267 if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene [" << range << "]" << NcbiEndl;
268
269 if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
270 if(PrintDetails()) NcbiCerr << "FixStrands: range: " << range << NcbiEndl;
271 problem_locs[range].count++;
272 if(feat->GetData().IsRna()) problem_locs[range].rnacount++;
273 if(feat->GetData().IsGene()) problem_locs[range].genecount++;
274 }
275 // now fix
276 for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
277 {
278 if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
279 if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene for fixing" << NcbiEndl;
280 CSeq_loc& loc = feat->SetLocation();
281 ENa_strand strand;
282 TSeqPos from, to;
283 getFromTo(loc, from, to, strand);
284 string range = printed_range(from, to);
285 if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
286 if(PrintDetails()) NcbiCerr << "FixStrands: range for fix: " << range << NcbiEndl;
287 if( problem_locs[range].count!=2
288 || problem_locs[range].rnacount!=1
289 || problem_locs[range].genecount!=1
290 )
291 {
292 // no touching
293 // warning
294 NcbiCerr << "CReadBlastApp::FixStrands: "
295 << "WARNING: "
296 << "location found, but the number of features with that location is confusing, "
297 << "no fixing for "
298 << "[" << problem_locs[range].name << "]"
299 << "(" << range << ")"
300 << NcbiEndl;
301 continue;
302 }
303 // over all intervals
304 int ninter=0;
305 for(CTypeIterator<CSeq_interval> inter = ::Begin(loc); inter; ++inter, ++ninter)
306 {
307 inter->SetStrand(problem_locs[range].strand);
308 NcbiCerr << "CReadBlastApp::FixStrands: "
309 << "[" << problem_locs[range].name << "] "
310 << "fixed"
311 << NcbiEndl;
312 }
313 NcbiCerr << "CReadBlastApp::FixStrands: ninters= " << ninter << NcbiEndl;
314 } // for(CTypeIterator<CSeq_feat>
315 if(PrintDetails()) NcbiCerr << "FixStrands: end" << NcbiEndl;
316 return 1;
317 }
318
RemoveProblems(map<string,string> & problem_names,LocMap & loc_map)319 int CReadBlastApp::RemoveProblems(map<string,string>& problem_names, LocMap& loc_map)
320 {
321 if(PrintDetails()) NcbiCerr << "RemoveProblems: start " << NcbiEndl;
322
323 PushVerbosity();
324 if(IsSubmit())
325 {
326 if ( m_Submit.GetData().IsEntrys())
327 {
328 for(CSeq_submit::C_Data::TEntrys::iterator entry = m_Submit.SetData().SetEntrys().begin();
329 entry != m_Submit.SetData().SetEntrys().end();)
330 // NON_CONST_ITERATE(CSeq_submit::C_Data::TEntrys, entry, m_Submit.SetData().SetEntrys())
331 {
332 int removeme = RemoveProblems(**entry, problem_names, loc_map);
333 if(PrintDetails())
334 NcbiCerr
335 << "RemoveProblems(void): doing entry: removeme = "
336 << removeme
337 << NcbiEndl;
338 if(removeme)
339 {
340 NcbiCerr << "RemoveProblems(): WARNING: "
341 << "CSeq_entry deleted, loss of annotation might occur"
342 << NcbiEndl;
343 entry=m_Submit.SetData().SetEntrys().erase(entry);
344 }
345 else entry++;
346 }
347 }
348 else
349 {
350 NcbiCerr << "ERROR: submit file does not have proper seqset"<< NcbiEndl;
351 }
352 }
353 else
354 {
355 if(PrintDetails())
356 NcbiCerr
357 << "RemoveProblems(void): case is single entry "
358 << NcbiEndl;
359 RemoveProblems(m_Entry, problem_names, loc_map);
360 }
361
362 PopVerbosity();
363 if(PrintDetails()) NcbiCerr << "RemoveProblems: end" << NcbiEndl;
364 return 1;
365 }
366
RemoveProblems(CSeq_entry & entry,map<string,string> & problem_seqs,LocMap & loc_map)367 int CReadBlastApp::RemoveProblems(CSeq_entry& entry, map<string, string>& problem_seqs, LocMap& loc_map)
368 {
369 int removeme=0;
370 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): start" << NcbiEndl;
371 if(entry.IsSeq())
372 {
373 removeme=RemoveProblems(entry.SetSeq(), problem_seqs, loc_map);
374 if(PrintDetails())
375 NcbiCerr
376 << "RemoveProblems(CSeq_entry)(seq case): removeme = "
377 << removeme
378 << NcbiEndl;
379
380 }
381 else //several seqs
382 {
383 CBioseq_set& bioset = entry.SetSet();
384 removeme=RemoveProblems(bioset, problem_seqs, loc_map);
385 CBioseq_set::TSeq_set& entries = bioset.SetSeq_set();
386 int size=0;
387 for (CTypeConstIterator<CBioseq> seq = ::ConstBegin(entry); seq; ++seq, size++);
388 if(PrintDetails())
389 NcbiCerr
390 << "RemoveProblems(CSeq_entry)(set case): removeme = "
391 << removeme
392 << ", entries.size = "
393 << entries.size()
394 << ", total seqs = "
395 << size
396 << NcbiEndl;
397 if (size<=1) NormalizeSeqentry(entry);
398 } // entry.IsSet()
399 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): end" << NcbiEndl;
400 return removeme;
401 }
402
NormalizeSeqentry(CSeq_entry & entry)403 void CReadBlastApp::NormalizeSeqentry(CSeq_entry& entry)
404 {
405 if(!entry.IsSet()) return;
406 CBioseq_set& bioset = entry.SetSet();
407 CBioseq_set::TSeq_set& entries = bioset.SetSeq_set();
408 if(entries.size()!=1) return;
409 // 1. merge descriptions
410 CBioseq& seq = (*entries.begin())->SetSeq();
411 if(bioset.IsSetDescr())
412 {
413 CSeq_descr::Tdata& descs = bioset.SetDescr().Set();
414 for(CSeq_descr::Tdata::iterator desc = descs.begin(); desc!=descs.end(); )
415 {
416 seq.SetDescr().Set().push_back(*desc);
417 desc=descs.erase(desc);
418 }
419 } // if(entry.SetSet().IsSetDescr())
420 // 2. move CBioseq under the CSeq_entr
421 CRef<CBioseq> pseq (&seq);
422 entry.SetSeq(*pseq);
423 NcbiCerr << "NormalizeSeqentry(CSeq_entry...): "
424 << "WARNING: "
425 << "converted sequence set to sequence"
426 << NcbiEndl;
427 return;
428 }
429
RemoveProblems(CBioseq_set & setseq,map<string,string> & problem_seqs,LocMap & loc_map)430 int CReadBlastApp::RemoveProblems(CBioseq_set& setseq, map<string, string>& problem_seqs, LocMap& loc_map)
431 {
432 bool noseqs=false;
433 bool noannot=false;
434 int removeme=0;
435 if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq_set): start" << NcbiEndl;
436 if(setseq.IsSetSeq_set())
437 {
438 int all_entries_removed = RemoveProblems(setseq.SetSeq_set(), problem_seqs, loc_map);
439 if(all_entries_removed > 0) {/* mandatory, no deletion */; noseqs=true;}
440 }
441 if(setseq.IsSetAnnot())
442 {
443 int all_annot_removed = RemoveProblems(setseq.SetAnnot(), problem_seqs, loc_map);
444 if(all_annot_removed > 0) {setseq.ResetAnnot(); noannot=true;}
445 }
446 if(noseqs ) removeme = 1;
447 if(PrintDetails())
448 NcbiCerr
449 << "RemoveProblems(CBioseq_set): noseqs = "
450 << noseqs
451 << ", noannot = "
452 << noannot
453 << ", removeme (return) = "
454 << removeme
455 << NcbiEndl;
456
457 return removeme;
458 }
459
RemoveProblems(CBioseq & seq,map<string,string> & problem_seqs,LocMap & loc_map)460 int CReadBlastApp::RemoveProblems(CBioseq& seq, map<string, string>& problem_seqs, LocMap& loc_map)
461 {
462 int remove=0;
463 if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): start" << NcbiEndl;
464 if(!seq.IsAa()) // nucleotide sequnce
465 {
466 if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): nuc" << NcbiEndl;
467 if(seq.IsSetAnnot())
468 {
469 int annotations_removed = RemoveProblems(seq.SetAnnot(), problem_seqs, loc_map);
470 if(annotations_removed) seq.ResetAnnot();
471 }
472 }
473 else // aminoacid sequence
474 // checkif needed to kill it
475 {
476 string thisName = GetStringDescr(seq);
477 string origName = thisName;
478 string::size_type ipos = thisName.rfind('|'); if(ipos!=string::npos) thisName.erase(0, ipos+1);
479 ipos = thisName.rfind('_'); if(ipos!=string::npos) ipos= thisName.rfind('_', ipos-1);
480 if(PrintDetails())
481 NcbiCerr
482 << "RemoveProblems(CBioseq): remove? sequence "
483 << "[" << origName << "]"
484 << " looking for "
485 << "[" << thisName << "]"
486 << NcbiEndl;
487
488 if(problem_seqs.find(thisName) != problem_seqs.end())
489 {
490 NcbiCerr
491 << "RemoveProblems(CBioseq): sequence "
492 << "[" << origName << "]"
493 << " is marked for removal, because of a match to "
494 << "[" << thisName << "]"
495 << NcbiEndl;
496 remove=1; // whack the sequence
497 }
498 }
499 if(PrintDetails())
500 NcbiCerr
501 << "RemoveProblems(CBioseq): remove = "
502 << remove
503 << NcbiEndl;
504
505
506 return remove;
507 }
508
509
RemoveProblems(CBioseq_set::TSeq_set & entries,map<string,string> & problem_seqs,LocMap & loc_map)510 int CReadBlastApp::RemoveProblems(CBioseq_set::TSeq_set& entries, map<string, string>& problem_seqs, LocMap& loc_map)
511 {
512 IncreaseVerbosity();
513 int remove=0;
514 for(CBioseq_set::TSeq_set::iterator entries_end = entries.end(), entry=entries.begin(); entry != entries_end; )
515 {
516 int removeseq=RemoveProblems(**entry, problem_seqs, loc_map);
517 if(PrintDetails())
518 NcbiCerr
519 << "RemoveProblems(CBioseq_set::TSeq_set): removeseq = "
520 << removeseq
521 << NcbiEndl;
522
523 if(removeseq) entry=entries.erase(entry);
524 else entry++;
525 } // each seqs
526 if(entries.size()==0) remove=1;
527 if(PrintDetails())
528 NcbiCerr
529 << "RemoveProblems(CBioseq_set::TSeq_set): nentries = "
530 << entries.size()
531 << NcbiEndl;
532
533 DecreaseVerbosity();
534 return remove;
535 }
536
RemoveProblems(CBioseq::TAnnot & annots,map<string,string> & problem_seqs,LocMap & loc_map)537 int CReadBlastApp::RemoveProblems(CBioseq::TAnnot& annots, map<string, string>& problem_seqs, LocMap& loc_map)
538 {
539 int remove=0;
540 if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): start" << NcbiEndl;
541 for(CBioseq::TAnnot::iterator annot=annots.begin(); annot!=annots.end(); )
542 {
543 int removeme=0;
544 if( (*annot)->GetData().IsFtable()) removeme=RemoveProblems((*annot)->SetData().SetFtable(), problem_seqs, loc_map);
545 if(removeme)
546 {
547 NcbiCerr << "RemoveProblems(annots, problem_seqs): "
548 << "INFO: "
549 << "annotation has empty feature table and it will be removed"
550 << NcbiEndl;
551 annot=annots.erase(annot);
552 }
553 else annot++;
554 }
555 if(annots.size()==0) remove=1;
556 if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): end" << NcbiEndl;
557
558 return remove;
559 }
560
RemoveProblems(CSeq_annot::C_Data::TFtable & table,map<string,string> & problem_seqs,LocMap & loc_map)561 int CReadBlastApp::RemoveProblems(CSeq_annot::C_Data::TFtable& table, map<string, string>& problem_seqs, LocMap& loc_map)
562 // this one needs cleaning
563 {
564 int removeme=0;
565 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): start" << NcbiEndl;
566 GetLocMap(loc_map, table);
567 for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
568 {
569 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat 000" << NcbiEndl;
570 bool gene, cdregion;
571 gene = (*feat)->GetData().IsGene();
572 cdregion = (*feat)->GetData().IsCdregion();
573 bool del_feature=false;
574 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat I" << NcbiEndl;
575 string real_loc_string = GetLocationString((*feat)->GetLocation());
576 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat II" << NcbiEndl;
577
578 string loc_string = GetLocusTag(**feat, loc_map); // more general, returns location string
579 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat: (" << real_loc_string << ")(" << loc_string << ")" << NcbiEndl;
580 //
581 // case *: matching locus tag
582 //
583 if(problem_seqs.find(loc_string) != problem_seqs.end())
584 {
585 if((*feat)->GetData().IsImp() &&
586 (*feat)->GetData().GetImp().CanGetKey())
587 {
588 NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, key = " << (*feat)->GetData().GetImp().GetKey() << NcbiEndl;
589 }
590 if((*feat)->GetData().IsImp() &&
591 (*feat)->CanGetComment() )
592 {
593 NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, comment = " << (*feat)->GetComment() << NcbiEndl;
594 }
595 /*
596 if((*feat)->GetData().IsImp() &&
597 (*feat)->GetData().GetImp().CanGetKey() &&
598 (*feat)->GetData().GetImp().GetKey() == "misc_feature" &&
599 (*feat)->CanGetComment() &&
600 (*feat)->GetComment().find("potential frameshift") != string::npos
601 ) del_feature = false; // this is a new feature, that we are not supposed to delete
602 */
603 if((*feat)->GetData().IsImp() &&
604 (*feat)->GetData().GetImp().CanGetKey() &&
605 (*feat)->GetData().GetImp().GetKey() == "misc_feature"
606 ) del_feature = false; // this is a new feature, we do not delete them
607 else del_feature=true;
608 }
609
610 if ( PrintDetails() )
611 {
612 NcbiCerr << "RemoveProblems: feature " << loc_string << ": ";
613 if(del_feature) NcbiCerr << "WILL BE REMOVED";
614 else NcbiCerr << "stays until further analysis for it";
615 NcbiCerr << NcbiEndl;
616 }
617 if(del_feature)
618 {
619 NcbiCerr << "RemoveProblems: WARNING: feature "
620 << "{" << (*feat)->GetData().SelectionName((*feat)->GetData().Which()) << "} "
621 << loc_string << ": ";
622 NcbiCerr << "will be removed because of a problem: ";
623 NcbiCerr << problem_seqs.find(loc_string)->second;
624 NcbiCerr << NcbiEndl;
625 }
626 if(!del_feature && gene && (*feat)->GetData().GetGene().CanGetLocus_tag() )
627 //
628 // case *: gene
629 //
630 {
631 string locus_tag = (*feat)->GetData().GetGene().GetLocus_tag();
632 if(problem_seqs.find(locus_tag) != problem_seqs.end()) del_feature=true;
633 if ( PrintDetails() )
634 {
635 NcbiCerr << "RemoveProblems: gene " << locus_tag << ": ";
636 if(del_feature)
637 NcbiCerr << "WILL BE REMOVED";
638 else
639 NcbiCerr << "stays";
640 NcbiCerr << NcbiEndl;
641 }
642 if(del_feature)
643 {
644 NcbiCerr << "RemoveProblems: WARNING: gene " << locus_tag << ": ";
645 NcbiCerr << "will be removed because of a problem: ";
646 NcbiCerr << problem_seqs.find(locus_tag)->second;
647 NcbiCerr << NcbiEndl;
648 }
649 }
650 if(!del_feature && cdregion && (*feat)->CanGetProduct() )
651 {
652 //
653 // case *: cdregion
654 //
655 string productName;
656 if( (*feat)->CanGetProduct() &&
657 (*feat)->GetProduct().IsWhole() &&
658 (*feat)->GetProduct().GetWhole().IsGeneral() &&
659 (*feat)->GetProduct().GetWhole().GetGeneral().CanGetTag() &&
660 (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().IsStr() )
661 {
662 productName = (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().GetStr();
663 }
664 else if (
665 (*feat)->CanGetProduct() &&
666 (*feat)->GetProduct().IsWhole())
667 {
668 productName = (*feat)->GetProduct().GetWhole().AsFastaString();
669 }
670 // strip leading contig ID if any
671 string::size_type ipos=productName.rfind('_', productName.size());
672 if(ipos != string::npos)
673 {
674 string::size_type ipos2;
675 ipos2=productName.rfind('_', ipos-1);
676 if(ipos2 != string::npos) productName.erase(0, ipos2+1);
677 // "1103032000567_RAZWK3B_00550" -> "RAZWK3B_00550";
678 else
679 {
680 ipos2=productName.rfind('|', ipos-1);
681 if(ipos2 != string::npos) productName.erase(0, ipos2+1);
682 }
683 // lcl|Xoryp_00025 -> Xoryp_00025
684 }
685 if(productName.length() && problem_seqs.find(productName) != problem_seqs.end()) del_feature=true;
686 if ( PrintDetails() )
687 {
688 NcbiCerr << "RemoveProblems: cdregion " << productName << ": ";
689 if(del_feature)
690 NcbiCerr << "WILL BE REMOVED";
691 else
692 NcbiCerr << "stays";
693 NcbiCerr << NcbiEndl;
694 }
695 }
696 if(del_feature)
697 {
698 if(problem_seqs.find(real_loc_string) == problem_seqs.end())
699 {
700 problem_seqs[real_loc_string]=problem_seqs[loc_string]; // saving for later
701 }
702 }
703 if(del_feature) feat=table.erase(feat);
704 else feat++;
705 }
706 if(table.size()==0) removeme=1;
707 if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): end" << NcbiEndl;
708 return removeme;
709 }
710
711 // RemoveInterim
712
RemoveInterim(void)713 int CReadBlastApp::RemoveInterim(void)
714 {
715
716 int nremoved=0;
717
718 if(PrintDetails()) NcbiCerr << "RemoveInterim: start " << NcbiEndl;
719 PushVerbosity();
720 for(CTypeIterator<CBioseq> seq=Begin(); seq; ++seq)
721 {
722 if(seq->IsSetAnnot() && seq->IsAa()) nremoved+= RemoveInterim(seq->SetAnnot());
723 if(seq->IsSetAnnot() && seq->IsNa()) nremoved+= RemoveInterim2(seq->SetAnnot());
724 }
725
726 PopVerbosity();
727 if(PrintDetails()) NcbiCerr << "RemoveInterim: end" << NcbiEndl;
728 return nremoved;
729 }
730 // protein sequence annotations
RemoveInterim(CBioseq::TAnnot & annots)731 int CReadBlastApp::RemoveInterim(CBioseq::TAnnot& annots)
732 {
733 int nremoved=0;
734 if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): start " << NcbiEndl;
735
736 for(CBioseq::TAnnot::iterator annot=annots.begin(), annot_end = annots.end(); annot != annot_end; )
737 {
738 bool erased = false;
739 if((*annot)->GetData().IsAlign())
740 {
741 nremoved++; erased = true;
742 }
743 if ( (*annot)->GetData().IsFtable())
744 {
745 int dremoved=0;
746 CSeq_annot::C_Data::TFtable& table = (*annot)->SetData().SetFtable();
747 for(CSeq_annot::C_Data::TFtable::iterator feat=table.begin(), feat_end= table.end(); feat != feat_end; )
748 {
749 string test = "Genomic Location:";
750 if ((*feat)->IsSetData() && (*feat)->GetData().IsProt() &&
751 (*feat)->IsSetComment() && (*feat)->GetComment().substr(0, test.size()) == test )
752 {
753 table.erase(feat++); dremoved++;
754 nremoved++;
755 }
756 else
757 {
758 feat++;
759 }
760 }
761
762 /*
763 this is really crappy way of doing it!
764 */
765 /*
766 while( (*annot)->GetData().GetFtable().size()>1 )
767 {
768 (*annot)->SetData().SetFtable().pop_back();
769 nremoved++;
770 dremoved++;
771 }
772 */
773 if(PrintDetails()) NcbiCerr << "RemoveInterim(CBioseq::TAnnot& annots): dremoved = "
774 << dremoved
775 << ", left=" << (*annot)->GetData().GetFtable().size()
776 << NcbiEndl;
777 if((*annot)->SetData().SetFtable().size() == 0)
778 {
779 nremoved++;
780 erased = true;
781 }
782 }
783 if(erased) annot=annots.erase(annot);
784 else annot++;
785 }
786
787 if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): end" << NcbiEndl;
788 return nremoved;
789 }
790
791 // nucleotide sequence annotations
RemoveInterim2(CBioseq::TAnnot & annots)792 int CReadBlastApp::RemoveInterim2(CBioseq::TAnnot& annots)
793 {
794 int nremoved=0;
795 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): start " << NcbiEndl;
796
797 NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, annots)
798 {
799 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature start" << NcbiEndl;
800 if ( !(*gen_feature)->GetData().IsFtable() ) continue;
801 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature is ftable" << NcbiEndl;
802 map<string,bool> feat_defined;
803 CSeq_annot::C_Data::TFtable& table = (*gen_feature)->SetData().SetFtable();
804 for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
805 {
806 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat start" << NcbiEndl;
807 const CSeq_feat& featr = **feat;
808 CNcbiStrstream buff;
809 buff << MSerial_AsnText << featr;
810 buff << '\0';
811 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): feat ASN:" << NcbiEndl;
812 if(PrintDetails()) NcbiCerr << buff.str() << NcbiEndl;
813 if(feat_defined.find(buff.str()) != feat_defined.end()) // dupe
814 {
815 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat dupe: erase" << NcbiEndl;
816 feat=table.erase(feat);
817 }
818 else
819 {
820 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map" << NcbiEndl;
821 feat_defined[buff.str()]=true;
822 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map done" << NcbiEndl;
823 // feat_defined[featr]=true;
824 feat++;
825 }
826 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat end" << NcbiEndl;
827 }
828
829 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature end" << NcbiEndl;
830 }
831
832 if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): end" << NcbiEndl;
833 return nremoved;
834 }
835
836
837
838
diagName(const string & type,const string & value)839 string diagName(const string& type, const string& value)
840 {
841 return type + "|" + value;
842 }
843
addProblems(list<problemStr> & dest,const list<problemStr> & src)844 int addProblems(list<problemStr>& dest, const list<problemStr>& src)
845 {
846 int n=0;
847 ITERATE(list<problemStr>, src_p, src)
848 {
849 dest.push_back(*src_p);
850 n++;
851 }
852 return n;
853 }
854
CollectRNAFeatures(TProblem_locs & problem_locs)855 int CReadBlastApp::CollectRNAFeatures(TProblem_locs& problem_locs)
856 {
857 ITERATE( diagMap, feat, m_diag)
858 {
859 ITERATE(list<problemStr>, problem, feat->second.problems)
860 {
861 bool added = false;
862 string name = feat->first;
863 string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
864 ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
865 if(ipos!=string::npos) name.erase(0, ipos+1);
866 string range = printed_range(problem->i1, problem->i2);
867 if(
868 (problem->type & eTRNABadStrand || problem->type & eTRNAUndefStrand) // irrelevant problem
869 && !problem->misc_feat_message.empty()
870 )
871 {
872 problem_locs[range].strand = problem->strand;
873 problem_locs[range].name = name;
874 problem_locs[range].count =
875 problem_locs[range].rnacount =
876 problem_locs[range].genecount = 0;
877 added=true;
878 }
879 if(PrintDetails())
880 NcbiCerr << "CReadBlastApp::CollectRNAFeatures: " << feat->first
881 << "[" << range << "]: "
882 << "(" << name << ")"
883 << (added ? "added" : "skipped") << NcbiEndl;
884 }
885 }
886 return problem_locs.size();
887
888 }
889
CollectFrameshiftedSeqs(map<string,string> & problem_names)890 int CReadBlastApp::CollectFrameshiftedSeqs(map<string,string>& problem_names)
891 {
892 const CArgs& args = GetArgs();
893 bool keep_frameshifted = args["kfs"].HasValue();
894 ITERATE( diagMap, feat, m_diag)
895 {
896 ITERATE(list<problemStr>, problem, feat->second.problems)
897 {
898 bool added = false;
899 string name = feat->first;
900 string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
901 ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
902 if(ipos!=string::npos) name.erase(0, ipos+1);
903 if(
904 (problem->type == eFrameShift && !keep_frameshifted)
905 ||
906 problem->type == eRemoveOverlap
907 ||
908 problem->type & eShortProtein
909 )
910 { problem_names[name]=ProblemType(problem->type); added=true; }
911 if(PrintDetails())
912 NcbiCerr << "CollectFrameshiftedSeqs: " << feat->first
913 << ": "
914 << "(" << name << ")"
915 << (added ? "added" : "skipped") << NcbiEndl;
916 }
917 }
918 return problem_names.size();
919 }
920
append_misc_feature(CBioseq_set::TSeq_set & seqs,const string & name,EProblem problem_type)921 void CReadBlastApp::append_misc_feature(CBioseq_set::TSeq_set& seqs, const string& name, EProblem problem_type)
922 {
923 if(m_diag.find(name)==m_diag.end())
924 {
925 // should not happen
926 NcbiCerr << "append_misc_feature: FATAL: do not have problems for " << name << NcbiEndl;
927 throw;
928 return;
929 }
930
931 NON_CONST_ITERATE( CBioseq_set::TSeq_set, na, seqs)
932 {
933 if( is_prot_entry((*na)->GetSeq()) ) continue;
934 list<CRef<CSeq_id> >& na_id = (*na)->SetSeq().SetId();
935 NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, (*na)->SetSeq().SetAnnot())
936 {
937 if ( !(*gen_feature)->GetData().IsFtable() ) continue;
938 typedef map<string, bool> Tmessage_misced;
939 typedef map<EProblem, Tmessage_misced> Tproblem_misced;
940 Tproblem_misced problem_misced; // list of problems fixed
941 ITERATE(list<problemStr>, problem, m_diag[name].problems)
942 {
943 if ( !(problem->type & problem_type) ) continue;
944 int from=-1, to=-1;
945 // string lt1, lt2;
946 ENa_strand strand;
947 string message="";
948 from = problem->i1;
949 if(from<0) continue;
950 // add new feature
951 to = problem->i2;
952 /*
953 lt1 = problem->id1;
954 lt2 = problem->id2;
955 */
956 strand = problem->strand;
957 message = problem->misc_feat_message;
958 if(message.size()==0) continue; // do not print empty misc_feat, they are empty for a reason
959 if(problem_misced.find(problem->type) != problem_misced.end() &&
960 problem_misced[problem->type].find(message) != problem_misced[problem->type].end()
961 ) continue;
962 else problem_misced[problem->type][message] = true;
963 SIZE_TYPE pos;
964 while((pos=message.find_first_of("\n\r"))!=string::npos)
965 {
966 message[pos]=' ';
967 }
968 CRef<CSeq_feat> feat(new CSeq_feat());
969
970 feat->SetComment(message);
971 feat->SetData().SetImp().SetKey("misc_feature");
972 feat->SetLocation().SetInt().SetFrom(from);
973 feat->SetLocation().SetInt().SetTo(to);
974 feat->SetLocation().SetInt().SetId(**na_id.begin());
975 feat->SetLocation().SetInt().SetStrand(strand);
976 (*gen_feature)->SetData().SetFtable().push_back(feat);
977 }
978
979 }
980 break;
981 }
982
983 return;
984 }
985
986
987
988
989