1 /*  $Id: overlaps.cpp 618094 2020-10-09 21:37:50Z ucko $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
35 // all things overlaps
36 // that i've done
37 
find_overlap(TSimpleSeqs::iterator & seq,const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs,TSimpleSeqs & best_seq)38 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
39    TSimpleSeqs& seqs, TSimpleSeqs& best_seq)
40 {
41    int nseq=0;
42    int from = ext_rna->exons[0].from;
43    int to   = ext_rna->exons[ext_rna->exons.size()-1].to;
44    string ext_rna_range = printed_range(ext_rna);
45    for(;seq!=seqs.end(); seq++, nseq++)
46      {
47      int from2 = seq->exons[0].from;
48      int to2   = seq->exons[seq->exons.size()-1].to;
49      bool over_origin = seq->exons.size()>1 && to2-from2 > m_length/2;
50      string ext_rna_range2 = printed_range(seq);
51      if(PrintDetails()) NcbiCerr << "find_overlap"
52           << "[" << ext_rna_range << "]"
53           << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
54 
55      if(to2>=from || over_origin)
56        {
57        if(!over_origin) { if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl; }
58        else             { if(PrintDetails()) NcbiCerr << "over_origin" << NcbiEndl; }
59        if(from2<=to || over_origin)
60          {
61          if(!over_origin) { if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;}
62          else             { if(PrintDetails()) NcbiCerr << "over_origin 2" << NcbiEndl;}
63          TSimpleSeqs::iterator seq2 = seq;
64          for(;seq2!=seqs.end(); seq2++)
65            {
66            int from2 = seq2->exons[0].from;
67            // int to2   = seq2->exons[seq->exons.size()-1].to;
68            string ext_rna_range2 = printed_range(seq2);
69            if(PrintDetails()) NcbiCerr << "\tfind_overlap"
70               << "[" << ext_rna_range << "]"
71               << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
72            if(from2>to) break;// last seq
73            int this_overlap;
74            overlaps(ext_rna, seq2, this_overlap);
75            if(PrintDetails()) NcbiCerr << "\tfind_overlap"
76               << "[" << ext_rna_range << "]"
77               << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
78            if(this_overlap>0)
79              {
80 //           best_seq.push_back(*seq);  // this is serious.
81              best_seq.push_back(*seq2);
82              }
83            }
84          }
85        break;
86        }
87      }
88    return nseq;
89 }
90 
find_overlap(TSimpleSeqs::iterator & seq,const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs,int & overlap)91 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
92    TSimpleSeqs& seqs, int& overlap)
93 {
94    int nseq=0;
95    int from = ext_rna->exons[0].from;
96    int to   = ext_rna->exons[ext_rna->exons.size()-1].to;
97    CNcbiStrstream ext_rna_range_stream; ext_rna_range_stream << from << "..." << to << '\0';
98    string ext_rna_range = ext_rna_range_stream.str();
99    TSimpleSeqs::iterator& best_seq = seq;
100    for(;seq!=seqs.end(); seq++, nseq++)
101      {
102      int from2 = seq->exons[0].from;
103      int to2   = seq->exons[seq->exons.size()-1].to;
104      CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
105      string ext_rna_range2 = ext_rna_range_stream2.str();
106      if(PrintDetails()) NcbiCerr << "find_overlap"
107           << "[" << ext_rna_range << "]"
108           << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
109 
110      if(to2>=from)
111        {
112        if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl;
113        if(from2<=to)
114          {
115          if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;
116          TSimpleSeqs::iterator seq2 = seq;
117          for(;seq2!=seqs.end(); seq2++)
118            {
119            int from2 = seq2->exons[0].from;
120            int to2   = seq2->exons[seq->exons.size()-1].to;
121            CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
122            string ext_rna_range2 = ext_rna_range_stream2.str();
123            if(PrintDetails()) NcbiCerr << "\tfind_overlap"
124               << "[" << ext_rna_range << "]"
125               << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
126            if(from2>to) break;// last seq
127            int this_overlap;
128            overlaps(ext_rna, seq2, this_overlap);
129            if(PrintDetails()) NcbiCerr << "\tfind_overlap"
130               << "[" << ext_rna_range << "]"
131               << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
132            if(this_overlap>overlap)
133              {
134              overlap=this_overlap;
135              best_seq = seq; // this one
136              }
137            }
138          }
139        break;
140        }
141      }
142    return nseq;
143 }
144 
overlaps(const TSimpleSeqs::iterator & seq1,const TSimpleSeqs::iterator & seq2,int & overlap)145 int CReadBlastApp::overlaps(const TSimpleSeqs::iterator& seq1, const TSimpleSeqs::iterator& seq2, int& overlap)
146 {
147   overlap = 0;
148   for(TSimplePairs::const_iterator e1=seq1->exons.begin(); e1!=seq1->exons.end(); e1++)
149     {
150     for(TSimplePairs::const_iterator e2=seq2->exons.begin(); e2!=seq2->exons.end(); e2++)
151       {
152       int o = min(e2->to, e1->to)-max(e1->from, e2->from)+1;
153       if(o>0) overlap+=o;
154       }
155     }
156   return overlap;
157 }
158 
overlaps_prot_na(CBioseq & seq,const CBioseq::TAnnot & annots)159 bool CReadBlastApp::overlaps_prot_na
160   (
161   CBioseq& seq,
162   const CBioseq::TAnnot& annots
163   )
164 {
165   bool result = false;
166   if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] starts" << NcbiEndl;
167   IncreaseVerbosity();
168   ITERATE(CBioseq::TAnnot, gen_feature, annots)
169     {
170     if ( !(*gen_feature)->GetData().IsFtable() ) continue;
171     bool lres;
172 
173     lres = overlaps_prot_na(seq, (*gen_feature)->GetData().GetFtable());
174     result = lres || result;
175     }
176   DecreaseVerbosity();
177   if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] ends" << NcbiEndl;
178   return result;
179 }
overlaps_na(const CBioseq::TAnnot & annots)180 bool CReadBlastApp::overlaps_na
181   (
182   const CBioseq::TAnnot& annots
183   )
184 {
185    bool result = false;
186    if(PrintDetails()) NcbiCerr << "overlaps_na[annots] starts" << NcbiEndl;
187    IncreaseVerbosity();
188    ITERATE(CBioseq::TAnnot, gen_feature, annots)
189      {
190      if ( !(*gen_feature)->GetData().IsFtable() ) continue;
191      bool lres;
192 
193      lres = overlaps_na((*gen_feature)->GetData().GetFtable());
194      result = lres || result;
195 
196      }
197    DecreaseVerbosity();
198    if(PrintDetails()) NcbiCerr << "overlaps_na[annots] ends" << NcbiEndl;
199    return result;
200 }
201 
overlaps_prot_na(CBioseq & seq,const CSeq_annot::C_Data::TFtable & feats)202 bool CReadBlastApp::overlaps_prot_na
203   (
204   CBioseq& seq,
205   const CSeq_annot::C_Data::TFtable& feats
206   )
207 {
208    if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] starts" << NcbiEndl;
209    IncreaseVerbosity();
210    bool result = false;
211    LocMap loc_map;
212    GetLocMap(loc_map, feats); // gene features for each location
213    EMyFeatureType seq_type = get_my_seq_type(seq);
214    string n2 = GetStringDescr (seq);
215    string name2 = GetProtName(seq);
216 /*
217    bool hasLoc = hasGenomicInterval(seq);
218    const CSeq_interval& seq_interval = getGenomicInterval(seq);
219 */
220    // bool hasLoc = hasGenomicLocation(seq);
221    const CSeq_loc& seq_interval = getGenomicLocation(seq);
222    TSeqPos from2, to2;
223    ENa_strand strand2;
224    getFromTo(seq_interval, from2, to2, strand2);
225 
226    ITERATE(CSeq_annot::C_Data::TFtable, f1, feats)
227      {
228      int overlap;
229      if( !(*f1)->GetData().IsRna() ) continue;
230 //     bool lres=overlaps_prot_na(n2, seq_interval, **f1, overlap);
231      bool lres=overlaps(seq_interval, (*f1)->GetLocation(), overlap);
232      // CSeqFeatData::E_Choice esite = (*f1)->GetData().Which();
233      if(lres && overlap >= m_rna_overlapThreshold)
234        {
235        string n1  =  GetLocusTag(**f1, loc_map);
236        string trna_type = get_trna_string(**f1);
237        string name1;
238        if(trna_type.size()>0) name1 = trna_type;
239        else name1 = GetRNAname(**f1);
240        EMyFeatureType rna_feat_type = get_my_feat_type(**f1, loc_map);
241 // check overlaps with protein
242        if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: FOUND OVERLAP" << NcbiEndl;
243        TSeqPos from1, to1;
244        ENa_strand strand1;
245        getFromTo((*f1)->GetLocation(), from1, to1, strand1);
246        int min1, min2, max1, max2;
247        min1 = min(from1, to1);
248        min2 = min(from2, to2);
249        max1 = max(from1, to1);
250        max2 = max(from2, to2);
251        // int mint; int maxt;
252        // mint = min(min1, min2);
253        // maxt = max(max1, max2);
254 
255        distanceReportStr *report = new distanceReportStr;
256        int left_frame  = (from1-1)%3+1;
257        int right_frame  = (from2-1)%3+1;
258 
259        report->left_strand  = strand1;
260        report->right_strand = strand2;
261        report->q_loc_left_from  = from1;
262        report->q_loc_right_from = from2;
263        report->q_loc_left_to    = to1;
264        report->q_loc_right_to   = to2;
265        report->q_id_left    = n1;
266        report->q_id_right   = n2;
267        report->q_name_left  = name1;
268        report->q_name_right = name2;
269        report->space          = overlap; // not used
270        report->left_frame     = left_frame;
271        report->right_frame    = right_frame;
272        report->loc1 = &((*f1)->GetLocation());
273        report->loc2 = &seq_interval;
274 
275        char bufferchar[20480];  memset(bufferchar, 0, 20480);
276        CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
277        printOverlapReport(report, buffer);
278 
279        CNcbiStrstream buff_misc_feat_rna;
280        buff_misc_feat_rna
281             << "potential RNA location ("
282             << name1 << ") that overlaps protein (" << get_title(seq) << ")" << '\0';
283 
284        CNcbiStrstream buff_misc_feat_protein;
285        buff_misc_feat_protein
286             << "potential protein location ("
287             << get_title(seq) << ") that overlaps RNA (" << name1 << ")" << '\0';
288 
289 
290        CNcbiStrstream misc_feat_rna;
291        misc_feat_rna << buff_misc_feat_rna.str() << '\0';
292        CNcbiStrstream misc_feat_protein;
293        misc_feat_protein << buff_misc_feat_protein.str() << '\0';
294 
295        if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created RNA buffer: "     << buff_misc_feat_rna.str()     << "\n";
296        if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created protein buffer: " << buff_misc_feat_protein.str() << "\n";
297        problemStr problem = {eRnaOverlap,  buffer.str(), "", "", "", -1, -1, eNa_strand_unknown };
298        m_diag[n1].problems.push_back(problem);
299        bool removeit=false;
300        string removen = "";
301 //       problemStr problemCOH = {eRemoveOverlap, "", misc_feat.str(), "", "", mint, maxt, eNa_strand_unknown };
302        if      (rna_feat_type == eMyFeatureType_pseudo_tRNA && seq_type != eMyFeatureType_hypo_CDS)
303          {
304          NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
305            << n1 << " marked for deletion (pseudo)" << "\n";
306          removen = n1;
307          removeit=true;
308          }
309        else if (rna_feat_type == eMyFeatureType_atypical_tRNA && seq_type != eMyFeatureType_hypo_CDS)
310          {
311          NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
312                << n1 << " marked for deletion (atypical)" << "\n";
313          removen = n1;
314          removeit=true;
315          }
316        else if
317          (
318            (
319              (rna_feat_type == eMyFeatureType_normal_tRNA ||
320              rna_feat_type == eMyFeatureType_atypical_tRNA
321              )
322              && seq_type == eMyFeatureType_hypo_CDS
323            ) ||
324            rna_feat_type == eMyFeatureType_rRNA
325          )
326          {
327          NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: CDS and gene "
328                << n2 << " marked for deletion (hypothetical)" << "\n";
329          removen = n2;
330          removeit=true;
331          }
332        else
333          {
334          if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: no deletion\n";
335          removeit=false;
336          }
337        if(removeit)
338          {
339          if(removen == n1) // RNA location is deleted
340            {
341            problemStr problemCOH = {eRemoveOverlap, "", misc_feat_rna.str(), "", "", min1, max1, strand1};
342            m_diag[removen].problems.push_back(problemCOH);
343            }
344          else // protein location is deleted
345            {
346            problemStr problemCOH = {eRemoveOverlap, "", misc_feat_protein.str(), "", "", min2, max2, strand2};
347            m_diag[removen].problems.push_back(problemCOH);
348            }
349          if(PrintDetails())  NcbiCerr << "overlaps_prot_na[seq,feats]: sequence "
350                   << "[" << removen << "]"
351                   << " is marked for removal"
352                   << NcbiEndl;
353          try
354            {
355            CBioseq_set::TSeq_set* seqs = get_parent_seqset(seq);
356            if(seqs!=NULL) append_misc_feature(*seqs, removen, eRemoveOverlap);
357            }
358          catch(...)
359            {
360            NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for " << removen << NcbiEndl;
361            }
362          }
363        }
364      result = lres || result;
365      } //  if(lres && overlap >= m_rna_overlapThreshold)
366 
367    DecreaseVerbosity();
368    if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] ends" << NcbiEndl;
369    return result;
370 }
371 
overlaps_na(const CSeq_annot::C_Data::TFtable & feats)372 bool CReadBlastApp::overlaps_na
373   (
374   const CSeq_annot::C_Data::TFtable& feats
375   )
376 {
377    if(PrintDetails()) NcbiCerr << "overlaps_na[feats] starts" << NcbiEndl;
378    IncreaseVerbosity();
379    bool result = false;
380    LocMap loc_map;
381    GetLocMap(loc_map, feats);
382    ITERATE(CSeq_annot::C_Data::TFtable, f1, feats)
383      {
384      if(PrintDetails()) NcbiCerr << "overlaps_na[feats] cycling through rna_feats" << NcbiEndl;
385 // check overlaps with externally defined tRNAs
386      if ( !(*f1)->GetData().IsRna() ) continue;
387      CRNA_ref::EType rna_type = (*f1)->GetData().GetRna().GetType();
388      if(rna_type != CRNA_ref::eType_tRNA && rna_type != CRNA_ref::eType_rRNA ) continue;
389      string type1;
390      if ( rna_type == CRNA_ref::eType_tRNA )
391        {
392        if ( !(*f1)->GetData().GetRna().CanGetExt() ) continue;
393        try { type1 = Get3type((*f1)->GetData().GetRna());}
394        catch  (...)
395          {
396          NcbiCerr << "overlaps_na[feats]: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
397          throw;
398          }
399        }
400      else
401        {
402        type1 = GetRRNAtype((*f1)->GetData().GetRna());
403        }
404      if(type1.size()==0) continue;
405      match_na(**f1, type1);
406      }
407    result = true;
408    DecreaseVerbosity();
409    if(PrintDetails()) NcbiCerr << "overlaps_na[feats] ends" << NcbiEndl;
410    return result;
411 }
412 
overlaps_prot_na(const string & n1,const CSeq_interval & i1,const CSeq_feat & f2,int & overlap)413 bool CReadBlastApp::overlaps_prot_na
414   (
415   const string& n1,
416   const CSeq_interval& i1,
417   const CSeq_feat& f2,
418   int& overlap
419   )
420 {
421   bool result = false;
422   if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] starts" << NcbiEndl;
423   overlap=0;
424   string n2="not gene";
425   if(f2.GetData().IsGene()) f2.GetData().GetGene().GetLabel(&n2);
426   if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2]: input: "
427                                << n1
428                                << ","
429                                << n2
430                                << NcbiEndl;
431   result = overlaps(i1, f2.GetLocation(), overlap);
432 
433   if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] ends" << NcbiEndl;
434   return result;
435 
436 }
437 
438 
overlaps_na(const CSeq_feat & f1,const CSeq_feat & f2,int & overlap)439 bool CReadBlastApp::overlaps_na
440   (
441   const CSeq_feat& f1,
442   const CSeq_feat& f2,
443   int& overlap
444   )
445 {
446    bool result = false;
447    if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] starts" << NcbiEndl;
448    overlap=0;
449 
450    string n1; f1.GetData().GetGene().GetLabel(&n1);
451    string n2; f2.GetData().GetGene().GetLabel(&n2);
452    if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2]: input: "
453                                << n1
454                                << ","
455                                << n2
456                                << NcbiEndl;
457    if(n1==n2) return result;
458 
459    result = overlaps(f1.GetLocation(), f2.GetLocation(), overlap);
460 
461    if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] ends" << NcbiEndl;
462    return result;
463 
464 }
465 template <typename t1, typename t2> bool
overlaps(const t1 & l1,const t2 & l2,int & overlap)466 CReadBlastApp::overlaps
467   (
468 /*
469   const CSeq_loc& l1,
470   const CSeq_loc& l2,
471 */
472   const t1& l1,
473   const t2& l2,
474   int& overlap
475   )
476 {
477   bool result = false;
478   overlap=0;
479   for (CTypeConstIterator<CSeq_interval> i1 = ncbi::ConstBegin(l1); i1; ++i1)
480     {
481     TSeqPos from1, to1, from2, to2;
482     ENa_strand strand1, strand2;
483     int min1, min2, max1, max2;
484     getFromTo( *i1, from1, to1, strand1);
485     min1 = min(from1, to1);
486     max1 = max(from1, to1);
487     for (CTypeConstIterator<CSeq_interval> i2 = ncbi::ConstBegin(l2); i2; ++i2)
488       {
489       getFromTo( *i2, from2, to2, strand2);
490       min2 = min(from2, to2);
491       max2 = max(from2, to2);
492       int overlap_start, overlap_end;
493       overlap_end = min(max1, max2);
494       overlap_start = max(min1, min2);
495 
496       bool result2 = overlap_end >= overlap_start;
497       if(!result2) continue;
498       overlap+=overlap_end - overlap_start + 1;
499       result=true;
500       }
501     }
502   return result;
503 }
504 
overlaps(const CSeq_loc & l1,int from2,int to2,int & overlap)505 bool CReadBlastApp::overlaps
506   (
507   const CSeq_loc& l1,
508   int from2, int to2,
509   int& overlap
510   )
511 {
512   bool result = false;
513   overlap=0;
514   TSeqPos from1, to1;
515   int min1, min2, max1, max2;
516   min2 = min(from2, to2);
517   max2 = max(from2, to2);
518   for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
519     {
520     ENa_strand strand1;
521     getFromTo(*i1, from1, to1, strand1);
522     min1 = min(from1, to1);
523     max1 = max(from1, to1);
524     int overlap_start, overlap_end;
525     overlap_end = min(max1, max2);
526     overlap_start = max(min1, min2);
527 
528     bool result2 = overlap_end >= overlap_start;
529     if(result2) result=true; // found one segment that overlaps with the second input segment
530     overlap+=overlap_end - overlap_start + 1;
531     }
532   return result;
533 }
534 
complete_overlap(const CSeq_loc & l1,const CSeq_loc & l2)535 bool CReadBlastApp::complete_overlap // l1 is covered by l2
536   (
537   const CSeq_loc& l1,
538   const CSeq_loc& l2
539   )
540 {
541   bool result = false;
542   for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
543     {
544     TSeqPos from1, to1, from2, to2;
545     ENa_strand strand1, strand2;
546     int min1, min2, max1, max2;
547     getFromTo( *i1, from1, to1, strand1);
548     min1 = min(from1, to1);
549     max1 = max(from1, to1);
550     result = false;  // assume this piece
551     for (CTypeConstIterator<CSeq_interval> i2 = ::ConstBegin(l2); i2; ++i2)
552       {
553       getFromTo( *i2, from2, to2, strand2);
554 // note that this does not take into account weird cases when two pieces of l2 are stuck together without any gap, covering i1 in combination
555       min2 = min(from2, to2);
556       max2 = max(from2, to2);
557       if(min2<=min1 && max2>=max1) // found completely covering piece
558         {
559         if(PrintDetails()) NcbiCerr << "complete_overlap: "
560             << from1 << " ... " << to1 << " "
561             << from2 << " ... " << to2 << " "
562             << NcbiEndl;
563         result=true;
564         break;
565         }
566       }
567     if(!result) return result;
568     }
569   return result;
570 }
571 
overlaps(const CBioseq & left,const CBioseq & right)572 bool CReadBlastApp::overlaps
573 // genomic interval is already stored from the NA_annotations
574   (
575   const CBioseq& left,
576   const CBioseq& right
577   )
578 {
579   bool result=false;
580   // check if prot
581   // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
582   string qname = GetStringDescr (left);
583   string qrname = GetStringDescr (right);
584   if(PrintDetails()) NcbiCerr << "overlaps, seq level " << NcbiEndl;
585   if(PrintDetails()) NcbiCerr << "left  " <<  GetStringDescr (left) << NcbiEndl;
586   if(PrintDetails()) NcbiCerr << "right " <<  GetStringDescr (right) << NcbiEndl;
587 // assuming that protein sequences come in one piece of seq-set
588   if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
589   if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
590   if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
591   if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
592 /*
593   if(!hasGenomicInterval(left)) return result;
594   if(!hasGenomicInterval(right)) return result;
595   const CSeq_interval& left_genomic_int = getGenomicInterval(left);
596   const CSeq_interval& right_genomic_int = getGenomicInterval(right);
597 */
598   if(!hasGenomicLocation(left)) return result;
599   if(!hasGenomicLocation(right)) return result;
600   const CSeq_loc& left_genomic_int = getGenomicLocation(left);
601   const CSeq_loc& right_genomic_int = getGenomicLocation(right);
602 
603   if(PrintDetails()) NcbiCerr << "Got intervals" << NcbiEndl;
604   TSeqPos from1, to1, from2, to2;
605   ENa_strand  left_strand;
606   ENa_strand right_strand;
607   getFromTo(left_genomic_int, from1, to1, left_strand);
608   getFromTo(right_genomic_int, from2, to2, right_strand);
609 
610   if(PrintDetails()) NcbiCerr << "Got strands" << NcbiEndl;
611   int left_frame=-0xFF, right_frame=-0xFF;
612   if(left_genomic_int.IsInt())
613     {
614     left_frame  = (from1-1)%3+1;
615     }
616   if(right_genomic_int.IsInt())
617     {
618     right_frame  = (from2-1)%3+1;
619     }
620 
621   if(left_strand  != eNa_strand_plus && left_genomic_int.IsInt())  left_frame=-left_frame;
622   if(right_strand != eNa_strand_plus && right_genomic_int.IsInt()) right_frame=-right_frame;
623 /*
624   Tue 10/7/2008 9:25 AM, Bill Klimke + his consultation w/ Leigh Riley
625   opposite strands overlaps should be treated exactly the same way
626 */
627   // if(left_strand != right_strand) return result;
628   if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
629   int space =
630                  (min((int)to1, (int)to2)-
631                   max((int)from2, (int)from1)
632                   +1
633                  )/3;
634 
635   bool complete_overlaps = false;
636   int scratch_overlap;
637   result = overlaps(left_genomic_int, right_genomic_int, scratch_overlap);
638   bool  left_covered_by_right=false;
639   bool  right_covered_by_left=false;
640   if(result) complete_overlaps = (left_covered_by_right=complete_overlap(left_genomic_int, right_genomic_int))
641                        || (right_covered_by_left=complete_overlap(right_genomic_int, left_genomic_int));
642   if(PrintDetails()) NcbiCerr << "space = " << space
643      << ", complete_overlap = " << complete_overlaps
644      << ", result = " << result
645      << NcbiEndl;
646   if(result && scratch_overlap >= m_cds_overlapThreshold)  // overlap
647     {
648     distanceReportStr *report = new distanceReportStr;
649     report->left_strand  = left_strand;
650     report->right_strand = right_strand;
651     report->q_loc_left_from  = from1;
652     report->q_loc_right_from = from2;
653     report->q_loc_left_to    = to1;
654     report->q_loc_right_to   = to2;
655     report->q_id_left    = GetStringDescr (left);
656     report->q_id_right   = GetStringDescr (right);
657     report->q_name_left  = GetProtName(left);
658     report->q_name_right = GetProtName(right);
659     report->space          = space;
660     report->left_frame     = left_frame;
661     report->right_frame    = right_frame;
662 
663     char bufferchar[20480];  memset(bufferchar, 0, 20480);
664     CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
665     printOverlapReport(report, buffer);
666 /*
667     CNcbiStrstream misc_feat;
668     misc_feat << "potential protein locations )" << GetProtName(left)
669             << ") and " << printed_range(right)
670             << " overlap by " << scratch_overlap
671             << "bp"
672             << NcbiEndl << '\0';
673 */
674     CNcbiStrstream misc_feat_left;
675     misc_feat_left
676        << "potential protein location (" << GetProtName(left)
677        << ") that overlaps protein (" << GetProtName(right) << ")" << NcbiEndl << '\0';
678 
679     CNcbiStrstream misc_feat_right;
680     misc_feat_right
681        << "potential protein location (" << GetProtName(right)
682        << ") that overlaps protein (" << GetProtName(left) << ")" << NcbiEndl << '\0';
683 
684     // problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
685     // problemStr problemO = {eOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
686     // problemStr problem;
687     // problemStr problemCOH = {eRemoveOverlap   , "", misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
688 
689 
690     if(complete_overlaps)
691       {
692       // problem  = problemCO;
693       if(report->q_name_left.find("hypothetical")!=string::npos && left_covered_by_right && !right_covered_by_left)
694         {
695         NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
696                  << "[" << qname << "]"
697                  << " is marked for removal because of a complete overlap"
698                  << NcbiEndl;
699         problemStr problemCOH = {eRemoveOverlap   , "", misc_feat_left.str(), "", "", (int)from1, (int)to1, left_strand};
700         problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
701         m_diag[qname].problems.push_back(problemCOH);
702         m_diag[qname].problems.push_back(problemCO);
703         try
704           {
705           CBioseq_set::TSeq_set* seqs = get_parent_seqset(left);
706           if(seqs!=NULL) append_misc_feature(*seqs, qname, eRemoveOverlap);
707           }
708         catch(...)
709           {
710           NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
711                    << qname << NcbiEndl;
712           }
713         }
714       if(report->q_name_right.find("hypothetical")!=string::npos && right_covered_by_left)
715         {
716         NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
717                  << "[" << qrname << "]"
718                  << " is marked for removal because of a complete overlap"
719                  << NcbiEndl;
720         problemStr problemCOH = {eRemoveOverlap   , "", misc_feat_right.str(), "", "", (int)from2, (int)to2, right_strand};
721         problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
722         m_diag[qrname].problems.push_back(problemCOH);
723         m_diag[qrname].problems.push_back(problemCO);
724         try
725           {
726           CBioseq_set::TSeq_set* seqs = get_parent_seqset(right);
727           if(seqs!=NULL) append_misc_feature(*seqs, qrname, eRemoveOverlap);
728           }
729         catch(...)
730           {
731           NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
732                    << qrname << NcbiEndl;
733           }
734         }
735       {
736       problemStr problemO_l = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
737       problemStr problemO_r = {eCompleteOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
738       m_diag[qname].problems.push_back(problemO_l);
739       m_diag[qrname].problems.push_back(problemO_r);
740       }
741       }
742     else
743       {
744       problemStr problemO_l = {eOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
745       problemStr problemO_r = {eOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
746       m_diag[qname].problems.push_back(problemO_l);
747       m_diag[qrname].problems.push_back(problemO_r);
748       }
749     delete report;
750     }
751   return result;
752 }
753 
754