1 /* $Id: overlaps.cpp 618094 2020-10-09 21:37:50Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34
35 // all things overlaps
36 // that i've done
37
find_overlap(TSimpleSeqs::iterator & seq,const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs,TSimpleSeqs & best_seq)38 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
39 TSimpleSeqs& seqs, TSimpleSeqs& best_seq)
40 {
41 int nseq=0;
42 int from = ext_rna->exons[0].from;
43 int to = ext_rna->exons[ext_rna->exons.size()-1].to;
44 string ext_rna_range = printed_range(ext_rna);
45 for(;seq!=seqs.end(); seq++, nseq++)
46 {
47 int from2 = seq->exons[0].from;
48 int to2 = seq->exons[seq->exons.size()-1].to;
49 bool over_origin = seq->exons.size()>1 && to2-from2 > m_length/2;
50 string ext_rna_range2 = printed_range(seq);
51 if(PrintDetails()) NcbiCerr << "find_overlap"
52 << "[" << ext_rna_range << "]"
53 << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
54
55 if(to2>=from || over_origin)
56 {
57 if(!over_origin) { if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl; }
58 else { if(PrintDetails()) NcbiCerr << "over_origin" << NcbiEndl; }
59 if(from2<=to || over_origin)
60 {
61 if(!over_origin) { if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;}
62 else { if(PrintDetails()) NcbiCerr << "over_origin 2" << NcbiEndl;}
63 TSimpleSeqs::iterator seq2 = seq;
64 for(;seq2!=seqs.end(); seq2++)
65 {
66 int from2 = seq2->exons[0].from;
67 // int to2 = seq2->exons[seq->exons.size()-1].to;
68 string ext_rna_range2 = printed_range(seq2);
69 if(PrintDetails()) NcbiCerr << "\tfind_overlap"
70 << "[" << ext_rna_range << "]"
71 << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
72 if(from2>to) break;// last seq
73 int this_overlap;
74 overlaps(ext_rna, seq2, this_overlap);
75 if(PrintDetails()) NcbiCerr << "\tfind_overlap"
76 << "[" << ext_rna_range << "]"
77 << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
78 if(this_overlap>0)
79 {
80 // best_seq.push_back(*seq); // this is serious.
81 best_seq.push_back(*seq2);
82 }
83 }
84 }
85 break;
86 }
87 }
88 return nseq;
89 }
90
find_overlap(TSimpleSeqs::iterator & seq,const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs,int & overlap)91 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
92 TSimpleSeqs& seqs, int& overlap)
93 {
94 int nseq=0;
95 int from = ext_rna->exons[0].from;
96 int to = ext_rna->exons[ext_rna->exons.size()-1].to;
97 CNcbiStrstream ext_rna_range_stream; ext_rna_range_stream << from << "..." << to << '\0';
98 string ext_rna_range = ext_rna_range_stream.str();
99 TSimpleSeqs::iterator& best_seq = seq;
100 for(;seq!=seqs.end(); seq++, nseq++)
101 {
102 int from2 = seq->exons[0].from;
103 int to2 = seq->exons[seq->exons.size()-1].to;
104 CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
105 string ext_rna_range2 = ext_rna_range_stream2.str();
106 if(PrintDetails()) NcbiCerr << "find_overlap"
107 << "[" << ext_rna_range << "]"
108 << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
109
110 if(to2>=from)
111 {
112 if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl;
113 if(from2<=to)
114 {
115 if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;
116 TSimpleSeqs::iterator seq2 = seq;
117 for(;seq2!=seqs.end(); seq2++)
118 {
119 int from2 = seq2->exons[0].from;
120 int to2 = seq2->exons[seq->exons.size()-1].to;
121 CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
122 string ext_rna_range2 = ext_rna_range_stream2.str();
123 if(PrintDetails()) NcbiCerr << "\tfind_overlap"
124 << "[" << ext_rna_range << "]"
125 << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
126 if(from2>to) break;// last seq
127 int this_overlap;
128 overlaps(ext_rna, seq2, this_overlap);
129 if(PrintDetails()) NcbiCerr << "\tfind_overlap"
130 << "[" << ext_rna_range << "]"
131 << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
132 if(this_overlap>overlap)
133 {
134 overlap=this_overlap;
135 best_seq = seq; // this one
136 }
137 }
138 }
139 break;
140 }
141 }
142 return nseq;
143 }
144
overlaps(const TSimpleSeqs::iterator & seq1,const TSimpleSeqs::iterator & seq2,int & overlap)145 int CReadBlastApp::overlaps(const TSimpleSeqs::iterator& seq1, const TSimpleSeqs::iterator& seq2, int& overlap)
146 {
147 overlap = 0;
148 for(TSimplePairs::const_iterator e1=seq1->exons.begin(); e1!=seq1->exons.end(); e1++)
149 {
150 for(TSimplePairs::const_iterator e2=seq2->exons.begin(); e2!=seq2->exons.end(); e2++)
151 {
152 int o = min(e2->to, e1->to)-max(e1->from, e2->from)+1;
153 if(o>0) overlap+=o;
154 }
155 }
156 return overlap;
157 }
158
overlaps_prot_na(CBioseq & seq,const CBioseq::TAnnot & annots)159 bool CReadBlastApp::overlaps_prot_na
160 (
161 CBioseq& seq,
162 const CBioseq::TAnnot& annots
163 )
164 {
165 bool result = false;
166 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] starts" << NcbiEndl;
167 IncreaseVerbosity();
168 ITERATE(CBioseq::TAnnot, gen_feature, annots)
169 {
170 if ( !(*gen_feature)->GetData().IsFtable() ) continue;
171 bool lres;
172
173 lres = overlaps_prot_na(seq, (*gen_feature)->GetData().GetFtable());
174 result = lres || result;
175 }
176 DecreaseVerbosity();
177 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] ends" << NcbiEndl;
178 return result;
179 }
overlaps_na(const CBioseq::TAnnot & annots)180 bool CReadBlastApp::overlaps_na
181 (
182 const CBioseq::TAnnot& annots
183 )
184 {
185 bool result = false;
186 if(PrintDetails()) NcbiCerr << "overlaps_na[annots] starts" << NcbiEndl;
187 IncreaseVerbosity();
188 ITERATE(CBioseq::TAnnot, gen_feature, annots)
189 {
190 if ( !(*gen_feature)->GetData().IsFtable() ) continue;
191 bool lres;
192
193 lres = overlaps_na((*gen_feature)->GetData().GetFtable());
194 result = lres || result;
195
196 }
197 DecreaseVerbosity();
198 if(PrintDetails()) NcbiCerr << "overlaps_na[annots] ends" << NcbiEndl;
199 return result;
200 }
201
overlaps_prot_na(CBioseq & seq,const CSeq_annot::C_Data::TFtable & feats)202 bool CReadBlastApp::overlaps_prot_na
203 (
204 CBioseq& seq,
205 const CSeq_annot::C_Data::TFtable& feats
206 )
207 {
208 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] starts" << NcbiEndl;
209 IncreaseVerbosity();
210 bool result = false;
211 LocMap loc_map;
212 GetLocMap(loc_map, feats); // gene features for each location
213 EMyFeatureType seq_type = get_my_seq_type(seq);
214 string n2 = GetStringDescr (seq);
215 string name2 = GetProtName(seq);
216 /*
217 bool hasLoc = hasGenomicInterval(seq);
218 const CSeq_interval& seq_interval = getGenomicInterval(seq);
219 */
220 // bool hasLoc = hasGenomicLocation(seq);
221 const CSeq_loc& seq_interval = getGenomicLocation(seq);
222 TSeqPos from2, to2;
223 ENa_strand strand2;
224 getFromTo(seq_interval, from2, to2, strand2);
225
226 ITERATE(CSeq_annot::C_Data::TFtable, f1, feats)
227 {
228 int overlap;
229 if( !(*f1)->GetData().IsRna() ) continue;
230 // bool lres=overlaps_prot_na(n2, seq_interval, **f1, overlap);
231 bool lres=overlaps(seq_interval, (*f1)->GetLocation(), overlap);
232 // CSeqFeatData::E_Choice esite = (*f1)->GetData().Which();
233 if(lres && overlap >= m_rna_overlapThreshold)
234 {
235 string n1 = GetLocusTag(**f1, loc_map);
236 string trna_type = get_trna_string(**f1);
237 string name1;
238 if(trna_type.size()>0) name1 = trna_type;
239 else name1 = GetRNAname(**f1);
240 EMyFeatureType rna_feat_type = get_my_feat_type(**f1, loc_map);
241 // check overlaps with protein
242 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: FOUND OVERLAP" << NcbiEndl;
243 TSeqPos from1, to1;
244 ENa_strand strand1;
245 getFromTo((*f1)->GetLocation(), from1, to1, strand1);
246 int min1, min2, max1, max2;
247 min1 = min(from1, to1);
248 min2 = min(from2, to2);
249 max1 = max(from1, to1);
250 max2 = max(from2, to2);
251 // int mint; int maxt;
252 // mint = min(min1, min2);
253 // maxt = max(max1, max2);
254
255 distanceReportStr *report = new distanceReportStr;
256 int left_frame = (from1-1)%3+1;
257 int right_frame = (from2-1)%3+1;
258
259 report->left_strand = strand1;
260 report->right_strand = strand2;
261 report->q_loc_left_from = from1;
262 report->q_loc_right_from = from2;
263 report->q_loc_left_to = to1;
264 report->q_loc_right_to = to2;
265 report->q_id_left = n1;
266 report->q_id_right = n2;
267 report->q_name_left = name1;
268 report->q_name_right = name2;
269 report->space = overlap; // not used
270 report->left_frame = left_frame;
271 report->right_frame = right_frame;
272 report->loc1 = &((*f1)->GetLocation());
273 report->loc2 = &seq_interval;
274
275 char bufferchar[20480]; memset(bufferchar, 0, 20480);
276 CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
277 printOverlapReport(report, buffer);
278
279 CNcbiStrstream buff_misc_feat_rna;
280 buff_misc_feat_rna
281 << "potential RNA location ("
282 << name1 << ") that overlaps protein (" << get_title(seq) << ")" << '\0';
283
284 CNcbiStrstream buff_misc_feat_protein;
285 buff_misc_feat_protein
286 << "potential protein location ("
287 << get_title(seq) << ") that overlaps RNA (" << name1 << ")" << '\0';
288
289
290 CNcbiStrstream misc_feat_rna;
291 misc_feat_rna << buff_misc_feat_rna.str() << '\0';
292 CNcbiStrstream misc_feat_protein;
293 misc_feat_protein << buff_misc_feat_protein.str() << '\0';
294
295 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created RNA buffer: " << buff_misc_feat_rna.str() << "\n";
296 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created protein buffer: " << buff_misc_feat_protein.str() << "\n";
297 problemStr problem = {eRnaOverlap, buffer.str(), "", "", "", -1, -1, eNa_strand_unknown };
298 m_diag[n1].problems.push_back(problem);
299 bool removeit=false;
300 string removen = "";
301 // problemStr problemCOH = {eRemoveOverlap, "", misc_feat.str(), "", "", mint, maxt, eNa_strand_unknown };
302 if (rna_feat_type == eMyFeatureType_pseudo_tRNA && seq_type != eMyFeatureType_hypo_CDS)
303 {
304 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
305 << n1 << " marked for deletion (pseudo)" << "\n";
306 removen = n1;
307 removeit=true;
308 }
309 else if (rna_feat_type == eMyFeatureType_atypical_tRNA && seq_type != eMyFeatureType_hypo_CDS)
310 {
311 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
312 << n1 << " marked for deletion (atypical)" << "\n";
313 removen = n1;
314 removeit=true;
315 }
316 else if
317 (
318 (
319 (rna_feat_type == eMyFeatureType_normal_tRNA ||
320 rna_feat_type == eMyFeatureType_atypical_tRNA
321 )
322 && seq_type == eMyFeatureType_hypo_CDS
323 ) ||
324 rna_feat_type == eMyFeatureType_rRNA
325 )
326 {
327 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: CDS and gene "
328 << n2 << " marked for deletion (hypothetical)" << "\n";
329 removen = n2;
330 removeit=true;
331 }
332 else
333 {
334 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: no deletion\n";
335 removeit=false;
336 }
337 if(removeit)
338 {
339 if(removen == n1) // RNA location is deleted
340 {
341 problemStr problemCOH = {eRemoveOverlap, "", misc_feat_rna.str(), "", "", min1, max1, strand1};
342 m_diag[removen].problems.push_back(problemCOH);
343 }
344 else // protein location is deleted
345 {
346 problemStr problemCOH = {eRemoveOverlap, "", misc_feat_protein.str(), "", "", min2, max2, strand2};
347 m_diag[removen].problems.push_back(problemCOH);
348 }
349 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: sequence "
350 << "[" << removen << "]"
351 << " is marked for removal"
352 << NcbiEndl;
353 try
354 {
355 CBioseq_set::TSeq_set* seqs = get_parent_seqset(seq);
356 if(seqs!=NULL) append_misc_feature(*seqs, removen, eRemoveOverlap);
357 }
358 catch(...)
359 {
360 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for " << removen << NcbiEndl;
361 }
362 }
363 }
364 result = lres || result;
365 } // if(lres && overlap >= m_rna_overlapThreshold)
366
367 DecreaseVerbosity();
368 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] ends" << NcbiEndl;
369 return result;
370 }
371
overlaps_na(const CSeq_annot::C_Data::TFtable & feats)372 bool CReadBlastApp::overlaps_na
373 (
374 const CSeq_annot::C_Data::TFtable& feats
375 )
376 {
377 if(PrintDetails()) NcbiCerr << "overlaps_na[feats] starts" << NcbiEndl;
378 IncreaseVerbosity();
379 bool result = false;
380 LocMap loc_map;
381 GetLocMap(loc_map, feats);
382 ITERATE(CSeq_annot::C_Data::TFtable, f1, feats)
383 {
384 if(PrintDetails()) NcbiCerr << "overlaps_na[feats] cycling through rna_feats" << NcbiEndl;
385 // check overlaps with externally defined tRNAs
386 if ( !(*f1)->GetData().IsRna() ) continue;
387 CRNA_ref::EType rna_type = (*f1)->GetData().GetRna().GetType();
388 if(rna_type != CRNA_ref::eType_tRNA && rna_type != CRNA_ref::eType_rRNA ) continue;
389 string type1;
390 if ( rna_type == CRNA_ref::eType_tRNA )
391 {
392 if ( !(*f1)->GetData().GetRna().CanGetExt() ) continue;
393 try { type1 = Get3type((*f1)->GetData().GetRna());}
394 catch (...)
395 {
396 NcbiCerr << "overlaps_na[feats]: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
397 throw;
398 }
399 }
400 else
401 {
402 type1 = GetRRNAtype((*f1)->GetData().GetRna());
403 }
404 if(type1.size()==0) continue;
405 match_na(**f1, type1);
406 }
407 result = true;
408 DecreaseVerbosity();
409 if(PrintDetails()) NcbiCerr << "overlaps_na[feats] ends" << NcbiEndl;
410 return result;
411 }
412
overlaps_prot_na(const string & n1,const CSeq_interval & i1,const CSeq_feat & f2,int & overlap)413 bool CReadBlastApp::overlaps_prot_na
414 (
415 const string& n1,
416 const CSeq_interval& i1,
417 const CSeq_feat& f2,
418 int& overlap
419 )
420 {
421 bool result = false;
422 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] starts" << NcbiEndl;
423 overlap=0;
424 string n2="not gene";
425 if(f2.GetData().IsGene()) f2.GetData().GetGene().GetLabel(&n2);
426 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2]: input: "
427 << n1
428 << ","
429 << n2
430 << NcbiEndl;
431 result = overlaps(i1, f2.GetLocation(), overlap);
432
433 if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] ends" << NcbiEndl;
434 return result;
435
436 }
437
438
overlaps_na(const CSeq_feat & f1,const CSeq_feat & f2,int & overlap)439 bool CReadBlastApp::overlaps_na
440 (
441 const CSeq_feat& f1,
442 const CSeq_feat& f2,
443 int& overlap
444 )
445 {
446 bool result = false;
447 if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] starts" << NcbiEndl;
448 overlap=0;
449
450 string n1; f1.GetData().GetGene().GetLabel(&n1);
451 string n2; f2.GetData().GetGene().GetLabel(&n2);
452 if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2]: input: "
453 << n1
454 << ","
455 << n2
456 << NcbiEndl;
457 if(n1==n2) return result;
458
459 result = overlaps(f1.GetLocation(), f2.GetLocation(), overlap);
460
461 if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] ends" << NcbiEndl;
462 return result;
463
464 }
465 template <typename t1, typename t2> bool
overlaps(const t1 & l1,const t2 & l2,int & overlap)466 CReadBlastApp::overlaps
467 (
468 /*
469 const CSeq_loc& l1,
470 const CSeq_loc& l2,
471 */
472 const t1& l1,
473 const t2& l2,
474 int& overlap
475 )
476 {
477 bool result = false;
478 overlap=0;
479 for (CTypeConstIterator<CSeq_interval> i1 = ncbi::ConstBegin(l1); i1; ++i1)
480 {
481 TSeqPos from1, to1, from2, to2;
482 ENa_strand strand1, strand2;
483 int min1, min2, max1, max2;
484 getFromTo( *i1, from1, to1, strand1);
485 min1 = min(from1, to1);
486 max1 = max(from1, to1);
487 for (CTypeConstIterator<CSeq_interval> i2 = ncbi::ConstBegin(l2); i2; ++i2)
488 {
489 getFromTo( *i2, from2, to2, strand2);
490 min2 = min(from2, to2);
491 max2 = max(from2, to2);
492 int overlap_start, overlap_end;
493 overlap_end = min(max1, max2);
494 overlap_start = max(min1, min2);
495
496 bool result2 = overlap_end >= overlap_start;
497 if(!result2) continue;
498 overlap+=overlap_end - overlap_start + 1;
499 result=true;
500 }
501 }
502 return result;
503 }
504
overlaps(const CSeq_loc & l1,int from2,int to2,int & overlap)505 bool CReadBlastApp::overlaps
506 (
507 const CSeq_loc& l1,
508 int from2, int to2,
509 int& overlap
510 )
511 {
512 bool result = false;
513 overlap=0;
514 TSeqPos from1, to1;
515 int min1, min2, max1, max2;
516 min2 = min(from2, to2);
517 max2 = max(from2, to2);
518 for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
519 {
520 ENa_strand strand1;
521 getFromTo(*i1, from1, to1, strand1);
522 min1 = min(from1, to1);
523 max1 = max(from1, to1);
524 int overlap_start, overlap_end;
525 overlap_end = min(max1, max2);
526 overlap_start = max(min1, min2);
527
528 bool result2 = overlap_end >= overlap_start;
529 if(result2) result=true; // found one segment that overlaps with the second input segment
530 overlap+=overlap_end - overlap_start + 1;
531 }
532 return result;
533 }
534
complete_overlap(const CSeq_loc & l1,const CSeq_loc & l2)535 bool CReadBlastApp::complete_overlap // l1 is covered by l2
536 (
537 const CSeq_loc& l1,
538 const CSeq_loc& l2
539 )
540 {
541 bool result = false;
542 for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
543 {
544 TSeqPos from1, to1, from2, to2;
545 ENa_strand strand1, strand2;
546 int min1, min2, max1, max2;
547 getFromTo( *i1, from1, to1, strand1);
548 min1 = min(from1, to1);
549 max1 = max(from1, to1);
550 result = false; // assume this piece
551 for (CTypeConstIterator<CSeq_interval> i2 = ::ConstBegin(l2); i2; ++i2)
552 {
553 getFromTo( *i2, from2, to2, strand2);
554 // note that this does not take into account weird cases when two pieces of l2 are stuck together without any gap, covering i1 in combination
555 min2 = min(from2, to2);
556 max2 = max(from2, to2);
557 if(min2<=min1 && max2>=max1) // found completely covering piece
558 {
559 if(PrintDetails()) NcbiCerr << "complete_overlap: "
560 << from1 << " ... " << to1 << " "
561 << from2 << " ... " << to2 << " "
562 << NcbiEndl;
563 result=true;
564 break;
565 }
566 }
567 if(!result) return result;
568 }
569 return result;
570 }
571
overlaps(const CBioseq & left,const CBioseq & right)572 bool CReadBlastApp::overlaps
573 // genomic interval is already stored from the NA_annotations
574 (
575 const CBioseq& left,
576 const CBioseq& right
577 )
578 {
579 bool result=false;
580 // check if prot
581 // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
582 string qname = GetStringDescr (left);
583 string qrname = GetStringDescr (right);
584 if(PrintDetails()) NcbiCerr << "overlaps, seq level " << NcbiEndl;
585 if(PrintDetails()) NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
586 if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
587 // assuming that protein sequences come in one piece of seq-set
588 if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
589 if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
590 if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
591 if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
592 /*
593 if(!hasGenomicInterval(left)) return result;
594 if(!hasGenomicInterval(right)) return result;
595 const CSeq_interval& left_genomic_int = getGenomicInterval(left);
596 const CSeq_interval& right_genomic_int = getGenomicInterval(right);
597 */
598 if(!hasGenomicLocation(left)) return result;
599 if(!hasGenomicLocation(right)) return result;
600 const CSeq_loc& left_genomic_int = getGenomicLocation(left);
601 const CSeq_loc& right_genomic_int = getGenomicLocation(right);
602
603 if(PrintDetails()) NcbiCerr << "Got intervals" << NcbiEndl;
604 TSeqPos from1, to1, from2, to2;
605 ENa_strand left_strand;
606 ENa_strand right_strand;
607 getFromTo(left_genomic_int, from1, to1, left_strand);
608 getFromTo(right_genomic_int, from2, to2, right_strand);
609
610 if(PrintDetails()) NcbiCerr << "Got strands" << NcbiEndl;
611 int left_frame=-0xFF, right_frame=-0xFF;
612 if(left_genomic_int.IsInt())
613 {
614 left_frame = (from1-1)%3+1;
615 }
616 if(right_genomic_int.IsInt())
617 {
618 right_frame = (from2-1)%3+1;
619 }
620
621 if(left_strand != eNa_strand_plus && left_genomic_int.IsInt()) left_frame=-left_frame;
622 if(right_strand != eNa_strand_plus && right_genomic_int.IsInt()) right_frame=-right_frame;
623 /*
624 Tue 10/7/2008 9:25 AM, Bill Klimke + his consultation w/ Leigh Riley
625 opposite strands overlaps should be treated exactly the same way
626 */
627 // if(left_strand != right_strand) return result;
628 if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
629 int space =
630 (min((int)to1, (int)to2)-
631 max((int)from2, (int)from1)
632 +1
633 )/3;
634
635 bool complete_overlaps = false;
636 int scratch_overlap;
637 result = overlaps(left_genomic_int, right_genomic_int, scratch_overlap);
638 bool left_covered_by_right=false;
639 bool right_covered_by_left=false;
640 if(result) complete_overlaps = (left_covered_by_right=complete_overlap(left_genomic_int, right_genomic_int))
641 || (right_covered_by_left=complete_overlap(right_genomic_int, left_genomic_int));
642 if(PrintDetails()) NcbiCerr << "space = " << space
643 << ", complete_overlap = " << complete_overlaps
644 << ", result = " << result
645 << NcbiEndl;
646 if(result && scratch_overlap >= m_cds_overlapThreshold) // overlap
647 {
648 distanceReportStr *report = new distanceReportStr;
649 report->left_strand = left_strand;
650 report->right_strand = right_strand;
651 report->q_loc_left_from = from1;
652 report->q_loc_right_from = from2;
653 report->q_loc_left_to = to1;
654 report->q_loc_right_to = to2;
655 report->q_id_left = GetStringDescr (left);
656 report->q_id_right = GetStringDescr (right);
657 report->q_name_left = GetProtName(left);
658 report->q_name_right = GetProtName(right);
659 report->space = space;
660 report->left_frame = left_frame;
661 report->right_frame = right_frame;
662
663 char bufferchar[20480]; memset(bufferchar, 0, 20480);
664 CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
665 printOverlapReport(report, buffer);
666 /*
667 CNcbiStrstream misc_feat;
668 misc_feat << "potential protein locations )" << GetProtName(left)
669 << ") and " << printed_range(right)
670 << " overlap by " << scratch_overlap
671 << "bp"
672 << NcbiEndl << '\0';
673 */
674 CNcbiStrstream misc_feat_left;
675 misc_feat_left
676 << "potential protein location (" << GetProtName(left)
677 << ") that overlaps protein (" << GetProtName(right) << ")" << NcbiEndl << '\0';
678
679 CNcbiStrstream misc_feat_right;
680 misc_feat_right
681 << "potential protein location (" << GetProtName(right)
682 << ") that overlaps protein (" << GetProtName(left) << ")" << NcbiEndl << '\0';
683
684 // problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
685 // problemStr problemO = {eOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
686 // problemStr problem;
687 // problemStr problemCOH = {eRemoveOverlap , "", misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
688
689
690 if(complete_overlaps)
691 {
692 // problem = problemCO;
693 if(report->q_name_left.find("hypothetical")!=string::npos && left_covered_by_right && !right_covered_by_left)
694 {
695 NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
696 << "[" << qname << "]"
697 << " is marked for removal because of a complete overlap"
698 << NcbiEndl;
699 problemStr problemCOH = {eRemoveOverlap , "", misc_feat_left.str(), "", "", (int)from1, (int)to1, left_strand};
700 problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
701 m_diag[qname].problems.push_back(problemCOH);
702 m_diag[qname].problems.push_back(problemCO);
703 try
704 {
705 CBioseq_set::TSeq_set* seqs = get_parent_seqset(left);
706 if(seqs!=NULL) append_misc_feature(*seqs, qname, eRemoveOverlap);
707 }
708 catch(...)
709 {
710 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
711 << qname << NcbiEndl;
712 }
713 }
714 if(report->q_name_right.find("hypothetical")!=string::npos && right_covered_by_left)
715 {
716 NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
717 << "[" << qrname << "]"
718 << " is marked for removal because of a complete overlap"
719 << NcbiEndl;
720 problemStr problemCOH = {eRemoveOverlap , "", misc_feat_right.str(), "", "", (int)from2, (int)to2, right_strand};
721 problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
722 m_diag[qrname].problems.push_back(problemCOH);
723 m_diag[qrname].problems.push_back(problemCO);
724 try
725 {
726 CBioseq_set::TSeq_set* seqs = get_parent_seqset(right);
727 if(seqs!=NULL) append_misc_feature(*seqs, qrname, eRemoveOverlap);
728 }
729 catch(...)
730 {
731 NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
732 << qrname << NcbiEndl;
733 }
734 }
735 {
736 problemStr problemO_l = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
737 problemStr problemO_r = {eCompleteOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
738 m_diag[qname].problems.push_back(problemO_l);
739 m_diag[qrname].problems.push_back(problemO_r);
740 }
741 }
742 else
743 {
744 problemStr problemO_l = {eOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
745 problemStr problemO_r = {eOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
746 m_diag[qname].problems.push_back(problemO_l);
747 m_diag[qrname].problems.push_back(problemO_r);
748 }
749 delete report;
750 }
751 return result;
752 }
753
754