1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
fit_blast(const CBioseq & left,const CBioseq & right,string & common_subject)35 bool CReadBlastApp::fit_blast
36   (
37   const CBioseq& left,
38   const CBioseq& right,
39   string& common_subject
40   )
41 {
42   bool result=false;
43   // check if prot
44   if(PrintDetails()) NcbiCerr << "fit_blast, seq level " << NcbiEndl;
45 //  string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
46 //  string qrname = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
47   string qname = GetStringDescr (left);
48   string qrname = GetStringDescr (right);
49   // if(PrintDetails()) NcbiCerr << "left  " <<  CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA) << NcbiEndl;
50   // if(PrintDetails()) NcbiCerr << "right " << CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA) << NcbiEndl;
51   if(PrintDetails()) NcbiCerr << "left  " <<  GetStringDescr (left) << NcbiEndl;
52   if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
53 // assuming that protein sequences come in one piece of seq-set
54   if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
55   if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
56   if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
57   if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
58 /*
59   if(!hasGenomicInterval(left)) return result;
60   if(!hasGenomicInterval(right)) return result;
61 
62   const CSeq_interval& left_genomic_int = getGenomicInterval(left);
63   const CSeq_interval& right_genomic_int = getGenomicInterval(right);
64 */
65   if(!hasGenomicLocation(left)) return result;
66   if(!hasGenomicLocation(right)) return result;
67 
68   const CSeq_loc& left_genomic_int = getGenomicLocation(left);
69   const CSeq_loc& right_genomic_int = getGenomicLocation(right);
70   TSeqPos from1, to1, from2, to2;
71 
72   ENa_strand  left_strand;
73   ENa_strand right_strand;
74   getFromTo(left_genomic_int, from1, to1, left_strand);
75   getFromTo(right_genomic_int, from2, to2, right_strand);
76 
77   int left_frame  = (from1-1)%3+1; // might be not making sense of location is composite
78   int right_frame = (from2-1)%3+1;
79   if(left_strand  != eNa_strand_plus )  left_frame=-left_frame;
80   if(right_strand != eNa_strand_plus ) right_frame=-right_frame;
81   if(left_strand != right_strand)
82     {
83     if(PrintDetails())
84        {
85        NcbiCerr << "Strands mismatch for " << NcbiEndl;
86        NcbiCerr << "left  " <<  GetStringDescr (left) << NcbiEndl;
87        NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
88        }
89     return result;
90     }
91   if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
92   bool reverse = left_strand == eNa_strand_minus;
93   int space = ((int)from2-(int)to1)/3+1; // +1 for stop codon, /3 to convert n to aa
94   if(PrintDetails()) NcbiCerr << "space: " << space << NcbiEndl;
95   int left_qLen  = getQueryLen(left);
96   int right_qLen = getQueryLen(right);
97   int num=0, numFit=0, numGI=0;
98   int numLeft=0, numRight=0;
99   vector<perfectHitStr> left_perfect;
100   vector<perfectHitStr> right_perfect;
101   collectPerfectHits(left_perfect, left);
102   collectPerfectHits(right_perfect, right);
103 
104   if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
105   int n_frameshift_pairs=0;
106   list<problemStr> problemsl;
107   list<problemStr> problemsr;
108   IncreaseVerbosity();
109   common_subject="";
110   string any_common_subject="";
111   ITERATE(CBioseq::TAnnot, left_annot, left.GetAnnot())
112     {
113     if(PrintDetails()) NcbiCerr << "Next left annot: numLeft: " << numLeft << NcbiEndl;
114     if(!(*left_annot)->GetData().IsAlign()) continue;
115     numLeft++;
116     if(PrintDetails()) NcbiCerr << "Left annot is alignment: numLeft: " << numLeft << NcbiEndl;
117     vector<long> left_gis = getGIs(left_annot);
118     if(PrintDetails()) NcbiCerr << "Left annot is alignment: left_gis: " << left_gis.size()  << NcbiEndl;
119 
120     IncreaseVerbosity();
121     ITERATE(CBioseq::TAnnot, right_annot, right.GetAnnot())
122       {
123       if(PrintDetails()) NcbiCerr << "Next right annot: numRight: " << numRight << NcbiEndl;
124       if(!(*right_annot)->GetData().IsAlign()) continue;
125       numRight++;
126       if(PrintDetails()) NcbiCerr << "Right annot is alignment: numRight: " << numRight << NcbiEndl;
127       vector<long> right_gis = getGIs(right_annot);
128       if(PrintDetails()) NcbiCerr << "Right annot is alignment: right_gis: " << right_gis.size()  << NcbiEndl;
129       PushVerbosity();
130       if(!giMatch(left_gis, right_gis))
131          {
132          PopVerbosity();
133          continue;
134          }
135       PopVerbosity();
136       if(PrintDetails()) NcbiCerr << "!!!!!!!!! GI Match !!!!!!!!!!" << NcbiEndl;
137       numGI++;
138       distanceReportStr *report = new distanceReportStr;
139       report->left_strand  = left_strand;
140       report->right_strand = right_strand;
141       report->s_id = left_gis[0];
142       report->q_loc_left_from  = reverse ? from2: from1;
143       report->q_loc_right_from = reverse ? from1 : from2;
144       report->q_loc_left_to    = reverse ? to2 : to1;
145       report->q_loc_right_to   = reverse ? to1 : to2;
146       int frame_from=report->q_loc_left_from;
147       int frame_to  =frame_from;
148       frame_from = min(frame_from, report->q_loc_left_from);
149       frame_to   = max(frame_to  , report->q_loc_left_from);
150       frame_from = min(frame_from, report->q_loc_right_from);
151       frame_to   = max(frame_to  , report->q_loc_right_from);
152       frame_from = min(frame_from, report->q_loc_left_to  );
153       frame_to   = max(frame_to  , report->q_loc_left_to  );
154       frame_from = min(frame_from, report->q_loc_right_to );
155       frame_to   = max(frame_to  , report->q_loc_right_to );
156       report->left_frame       = left_frame;
157       report->right_frame      = right_frame;
158       if( (!reverse && fit_blast(left, right, left_annot, right_annot, left_qLen, right_qLen, space, report)) ||
159           ( reverse && fit_blast(right, left, right_annot, left_annot, right_qLen, left_qLen, space, report))
160         )
161         {
162         numFit++;
163         string this_subject = getAnnotName(left_annot);
164         if(numFit==1) any_common_subject = this_subject;
165         if(common_subject.size()==0 && this_subject.find("hypothetical")==string::npos)
166           {
167 // best subject
168           common_subject = this_subject;
169           if(PrintDetails()) NcbiCerr << "zero common_subject changed to non-hypothetical this_subject " << this_subject
170                                       << NcbiEndl;
171           }
172         if(PrintDetails()) NcbiCerr << "!!!! Blast bounds match !!!!! numFit: " << numFit << NcbiEndl;
173 
174         char bufferchar[20480];  memset(bufferchar, 0, 20480);
175         CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
176         printReport(report, buffer);
177         CNcbiStrstream misc_feat;
178         if(common_subject.size())
179           {
180           misc_feat << "potential frameshift: common BLAST hit: "
181                     << common_subject << '\0';
182           }
183         else
184           {
185           misc_feat << '\0';
186           }
187         {
188         if(PrintDetails()) NcbiCerr << "added a problem: " << misc_feat.str() << NcbiEndl;
189         if(PrintDetails()) NcbiCerr << "added a problem(left): " << qname << NcbiEndl;
190         if(PrintDetails()) NcbiCerr << "added a problem(right): " << qrname << NcbiEndl;
191         problemStr problem = { eFrameShift, buffer.str(),  misc_feat.str(),
192           qname, qrname,
193           frame_from,
194           frame_to ,
195           report->left_strand};
196         problemsl.push_back(problem);
197         }
198         {
199         problemStr problem = { eFrameShift, "", "", "", "", -1, -1, eNa_strand_unknown };
200         problemsr.push_back(problem);
201         }
202         n_frameshift_pairs++;
203         }
204       else
205         delete report;
206       }
207     DecreaseVerbosity();
208     }
209   DecreaseVerbosity();
210   num=numLeft*numRight;
211   if(common_subject.size()==0) common_subject = any_common_subject;
212 
213 
214   if(numFit) // this block determines whether it is frameshift or not
215     {
216 // one of the queries is hypothetical - frameshift!
217     if(qname.find("hypothetical") != string::npos || qrname.find("hypothetical") != string::npos)
218       {
219       if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
220                                   << "one of those is hypo: FRAMESHIFT" << NcbiEndl;
221       result = true;
222       }
223     else
224       {
225 // not enough frameshift evidence - not frameshift!
226       if(numFit<2)
227         {
228         if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
229                                   << "numFit < 2: NOT A FRAMESHIFT" << NcbiEndl;
230         result = false;
231         }
232       else
233         {
234 // there are truely exhonerating hits - not frameshift!
235         if(left_perfect.size() || right_perfect.size())
236           {
237           if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
238                                   << "there are exhonerating hits: NOT A FRAMESHIFT" << NcbiEndl;
239           result = false;
240           }
241 // the rest are frameshifts
242         else
243           {
244           if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
245                                   << "there are no exhonerating hits: FRAMESHIFT" << NcbiEndl;
246           result = true;
247           }
248         }
249       }
250     }
251   else
252     {
253     if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
254                                   << "numFit = 0: NOT A FRAMESHIFT" << NcbiEndl;
255     result = false;
256     }
257   if(PrintDetails()) NcbiCerr << "after, left  " <<  GetStringDescr (left)  << NcbiEndl;
258   if(PrintDetails()) NcbiCerr << "after, right " <<  GetStringDescr (right) << NcbiEndl;
259   if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
260   if(PrintDetails()) NcbiCerr << "final numGI, numFit : " << numGI << " " << numFit << NcbiEndl;
261   if( !result && numFit  )
262 // print exhonerating hits to stderr if detailed printing
263      {
264      char bufferchar[2048];  memset(bufferchar, 0, 2048);
265      CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 2048));
266      buffer << "Left sequence has " << left_perfect.size() << " perfect hits." << NcbiEndl;
267      ITERATE(vector<perfectHitStr>, hit, left_perfect)
268        {
269        printPerfectHit(*hit, buffer);
270        }
271      buffer << "Right sequence has " << right_perfect.size() << " perfect hits." << NcbiEndl;
272      ITERATE(vector<perfectHitStr>, hit, right_perfect)
273        {
274        printPerfectHit(*hit, buffer);
275        }
276      buffer << "In total, " << numFit << " pairs of hits (out of " << num << ") match for these two sequences." << NcbiEndl;
277      if(PrintDetails()) NcbiCerr << buffer.str();
278      if(PrintDetails()) NcbiCerr << "that is, left sequence (" << qname << ")." << NcbiEndl;
279      }
280   if(result)
281     {
282     addProblems(m_diag[qname].problems, problemsl);
283     addProblems(m_diag[qrname].problems, problemsr);
284     }
285 
286   return result;
287 }
288 
fit_blast(const CBioseq & left,const CBioseq & right,CBioseq::TAnnot::const_iterator & left_annot,CBioseq::TAnnot::const_iterator & right_annot,int left_qLen,int right_qLen,int space,distanceReportStr * report)289 bool CReadBlastApp::fit_blast
290   (
291   const CBioseq& left,
292   const CBioseq& right,
293   CBioseq::TAnnot::const_iterator& left_annot,
294   CBioseq::TAnnot::const_iterator& right_annot,
295   int left_qLen, int right_qLen,
296   int space, distanceReportStr* report
297   )
298 {
299   bool result=false;
300   // report->q_id_left    = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
301   // report->q_id_right   = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
302   report->q_id_left    = GetStringDescr (left);
303   report->q_id_right   = GetStringDescr (right);
304   report->q_name_left  = GetProtName(left);
305   report->q_name_right = GetProtName(right);
306 
307   int left_sLen  = getLenScore(left_annot);
308   int right_sLen = getLenScore(right_annot);
309 
310   int left_qFrom, left_qTo, right_qFrom, right_qTo;
311   int left_sFrom, left_sTo, right_sFrom, right_sTo;
312   getBounds(left_annot,  &left_qFrom,  &left_qTo,  &left_sFrom,  &left_sTo);
313   getBounds(right_annot, &right_qFrom, &right_qTo, &right_sFrom, &right_sTo);
314 
315   report->q_left_left    = left_qFrom-1;
316   report->q_left_middle  = left_qTo-left_qFrom+1;
317   report->q_left_right   = left_qLen-left_qTo;
318 
319   report->space          = space;
320   report->s_name         ="cannot get subject name";
321   report->s_name = getAnnotName(left_annot);
322   report->alignment_left = getAnnotComment(left_annot);
323   report->alignment_right = getAnnotComment(right_annot);
324 
325   if(PrintDetails()) NcbiCerr << "fit_blast annot level: " << report->s_name.c_str() << NcbiEndl;
326 
327   report->q_right_left   = right_qFrom-1;
328   report->q_right_middle = right_qTo-right_qFrom+1;
329   report->q_right_right  = right_qLen-right_qTo;
330 
331   report->s_left_left    = left_sFrom-1;
332   report->s_left_middle  = left_sTo-left_sFrom+1;
333   report->s_left_right   = left_sLen-left_sTo;
334 
335   report->s_right_left   = right_sFrom-1;
336   report->s_right_middle = right_sTo-right_sFrom+1;
337   report->s_right_right  = right_sLen-right_sTo;
338 
339 
340   int result_left = report->diff_left  =
341                  (report->s_left_right - report->s_right_right) -
342                  (report->q_left_right + report->space + report->q_right_left)
343                ;
344   int result_right = report->diff_right =
345                  (report->s_right_left - report->s_left_left  ) -
346                  (report->q_left_right + report->space + report->q_right_left)
347                ;
348 
349   report->diff_left  -= report->q_right_middle;
350   report->diff_right -= report->q_left_middle;
351 
352   report->diff_edge_left  = report->s_right_left -
353                            (report->s_left_left  + report->s_left_middle  +
354                             report->space + report->q_left_right + report->q_right_left );
355   report->diff_edge_right = report->s_left_right -
356                            (report->s_right_right+ report->s_right_middle +
357                             report->space + report->q_left_right + report->q_right_left );
358   if(PrintDetails())
359       printReport(report, NcbiCerr);
360 
361 
362 
363   // result = diff<150;
364 // subject's hypothetical or not does not play a role
365   result = result_left > 0 && result_right > 0; // && report->s_name.find("hypothetical")==string::npos;
366   return result;
367 }
has_blast_hits(const CBioseq & seq)368 bool CReadBlastApp::has_blast_hits(const CBioseq& seq)
369 {
370     bool result;
371     if(PrintDetails()) NcbiCerr << "has_blast_hits?"  << NcbiEndl;
372     result=seq.GetAnnot().size()>1;
373     if(PrintDetails()) NcbiCerr << result  << NcbiEndl;
374     return result;
375 }
376 
377 
378 
379