1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34
fit_blast(const CBioseq & left,const CBioseq & right,string & common_subject)35 bool CReadBlastApp::fit_blast
36 (
37 const CBioseq& left,
38 const CBioseq& right,
39 string& common_subject
40 )
41 {
42 bool result=false;
43 // check if prot
44 if(PrintDetails()) NcbiCerr << "fit_blast, seq level " << NcbiEndl;
45 // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
46 // string qrname = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
47 string qname = GetStringDescr (left);
48 string qrname = GetStringDescr (right);
49 // if(PrintDetails()) NcbiCerr << "left " << CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA) << NcbiEndl;
50 // if(PrintDetails()) NcbiCerr << "right " << CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA) << NcbiEndl;
51 if(PrintDetails()) NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
52 if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
53 // assuming that protein sequences come in one piece of seq-set
54 if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
55 if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
56 if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
57 if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
58 /*
59 if(!hasGenomicInterval(left)) return result;
60 if(!hasGenomicInterval(right)) return result;
61
62 const CSeq_interval& left_genomic_int = getGenomicInterval(left);
63 const CSeq_interval& right_genomic_int = getGenomicInterval(right);
64 */
65 if(!hasGenomicLocation(left)) return result;
66 if(!hasGenomicLocation(right)) return result;
67
68 const CSeq_loc& left_genomic_int = getGenomicLocation(left);
69 const CSeq_loc& right_genomic_int = getGenomicLocation(right);
70 TSeqPos from1, to1, from2, to2;
71
72 ENa_strand left_strand;
73 ENa_strand right_strand;
74 getFromTo(left_genomic_int, from1, to1, left_strand);
75 getFromTo(right_genomic_int, from2, to2, right_strand);
76
77 int left_frame = (from1-1)%3+1; // might be not making sense of location is composite
78 int right_frame = (from2-1)%3+1;
79 if(left_strand != eNa_strand_plus ) left_frame=-left_frame;
80 if(right_strand != eNa_strand_plus ) right_frame=-right_frame;
81 if(left_strand != right_strand)
82 {
83 if(PrintDetails())
84 {
85 NcbiCerr << "Strands mismatch for " << NcbiEndl;
86 NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
87 NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
88 }
89 return result;
90 }
91 if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
92 bool reverse = left_strand == eNa_strand_minus;
93 int space = ((int)from2-(int)to1)/3+1; // +1 for stop codon, /3 to convert n to aa
94 if(PrintDetails()) NcbiCerr << "space: " << space << NcbiEndl;
95 int left_qLen = getQueryLen(left);
96 int right_qLen = getQueryLen(right);
97 int num=0, numFit=0, numGI=0;
98 int numLeft=0, numRight=0;
99 vector<perfectHitStr> left_perfect;
100 vector<perfectHitStr> right_perfect;
101 collectPerfectHits(left_perfect, left);
102 collectPerfectHits(right_perfect, right);
103
104 if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
105 int n_frameshift_pairs=0;
106 list<problemStr> problemsl;
107 list<problemStr> problemsr;
108 IncreaseVerbosity();
109 common_subject="";
110 string any_common_subject="";
111 ITERATE(CBioseq::TAnnot, left_annot, left.GetAnnot())
112 {
113 if(PrintDetails()) NcbiCerr << "Next left annot: numLeft: " << numLeft << NcbiEndl;
114 if(!(*left_annot)->GetData().IsAlign()) continue;
115 numLeft++;
116 if(PrintDetails()) NcbiCerr << "Left annot is alignment: numLeft: " << numLeft << NcbiEndl;
117 vector<long> left_gis = getGIs(left_annot);
118 if(PrintDetails()) NcbiCerr << "Left annot is alignment: left_gis: " << left_gis.size() << NcbiEndl;
119
120 IncreaseVerbosity();
121 ITERATE(CBioseq::TAnnot, right_annot, right.GetAnnot())
122 {
123 if(PrintDetails()) NcbiCerr << "Next right annot: numRight: " << numRight << NcbiEndl;
124 if(!(*right_annot)->GetData().IsAlign()) continue;
125 numRight++;
126 if(PrintDetails()) NcbiCerr << "Right annot is alignment: numRight: " << numRight << NcbiEndl;
127 vector<long> right_gis = getGIs(right_annot);
128 if(PrintDetails()) NcbiCerr << "Right annot is alignment: right_gis: " << right_gis.size() << NcbiEndl;
129 PushVerbosity();
130 if(!giMatch(left_gis, right_gis))
131 {
132 PopVerbosity();
133 continue;
134 }
135 PopVerbosity();
136 if(PrintDetails()) NcbiCerr << "!!!!!!!!! GI Match !!!!!!!!!!" << NcbiEndl;
137 numGI++;
138 distanceReportStr *report = new distanceReportStr;
139 report->left_strand = left_strand;
140 report->right_strand = right_strand;
141 report->s_id = left_gis[0];
142 report->q_loc_left_from = reverse ? from2: from1;
143 report->q_loc_right_from = reverse ? from1 : from2;
144 report->q_loc_left_to = reverse ? to2 : to1;
145 report->q_loc_right_to = reverse ? to1 : to2;
146 int frame_from=report->q_loc_left_from;
147 int frame_to =frame_from;
148 frame_from = min(frame_from, report->q_loc_left_from);
149 frame_to = max(frame_to , report->q_loc_left_from);
150 frame_from = min(frame_from, report->q_loc_right_from);
151 frame_to = max(frame_to , report->q_loc_right_from);
152 frame_from = min(frame_from, report->q_loc_left_to );
153 frame_to = max(frame_to , report->q_loc_left_to );
154 frame_from = min(frame_from, report->q_loc_right_to );
155 frame_to = max(frame_to , report->q_loc_right_to );
156 report->left_frame = left_frame;
157 report->right_frame = right_frame;
158 if( (!reverse && fit_blast(left, right, left_annot, right_annot, left_qLen, right_qLen, space, report)) ||
159 ( reverse && fit_blast(right, left, right_annot, left_annot, right_qLen, left_qLen, space, report))
160 )
161 {
162 numFit++;
163 string this_subject = getAnnotName(left_annot);
164 if(numFit==1) any_common_subject = this_subject;
165 if(common_subject.size()==0 && this_subject.find("hypothetical")==string::npos)
166 {
167 // best subject
168 common_subject = this_subject;
169 if(PrintDetails()) NcbiCerr << "zero common_subject changed to non-hypothetical this_subject " << this_subject
170 << NcbiEndl;
171 }
172 if(PrintDetails()) NcbiCerr << "!!!! Blast bounds match !!!!! numFit: " << numFit << NcbiEndl;
173
174 char bufferchar[20480]; memset(bufferchar, 0, 20480);
175 CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 20480));
176 printReport(report, buffer);
177 CNcbiStrstream misc_feat;
178 if(common_subject.size())
179 {
180 misc_feat << "potential frameshift: common BLAST hit: "
181 << common_subject << '\0';
182 }
183 else
184 {
185 misc_feat << '\0';
186 }
187 {
188 if(PrintDetails()) NcbiCerr << "added a problem: " << misc_feat.str() << NcbiEndl;
189 if(PrintDetails()) NcbiCerr << "added a problem(left): " << qname << NcbiEndl;
190 if(PrintDetails()) NcbiCerr << "added a problem(right): " << qrname << NcbiEndl;
191 problemStr problem = { eFrameShift, buffer.str(), misc_feat.str(),
192 qname, qrname,
193 frame_from,
194 frame_to ,
195 report->left_strand};
196 problemsl.push_back(problem);
197 }
198 {
199 problemStr problem = { eFrameShift, "", "", "", "", -1, -1, eNa_strand_unknown };
200 problemsr.push_back(problem);
201 }
202 n_frameshift_pairs++;
203 }
204 else
205 delete report;
206 }
207 DecreaseVerbosity();
208 }
209 DecreaseVerbosity();
210 num=numLeft*numRight;
211 if(common_subject.size()==0) common_subject = any_common_subject;
212
213
214 if(numFit) // this block determines whether it is frameshift or not
215 {
216 // one of the queries is hypothetical - frameshift!
217 if(qname.find("hypothetical") != string::npos || qrname.find("hypothetical") != string::npos)
218 {
219 if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
220 << "one of those is hypo: FRAMESHIFT" << NcbiEndl;
221 result = true;
222 }
223 else
224 {
225 // not enough frameshift evidence - not frameshift!
226 if(numFit<2)
227 {
228 if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
229 << "numFit < 2: NOT A FRAMESHIFT" << NcbiEndl;
230 result = false;
231 }
232 else
233 {
234 // there are truely exhonerating hits - not frameshift!
235 if(left_perfect.size() || right_perfect.size())
236 {
237 if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
238 << "there are exhonerating hits: NOT A FRAMESHIFT" << NcbiEndl;
239 result = false;
240 }
241 // the rest are frameshifts
242 else
243 {
244 if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
245 << "there are no exhonerating hits: FRAMESHIFT" << NcbiEndl;
246 result = true;
247 }
248 }
249 }
250 }
251 else
252 {
253 if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
254 << "numFit = 0: NOT A FRAMESHIFT" << NcbiEndl;
255 result = false;
256 }
257 if(PrintDetails()) NcbiCerr << "after, left " << GetStringDescr (left) << NcbiEndl;
258 if(PrintDetails()) NcbiCerr << "after, right " << GetStringDescr (right) << NcbiEndl;
259 if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
260 if(PrintDetails()) NcbiCerr << "final numGI, numFit : " << numGI << " " << numFit << NcbiEndl;
261 if( !result && numFit )
262 // print exhonerating hits to stderr if detailed printing
263 {
264 char bufferchar[2048]; memset(bufferchar, 0, 2048);
265 CNcbiStrstream buffer(NCBI_STRSTREAM_INIT(bufferchar, 2048));
266 buffer << "Left sequence has " << left_perfect.size() << " perfect hits." << NcbiEndl;
267 ITERATE(vector<perfectHitStr>, hit, left_perfect)
268 {
269 printPerfectHit(*hit, buffer);
270 }
271 buffer << "Right sequence has " << right_perfect.size() << " perfect hits." << NcbiEndl;
272 ITERATE(vector<perfectHitStr>, hit, right_perfect)
273 {
274 printPerfectHit(*hit, buffer);
275 }
276 buffer << "In total, " << numFit << " pairs of hits (out of " << num << ") match for these two sequences." << NcbiEndl;
277 if(PrintDetails()) NcbiCerr << buffer.str();
278 if(PrintDetails()) NcbiCerr << "that is, left sequence (" << qname << ")." << NcbiEndl;
279 }
280 if(result)
281 {
282 addProblems(m_diag[qname].problems, problemsl);
283 addProblems(m_diag[qrname].problems, problemsr);
284 }
285
286 return result;
287 }
288
fit_blast(const CBioseq & left,const CBioseq & right,CBioseq::TAnnot::const_iterator & left_annot,CBioseq::TAnnot::const_iterator & right_annot,int left_qLen,int right_qLen,int space,distanceReportStr * report)289 bool CReadBlastApp::fit_blast
290 (
291 const CBioseq& left,
292 const CBioseq& right,
293 CBioseq::TAnnot::const_iterator& left_annot,
294 CBioseq::TAnnot::const_iterator& right_annot,
295 int left_qLen, int right_qLen,
296 int space, distanceReportStr* report
297 )
298 {
299 bool result=false;
300 // report->q_id_left = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
301 // report->q_id_right = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
302 report->q_id_left = GetStringDescr (left);
303 report->q_id_right = GetStringDescr (right);
304 report->q_name_left = GetProtName(left);
305 report->q_name_right = GetProtName(right);
306
307 int left_sLen = getLenScore(left_annot);
308 int right_sLen = getLenScore(right_annot);
309
310 int left_qFrom, left_qTo, right_qFrom, right_qTo;
311 int left_sFrom, left_sTo, right_sFrom, right_sTo;
312 getBounds(left_annot, &left_qFrom, &left_qTo, &left_sFrom, &left_sTo);
313 getBounds(right_annot, &right_qFrom, &right_qTo, &right_sFrom, &right_sTo);
314
315 report->q_left_left = left_qFrom-1;
316 report->q_left_middle = left_qTo-left_qFrom+1;
317 report->q_left_right = left_qLen-left_qTo;
318
319 report->space = space;
320 report->s_name ="cannot get subject name";
321 report->s_name = getAnnotName(left_annot);
322 report->alignment_left = getAnnotComment(left_annot);
323 report->alignment_right = getAnnotComment(right_annot);
324
325 if(PrintDetails()) NcbiCerr << "fit_blast annot level: " << report->s_name.c_str() << NcbiEndl;
326
327 report->q_right_left = right_qFrom-1;
328 report->q_right_middle = right_qTo-right_qFrom+1;
329 report->q_right_right = right_qLen-right_qTo;
330
331 report->s_left_left = left_sFrom-1;
332 report->s_left_middle = left_sTo-left_sFrom+1;
333 report->s_left_right = left_sLen-left_sTo;
334
335 report->s_right_left = right_sFrom-1;
336 report->s_right_middle = right_sTo-right_sFrom+1;
337 report->s_right_right = right_sLen-right_sTo;
338
339
340 int result_left = report->diff_left =
341 (report->s_left_right - report->s_right_right) -
342 (report->q_left_right + report->space + report->q_right_left)
343 ;
344 int result_right = report->diff_right =
345 (report->s_right_left - report->s_left_left ) -
346 (report->q_left_right + report->space + report->q_right_left)
347 ;
348
349 report->diff_left -= report->q_right_middle;
350 report->diff_right -= report->q_left_middle;
351
352 report->diff_edge_left = report->s_right_left -
353 (report->s_left_left + report->s_left_middle +
354 report->space + report->q_left_right + report->q_right_left );
355 report->diff_edge_right = report->s_left_right -
356 (report->s_right_right+ report->s_right_middle +
357 report->space + report->q_left_right + report->q_right_left );
358 if(PrintDetails())
359 printReport(report, NcbiCerr);
360
361
362
363 // result = diff<150;
364 // subject's hypothetical or not does not play a role
365 result = result_left > 0 && result_right > 0; // && report->s_name.find("hypothetical")==string::npos;
366 return result;
367 }
has_blast_hits(const CBioseq & seq)368 bool CReadBlastApp::has_blast_hits(const CBioseq& seq)
369 {
370 bool result;
371 if(PrintDetails()) NcbiCerr << "has_blast_hits?" << NcbiEndl;
372 result=seq.GetAnnot().size()>1;
373 if(PrintDetails()) NcbiCerr << result << NcbiEndl;
374 return result;
375 }
376
377
378
379