1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 * Author: Ning Ma
26 *
27 */
28
29 /** @file igblast.cpp
30 * Implementation of CIgBlast.
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algo/blast/igblast/igblast.hpp>
35 #include <algo/blast/api/local_blast.hpp>
36 #include <algo/blast/api/bl2seq.hpp>
37 #include <algo/blast/api/remote_blast.hpp>
38 #include <algo/blast/api/objmgr_query_data.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40 #include <objtools/alnmgr/alnvec.hpp>
41 #include <algo/blast/composition_adjustment/composition_constants.h>
42 #include <objmgr/object_manager.hpp>
43
44
45 /** @addtogroup AlgoBlast
46 *
47 * @{
48 */
49
50 BEGIN_NCBI_SCOPE
51 USING_SCOPE(objects);
52 BEGIN_SCOPE(blast)
53
54 static int max_allowed_VJ_distance_with_D = 225;
55 static int max_allowed_VJ_distance_without_D = 50;
56 static int max_allowed_VD_distance = 120;
57 static int max_allowed_j_deletion = 32;
58 static int extend_length5end = 30;
59 static int extend_length3end = 15;
60 static int max_J_length = 70;
61 static int max_allowed_V_end_to_J_end = max_allowed_VJ_distance_with_D + max_J_length;
62 static int max_v_j_overlap = 7;
63 static int j_wordsize = 7;
64
s_ReadLinesFromFile(const string & fn,vector<string> & lines)65 static void s_ReadLinesFromFile(const string& fn, vector<string>& lines)
66 {
67 CNcbiIfstream fs(fn.c_str(), IOS_BASE::in);
68 lines.clear();
69
70 if (CFile(fn).Exists() && ! fs.fail()) {
71 char line[256];
72 while(true) {
73 fs.getline(line, 256);
74 if (fs.eof()) break;
75 if (line[0] == '#') continue;
76 string l(line);
77 lines.push_back(l);
78 }
79 }
80 fs.close();
81 };
82
CIgAnnotationInfo(CConstRef<CIgBlastOptions> & ig_opt)83 CIgAnnotationInfo::CIgAnnotationInfo(CConstRef<CIgBlastOptions> &ig_opt)
84 {
85 vector<string> lines;
86
87 // read domain info from pdm or ndm file
88 const string suffix = (ig_opt->m_IsProtein) ? ".pdm." : ".ndm.";
89 string fn(SeqDB_ResolveDbPath(ig_opt->m_IgDataPath + "/" + ig_opt->m_Origin + "/"
90 + ig_opt->m_Origin + suffix + ig_opt->m_DomainSystem));
91 if (fn == "") {
92 NCBI_THROW(CBlastException, eInvalidArgument,
93 "Domain annotation data file could not be found in [internal_data] directory");
94 }
95 s_ReadLinesFromFile(fn, lines);
96 int index = 0;
97 ITERATE(vector<string>, l, lines) {
98 vector<string> tokens;
99 NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
100 if (!tokens.empty()) {
101 m_DomainIndex[tokens[0]] = index;
102 for (int i=1; i<11; ++i) {
103 m_DomainData.push_back(NStr::StringToInt(tokens[i]));
104 }
105 index += 10;
106 m_DomainChainType[tokens[0]] = tokens[11];
107 int frame = NStr::StringToInt(tokens[12]);
108 if (frame != -1) {
109 m_FrameOffset[tokens[0]] = frame;
110 }
111 }
112 }
113
114 // read frame info from aux files
115 if (ig_opt->m_IsProtein) return;
116 fn = ig_opt->m_AuxFilename;
117 s_ReadLinesFromFile(fn, lines);
118 if (lines.size() == 0) {
119 ERR_POST(Warning << "Auxilary data file could not be found");
120 }
121 ITERATE(vector<string>, l, lines) {
122 vector<string> tokens;
123 NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
124 if (!tokens.empty()) {
125 int frame = NStr::StringToInt(tokens[1]);
126 if (frame != -1) {
127 m_FrameOffset[tokens[0]] = frame;
128 }
129 if (tokens.size() == 3) { //just backward compatible as there was no such field
130 m_DJChainType[tokens[0]] = tokens[2];
131 } else if (tokens.size() == 4) { //just backward compatible as there was no such field
132 m_DJChainType[tokens[0]] = tokens[2];
133 m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
134 } else if (tokens.size() == 5) { //just backward compatible as there was no such field
135 m_DJChainType[tokens[0]] = tokens[2];
136 m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
137 m_Fwr4EndOffset[tokens[0]] = NStr::StringToInt(tokens[4]);
138 }
139
140 }
141 }
142 };
143
x_ScreenByAlignLength(CRef<CSearchResultSet> & results,int length)144 void CIgBlast::x_ScreenByAlignLength(CRef<CSearchResultSet> & results, int length){
145 NON_CONST_ITERATE(CSearchResultSet, result, *results) {
146 if ((*result)->HasAlignments()) {
147 CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
148 CSeq_align_set::Tdata::iterator it = align_list.begin();
149 while (it != align_list.end()) {
150 if((int)((*it)->GetAlignLength()) - (int)((*it)->GetTotalGapCount(0)) < length){
151 it = align_list.erase(it);
152 } else {
153 ++it;
154 }
155 }
156 }
157 }
158 }
159
160
x_ExtendAlign5end(CRef<CSearchResultSet> & results)161 void CIgBlast::x_ExtendAlign5end(CRef<CSearchResultSet> & results){
162
163 NON_CONST_ITERATE(CSearchResultSet, result, *results) {
164
165 if ((*result)->HasAlignments()) {
166 CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
167 int desired_len = 0;
168 int actual_len = 0;
169 int top_hit_actual_len = 0;
170 int count = 0;
171 ENa_strand extend_strand = eNa_strand_plus;
172 int highest_score = 0;
173
174 NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
175
176 // cerr << "before=" << MSerial_AsnText << **align << endl;
177
178 //extend germline match up to some positions at 5' end. Extend length is
179 //set by comparing to top hit or top hit equivalents
180
181 int score = 0;
182 (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
183
184 if (score >= highest_score) { //top hits
185 highest_score = score;
186 extend_strand = (*align)->GetSeqStrand(0);
187 desired_len = min(extend_length5end,
188 (*align)->GetSegs().GetDenseg().GetStarts()[1]);
189
190 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
191 int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
192 int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
193 query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
194 (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
195 top_hit_actual_len = min(desired_len, allowed_len);
196
197
198 } else {
199
200 top_hit_actual_len = min(desired_len,
201 min((*align)->GetSegs().GetDenseg().GetStarts()[0],
202 (*align)->GetSegs().GetDenseg().GetStarts()[1]));
203
204 }
205 }
206
207 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
208 int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
209 int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
210 query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
211 (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
212 actual_len = min(top_hit_actual_len, min(desired_len, allowed_len));
213
214
215 } else {
216
217 actual_len = min(top_hit_actual_len, min(desired_len,
218 min((*align)->GetSegs().GetDenseg().GetStarts()[0],
219 (*align)->GetSegs().GetDenseg().GetStarts()[1])));
220
221 }
222
223 count ++;
224 //only extend if it has the same strand as the top hit
225 if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
226 if (extend_strand == eNa_strand_minus) {
227
228 (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
229 (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
230
231 } else {
232
233 (*align)->SetSegs().SetDenseg().SetStarts()[0] -= actual_len;
234 (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
235 (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
236
237 }
238 }
239 // cerr << "after=" << MSerial_AsnText << **align << endl;
240 }
241
242 }
243 }
244 }
245
x_ExtendAlign3end(CRef<CSearchResultSet> & results)246 void CIgBlast::x_ExtendAlign3end(CRef<CSearchResultSet> & results){
247
248 NON_CONST_ITERATE(CSearchResultSet, result, *results) {
249
250 if ((*result)->HasAlignments()) {
251 CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
252 int desired_len = 0;
253 int actual_len = 0;
254 int top_hit_actual_len = 0;
255 ENa_strand extend_strand = eNa_strand_plus;
256 int highest_score = 0;
257
258 NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
259
260 // cerr << "before=" << MSerial_AsnText << **align << endl;
261
262 //extend germline match up to some positions at 5' end. Extend length is
263 //set by comparing to top hit or top hit equivalents
264
265 int score = 0;
266 (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
267
268 if (score >= highest_score) { //top hits
269 highest_score = score;
270 extend_strand = (*align)->GetSeqStrand(0);
271
272 int j_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(1)).GetBioseqLength() - 1;
273 int j_align_stop = (*align)->GetSegs().GetDenseg().GetSeqStop(1);
274 desired_len = min(extend_length3end,
275 j_stop - j_align_stop);
276
277 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
278
279 int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
280 int allowed_query_length = query_align_start;
281
282 top_hit_actual_len = min(desired_len, allowed_query_length);
283 } else {
284 int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
285 int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
286 top_hit_actual_len = min(desired_len, allowed_query_length);
287
288 }
289 }
290
291 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
292
293 int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
294 int allowed_query_length = query_align_start;
295 actual_len = min(allowed_query_length, top_hit_actual_len);
296
297 } else {
298 int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
299 int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
300 actual_len = min(top_hit_actual_len, allowed_query_length);
301
302 }
303
304 //only extend if it has the same strand as the top hit
305 if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
306 if (extend_strand == eNa_strand_minus) {
307
308 int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
309 int num_dim = (*align)->GetSegs().GetDenseg().GetDim();
310 (*align)->SetSegs().SetDenseg().SetStarts()[num_seg*num_dim - 2] -= actual_len;
311 (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
312
313 } else {
314 int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
315 (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
316
317 }
318 }
319 // cerr << "after=" << MSerial_AsnText << **align << endl;
320 }
321
322 }
323 }
324 }
325
326 CRef<CSearchResultSet>
Run()327 CIgBlast::Run()
328 {
329 vector<CRef <CIgAnnotation> > annots;
330 CRef<CSearchResultSet> final_results;
331 CRef<IQueryFactory> qf;
332 CRef<CBlastOptionsHandle> opts_hndl(CBlastOptionsFactory
333 ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
334 CRef<CSearchResultSet> results[4], result;
335
336 /*** search V germline */
337 {
338 x_SetupVSearch(qf, opts_hndl);
339 CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[0]);
340 blast.SetNumberOfThreads(m_NumThreads);
341 results[0] = blast.Run();
342 if (m_IgOptions->m_ExtendAlign5end){
343 x_ExtendAlign5end(results[0]);
344 }
345 x_ScreenByAlignLength(results[0], m_IgOptions->m_MinVLength);
346 x_ConvertResultType(results[0]);
347 s_SortResultsByEvalue(results[0]);
348 x_AnnotateV(results[0], annots);
349 }
350
351 /*** search internal V for domain annotation */
352 {
353 //restore default settings for internal db search
354 if (m_IgOptions->m_IsProtein) {
355 opts_hndl->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
356 } else {
357 opts_hndl->SetOptions().SetMismatchPenalty(-1);
358 opts_hndl->SetOptions().SetWordSize(9);
359 opts_hndl->SetOptions().SetGapOpeningCost(4);
360 opts_hndl->SetOptions().SetGapExtensionCost(1);
361 }
362 opts_hndl->SetEvalueThreshold(20);
363 opts_hndl->SetHitlistSize(20); // use a larger number to ensure annotation
364 CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[3]);
365 blast.SetNumberOfThreads(m_NumThreads);
366 results[3] = blast.Run();
367 if (m_IgOptions->m_ExtendAlign5end){
368 x_ExtendAlign5end(results[3]);
369 }
370 x_ScreenByAlignLength(results[3], m_IgOptions->m_MinVLength);
371 s_SortResultsByEvalue(results[3]);
372 x_AnnotateDomain(results[0], results[3], annots);
373 }
374
375 opts_hndl.Reset(CBlastOptionsFactory
376 ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
377
378 /*** search DJ germline */
379 int num_genes = (m_IgOptions->m_IsProtein) ? 1 : 3;
380 if (num_genes > 1) {
381
382 for (int gene = 1; gene < num_genes; ++gene) {
383 x_SetupDJSearch(annots, qf, opts_hndl, gene);
384 CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[gene]);
385 try {
386 blast.SetNumberOfThreads(m_NumThreads);
387 results[gene] = blast.Run();
388 if (gene == 2){
389 if (m_IgOptions->m_ExtendAlign3end){
390 x_ExtendAlign3end(results[gene]);
391 }
392 x_ScreenByAlignLength(results[gene], m_IgOptions->m_MinJLength);
393 }
394 x_ConvertResultType(results[gene]);
395 } catch(...) {
396 num_genes = 1;
397 break;
398 }
399 }
400 x_ProcessDJResult(results[0], results[1], results[2], annots);
401
402 if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 && m_IgOptions->m_D_penalty == -4){
403 x_AnnotateDJ(results[1], results[2], annots);
404 } else {
405 x_AnnotateJ(results[2], annots);
406 //redo d gene search and not allow dj overlap
407 x_SetupNoOverlapDSearch(annots, results[1], qf, opts_hndl, 1);
408 CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[1]);
409 try {
410 blast.SetNumberOfThreads(m_NumThreads);
411 results[1] = blast.Run();
412
413 x_ConvertResultType(results[1]);
414 } catch(...) {
415 cerr << "blast failed" << endl;
416 }
417 x_ProcessDGeneResult(results[0], results[1], results[2],annots);
418 x_AnnotateD(results[1], annots);
419 }
420 }
421
422 /*** collect germline search results */
423 for (int gene = 0; gene < num_genes; ++gene) {
424 s_AppendResults(results[gene], m_IgOptions->m_NumAlign[gene], gene, final_results);
425 }
426
427 /*** search user specified db */
428 bool skipped = false;
429 if (m_IsLocal) {
430 if (&(*m_LocalDb) != &(*(m_IgOptions->m_Db[0]))) {
431 x_SetupDbSearch(annots, qf);
432 CLocalBlast blast(qf, m_Options, m_LocalDb);
433 blast.SetNumberOfThreads(m_NumThreads);
434 result = blast.Run();
435 } else {
436 skipped = true;
437 }
438 } else {
439 x_SetupDbSearch(annots, qf);
440 CRef<CRemoteBlast> blast;
441 if (m_RemoteDb.NotEmpty()) {
442 _ASSERT(m_Subject.Empty());
443 blast.Reset(new CRemoteBlast(qf, m_Options, *m_RemoteDb));
444 if(m_EntrezQuery != NcbiEmptyString){
445 blast->SetEntrezQuery(m_EntrezQuery.c_str());
446 }
447 } else {
448 blast.Reset(new CRemoteBlast(qf, m_Options, m_Subject));
449 }
450 blast->Submit();
451 m_RID=blast->GetRID();
452 GetDiagContext().Extra().Print("RID", m_RID);
453 result = blast->GetResultSet();
454 }
455 if (! skipped) {
456 x_ConvertResultType(result);
457 s_SortResultsByEvalue(result);
458 s_AppendResults(result, -1, -1, final_results);
459 }
460
461 /*** set chain type info */
462 x_SetChainType(final_results, annots);
463
464 /*** attach annotation info back to the results */
465 x_SetAnnotation(annots, final_results);
466
467 return final_results;
468 };
469
470 // Compare two seqaligns according to their evalue and coverage and name
471 //compare name since blast does not guarantee order of same score hits
s_CompareSeqAlignByScoreAndName(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)472 static bool s_CompareSeqAlignByScoreAndName(const CRef<CSeq_align> &x, const CRef<CSeq_align> &y)
473 {
474 int sx, sy;
475 x->GetNamedScore(CSeq_align::eScore_Score, sx);
476 y->GetNamedScore(CSeq_align::eScore_Score, sy);
477 if (sx != sy) return (sx > sy);
478
479 sx = x->GetAlignLength();
480 sy = y->GetAlignLength();
481 if (sx != sy) {
482 return (sx >= sy);
483 }
484
485 string x_id = NcbiEmptyString;
486 string y_id = NcbiEmptyString;
487 x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
488 y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
489 return (x_id < y_id);
490
491 };
492
493
494
x_SetupVSearch(CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl)495 void CIgBlast::x_SetupVSearch(CRef<IQueryFactory> &qf,
496 CRef<CBlastOptionsHandle> &opts_hndl)
497 {
498 CBlastOptions & opts = opts_hndl->SetOptions();
499 if (m_IgOptions->m_IsProtein) {
500 opts.SetCompositionBasedStats(eNoCompositionBasedStats);
501 } else {
502 int penalty = m_IgOptions->m_V_penalty;
503 opts.SetMatchReward(1);
504 opts.SetMismatchPenalty(penalty);
505 opts.SetWordSize(m_Options->GetOptions().GetWordSize());
506 if (penalty == -1) {
507 opts.SetGapOpeningCost(4);
508 opts.SetGapExtensionCost(1);
509 }
510 }
511 opts_hndl->SetEvalueThreshold(m_Options->GetOptions().GetEvalueThreshold());
512 opts_hndl->SetFilterString("F");
513 opts_hndl->SetHitlistSize(15+ m_IgOptions->m_NumAlign[0]);
514 qf.Reset(new CObjMgr_QueryFactory(*m_Query));
515
516 };
517
x_SetupDJSearch(const vector<CRef<CIgAnnotation>> & annots,CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl,int db_type)518 void CIgBlast::x_SetupDJSearch(const vector<CRef <CIgAnnotation> > &annots,
519 CRef<IQueryFactory> &qf,
520 CRef<CBlastOptionsHandle> &opts_hndl,
521 int db_type)
522 {
523 // Only igblastn will search DJ
524 CBlastOptions & opts = opts_hndl->SetOptions();
525 opts.SetMatchReward(1);
526 if (db_type == 2){ //J genes are longer so if can afford more reliable identification
527 opts.SetWordSize(j_wordsize);
528 opts.SetMismatchPenalty(m_IgOptions->m_J_penalty);
529 } else {
530 opts.SetWordSize(m_IgOptions->m_Min_D_match);
531 opts.SetMismatchPenalty(m_IgOptions->m_D_penalty);
532 }
533
534 opts.SetGapOpeningCost(5);
535 opts.SetGapExtensionCost(2);
536 opts_hndl->SetEvalueThreshold((db_type == 2) ? 1000.0 : 100000.0);
537 opts_hndl->SetFilterString("F");
538 opts_hndl->SetHitlistSize(max(max(50,
539 m_IgOptions->m_NumAlign[1]),
540 m_IgOptions->m_NumAlign[2]));
541
542 // Mask query for D, J search
543 int iq = 0;
544 ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
545 CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
546 CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
547 int len = query->GetLength();
548 if ((*annot)->m_GeneInfo[0] == -1) {
549 // This is not a germline sequence. Mask it out
550 TMaskedQueryRegions mask_list;
551 CRef<CSeqLocInfo> mask(
552 new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
553 mask_list.push_back(mask);
554 m_Query->SetMaskedRegions(iq, mask_list);
555 } else {
556 // Excluding the V gene except the last max_v_j_overlap bp for D and J gene search;
557 // also limit the J match to max_allowed_V_end_to_J_end beyond V gene.
558 int v_overlap;
559 if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 && m_IgOptions->m_D_penalty == -4) {
560 v_overlap = max_v_j_overlap;
561 } else {
562 v_overlap = 0;
563 }
564 bool ms = (*annot)->m_MinusStrand;
565 int begin = (ms)?
566 (*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end: (*annot)->m_GeneInfo[1] - 1 - v_overlap;
567 int end = (ms)?
568 (*annot)->m_GeneInfo[0] + v_overlap: (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end;
569 if (begin > 0 && begin <= len-1) {
570 CRef<CSeqLocInfo> mask(
571 new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin), 0));
572 m_Query->AddMask(iq, mask);
573 }
574 if (end < len -1 && end >= 0) {
575 CRef<CSeqLocInfo> mask(
576 new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
577 m_Query->AddMask(iq, mask);
578 }
579 }
580 ++iq;
581 }
582
583 // Generate query factory
584 qf.Reset(new CObjMgr_QueryFactory(*m_Query));
585 };
586
587
x_SetupNoOverlapDSearch(const vector<CRef<CIgAnnotation>> & annots,CRef<CSearchResultSet> & previous_d_results,CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl,int db_type)588 void CIgBlast::x_SetupNoOverlapDSearch(const vector<CRef <CIgAnnotation> > &annots,
589 CRef<CSearchResultSet> &previous_d_results,
590 CRef<IQueryFactory> &qf,
591 CRef<CBlastOptionsHandle> &opts_hndl,
592 int db_type)
593 {
594 // Only igblastn will search DJ
595 CBlastOptions & opts = opts_hndl->SetOptions();
596 opts.SetMatchReward(1);
597 opts.SetWordSize(m_IgOptions->m_Min_D_match);
598 opts.SetMismatchPenalty(m_IgOptions->m_D_penalty);
599 opts.SetGapOpeningCost(5);
600 opts.SetGapExtensionCost(2);
601 opts_hndl->SetEvalueThreshold(100000.0);
602 opts_hndl->SetFilterString("F");
603 opts_hndl->SetHitlistSize(max(max(50,
604 m_IgOptions->m_NumAlign[1]),
605 m_IgOptions->m_NumAlign[2]));
606
607 // Mask query for D
608 int iq = 0;
609 ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
610 CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
611 CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
612 int len = query->GetLength();
613 CRef<CSeq_align_set> align_d(0);
614 if ((*previous_d_results)[iq].HasAlignments()){
615 align_d = (*previous_d_results)[iq].SetSeqAlign();
616 }
617
618 if ((*annot)->m_GeneInfo[0] == -1 || !align_d || align_d.Empty() || align_d->IsEmpty()) {
619 // This is not a ig sequence or there is no d gene per previous search. Mask it out
620 TMaskedQueryRegions mask_list;
621 CRef<CSeqLocInfo> mask(
622 new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
623 mask_list.push_back(mask);
624 m_Query->SetMaskedRegions(iq, mask_list);
625 } else {
626 // Excluding the V gene and J gene
627 bool ms = (*annot)->m_MinusStrand;
628 int v_end_or_j_begin = (ms)?
629 max((*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end, (*annot)->m_GeneInfo[5] - 1): (*annot)->m_GeneInfo[1] -1;
630 int j_begin_or_v_end = (ms)?
631 (*annot)->m_GeneInfo[0]: min((*annot)->m_GeneInfo[4], (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end);
632 if (v_end_or_j_begin > 0) {
633 CRef<CSeqLocInfo> mask(
634 new CSeqLocInfo(new CSeq_interval(*q_id, 0, v_end_or_j_begin), 0));
635 m_Query->AddMask(iq, mask);
636 }
637 if (j_begin_or_v_end < len-1 && j_begin_or_v_end > 0) {
638 CRef<CSeqLocInfo> mask(
639 new CSeqLocInfo(new CSeq_interval(*q_id, j_begin_or_v_end, len-1), 0));
640 m_Query->AddMask(iq, mask);
641 }
642 }
643 ++iq;
644 }
645
646 // Generate query factory
647 qf.Reset(new CObjMgr_QueryFactory(*m_Query));
648 };
649
x_SetupDbSearch(vector<CRef<CIgAnnotation>> & annots,CRef<IQueryFactory> & qf)650 void CIgBlast::x_SetupDbSearch(vector<CRef <CIgAnnotation> > &annots,
651 CRef<IQueryFactory> &qf)
652 {
653 // Options already passed in as m_Options. Only set up (mask) the query
654 int iq = 0;
655 ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
656 CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
657 CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
658 int len = query->GetLength();
659 TMaskedQueryRegions mask_list;
660 if ((*annot)->m_GeneInfo[0] ==-1) {
661 // This is not a germline sequence. Mask it out
662 CRef<CSeqLocInfo> mask(
663 new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
664 mask_list.push_back(mask);
665 } else if (m_IgOptions->m_FocusV) {
666 // Restrict to V gene
667 int begin = (*annot)->m_GeneInfo[0];
668 int end = (*annot)->m_GeneInfo[1];
669 if (begin > 0) {
670 CRef<CSeqLocInfo> mask(
671 new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin-1), 0));
672 mask_list.push_back(mask);
673 }
674 if (end < len) {
675 CRef<CSeqLocInfo> mask(
676 new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
677 mask_list.push_back(mask);
678 }
679 }
680 m_Query->SetMaskedRegions(iq, mask_list);
681 ++iq;
682 }
683 qf.Reset(new CObjMgr_QueryFactory(*m_Query));
684 };
685
686 // Compare the second seqalign to see if it is as good as the first one
s_IsSeqAlignAsGood(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)687 static bool s_IsSeqAlignAsGood(const CRef<CSeq_align> &x,
688 const CRef<CSeq_align> &y)
689 {
690 double sx, sy;
691 x->GetNamedScore(CSeq_align::eScore_EValue, sx);
692 y->GetNamedScore(CSeq_align::eScore_EValue, sy);
693 if (sx < 0.999999 * sy || sy < 0.999999 * sx) return false;
694 int ix, iy;
695 x->GetNamedScore(CSeq_align::eScore_Score, ix);
696 y->GetNamedScore(CSeq_align::eScore_Score, iy);
697 if (ix > iy) return false;
698 int dx, dy;
699 dx = x->GetAlignLength();
700 dy = y->GetAlignLength();
701 return (dx <= dy);
702 }
703
704 // Remove lcl| from seqid label
s_RemoveLocalPrefix(const string & sid)705 static string s_RemoveLocalPrefix(const string & sid)
706 {
707 if (sid.substr(0, 4) == "lcl|") return(sid.substr(4, sid.length()));
708 return sid;
709 }
710
s_MakeTopHitsId(const CSeq_align_set::Tdata & align_list,int num_align)711 static string s_MakeTopHitsId(const CSeq_align_set::Tdata& align_list, int num_align) {
712 string ids = NcbiEmptyString;
713 CRef<CSeq_align> align = align_list.front();
714 int count = 0;
715 ITERATE(CSeq_align_set::Tdata, it, align_list) {
716
717 if (count < num_align && s_IsSeqAlignAsGood(align, (*it))) {
718 string this_id = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
719 if (ids.find(this_id) == string::npos) {
720
721 //no redundant id
722 if (ids != NcbiEmptyString) {
723 ids += ",";
724 }
725 ids += this_id;
726 count ++;
727 }
728 } else {
729 break;
730 }
731 }
732 return ids;
733 }
734
x_AnnotateV(CRef<CSearchResultSet> & results,vector<CRef<CIgAnnotation>> & annots)735 void CIgBlast::x_AnnotateV(CRef<CSearchResultSet> &results,
736 vector<CRef <CIgAnnotation> > &annots)
737 {
738 ITERATE(CSearchResultSet, result, *results) {
739
740 CIgAnnotation *annot = new CIgAnnotation();
741 annots.push_back(CRef<CIgAnnotation>(annot));
742
743 if ((*result)->HasAlignments()) {
744 const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
745 CRef<CSeq_align> align = align_list.front();
746 annot->m_GeneInfo[0] = align->GetSeqStart(0);
747 annot->m_GeneInfo[1] = align->GetSeqStop(0)+1;
748 annot->m_TopGeneIds[0] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[0]);
749 }
750 }
751 };
752
753 // Test if the alignment is already in the align_list
s_SeqAlignInSet(CSeq_align_set::Tdata & align_list,CRef<CSeq_align> & align)754 static bool s_SeqAlignInSet(CSeq_align_set::Tdata & align_list, CRef<CSeq_align> &align)
755 {
756 ITERATE(CSeq_align_set::Tdata, it, align_list) {
757 if ((*it)->GetSeq_id(1).Match(align->GetSeq_id(1)) &&
758 (*it)->GetSeqStart(1) == align->GetSeqStart(1) &&
759 (*it)->GetSeqStop(1) == align->GetSeqStop(1)) return true;
760 }
761 return false;
762 };
763
764 // Compare two seqaligns according to their evalue and coverage
s_CompareSeqAlignByEvalue(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)765 static bool s_CompareSeqAlignByEvalue(const CRef<CSeq_align> &x,
766 const CRef<CSeq_align> &y)
767 {
768 double sx, sy;
769 x->GetNamedScore(CSeq_align::eScore_EValue, sx);
770 y->GetNamedScore(CSeq_align::eScore_EValue, sy);
771 if (sx < 0.999999 * sy) return true;
772 if (sy < 0.999999 * sx) return false;
773 int ix, iy;
774 x->GetNamedScore(CSeq_align::eScore_Score, ix);
775 y->GetNamedScore(CSeq_align::eScore_Score, iy);
776 if (ix != iy) return (ix > iy);
777
778 int dx, dy;
779 dx = x->GetAlignLength();
780 dy = y->GetAlignLength();
781 if (dx != dy) {
782 return (dx >= dy);
783 }
784 string x_id = NcbiEmptyString;
785 string y_id = NcbiEmptyString;
786 x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
787 y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
788 return (x_id < y_id);
789 };
790
791 // Compare two seqaligns according to their evalue and coverage
s_CompareSeqAlignByScore(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)792 static bool s_CompareSeqAlignByScore(const CRef<CSeq_align> &x, const CRef<CSeq_align> &y)
793 {
794 int sx, sy;
795 x->GetNamedScore(CSeq_align::eScore_Score, sx);
796 y->GetNamedScore(CSeq_align::eScore_Score, sy);
797 if (sx != sy) return (sx > sy);
798 sx = x->GetAlignLength();
799 sy = y->GetAlignLength();
800 return (sx >= sy);
801
802 };
803
804
805
806 // Test if D and J annotation not compatible
s_DJNotCompatible(const CSeq_align & d,const CSeq_align & j,bool ms,int margin)807 static bool s_DJNotCompatible(const CSeq_align &d, const CSeq_align &j, bool ms, int margin)
808 {
809 int ds = d.GetSeqStart(0);
810 int de = d.GetSeqStop(0);
811 int js = j.GetSeqStart(0);
812 int je = j.GetSeqStop(0);
813
814 //D gene needs to have minimal match in addition to overlap with J gene
815 //D gene needs to end before J gene ends
816 if (ms) {
817 if (ds < js || de < je + margin) return true;
818 } else {
819 if (ds > js - margin || de > je) return true;
820 }
821 return false;
822 };
823
824 /*
825 static bool s_IsTopMatchJD(CSearchResults& res_J, CIgAnnotationInfo& annotation_info){
826 bool result = true; //default
827 CRef<CSeq_align_set> align_J;
828 if (res_J.HasAlignments()) {
829 align_J.Reset(const_cast<CSeq_align_set *>
830 (&*(res_J.GetSeqAlign())));
831 CSeq_align_set::Tdata & align_list = align_J->Set();
832 CSeq_align_set::Tdata::iterator it = align_list.begin();
833 int prev_score = 0;
834 result = false;
835 while (it != align_list.end()) {
836 int current_score;
837 (*it)->GetNamedScore(CSeq_align::eScore_Score, current_score);
838 if(current_score >= prev_score){
839 string j_id;
840 (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
841 string j_chain_type = annotation_info.GetDJChainType(j_id);
842 if (j_chain_type == "N/A"){
843 //assume J gene id style
844
845 string sid = NStr::ToUpper(j_id);
846 if (sid.substr(0, 2) == "TR" && sid[3] == 'J') {
847 j_chain_type = "J" + sid.substr(2,1);
848 } else if (sid[0] == 'J') {
849 j_chain_type = sid.substr(0,2);
850 }
851 }
852 if (j_chain_type == "JD"){
853 result = true;
854 break;
855 }
856
857 } else {
858 break;
859 }
860 prev_score = current_score;
861 ++it;
862 }
863
864 }
865 return result;
866 };
867 */
x_FindDJAln(CRef<CSeq_align_set> & align_D,CRef<CSeq_align_set> & align_J,string q_ct,bool q_ms,ENa_strand q_st,int q_ve,int iq,bool va_or_vd_as_heavy_chain)868 void CIgBlast::x_FindDJAln(CRef<CSeq_align_set>& align_D,
869 CRef<CSeq_align_set>& align_J,
870 string q_ct,
871 bool q_ms,
872 ENa_strand q_st,
873 int q_ve,
874 int iq,
875 bool va_or_vd_as_heavy_chain) {
876
877 int allowed_VJ_distance = max_allowed_VJ_distance_with_D;
878 /* preprocess D */
879 if (align_D && !align_D->Get().empty()) {
880 CSeq_align_set::Tdata & align_list = align_D->Set();
881 CSeq_align_set::Tdata::iterator it = align_list.begin();
882 /* chain type test */
883 if (q_ct!="VH" && q_ct!="VD" && q_ct!="VA" && q_ct!="VB" ) {
884 while (it != align_list.end()) {
885 it = align_list.erase(it);
886 }
887 allowed_VJ_distance = max_allowed_VJ_distance_without_D;
888 } else if (q_ct =="VA" || q_ct =="VD") {
889 if (va_or_vd_as_heavy_chain) {
890 //VA could behave like VD and is allowed to rearrange to JA or DD/JD
891 q_ct = "VD";
892 //annot->m_ChainType[0] = "VD";
893 } else {
894 q_ct = "VA";
895 while (it != align_list.end()) {
896 it = align_list.erase(it);
897 }
898 allowed_VJ_distance = max_allowed_VJ_distance_without_D;
899 }
900 }
901 //test compatability between V and D
902 it = align_list.begin();
903 while (it != align_list.end()) {
904 bool keep = true;
905 /* chain type test */
906 if (q_ct!="N/A") {
907 char s_ct = q_ct[1];
908 string d_id;
909 (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
910 string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
911 if (d_chain_type != "N/A"){
912 if (d_chain_type[1] != q_ct[1]) keep = false;
913 } else { //assume D gene id style
914 string sid = (*it)->GetSeq_id(1).AsFastaString();
915 sid = NStr::ToUpper(sid);
916 if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
917 if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
918 && sid[3] == 'D') {
919 s_ct = sid[2];
920 }
921 if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
922 if (s_ct != q_ct[1]) keep = false;
923 }
924 }
925
926 /* remove failed seq_align */
927 if (!keep) it = align_list.erase(it);
928 else ++it;
929 }
930
931
932 /* strand test */
933 bool strand_found = false;
934 ITERATE(CSeq_align_set::Tdata, it, align_list) {
935 if ((*it)->GetSeqStrand(0) == q_st) {
936 strand_found = true;
937 break;
938 }
939 }
940 if (strand_found) {
941 it = align_list.begin();
942 while (it != align_list.end()) {
943 if ((*it)->GetSeqStrand(0) != q_st) {
944 it = align_list.erase(it);
945 } else ++it;
946 }
947 }
948 /* v end test */
949 it = align_list.begin();
950 while (it != align_list.end()) {
951 bool keep = false;
952 int q_ds = (*it)->GetSeqStart(0);
953 int q_de = (*it)->GetSeqStop(0);
954 if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
955 else keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
956 if (!keep) it = align_list.erase(it);
957 else ++it;
958 }
959 /* sort according to score */
960 align_list.sort(s_CompareSeqAlignByScoreAndName);
961 }
962
963 /* preprocess J */
964 if (align_J && !align_J->Get().empty()) {
965 CSeq_align_set::Tdata & align_list = align_J->Set();
966 CSeq_align_set::Tdata::iterator it = align_list.begin();
967 while (it != align_list.end()) {
968 bool keep = true;
969 /* chain type test */
970 if (q_ct!="N/A") {
971 char s_ct = q_ct[1];
972 string j_id;
973 (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
974 string j_chain_type = m_AnnotationInfo.GetDJChainType(j_id);
975 if (j_chain_type != "N/A"){
976 if (j_chain_type[1] != q_ct[1]) keep = false;
977 } else { //assume J gene id style
978 string sid = (*it)->GetSeq_id(1).AsFastaString();
979 sid = NStr::ToUpper(sid);
980 if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
981 if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
982 && sid[3] == 'J') {
983 s_ct = sid[2];
984 } else if (sid[0] == 'J') {
985 s_ct = sid[1];
986 }
987 if (s_ct!='H' && s_ct!='L' && s_ct!='K' &&
988 s_ct!='A' && s_ct!='B' && s_ct!='D' && s_ct!='G') s_ct = q_ct[1];
989 if (s_ct != q_ct[1]) keep = false;
990 }
991 } else {
992 keep = false;
993 }
994 /* strand test */
995 if ((*it)->GetSeqStrand(0) != q_st) keep = false;
996 /* subject start test */
997 if ((*it)->GetSeqStart(1) > max_allowed_j_deletion) keep = false;
998 /* v end test */
999 int q_js = (*it)->GetSeqStart(0);
1000 int q_je = (*it)->GetSeqStop(0);
1001 if (q_ms) {
1002 if (q_je < q_ve - allowed_VJ_distance || q_js > q_ve - j_wordsize) keep = false;
1003 } else {
1004 if (q_js > q_ve + allowed_VJ_distance || q_je < q_ve + j_wordsize) keep = false;
1005 }
1006 /* remove failed seq_align */
1007 if (!keep) it = align_list.erase(it);
1008 else ++it;
1009 }
1010 /* sort according to score */
1011 align_list.sort(s_CompareSeqAlignByScoreAndName);
1012 }
1013
1014 /* which one to keep, D or J? */
1015 if (align_D.NotEmpty() && !align_D->IsEmpty() &&
1016 align_J.NotEmpty() && !align_J->IsEmpty()) {
1017 CSeq_align_set::Tdata & al_D = align_D->Set();
1018 CSeq_align_set::Tdata & al_J = align_J->Set();
1019 CSeq_align_set::Tdata::iterator it;
1020 bool keep_J = s_CompareSeqAlignByScore(*(al_J.begin()), *(al_D.begin()));
1021 if (keep_J) {
1022 it = al_D.begin();
1023 while (it != al_D.end()) {
1024 if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1025 it = al_D.erase(it);
1026 } else ++it;
1027 }
1028
1029 if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 &&
1030 m_IgOptions->m_D_penalty == -4) {
1031 //deleting j only for overlap case otherwise it's handeled later
1032 if (align_D.NotEmpty() && !align_D->IsEmpty()){
1033 it = al_J.begin();
1034 while (it != al_J.end()) {
1035 if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1036 it = al_J.erase(it);
1037 } else ++it;
1038 }
1039 }
1040 }
1041 } else {
1042 it = al_J.begin();
1043
1044 while (it != al_J.end()) {
1045 if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1046 it = al_J.erase(it);
1047 } else ++it;
1048 }
1049 if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1050 it = al_D.begin();
1051 while (it != al_D.end()) {
1052 if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1053 it = al_D.erase(it);
1054 } else ++it;
1055 }
1056
1057 }
1058 }
1059
1060 }
1061
1062 }
1063
1064
x_FindDJ(CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,CRef<CIgAnnotation> & annot,CRef<CSeq_align_set> & align_D,CRef<CSeq_align_set> & align_J,string q_ct,bool q_ms,ENa_strand q_st,int q_ve,int iq)1065 void CIgBlast::x_FindDJ(CRef<CSearchResultSet>& results_D,
1066 CRef<CSearchResultSet>& results_J,
1067 CRef <CIgAnnotation>& annot,
1068 CRef<CSeq_align_set>& align_D,
1069 CRef<CSeq_align_set>& align_J,
1070 string q_ct,
1071 bool q_ms,
1072 ENa_strand q_st,
1073 int q_ve,
1074 int iq) {
1075
1076 CRef<CSeq_align_set> original_align_D(new CSeq_align_set);
1077 CRef<CSeq_align_set> original_align_J(new CSeq_align_set);
1078
1079 /* preprocess D */
1080 CSearchResults& res_D = (*results_D)[iq];
1081 if (res_D.HasAlignments()) {
1082
1083 align_D.Reset(const_cast<CSeq_align_set *>
1084 (&*(res_D.GetSeqAlign())));
1085 original_align_D->Assign(*align_D);
1086
1087 }
1088
1089 /* preprocess J */
1090 CSearchResults& res_J = (*results_J)[iq];
1091 if (res_J.HasAlignments()) {
1092 align_J.Reset(const_cast<CSeq_align_set *>
1093 (&*(res_J.GetSeqAlign())));
1094 original_align_J->Assign(*align_J);
1095
1096 }
1097 //try as VA
1098 x_FindDJAln(align_D, align_J, q_ct, q_ms, q_st, q_ve, iq, false);
1099 if ((original_align_D.NotEmpty() && !original_align_D->Get().empty()) && (q_ct =="VA" || q_ct =="VD")) {
1100
1101 annot->m_ChainType[0] = "VA";
1102 //try as VD
1103 x_FindDJAln(original_align_D, original_align_J, q_ct, q_ms, q_st, q_ve, iq, true);
1104 int as_heavy_chain_score = 0;
1105 int as_light_chain_score = 0;
1106 int d_score = 0;
1107 if(original_align_J.NotEmpty() && !original_align_J->Get().empty()){
1108 original_align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_heavy_chain_score);
1109 }
1110
1111 if(original_align_D.NotEmpty() && !original_align_D->Get().empty()){
1112 original_align_D->Get().front()->GetNamedScore(CSeq_align::eScore_Score, d_score);
1113 }
1114 if (align_J.NotEmpty() && !align_J->Get().empty()){
1115 align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_light_chain_score);
1116 }
1117
1118
1119 if (as_heavy_chain_score + d_score> as_light_chain_score){
1120 if (align_D.NotEmpty() && original_align_D.NotEmpty()){
1121 align_D->Assign(*original_align_D);
1122 }
1123 if (align_J.NotEmpty() && original_align_J.NotEmpty()){
1124 align_J->Assign(*original_align_J);
1125 }
1126
1127 annot->m_ChainType[0] = "VD";
1128 }
1129
1130 }
1131
1132 }
1133
x_FillJDomain(CRef<CSeq_align> & align,CRef<CIgAnnotation> & annot)1134 void CIgBlast::x_FillJDomain(CRef<CSeq_align> & align, CRef <CIgAnnotation> & annot){
1135 string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1136 int j_cdr3end = m_AnnotationInfo.GetJDomain(sid);
1137 int subject_start = align->GetSeqStart(1);
1138 int subject_end = align->GetSeqStop(1);
1139 //don't try if j starts after cdr3 ends as we don't know for sure where the boundry is
1140 if (j_cdr3end > 0 && subject_start - j_cdr3end <= 1) {
1141 CAlnMap j_map(align->GetSegs().GetDenseg());
1142
1143 //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1144 // of fwr4 can be handled.
1145 annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1,
1146 max(subject_start, min(j_cdr3end + 1,
1147 subject_end)),
1148 IAlnExplorer::eRight);
1149
1150 if (align->GetSeqStrand(0) == eNa_strand_minus) {
1151 annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1152 }
1153
1154 //deduct one back to in CDR3
1155 if (subject_end > j_cdr3end) {
1156 annot->m_JDomain[1] --;
1157 }
1158 //allow missed alignment to the first bp and deduce the cdr3 by gapless extension backwards
1159 } else if (j_cdr3end > 0 && subject_start - j_cdr3end <= 2) {
1160 CAlnMap j_map(align->GetSegs().GetDenseg());
1161
1162 //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1163 // of fwr4 can be handled.
1164 annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1, subject_start, IAlnExplorer::eRight);
1165
1166 if (align->GetSeqStrand(0) == eNa_strand_minus) {
1167 annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1168 }
1169
1170 //deduct diff back to be in CDR3
1171 if (subject_end > j_cdr3end) {
1172 annot->m_JDomain[1] = annot->m_JDomain[1] - (subject_start - j_cdr3end);
1173 }
1174 }
1175
1176 //fwr4
1177 if (annot->m_JDomain[1] > 0) {
1178 int j_fwr4end_offset = m_AnnotationInfo.GetFwr4EndOffset(sid);
1179 annot->m_JDomain[4] = j_fwr4end_offset;
1180 if (j_fwr4end_offset >= 0) {
1181 int j_fwr4end = m_Scope->GetBioseqHandle(align->GetSeq_id(1)).GetBioseqLength() - j_fwr4end_offset - 1;
1182 CAlnMap j_map(align->GetSegs().GetDenseg());
1183
1184 annot->m_JDomain[3] = j_map.GetSeqPosFromSeqPos(0, 1, min(j_fwr4end, subject_end), IAlnExplorer::eRight);
1185
1186
1187 if (align->GetSeqStrand(0) == eNa_strand_minus) {
1188 annot->m_JDomain[3] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[3] - 1;
1189 }
1190 //cdr3 domain at the end of alignment
1191 if (annot->m_JDomain[1] == annot->m_JDomain[3]) {
1192 annot->m_JDomain[3] = -1;
1193 }
1194 }
1195 }
1196
1197
1198 }
1199
x_ProcessDGeneResult(CRef<CSearchResultSet> & results_V,CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1200 void CIgBlast::x_ProcessDGeneResult(CRef<CSearchResultSet>& results_V,
1201 CRef<CSearchResultSet>& results_D,
1202 CRef<CSearchResultSet>& results_J,
1203 vector<CRef <CIgAnnotation> > &annots) {
1204
1205 int iq = 0;
1206 NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1207 string q_ct = (*annot)->m_ChainType[0];
1208 bool q_ms = (*annot)->m_MinusStrand;
1209 ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1210 int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1211
1212 CRef<CSeq_align_set> align_D (0);
1213 CSearchResults& res_D = (*results_D)[iq];
1214 if (res_D.HasAlignments()) {
1215 align_D = res_D.SetSeqAlign();
1216 }
1217
1218 // preprocess D
1219 if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1220 CSeq_align_set::Tdata & align_list = align_D->Set();
1221 CSeq_align_set::Tdata::iterator it = align_list.begin();
1222
1223 //test compatability between V and D
1224 it = align_list.begin();
1225 while (it != align_list.end()) {
1226 bool keep = true;
1227 // chain type test
1228 if (q_ct!="N/A") {
1229 char s_ct = q_ct[1];
1230 string d_id;
1231 (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
1232 string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
1233 if (d_chain_type != "N/A"){
1234 if (d_chain_type[1] != q_ct[1]) keep = false;
1235 } else { //assume D gene id style
1236 string sid = (*it)->GetSeq_id(1).AsFastaString();
1237 sid = NStr::ToUpper(sid);
1238 if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
1239 if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
1240 && sid[3] == 'D') {
1241 s_ct = sid[2];
1242 }
1243 if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
1244 if (s_ct != q_ct[1]) keep = false;
1245 }
1246 }
1247
1248 //remove failed seq_align
1249 if (!keep) it = align_list.erase(it);
1250 else ++it;
1251 }
1252
1253
1254 //strand test
1255 bool strand_found = false;
1256 ITERATE(CSeq_align_set::Tdata, it, align_list) {
1257 if ((*it)->GetSeqStrand(0) == q_st) {
1258 strand_found = true;
1259 break;
1260 }
1261 }
1262 if (strand_found) {
1263 it = align_list.begin();
1264 while (it != align_list.end()) {
1265 if ((*it)->GetSeqStrand(0) != q_st) {
1266 it = align_list.erase(it);
1267 } else ++it;
1268 }
1269 }
1270 //v end test
1271 it = align_list.begin();
1272 while (it != align_list.end()) {
1273 bool keep = false;
1274 int q_ds = (*it)->GetSeqStart(0);
1275 int q_de = (*it)->GetSeqStop(0);
1276 if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
1277 else keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
1278 if (!keep) it = align_list.erase(it);
1279 else ++it;
1280 }
1281 // sort according to score
1282 align_list.sort(s_CompareSeqAlignByScoreAndName);
1283
1284 /* process J */
1285 CRef<CSeq_align_set> align_J (0);
1286 CSearchResults& res_J = (*results_J)[iq];
1287 if (res_J.HasAlignments()) {
1288 align_J = res_J.SetSeqAlign();
1289 }
1290 if (align_J && align_J.NotEmpty() && !align_J->IsEmpty() && !align_list.empty()) {
1291
1292 CSeq_align_set::Tdata & al_J = align_J->Set();
1293 CSeq_align_set::Tdata::iterator it = al_J.begin();
1294 while (it != al_J.end()) {
1295 if (s_DJNotCompatible(*(align_list.front()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1296 it = al_J.erase(it);
1297 } else ++it;
1298 }
1299 }
1300 }
1301
1302 iq ++;
1303 }
1304 }
1305
x_ProcessDJResult(CRef<CSearchResultSet> & results_V,CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1306 void CIgBlast::x_ProcessDJResult(CRef<CSearchResultSet>& results_V,
1307 CRef<CSearchResultSet>& results_D,
1308 CRef<CSearchResultSet>& results_J,
1309 vector<CRef <CIgAnnotation> > &annots) {
1310
1311 int iq = 0;
1312 NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1313 string q_ct = (*annot)->m_ChainType[0];
1314 bool q_ms = (*annot)->m_MinusStrand;
1315 ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1316 int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1317
1318 CRef<CSeq_align_set> align_D, align_J;
1319
1320 x_FindDJ( results_D, results_J, *annot, align_D, align_J, q_ct, q_ms, q_st, q_ve, iq);
1321 iq ++;
1322 }
1323 }
1324
x_AnnotateD(CRef<CSearchResultSet> & results_D,vector<CRef<CIgAnnotation>> & annots)1325 void CIgBlast::x_AnnotateD(CRef<CSearchResultSet> &results_D,
1326 vector<CRef <CIgAnnotation> > &annots)
1327 {
1328
1329 int iq = 0;
1330 NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1331
1332 string q_ct = (*annot)->m_ChainType[0];
1333 CSearchResults& res_d = (*results_D)[iq];
1334 CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1335 if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1336 const CSeq_align_set::Tdata& align_list = align_D->Get();
1337 CRef<CSeq_align> align = align_list.front();
1338 (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1339 (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1340 (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1341 }
1342
1343
1344 /* next set of results */
1345 ++iq;
1346 }
1347 };
1348
x_AnnotateJ(CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1349 void CIgBlast::x_AnnotateJ(CRef<CSearchResultSet> &results_J,
1350 vector<CRef <CIgAnnotation> > &annots)
1351 {
1352 int iq = 0;
1353 NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1354
1355 string q_ct = (*annot)->m_ChainType[0];
1356 bool q_ms = (*annot)->m_MinusStrand;
1357
1358 const CSearchResults& res_j = (*results_J)[iq];
1359 CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1360
1361 /* annotate J */
1362 if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1363 const CSeq_align_set::Tdata & align_list = align_J->Get();
1364 CRef<CSeq_align> align = align_list.front();
1365 x_FillJDomain(align, *annot);
1366 (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1367 (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1368 string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1369 int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1370 if (frame_offset >= 0) {
1371 int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1372 (*annot)->m_FrameInfo[2] = (q_ms) ?
1373 align->GetSeqStop(0) + frame_adj
1374 : align->GetSeqStart(0) - frame_adj;
1375 }
1376 (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1377 }
1378
1379 /* next set of results */
1380 ++iq;
1381 }
1382 };
1383
x_AnnotateDJ(CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1384 void CIgBlast::x_AnnotateDJ(CRef<CSearchResultSet> &results_D,
1385 CRef<CSearchResultSet> &results_J,
1386 vector<CRef <CIgAnnotation> > &annots)
1387 {
1388 int iq = 0;
1389 NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1390
1391 string q_ct = (*annot)->m_ChainType[0];
1392 bool q_ms = (*annot)->m_MinusStrand;
1393
1394 const CSearchResults& res_j = (*results_J)[iq];
1395 const CSearchResults& res_d = (*results_D)[iq];
1396 CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1397 CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1398
1399 /* annotate D */
1400 if (align_D.NotEmpty() && !align_D->IsEmpty()) {
1401 const CSeq_align_set::Tdata & align_list = align_D->Get();
1402 CRef<CSeq_align> align = align_list.front();
1403 (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1404 (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1405 (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1406
1407 }
1408
1409 /* annotate J */
1410 if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1411 const CSeq_align_set::Tdata & align_list = align_J->Get();
1412 CRef<CSeq_align> align = align_list.front();
1413 x_FillJDomain(align, *annot);
1414 (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1415 (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1416 string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1417 int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1418 if (frame_offset >= 0) {
1419 int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1420 (*annot)->m_FrameInfo[2] = (q_ms) ?
1421 align->GetSeqStop(0) + frame_adj
1422 : align->GetSeqStart(0) - frame_adj;
1423 }
1424 (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1425 }
1426
1427 /* next set of results */
1428 ++iq;
1429 }
1430 };
1431
1432 // query chain type and domain is annotated by germline alignment
x_AnnotateDomain(CRef<CSearchResultSet> & gl_results,CRef<CSearchResultSet> & dm_results,vector<CRef<CIgAnnotation>> & annots)1433 void CIgBlast::x_AnnotateDomain(CRef<CSearchResultSet> &gl_results,
1434 CRef<CSearchResultSet> &dm_results,
1435 vector<CRef <CIgAnnotation> > &annots)
1436 {
1437 CRef<CObjectManager> mgr = CObjectManager::GetInstance();
1438 CScope scope_q(*mgr), scope_s(*mgr);
1439 CRef<CSeqDB> db_V, db_domain;
1440 bool annotate_subject = false;
1441 if (m_IgOptions->m_Db[0]->IsBlastDb()) {
1442 string db_name_V = m_IgOptions->m_Db[0]->GetDatabaseName();
1443 string db_name_domain = m_IgOptions->m_Db[3]->GetDatabaseName();
1444 CSeqDB::ESeqType db_type = (m_IgOptions->m_IsProtein)?
1445 CSeqDB::eProtein : CSeqDB::eNucleotide;
1446 db_V.Reset(new CSeqDB(db_name_V, db_type));
1447 if (db_name_V == db_name_domain) {
1448 db_domain.Reset(&(*db_V));
1449 } else {
1450 db_domain.Reset(new CSeqDB(db_name_domain, db_type));
1451 }
1452 annotate_subject = true;
1453 }
1454
1455 int iq = 0;
1456 ITERATE(CSearchResultSet, result, *dm_results) {
1457
1458 CIgAnnotation *annot = &*(annots[iq]);
1459 annot->m_ChainType.push_back("N/A"); // Assuming non-ig sequence first
1460 annot->m_ChainTypeToShow = "N/A";
1461 if ((*result)->HasAlignments() && (*gl_results)[iq].HasAlignments()) {
1462
1463
1464 CConstRef<CSeq_align> master_align =
1465 (*gl_results)[iq].GetSeqAlign()->Get().front();
1466 CAlnMap q_map(master_align->GetSegs().GetDenseg());
1467
1468 if (master_align->GetSeqStrand(0) == eNa_strand_minus) {
1469 annot->m_MinusStrand = true;
1470 }
1471
1472 int q_ends[2], q_dir;
1473
1474 if (annot->m_MinusStrand) {
1475 q_ends[1] = master_align->GetSeqStart(0);
1476 q_ends[0] = master_align->GetSeqStop(0);
1477 q_dir = -1;
1478
1479 } else {
1480 q_ends[0] = master_align->GetSeqStart(0);
1481 q_ends[1] = master_align->GetSeqStop(0);
1482 q_dir = 1;
1483 }
1484
1485 const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
1486
1487 ITERATE(CSeq_align_set::Tdata, it, align_list) {
1488
1489 string sid = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
1490 annot->m_ChainType[0] = m_AnnotationInfo.GetDomainChainType(sid);
1491 annot->m_ChainTypeToShow = annot->m_ChainType[0];
1492 int domain_info[10];
1493
1494 if (m_AnnotationInfo.GetDomainInfo(sid, domain_info)) {
1495
1496
1497 CAlnMap s_map((*it)->GetSegs().GetDenseg());
1498 int s_start = (*it)->GetSeqStart(1);
1499 int s_stop = (*it)->GetSeqStop(1);
1500
1501 CRef<CAlnMap> d_map;
1502 int d_start = -1;
1503 int d_stop = -1;
1504
1505 int start, stop;
1506
1507 if (annotate_subject) {
1508 CRef<CBioseq> seq_q = db_domain->SeqidToBioseq((*it)->GetSeq_id(1));
1509 CBioseq_Handle hdl_q = scope_q.AddBioseq(*seq_q);
1510 CRef<CBioseq> seq_s = db_V->SeqidToBioseq(master_align->GetSeq_id(1));
1511 CBioseq_Handle hdl_s = scope_s.AddBioseq(*seq_s);
1512 CSeq_loc query, subject;
1513 query.SetWhole();
1514 query.SetId((*it)->GetSeq_id(1));
1515 subject.SetWhole();
1516 subject.SetId(master_align->GetSeq_id(1));
1517 SSeqLoc q_loc(&query, &scope_q);
1518 SSeqLoc s_loc(&subject, &scope_s);
1519 CBl2Seq bl2seq(q_loc, s_loc, (m_IgOptions->m_IsProtein)? eBlastp: eBlastn);
1520 const CSearchResults& result = (*(bl2seq.RunEx()))[0];
1521 if (result.HasAlignments()) {
1522 CConstRef<CSeq_align> subject_align = result.GetSeqAlign()->Get().front();
1523 d_map.Reset(new CAlnMap(subject_align->GetSegs().GetDenseg()));
1524 d_start = subject_align->GetSeqStart(0);
1525 d_stop = subject_align->GetSeqStop(0);
1526 }
1527 scope_q.RemoveBioseq(hdl_q);
1528 scope_s.RemoveBioseq(hdl_s);
1529 }
1530
1531 for (int i =0; i<10; i+=2) {
1532
1533 start = domain_info[i] - 1;
1534 stop = domain_info[i+1] - 1;
1535
1536 if (start <= d_stop && stop >= d_start) {
1537 int start_copy = start;
1538 int stop_copy = stop;
1539 if (start_copy < d_start) start_copy = d_start;
1540 if (stop_copy > d_stop) stop_copy = d_stop;
1541 if (start_copy <= stop_copy) {
1542 if (i>0 && annot->m_DomainInfo_S[i-1]>=0) {
1543 annot->m_DomainInfo_S[i] = annot->m_DomainInfo_S[i-1] + 1;
1544 } else {
1545 annot->m_DomainInfo_S[i] =
1546 d_map->GetSeqPosFromSeqPos(1, 0, start_copy, IAlnExplorer::eForward);
1547 }
1548 annot->m_DomainInfo_S[i+1] =
1549 d_map->GetSeqPosFromSeqPos(1, 0, stop_copy, IAlnExplorer::eBackwards);
1550 }
1551 }
1552
1553 if (start > s_stop || stop < s_start) continue;
1554
1555 if (start < s_start) start = s_start;
1556
1557 if (stop > s_stop) stop = s_stop;
1558
1559 if (start > stop) continue;
1560
1561 start = s_map.GetSeqPosFromSeqPos(0, 1, start, IAlnExplorer::eForward);
1562 stop = s_map.GetSeqPosFromSeqPos(0, 1, stop, IAlnExplorer::eBackwards);
1563
1564 if ((start - q_ends[1])*q_dir > 0 || (stop - q_ends[0])*q_dir < 0) continue;
1565
1566 if ((start - q_ends[0])*q_dir < 0) start = q_ends[0];
1567
1568 if ((stop - q_ends[1])*q_dir > 0) stop = q_ends[1];
1569
1570 if ((start - stop)*q_dir > 0) continue;
1571
1572 start = q_map.GetSeqPosFromSeqPos(1, 0, start, IAlnExplorer::eForward);
1573 stop = q_map.GetSeqPosFromSeqPos(1, 0, stop, IAlnExplorer::eBackwards);
1574
1575 start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1576 stop = q_map.GetSeqPosFromSeqPos(0, 1, stop);
1577
1578 if ((start - stop)*q_dir > 0) continue;
1579
1580 annot->m_DomainInfo[i] = start;
1581 annot->m_DomainInfo[i+1] = stop;
1582 }
1583
1584
1585
1586 // extension of the first and last annotated domain (if any)
1587 int i = 0;
1588 int extension = 0;
1589 while (i<10 && annot->m_DomainInfo[i] < 0) i+=2;
1590 if (i < 10 && domain_info[i] > 0) {
1591 extension = (domain_info[i] - 1 -
1592 s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1593 IAlnExplorer::eBackwards))*q_dir;
1594 annot->m_DomainInfo[i] += extension;
1595 //this does not get reversed like m_DomainInfo
1596 annot->m_DomainInfo_S[i] -= abs(extension);
1597
1598 if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1599 if (annot->m_DomainInfo_S[i] < 0) annot->m_DomainInfo_S[i] = 0;
1600
1601 i+=2;
1602 while (i<10 && annot->m_DomainInfo[i] >=0) {
1603 annot->m_DomainInfo[i] = annot->m_DomainInfo[i-1] + q_dir;
1604 i+=2;
1605 }
1606 i = 9;
1607 while (i>0 && annot->m_DomainInfo[i] < 0) i-=2;
1608 if (i >= 0) {
1609 annot->m_DomainInfo[i] += (domain_info[i] - 1 -
1610 s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1611 IAlnExplorer::eForward))*q_dir;
1612 if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1613 }
1614 }
1615
1616 // any extra alignments after FWR3 are attributed to CDR3
1617 start = annot->m_DomainInfo[9];
1618
1619 if (start >= 0 && (start - q_ends[1])*q_dir < 0) {
1620 start = q_map.GetSeqPosFromSeqPos(1, 0, start+q_dir, IAlnExplorer::eForward);
1621 start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1622
1623 if ((start - q_ends[1])*q_dir <= 0) {
1624 annot->m_DomainInfo[10] = start;
1625 annot->m_DomainInfo[11] = q_ends[1];
1626 }
1627 }
1628 // annotate the query frame offset
1629 int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1630
1631 if (frame_offset >= 0) {
1632 int q_start = (*it)->GetSeqStart(0);
1633 int q_stop = (*it)->GetSeqStop(0);
1634 int q_mid = q_start + q_stop;
1635 int q_dif = q_stop - q_start;
1636 int frame_adj = (3 - ((*it)->GetSeqStart(1) + 3 - frame_offset) % 3) %3;
1637 annot->m_FrameInfo[0] = (q_mid - q_dir *q_dif)/2 + q_dir * frame_adj;
1638
1639 //counting frame from fwr3 end, not the V end since we need to ignore a few bases
1640 //in the CDR3 to allow any insertion or deletion at V gene end
1641 if (annot->m_DomainInfo[9] > 0) {
1642 int fwr3_stop = annot->m_DomainInfo[9];
1643
1644 if (annot->m_MinusStrand) {
1645
1646 q_start = max(q_start, fwr3_stop);
1647 q_mid = q_start + q_stop;
1648 q_dif = q_stop - q_start;
1649 frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_start, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1650 } else {
1651 q_stop = min(q_stop, fwr3_stop);
1652 q_mid = q_start + q_stop;
1653 q_dif = q_stop - q_start;
1654 frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_stop, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1655 }
1656 } else {
1657 frame_adj = ((*it)->GetSeqStop(1) + 3 - frame_offset) % 3;
1658 }
1659
1660 annot->m_FrameInfo[1] = (q_mid + q_dir *q_dif)/2 - q_dir * frame_adj;
1661 }
1662 break;
1663
1664 }
1665 }
1666 }
1667 ++iq;
1668 }
1669 };
1670
x_SetChainType(CRef<CSearchResultSet> & results,vector<CRef<CIgAnnotation>> & annots)1671 void CIgBlast::x_SetChainType(CRef<CSearchResultSet> &results,
1672 vector<CRef <CIgAnnotation> > &annots)
1673 {
1674 int iq = 0;
1675 ITERATE(CSearchResultSet, result, *results) {
1676
1677 CIgAnnotation *annot = &*(annots[iq++]);
1678
1679 if ((*result)->HasAlignments()) {
1680 int num_aligns = (*result)->GetSeqAlign()->Size();
1681 CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
1682 (const_cast<CSearchResults *>(&**result));
1683 for (int i=0; i<ig_result->m_NumActualV; ++i, --num_aligns) {
1684 annot->m_ChainType.push_back("V");
1685 }
1686 for (int i=0; i<ig_result->m_NumActualD; ++i, --num_aligns) {
1687 annot->m_ChainType.push_back("D");
1688 }
1689 for (int i=0; i<ig_result->m_NumActualJ; ++i, --num_aligns) {
1690 annot->m_ChainType.push_back("J");
1691 }
1692 for (int i=0; i<num_aligns; ++i) {
1693 annot->m_ChainType.push_back("N/A");
1694 }
1695 }
1696 }
1697 };
1698
s_SortResultsByEvalue(CRef<CSearchResultSet> & results)1699 void CIgBlast::s_SortResultsByEvalue(CRef<CSearchResultSet>& results)
1700 {
1701 ITERATE(CSearchResultSet, result, *results) {
1702 if ((*result)->HasAlignments()) {
1703 CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1704 (&*((*result)->GetSeqAlign())));
1705 CSeq_align_set::Tdata & align_list = align->Set();
1706 align_list.sort(s_CompareSeqAlignByEvalue);
1707 }
1708 }
1709 };
1710
1711 // convert sequencecomparison to database mode
x_ConvertResultType(CRef<CSearchResultSet> & result)1712 void CIgBlast::x_ConvertResultType(CRef<CSearchResultSet> &result)
1713 {
1714 if (result->GetResultType() != eSequenceComparison) {
1715 return;
1716 }
1717
1718 int num_queries = m_Query->Size();
1719 int num_results = result->GetNumResults();
1720 int ir = 0;
1721 CSearchResultSet *retv = new CSearchResultSet();
1722
1723 for (int iq = 0; iq< num_queries && ir< num_results; ++iq) {
1724
1725 CSearchResults &res = (*result)[ir++];
1726 CRef<CBlastAncillaryData> ancillary = res.GetAncillaryData();
1727 TQueryMessages errmsg = res.GetErrors();
1728 CConstRef<CSeq_id> rid = res.GetSeqId();
1729 CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1730 (&*(res.GetSeqAlign())));
1731 CSeq_align_set::Tdata & align_list = align->Set();
1732
1733 CConstRef<CSeq_id> qid = m_Query->GetBlastSearchQuery(iq)->GetQueryId();
1734 while(!qid->Match(*rid)) {
1735 CRef<CSeq_align_set> empty;
1736 CRef<CSearchResults> r(new CSearchResults(qid, empty, errmsg, ancillary));
1737 retv->push_back(r);
1738 qid = m_Query->GetBlastSearchQuery(++iq)->GetQueryId();
1739 }
1740
1741 while(ir < num_results && (*result)[ir].GetSeqId()->Match(*qid)) {
1742 CSearchResults &add_res = (*result)[ir++];
1743 CRef<CSeq_align_set> add;
1744 add.Reset(const_cast<CSeq_align_set *>
1745 (&*(add_res.GetSeqAlign())));
1746 CSeq_align_set::Tdata & add_list = add->Set();
1747 align_list.insert(align_list.end(), add_list.begin(), add_list.end());
1748 }
1749 CRef<CSearchResults> r(new CSearchResults(qid, align, errmsg, ancillary));
1750 retv->push_back(r);
1751 }
1752
1753 result.Reset(retv);
1754 };
1755
s_AppendResults(CRef<CSearchResultSet> & results,int num_aligns,int gene,CRef<CSearchResultSet> & final_results)1756 void CIgBlast::s_AppendResults(CRef<CSearchResultSet> &results,
1757 int num_aligns,
1758 int gene,
1759 CRef<CSearchResultSet> &final_results)
1760 {
1761 bool new_result = (final_results.Empty());
1762 if (new_result) {
1763 final_results.Reset(new CSearchResultSet());
1764 }
1765
1766 int iq = 0;
1767 ITERATE(CSearchResultSet, result, *results) {
1768
1769 CRef<CSeq_align_set> align;
1770 int actual_align = 0;
1771
1772 if ((*result)->HasAlignments()) {
1773 align.Reset(const_cast<CSeq_align_set *>
1774 (&*((*result)->GetSeqAlign())));
1775
1776 // keep only the first num_alignments
1777 if (num_aligns >= 0) {
1778 CSeq_align_set::Tdata & align_list = align->Set();
1779 if (align_list.size() > (CSeq_align_set::Tdata::size_type)num_aligns) {
1780 CSeq_align_set::Tdata::iterator it = align_list.begin();
1781 for (int i=0; i<num_aligns; ++i) ++it;
1782 align_list.erase(it, align_list.end());
1783 actual_align = num_aligns;
1784 } else {
1785 actual_align = align_list.size();
1786 }
1787 }
1788 }
1789
1790 TQueryMessages errmsg = (*result)->GetErrors();
1791 CConstRef<CSeq_id> query = (*result)->GetSeqId();
1792
1793 CIgBlastResults *ig_result;
1794 if (new_result) {
1795 // TODO maybe we need the db ancillary instead?
1796 CRef<CBlastAncillaryData> ancillary = (*result)->GetAncillaryData();
1797 ig_result = new CIgBlastResults(query, align, errmsg, ancillary);
1798 CRef<CSearchResults> r(ig_result);
1799 final_results->push_back(r);
1800 } else {
1801 while( !(*final_results)[iq].GetSeqId()->Match(*query)) ++iq;
1802 ig_result = dynamic_cast<CIgBlastResults *> (&(*final_results)[iq]);
1803 if (!align.Empty()) {
1804 CSeq_align_set::Tdata & ig_list = ig_result->SetSeqAlign()->Set();
1805 CSeq_align_set::Tdata & align_list = align->Set();
1806
1807 if (gene < 0) {
1808 // Remove duplicate seq_aligns
1809 CSeq_align_set::Tdata::iterator it = align_list.begin();
1810 while (it != align_list.end()) {
1811 if (s_SeqAlignInSet(ig_list, *it)) it = align_list.erase(it);
1812 else ++it;
1813 }
1814 }
1815
1816 if (!align_list.empty()) {
1817 ig_list.insert(ig_list.end(), align_list.begin(), align_list.end());
1818 ig_result->GetErrors().Combine(errmsg);
1819 }
1820 }
1821 }
1822
1823 switch(gene) {
1824 case 0: ig_result->m_NumActualV = actual_align; break;
1825 case 1: ig_result->m_NumActualD = actual_align; break;
1826 case 2: ig_result->m_NumActualJ = actual_align; break;
1827 default: break;
1828 }
1829 }
1830 };
1831
x_SetAnnotation(vector<CRef<CIgAnnotation>> & annots,CRef<CSearchResultSet> & final_results)1832 void CIgBlast::x_SetAnnotation(vector<CRef <CIgAnnotation> > &annots,
1833 CRef<CSearchResultSet> &final_results)
1834 {
1835 int iq = 0;
1836 NON_CONST_ITERATE(CSearchResultSet, result, *final_results) {
1837 CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
1838 (const_cast<CSearchResults *>(&**result));
1839 CIgAnnotation *annot = &*(annots[iq++]);
1840 ig_result->SetIgAnnotation().Reset(annot);
1841 if (annot->m_GeneInfo[4] < 0 && m_IgOptions->m_MinJLength > 0) { //no J
1842 if ((*result)->HasAlignments()){
1843 (*result)->SetSeqAlign()->Set().clear();
1844 }
1845 }
1846 }
1847 };
1848
1849 END_SCOPE(blast)
1850 END_NCBI_SCOPE
1851
1852 /* @} */
1853