1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author:  Ning Ma
26  *
27  */
28 
29 /** @file igblast.cpp
30  * Implementation of CIgBlast.
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/blast/igblast/igblast.hpp>
35 #include <algo/blast/api/local_blast.hpp>
36 #include <algo/blast/api/bl2seq.hpp>
37 #include <algo/blast/api/remote_blast.hpp>
38 #include <algo/blast/api/objmgr_query_data.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40 #include <objtools/alnmgr/alnvec.hpp>
41 #include <algo/blast/composition_adjustment/composition_constants.h>
42 #include <objmgr/object_manager.hpp>
43 
44 
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
49 
50 BEGIN_NCBI_SCOPE
51 USING_SCOPE(objects);
52 BEGIN_SCOPE(blast)
53 
54 static int max_allowed_VJ_distance_with_D = 225;
55 static int max_allowed_VJ_distance_without_D = 50;
56 static int max_allowed_VD_distance = 120;
57 static int max_allowed_j_deletion = 32;
58 static int extend_length5end = 30;
59 static int extend_length3end = 15;
60 static int max_J_length = 70;
61 static int max_allowed_V_end_to_J_end = max_allowed_VJ_distance_with_D + max_J_length;
62 static int max_v_j_overlap = 7;
63 static int j_wordsize = 7;
64 
s_ReadLinesFromFile(const string & fn,vector<string> & lines)65 static void s_ReadLinesFromFile(const string& fn, vector<string>& lines)
66 {
67     CNcbiIfstream fs(fn.c_str(), IOS_BASE::in);
68     lines.clear();
69 
70     if (CFile(fn).Exists() && ! fs.fail()) {
71         char line[256];
72         while(true) {
73             fs.getline(line, 256);
74             if (fs.eof()) break;
75             if (line[0] == '#') continue;
76             string l(line);
77             lines.push_back(l);
78         }
79     }
80     fs.close();
81 };
82 
CIgAnnotationInfo(CConstRef<CIgBlastOptions> & ig_opt)83 CIgAnnotationInfo::CIgAnnotationInfo(CConstRef<CIgBlastOptions> &ig_opt)
84 {
85     vector<string> lines;
86 
87     // read domain info from pdm or ndm file
88     const string suffix = (ig_opt->m_IsProtein) ? ".pdm." : ".ndm.";
89     string fn(SeqDB_ResolveDbPath(ig_opt->m_IgDataPath + "/" + ig_opt->m_Origin + "/"
90                                + ig_opt->m_Origin + suffix + ig_opt->m_DomainSystem));
91     if (fn == "") {
92         NCBI_THROW(CBlastException,  eInvalidArgument,
93               "Domain annotation data file could not be found in [internal_data] directory");
94     }
95     s_ReadLinesFromFile(fn, lines);
96     int index = 0;
97     ITERATE(vector<string>, l, lines) {
98         vector<string> tokens;
99         NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
100         if (!tokens.empty()) {
101             m_DomainIndex[tokens[0]] = index;
102             for (int i=1; i<11; ++i) {
103                 m_DomainData.push_back(NStr::StringToInt(tokens[i]));
104             }
105             index += 10;
106             m_DomainChainType[tokens[0]] = tokens[11];
107             int frame = NStr::StringToInt(tokens[12]);
108             if (frame != -1) {
109                 m_FrameOffset[tokens[0]] = frame;
110             }
111         }
112     }
113 
114     // read frame info from aux files
115     if (ig_opt->m_IsProtein) return;
116     fn = ig_opt->m_AuxFilename;
117     s_ReadLinesFromFile(fn, lines);
118     if (lines.size() == 0) {
119         ERR_POST(Warning << "Auxilary data file could not be found");
120     }
121     ITERATE(vector<string>, l, lines) {
122         vector<string> tokens;
123         NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
124         if (!tokens.empty()) {
125             int frame = NStr::StringToInt(tokens[1]);
126             if (frame != -1) {
127                 m_FrameOffset[tokens[0]] = frame;
128             }
129             if (tokens.size() == 3) { //just backward compatible as there was no such field
130                 m_DJChainType[tokens[0]] = tokens[2];
131             } else if (tokens.size() == 4) { //just backward compatible as there was no such field
132                 m_DJChainType[tokens[0]] = tokens[2];
133                 m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
134             }  else if (tokens.size() == 5) { //just backward compatible as there was no such field
135                 m_DJChainType[tokens[0]] = tokens[2];
136                 m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
137                 m_Fwr4EndOffset[tokens[0]] = NStr::StringToInt(tokens[4]);
138             }
139 
140         }
141     }
142 };
143 
x_ScreenByAlignLength(CRef<CSearchResultSet> & results,int length)144 void CIgBlast::x_ScreenByAlignLength(CRef<CSearchResultSet> & results, int length){
145     NON_CONST_ITERATE(CSearchResultSet, result, *results) {
146         if ((*result)->HasAlignments()) {
147             CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
148             CSeq_align_set::Tdata::iterator it = align_list.begin();
149             while (it != align_list.end()) {
150                 if((int)((*it)->GetAlignLength()) - (int)((*it)->GetTotalGapCount(0)) < length){
151                     it = align_list.erase(it);
152                 } else {
153                     ++it;
154                 }
155             }
156         }
157     }
158 }
159 
160 
x_ExtendAlign5end(CRef<CSearchResultSet> & results)161 void CIgBlast::x_ExtendAlign5end(CRef<CSearchResultSet> & results){
162 
163     NON_CONST_ITERATE(CSearchResultSet, result, *results) {
164 
165         if ((*result)->HasAlignments()) {
166             CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
167             int desired_len = 0;
168             int actual_len = 0;
169             int top_hit_actual_len = 0;
170             int count = 0;
171             ENa_strand extend_strand = eNa_strand_plus;
172             int highest_score = 0;
173 
174             NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
175 
176                 // cerr << "before=" << MSerial_AsnText << **align << endl;
177 
178                 //extend germline match up to some positions at 5' end.  Extend length is
179                 //set by comparing to top hit or top hit equivalents
180 
181                 int score = 0;
182                 (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
183 
184                 if (score >= highest_score)  { //top hits
185                     highest_score = score;
186                     extend_strand = (*align)->GetSeqStrand(0);
187                     desired_len = min(extend_length5end,
188                                       (*align)->GetSegs().GetDenseg().GetStarts()[1]);
189 
190                     if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
191                         int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
192                         int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
193                                                query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
194                                                             (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
195                         top_hit_actual_len = min(desired_len, allowed_len);
196 
197 
198                     } else {
199 
200                         top_hit_actual_len = min(desired_len,
201                                          min((*align)->GetSegs().GetDenseg().GetStarts()[0],
202                                              (*align)->GetSegs().GetDenseg().GetStarts()[1]));
203 
204                     }
205                 }
206 
207                 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
208                     int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
209                     int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
210                                            query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
211                                                         (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
212                     actual_len = min(top_hit_actual_len, min(desired_len, allowed_len));
213 
214 
215                 } else {
216 
217                     actual_len = min(top_hit_actual_len, min(desired_len,
218                                                              min((*align)->GetSegs().GetDenseg().GetStarts()[0],
219                                                                  (*align)->GetSegs().GetDenseg().GetStarts()[1])));
220 
221                 }
222 
223                 count ++;
224                 //only extend if it has the same strand as the top hit
225                 if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
226                     if (extend_strand == eNa_strand_minus) {
227 
228                         (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
229                         (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
230 
231                     } else {
232 
233                         (*align)->SetSegs().SetDenseg().SetStarts()[0] -= actual_len;
234                         (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
235                         (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
236 
237                     }
238                 }
239                 // cerr << "after=" << MSerial_AsnText << **align << endl;
240             }
241 
242         }
243     }
244 }
245 
x_ExtendAlign3end(CRef<CSearchResultSet> & results)246 void CIgBlast::x_ExtendAlign3end(CRef<CSearchResultSet> & results){
247 
248     NON_CONST_ITERATE(CSearchResultSet, result, *results) {
249 
250         if ((*result)->HasAlignments()) {
251             CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
252             int desired_len = 0;
253             int actual_len = 0;
254             int top_hit_actual_len = 0;
255             ENa_strand extend_strand = eNa_strand_plus;
256             int highest_score = 0;
257 
258             NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
259 
260                 // cerr << "before=" << MSerial_AsnText << **align << endl;
261 
262                 //extend germline match up to some positions at 5' end.  Extend length is
263                 //set by comparing to top hit or top hit equivalents
264 
265                 int score = 0;
266                 (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
267 
268                 if (score >= highest_score)  { //top hits
269                     highest_score = score;
270                     extend_strand = (*align)->GetSeqStrand(0);
271 
272                     int j_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(1)).GetBioseqLength() - 1;
273                     int j_align_stop = (*align)->GetSegs().GetDenseg().GetSeqStop(1);
274                     desired_len = min(extend_length3end,
275                                       j_stop - j_align_stop);
276 
277                     if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
278 
279                         int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
280                         int allowed_query_length = query_align_start;
281 
282                         top_hit_actual_len = min(desired_len, allowed_query_length);
283                     } else {
284                         int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
285                         int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
286                         top_hit_actual_len = min(desired_len, allowed_query_length);
287 
288                     }
289                 }
290 
291                 if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
292 
293                     int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
294                     int allowed_query_length = query_align_start;
295                     actual_len = min(allowed_query_length, top_hit_actual_len);
296 
297                 } else {
298                     int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
299                     int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
300                     actual_len = min(top_hit_actual_len, allowed_query_length);
301 
302                 }
303 
304                 //only extend if it has the same strand as the top hit
305                 if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
306                     if (extend_strand == eNa_strand_minus) {
307 
308                       int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
309                       int num_dim = (*align)->GetSegs().GetDenseg().GetDim();
310                       (*align)->SetSegs().SetDenseg().SetStarts()[num_seg*num_dim - 2] -= actual_len;
311                       (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
312 
313                     } else {
314                         int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
315                         (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
316 
317                     }
318                 }
319                 // cerr << "after=" << MSerial_AsnText << **align << endl;
320             }
321 
322         }
323     }
324 }
325 
326 CRef<CSearchResultSet>
Run()327 CIgBlast::Run()
328 {
329     vector<CRef <CIgAnnotation> > annots;
330     CRef<CSearchResultSet> final_results;
331     CRef<IQueryFactory> qf;
332     CRef<CBlastOptionsHandle> opts_hndl(CBlastOptionsFactory
333            ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
334     CRef<CSearchResultSet> results[4], result;
335 
336     /*** search V germline */
337     {
338         x_SetupVSearch(qf, opts_hndl);
339         CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[0]);
340          blast.SetNumberOfThreads(m_NumThreads);
341         results[0] = blast.Run();
342         if (m_IgOptions->m_ExtendAlign5end){
343             x_ExtendAlign5end(results[0]);
344         }
345         x_ScreenByAlignLength(results[0], m_IgOptions->m_MinVLength);
346         x_ConvertResultType(results[0]);
347         s_SortResultsByEvalue(results[0]);
348         x_AnnotateV(results[0], annots);
349     }
350 
351     /*** search internal V for domain annotation */
352     {
353         //restore default settings for internal db search
354         if (m_IgOptions->m_IsProtein) {
355             opts_hndl->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
356         } else {
357             opts_hndl->SetOptions().SetMismatchPenalty(-1);
358             opts_hndl->SetOptions().SetWordSize(9);
359             opts_hndl->SetOptions().SetGapOpeningCost(4);
360             opts_hndl->SetOptions().SetGapExtensionCost(1);
361         }
362         opts_hndl->SetEvalueThreshold(20);
363         opts_hndl->SetHitlistSize(20);  // use a larger number to ensure annotation
364         CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[3]);
365         blast.SetNumberOfThreads(m_NumThreads);
366         results[3] = blast.Run();
367         if (m_IgOptions->m_ExtendAlign5end){
368             x_ExtendAlign5end(results[3]);
369         }
370         x_ScreenByAlignLength(results[3], m_IgOptions->m_MinVLength);
371         s_SortResultsByEvalue(results[3]);
372         x_AnnotateDomain(results[0], results[3], annots);
373     }
374 
375     opts_hndl.Reset(CBlastOptionsFactory
376                      ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
377 
378     /*** search DJ germline */
379     int num_genes =  (m_IgOptions->m_IsProtein) ? 1 : 3;
380     if (num_genes > 1) {
381 
382         for (int gene = 1; gene < num_genes; ++gene) {
383             x_SetupDJSearch(annots, qf, opts_hndl, gene);
384             CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[gene]);
385             try {
386                 blast.SetNumberOfThreads(m_NumThreads);
387                 results[gene] = blast.Run();
388                 if (gene == 2){
389                     if (m_IgOptions->m_ExtendAlign3end){
390                         x_ExtendAlign3end(results[gene]);
391                     }
392                     x_ScreenByAlignLength(results[gene], m_IgOptions->m_MinJLength);
393                 }
394                 x_ConvertResultType(results[gene]);
395             } catch(...) {
396                 num_genes = 1;
397                 break;
398             }
399         }
400         x_ProcessDJResult(results[0], results[1], results[2], annots);
401 
402         if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 && m_IgOptions->m_D_penalty == -4){
403             x_AnnotateDJ(results[1], results[2],  annots);
404         } else {
405             x_AnnotateJ(results[2],  annots);
406             //redo d gene search and not allow dj overlap
407             x_SetupNoOverlapDSearch(annots, results[1], qf, opts_hndl, 1);
408             CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[1]);
409             try {
410                 blast.SetNumberOfThreads(m_NumThreads);
411                 results[1] = blast.Run();
412 
413                 x_ConvertResultType(results[1]);
414             } catch(...) {
415                 cerr << "blast failed" << endl;
416             }
417             x_ProcessDGeneResult(results[0], results[1], results[2],annots);
418             x_AnnotateD(results[1], annots);
419         }
420     }
421 
422     /*** collect germline search results */
423     for (int gene = 0; gene  < num_genes; ++gene) {
424         s_AppendResults(results[gene], m_IgOptions->m_NumAlign[gene], gene, final_results);
425     }
426 
427     /*** search user specified db */
428     bool skipped = false;
429     if (m_IsLocal) {
430         if (&(*m_LocalDb) != &(*(m_IgOptions->m_Db[0]))) {
431             x_SetupDbSearch(annots, qf);
432             CLocalBlast blast(qf, m_Options, m_LocalDb);
433             blast.SetNumberOfThreads(m_NumThreads);
434             result = blast.Run();
435         } else {
436             skipped = true;
437         }
438     } else {
439         x_SetupDbSearch(annots, qf);
440         CRef<CRemoteBlast> blast;
441         if (m_RemoteDb.NotEmpty()) {
442             _ASSERT(m_Subject.Empty());
443             blast.Reset(new CRemoteBlast(qf, m_Options, *m_RemoteDb));
444             if(m_EntrezQuery != NcbiEmptyString){
445                 blast->SetEntrezQuery(m_EntrezQuery.c_str());
446             }
447         } else {
448             blast.Reset(new CRemoteBlast(qf, m_Options, m_Subject));
449         }
450         blast->Submit();
451         m_RID=blast->GetRID();
452         GetDiagContext().Extra().Print("RID", m_RID);
453         result = blast->GetResultSet();
454     }
455     if (! skipped) {
456         x_ConvertResultType(result);
457         s_SortResultsByEvalue(result);
458         s_AppendResults(result, -1, -1, final_results);
459     }
460 
461     /*** set chain type info */
462     x_SetChainType(final_results, annots);
463 
464     /*** attach annotation info back to the results */
465     x_SetAnnotation(annots, final_results);
466 
467     return final_results;
468 };
469 
470 // Compare two seqaligns according to their evalue and coverage and name
471 //compare name since blast does not guarantee order of same score hits
s_CompareSeqAlignByScoreAndName(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)472 static bool s_CompareSeqAlignByScoreAndName(const CRef<CSeq_align> &x, const CRef<CSeq_align> &y)
473 {
474     int sx, sy;
475     x->GetNamedScore(CSeq_align::eScore_Score, sx);
476     y->GetNamedScore(CSeq_align::eScore_Score, sy);
477     if (sx != sy) return (sx > sy);
478 
479     sx = x->GetAlignLength();
480     sy = y->GetAlignLength();
481     if (sx != sy) {
482         return (sx >= sy);
483     }
484 
485     string x_id = NcbiEmptyString;
486     string y_id = NcbiEmptyString;
487     x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
488     y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
489     return (x_id < y_id);
490 
491 };
492 
493 
494 
x_SetupVSearch(CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl)495 void CIgBlast::x_SetupVSearch(CRef<IQueryFactory>       &qf,
496                               CRef<CBlastOptionsHandle> &opts_hndl)
497 {
498     CBlastOptions & opts = opts_hndl->SetOptions();
499     if (m_IgOptions->m_IsProtein) {
500         opts.SetCompositionBasedStats(eNoCompositionBasedStats);
501     } else {
502         int penalty = m_IgOptions->m_V_penalty;
503         opts.SetMatchReward(1);
504         opts.SetMismatchPenalty(penalty);
505         opts.SetWordSize(m_Options->GetOptions().GetWordSize());
506         if (penalty == -1) {
507             opts.SetGapOpeningCost(4);
508             opts.SetGapExtensionCost(1);
509         }
510     }
511     opts_hndl->SetEvalueThreshold(m_Options->GetOptions().GetEvalueThreshold());
512     opts_hndl->SetFilterString("F");
513     opts_hndl->SetHitlistSize(15+ m_IgOptions->m_NumAlign[0]);
514     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
515 
516 };
517 
x_SetupDJSearch(const vector<CRef<CIgAnnotation>> & annots,CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl,int db_type)518 void CIgBlast::x_SetupDJSearch(const vector<CRef <CIgAnnotation> > &annots,
519                                CRef<IQueryFactory>           &qf,
520                                CRef<CBlastOptionsHandle>     &opts_hndl,
521                                int db_type)
522 {
523     // Only igblastn will search DJ
524     CBlastOptions & opts = opts_hndl->SetOptions();
525     opts.SetMatchReward(1);
526     if (db_type == 2){ //J genes are longer so if can afford more reliable identification
527         opts.SetWordSize(j_wordsize);
528         opts.SetMismatchPenalty(m_IgOptions->m_J_penalty);
529     } else {
530         opts.SetWordSize(m_IgOptions->m_Min_D_match);
531         opts.SetMismatchPenalty(m_IgOptions->m_D_penalty);
532     }
533 
534     opts.SetGapOpeningCost(5);
535     opts.SetGapExtensionCost(2);
536     opts_hndl->SetEvalueThreshold((db_type == 2) ? 1000.0 : 100000.0);
537     opts_hndl->SetFilterString("F");
538     opts_hndl->SetHitlistSize(max(max(50,
539                m_IgOptions->m_NumAlign[1]),
540                m_IgOptions->m_NumAlign[2]));
541 
542     // Mask query for D, J search
543     int iq = 0;
544     ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
545         CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
546         CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
547         int len = query->GetLength();
548         if ((*annot)->m_GeneInfo[0] == -1) {
549             // This is not a germline sequence.  Mask it out
550             TMaskedQueryRegions mask_list;
551             CRef<CSeqLocInfo> mask(
552                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
553             mask_list.push_back(mask);
554             m_Query->SetMaskedRegions(iq, mask_list);
555         } else {
556             // Excluding the V gene except the last max_v_j_overlap bp for D and J gene search;
557             // also limit the J match to max_allowed_V_end_to_J_end beyond V gene.
558             int v_overlap;
559             if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 && m_IgOptions->m_D_penalty == -4) {
560                 v_overlap = max_v_j_overlap;
561             } else {
562                 v_overlap = 0;
563             }
564             bool ms = (*annot)->m_MinusStrand;
565             int begin = (ms)?
566               (*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end: (*annot)->m_GeneInfo[1] - 1 - v_overlap;
567             int end = (ms)?
568               (*annot)->m_GeneInfo[0] + v_overlap: (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end;
569             if (begin > 0 && begin <= len-1) {
570                 CRef<CSeqLocInfo> mask(
571                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin), 0));
572                 m_Query->AddMask(iq, mask);
573             }
574             if (end < len -1 && end >= 0) {
575                 CRef<CSeqLocInfo> mask(
576                   new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
577                 m_Query->AddMask(iq, mask);
578             }
579         }
580         ++iq;
581     }
582 
583     // Generate query factory
584     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
585 };
586 
587 
x_SetupNoOverlapDSearch(const vector<CRef<CIgAnnotation>> & annots,CRef<CSearchResultSet> & previous_d_results,CRef<IQueryFactory> & qf,CRef<CBlastOptionsHandle> & opts_hndl,int db_type)588 void CIgBlast::x_SetupNoOverlapDSearch(const vector<CRef <CIgAnnotation> > &annots,
589                                        CRef<CSearchResultSet>        &previous_d_results,
590                                        CRef<IQueryFactory>           &qf,
591                                        CRef<CBlastOptionsHandle>     &opts_hndl,
592                                        int db_type)
593 {
594     // Only igblastn will search DJ
595     CBlastOptions & opts = opts_hndl->SetOptions();
596     opts.SetMatchReward(1);
597     opts.SetWordSize(m_IgOptions->m_Min_D_match);
598     opts.SetMismatchPenalty(m_IgOptions->m_D_penalty);
599     opts.SetGapOpeningCost(5);
600     opts.SetGapExtensionCost(2);
601     opts_hndl->SetEvalueThreshold(100000.0);
602     opts_hndl->SetFilterString("F");
603     opts_hndl->SetHitlistSize(max(max(50,
604                                       m_IgOptions->m_NumAlign[1]),
605                                   m_IgOptions->m_NumAlign[2]));
606 
607     // Mask query for D
608     int iq = 0;
609     ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
610         CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
611         CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
612         int len = query->GetLength();
613         CRef<CSeq_align_set> align_d(0);
614         if ((*previous_d_results)[iq].HasAlignments()){
615             align_d =  (*previous_d_results)[iq].SetSeqAlign();
616         }
617 
618         if ((*annot)->m_GeneInfo[0] == -1 || !align_d || align_d.Empty() || align_d->IsEmpty()) {
619             // This is not a ig sequence or there is no d gene per previous search.  Mask it out
620             TMaskedQueryRegions mask_list;
621             CRef<CSeqLocInfo> mask(
622                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
623             mask_list.push_back(mask);
624             m_Query->SetMaskedRegions(iq, mask_list);
625         } else {
626             // Excluding the V gene and J gene
627             bool ms = (*annot)->m_MinusStrand;
628             int v_end_or_j_begin = (ms)?
629                 max((*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end, (*annot)->m_GeneInfo[5] - 1): (*annot)->m_GeneInfo[1] -1;
630             int j_begin_or_v_end = (ms)?
631                 (*annot)->m_GeneInfo[0]: min((*annot)->m_GeneInfo[4], (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end);
632             if (v_end_or_j_begin > 0) {
633                 CRef<CSeqLocInfo> mask(
634                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, v_end_or_j_begin), 0));
635                 m_Query->AddMask(iq, mask);
636             }
637             if (j_begin_or_v_end < len-1 && j_begin_or_v_end > 0) {
638                 CRef<CSeqLocInfo> mask(
639                   new CSeqLocInfo(new CSeq_interval(*q_id, j_begin_or_v_end, len-1), 0));
640                 m_Query->AddMask(iq, mask);
641             }
642         }
643         ++iq;
644     }
645 
646     // Generate query factory
647     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
648 };
649 
x_SetupDbSearch(vector<CRef<CIgAnnotation>> & annots,CRef<IQueryFactory> & qf)650 void CIgBlast::x_SetupDbSearch(vector<CRef <CIgAnnotation> > &annots,
651                                CRef<IQueryFactory>           &qf)
652 {
653     // Options already passed in as m_Options.  Only set up (mask) the query
654     int iq = 0;
655     ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
656         CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
657         CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
658         int len = query->GetLength();
659         TMaskedQueryRegions mask_list;
660         if ((*annot)->m_GeneInfo[0] ==-1) {
661             // This is not a germline sequence.  Mask it out
662             CRef<CSeqLocInfo> mask(
663                       new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
664             mask_list.push_back(mask);
665         } else if (m_IgOptions->m_FocusV) {
666             // Restrict to V gene
667             int begin = (*annot)->m_GeneInfo[0];
668             int end = (*annot)->m_GeneInfo[1];
669             if (begin > 0) {
670                 CRef<CSeqLocInfo> mask(
671                       new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin-1), 0));
672                 mask_list.push_back(mask);
673             }
674             if (end < len) {
675                 CRef<CSeqLocInfo> mask(
676                       new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
677                 mask_list.push_back(mask);
678             }
679         }
680         m_Query->SetMaskedRegions(iq, mask_list);
681         ++iq;
682     }
683     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
684 };
685 
686 // Compare the second seqalign to see if it is as good as the first one
s_IsSeqAlignAsGood(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)687 static bool s_IsSeqAlignAsGood(const CRef<CSeq_align> &x,
688                                const CRef<CSeq_align> &y)
689 {
690     double sx, sy;
691     x->GetNamedScore(CSeq_align::eScore_EValue, sx);
692     y->GetNamedScore(CSeq_align::eScore_EValue, sy);
693     if (sx < 0.999999 * sy || sy < 0.999999 * sx) return false;
694     int ix, iy;
695     x->GetNamedScore(CSeq_align::eScore_Score, ix);
696     y->GetNamedScore(CSeq_align::eScore_Score, iy);
697     if (ix > iy) return false;
698     int dx, dy;
699     dx = x->GetAlignLength();
700     dy = y->GetAlignLength();
701     return (dx <= dy);
702 }
703 
704 // Remove lcl| from seqid label
s_RemoveLocalPrefix(const string & sid)705 static string s_RemoveLocalPrefix(const string & sid)
706 {
707     if (sid.substr(0, 4) == "lcl|") return(sid.substr(4, sid.length()));
708     return sid;
709 }
710 
s_MakeTopHitsId(const CSeq_align_set::Tdata & align_list,int num_align)711 static string s_MakeTopHitsId(const CSeq_align_set::Tdata& align_list, int num_align) {
712     string ids = NcbiEmptyString;
713     CRef<CSeq_align> align = align_list.front();
714     int count = 0;
715     ITERATE(CSeq_align_set::Tdata, it, align_list) {
716 
717         if (count < num_align && s_IsSeqAlignAsGood(align, (*it))) {
718             string this_id = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
719             if (ids.find(this_id) == string::npos) {
720 
721                 //no redundant id
722                 if (ids != NcbiEmptyString) {
723                     ids += ",";
724                 }
725                 ids += this_id;
726                 count ++;
727             }
728         } else {
729             break;
730         }
731     }
732     return ids;
733 }
734 
x_AnnotateV(CRef<CSearchResultSet> & results,vector<CRef<CIgAnnotation>> & annots)735 void CIgBlast::x_AnnotateV(CRef<CSearchResultSet>        &results,
736                            vector<CRef <CIgAnnotation> > &annots)
737 {
738     ITERATE(CSearchResultSet, result, *results) {
739 
740         CIgAnnotation *annot = new CIgAnnotation();
741         annots.push_back(CRef<CIgAnnotation>(annot));
742 
743         if ((*result)->HasAlignments()) {
744             const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
745             CRef<CSeq_align> align = align_list.front();
746             annot->m_GeneInfo[0] = align->GetSeqStart(0);
747             annot->m_GeneInfo[1] = align->GetSeqStop(0)+1;
748             annot->m_TopGeneIds[0] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[0]);
749         }
750     }
751 };
752 
753 // Test if the alignment is already in the align_list
s_SeqAlignInSet(CSeq_align_set::Tdata & align_list,CRef<CSeq_align> & align)754 static bool s_SeqAlignInSet(CSeq_align_set::Tdata & align_list, CRef<CSeq_align> &align)
755 {
756     ITERATE(CSeq_align_set::Tdata, it, align_list) {
757         if ((*it)->GetSeq_id(1).Match(align->GetSeq_id(1)) &&
758             (*it)->GetSeqStart(1) == align->GetSeqStart(1) &&
759             (*it)->GetSeqStop(1) == align->GetSeqStop(1)) return true;
760     }
761     return false;
762 };
763 
764 // Compare two seqaligns according to their evalue and coverage
s_CompareSeqAlignByEvalue(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)765 static bool s_CompareSeqAlignByEvalue(const CRef<CSeq_align> &x,
766                                       const CRef<CSeq_align> &y)
767 {
768     double sx, sy;
769     x->GetNamedScore(CSeq_align::eScore_EValue, sx);
770     y->GetNamedScore(CSeq_align::eScore_EValue, sy);
771     if (sx < 0.999999 * sy) return true;
772     if (sy < 0.999999 * sx) return false;
773     int ix, iy;
774     x->GetNamedScore(CSeq_align::eScore_Score, ix);
775     y->GetNamedScore(CSeq_align::eScore_Score, iy);
776     if (ix != iy) return (ix > iy);
777 
778     int dx, dy;
779     dx = x->GetAlignLength();
780     dy = y->GetAlignLength();
781     if (dx != dy) {
782         return (dx >= dy);
783     }
784     string x_id = NcbiEmptyString;
785     string y_id = NcbiEmptyString;
786     x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
787     y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
788     return (x_id < y_id);
789 };
790 
791 // Compare two seqaligns according to their evalue and coverage
s_CompareSeqAlignByScore(const CRef<CSeq_align> & x,const CRef<CSeq_align> & y)792 static bool s_CompareSeqAlignByScore(const CRef<CSeq_align> &x, const CRef<CSeq_align> &y)
793 {
794     int sx, sy;
795     x->GetNamedScore(CSeq_align::eScore_Score, sx);
796     y->GetNamedScore(CSeq_align::eScore_Score, sy);
797     if (sx != sy) return (sx > sy);
798     sx = x->GetAlignLength();
799     sy = y->GetAlignLength();
800     return (sx >= sy);
801 
802 };
803 
804 
805 
806 // Test if D and J annotation not compatible
s_DJNotCompatible(const CSeq_align & d,const CSeq_align & j,bool ms,int margin)807 static bool s_DJNotCompatible(const CSeq_align &d, const CSeq_align &j, bool ms, int margin)
808 {
809     int ds = d.GetSeqStart(0);
810     int de = d.GetSeqStop(0);
811     int js = j.GetSeqStart(0);
812     int je = j.GetSeqStop(0);
813 
814     //D gene needs to have minimal match in addition to overlap with J gene
815     //D gene needs to end before J gene ends
816     if (ms) {
817         if (ds < js || de < je + margin) return true;
818     } else {
819         if (ds > js - margin || de > je) return true;
820     }
821     return false;
822 };
823 
824 /*
825 static bool s_IsTopMatchJD(CSearchResults& res_J, CIgAnnotationInfo& annotation_info){
826     bool result = true; //default
827     CRef<CSeq_align_set> align_J;
828     if (res_J.HasAlignments()) {
829         align_J.Reset(const_cast<CSeq_align_set *>
830                       (&*(res_J.GetSeqAlign())));
831         CSeq_align_set::Tdata & align_list = align_J->Set();
832         CSeq_align_set::Tdata::iterator it = align_list.begin();
833         int prev_score = 0;
834         result = false;
835         while (it != align_list.end()) {
836             int current_score;
837             (*it)->GetNamedScore(CSeq_align::eScore_Score, current_score);
838             if(current_score >= prev_score){
839                 string j_id;
840                 (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
841                 string j_chain_type = annotation_info.GetDJChainType(j_id);
842                 if (j_chain_type == "N/A"){
843                     //assume J gene id style
844 
845                     string sid = NStr::ToUpper(j_id);
846                     if (sid.substr(0, 2) == "TR" && sid[3] == 'J') {
847                         j_chain_type = "J" + sid.substr(2,1);
848                     } else if (sid[0] == 'J') {
849                         j_chain_type = sid.substr(0,2);
850                     }
851                 }
852                 if (j_chain_type == "JD"){
853                     result = true;
854                     break;
855                 }
856 
857             } else {
858                 break;
859             }
860             prev_score = current_score;
861             ++it;
862         }
863 
864     }
865     return result;
866 };
867 */
x_FindDJAln(CRef<CSeq_align_set> & align_D,CRef<CSeq_align_set> & align_J,string q_ct,bool q_ms,ENa_strand q_st,int q_ve,int iq,bool va_or_vd_as_heavy_chain)868 void CIgBlast::x_FindDJAln(CRef<CSeq_align_set>& align_D,
869                            CRef<CSeq_align_set>& align_J,
870                            string q_ct,
871                            bool q_ms,
872                            ENa_strand q_st,
873                            int q_ve,
874                            int iq,
875                            bool va_or_vd_as_heavy_chain) {
876 
877     int allowed_VJ_distance = max_allowed_VJ_distance_with_D;
878         /* preprocess D */
879         if (align_D && !align_D->Get().empty()) {
880             CSeq_align_set::Tdata & align_list = align_D->Set();
881             CSeq_align_set::Tdata::iterator it = align_list.begin();
882             /* chain type test */
883             if (q_ct!="VH" && q_ct!="VD" && q_ct!="VA" && q_ct!="VB" ) {
884                 while (it != align_list.end()) {
885                     it = align_list.erase(it);
886                 }
887                 allowed_VJ_distance = max_allowed_VJ_distance_without_D;
888             } else if (q_ct =="VA" || q_ct =="VD") {
889                 if (va_or_vd_as_heavy_chain) {
890                     //VA could behave like VD and is allowed to rearrange to JA or DD/JD
891                     q_ct = "VD";
892                     //annot->m_ChainType[0] = "VD";
893                 } else {
894                     q_ct = "VA";
895                     while (it != align_list.end()) {
896                         it = align_list.erase(it);
897                     }
898                     allowed_VJ_distance = max_allowed_VJ_distance_without_D;
899                 }
900             }
901             //test compatability between V and D
902             it = align_list.begin();
903             while (it != align_list.end()) {
904                 bool keep = true;
905                 /* chain type test */
906                 if (q_ct!="N/A") {
907                     char s_ct = q_ct[1];
908                     string d_id;
909                     (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
910                     string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
911                     if (d_chain_type != "N/A"){
912                         if (d_chain_type[1] != q_ct[1]) keep = false;
913                     } else { //assume D gene id style
914                         string sid = (*it)->GetSeq_id(1).AsFastaString();
915                         sid = NStr::ToUpper(sid);
916                         if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
917                         if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
918                             && sid[3] == 'D') {
919                             s_ct = sid[2];
920                         }
921                         if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
922                         if (s_ct != q_ct[1]) keep = false;
923                     }
924                 }
925 
926                 /* remove failed seq_align */
927                 if (!keep) it = align_list.erase(it);
928                 else ++it;
929             }
930 
931 
932             /* strand test */
933             bool strand_found = false;
934             ITERATE(CSeq_align_set::Tdata, it, align_list) {
935                 if ((*it)->GetSeqStrand(0) == q_st) {
936                     strand_found = true;
937                     break;
938                 }
939             }
940             if (strand_found) {
941                 it = align_list.begin();
942                 while (it != align_list.end()) {
943                     if ((*it)->GetSeqStrand(0) != q_st) {
944                         it = align_list.erase(it);
945                     } else ++it;
946                 }
947             }
948             /* v end test */
949             it = align_list.begin();
950             while (it != align_list.end()) {
951                 bool keep = false;
952                 int q_ds = (*it)->GetSeqStart(0);
953                 int q_de = (*it)->GetSeqStop(0);
954                 if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
955                 else      keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
956                 if (!keep) it = align_list.erase(it);
957                 else ++it;
958             }
959             /* sort according to score */
960             align_list.sort(s_CompareSeqAlignByScoreAndName);
961         }
962 
963         /* preprocess J */
964         if (align_J && !align_J->Get().empty()) {
965             CSeq_align_set::Tdata & align_list = align_J->Set();
966             CSeq_align_set::Tdata::iterator it = align_list.begin();
967             while (it != align_list.end()) {
968                 bool keep = true;
969                 /* chain type test */
970                 if (q_ct!="N/A") {
971                     char s_ct = q_ct[1];
972                     string j_id;
973                     (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
974                     string j_chain_type = m_AnnotationInfo.GetDJChainType(j_id);
975                     if (j_chain_type != "N/A"){
976                         if (j_chain_type[1] != q_ct[1]) keep = false;
977                     } else { //assume J gene id style
978                         string sid = (*it)->GetSeq_id(1).AsFastaString();
979                         sid = NStr::ToUpper(sid);
980                         if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
981                         if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
982                             && sid[3] == 'J') {
983                             s_ct = sid[2];
984                         } else if (sid[0] == 'J') {
985                             s_ct = sid[1];
986                         }
987                         if (s_ct!='H' && s_ct!='L' && s_ct!='K' &&
988                             s_ct!='A' && s_ct!='B' && s_ct!='D' && s_ct!='G') s_ct = q_ct[1];
989                         if (s_ct != q_ct[1]) keep = false;
990                     }
991                 } else {
992                     keep = false;
993                 }
994                 /* strand test */
995                 if ((*it)->GetSeqStrand(0) != q_st) keep = false;
996                 /* subject start test */
997                 if ((*it)->GetSeqStart(1) > max_allowed_j_deletion) keep = false;
998                 /* v end test */
999                 int q_js = (*it)->GetSeqStart(0);
1000                 int q_je = (*it)->GetSeqStop(0);
1001                 if (q_ms) {
1002                     if (q_je < q_ve - allowed_VJ_distance  || q_js > q_ve - j_wordsize) keep = false;
1003                 } else {
1004                     if (q_js > q_ve + allowed_VJ_distance || q_je < q_ve + j_wordsize) keep = false;
1005                 }
1006                 /* remove failed seq_align */
1007                 if (!keep) it = align_list.erase(it);
1008                 else ++it;
1009             }
1010             /* sort according to score */
1011             align_list.sort(s_CompareSeqAlignByScoreAndName);
1012         }
1013 
1014         /* which one to keep, D or J? */
1015         if (align_D.NotEmpty() && !align_D->IsEmpty() &&
1016             align_J.NotEmpty() && !align_J->IsEmpty()) {
1017             CSeq_align_set::Tdata & al_D = align_D->Set();
1018             CSeq_align_set::Tdata & al_J = align_J->Set();
1019             CSeq_align_set::Tdata::iterator it;
1020             bool keep_J = s_CompareSeqAlignByScore(*(al_J.begin()), *(al_D.begin()));
1021             if (keep_J) {
1022                 it = al_D.begin();
1023                 while (it != al_D.end()) {
1024                     if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1025                         it = al_D.erase(it);
1026                     } else ++it;
1027                 }
1028 
1029                 if (m_IgOptions->m_DetectOverlap && m_IgOptions->m_J_penalty == -3 &&
1030                     m_IgOptions->m_D_penalty == -4) {
1031                     //deleting j only for overlap case otherwise it's handeled later
1032                     if (align_D.NotEmpty() && !align_D->IsEmpty()){
1033                         it = al_J.begin();
1034                         while (it != al_J.end()) {
1035                             if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1036                                 it = al_J.erase(it);
1037                             } else ++it;
1038                         }
1039                     }
1040                 }
1041             } else {
1042                 it = al_J.begin();
1043 
1044                 while (it != al_J.end()) {
1045                     if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1046                         it = al_J.erase(it);
1047                     } else ++it;
1048                 }
1049                 if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1050                     it = al_D.begin();
1051                     while (it != al_D.end()) {
1052                         if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1053                             it = al_D.erase(it);
1054                         } else ++it;
1055                     }
1056 
1057                 }
1058             }
1059 
1060         }
1061 
1062 }
1063 
1064 
x_FindDJ(CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,CRef<CIgAnnotation> & annot,CRef<CSeq_align_set> & align_D,CRef<CSeq_align_set> & align_J,string q_ct,bool q_ms,ENa_strand q_st,int q_ve,int iq)1065 void CIgBlast::x_FindDJ(CRef<CSearchResultSet>& results_D,
1066                         CRef<CSearchResultSet>& results_J,
1067                         CRef <CIgAnnotation>& annot,
1068                         CRef<CSeq_align_set>& align_D,
1069                         CRef<CSeq_align_set>& align_J,
1070                         string q_ct,
1071                         bool q_ms,
1072                         ENa_strand q_st,
1073                         int q_ve,
1074                         int iq) {
1075 
1076     CRef<CSeq_align_set> original_align_D(new CSeq_align_set);
1077     CRef<CSeq_align_set> original_align_J(new CSeq_align_set);
1078 
1079         /* preprocess D */
1080         CSearchResults& res_D = (*results_D)[iq];
1081         if (res_D.HasAlignments()) {
1082 
1083             align_D.Reset(const_cast<CSeq_align_set *>
1084                                            (&*(res_D.GetSeqAlign())));
1085             original_align_D->Assign(*align_D);
1086 
1087         }
1088 
1089         /* preprocess J */
1090         CSearchResults& res_J = (*results_J)[iq];
1091         if (res_J.HasAlignments()) {
1092             align_J.Reset(const_cast<CSeq_align_set *>
1093                                            (&*(res_J.GetSeqAlign())));
1094             original_align_J->Assign(*align_J);
1095 
1096         }
1097         //try as VA
1098         x_FindDJAln(align_D, align_J, q_ct, q_ms, q_st, q_ve, iq, false);
1099         if ((original_align_D.NotEmpty() && !original_align_D->Get().empty()) && (q_ct =="VA" || q_ct =="VD")) {
1100 
1101             annot->m_ChainType[0] = "VA";
1102             //try as VD
1103             x_FindDJAln(original_align_D, original_align_J, q_ct, q_ms, q_st, q_ve, iq, true);
1104             int as_heavy_chain_score = 0;
1105             int as_light_chain_score = 0;
1106             int d_score = 0;
1107             if(original_align_J.NotEmpty() && !original_align_J->Get().empty()){
1108                 original_align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_heavy_chain_score);
1109             }
1110 
1111             if(original_align_D.NotEmpty() && !original_align_D->Get().empty()){
1112                 original_align_D->Get().front()->GetNamedScore(CSeq_align::eScore_Score, d_score);
1113             }
1114             if (align_J.NotEmpty() && !align_J->Get().empty()){
1115                 align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_light_chain_score);
1116             }
1117 
1118 
1119             if (as_heavy_chain_score + d_score> as_light_chain_score){
1120                 if (align_D.NotEmpty() && original_align_D.NotEmpty()){
1121                     align_D->Assign(*original_align_D);
1122                 }
1123                 if (align_J.NotEmpty() && original_align_J.NotEmpty()){
1124                     align_J->Assign(*original_align_J);
1125                 }
1126 
1127                 annot->m_ChainType[0] = "VD";
1128             }
1129 
1130         }
1131 
1132 }
1133 
x_FillJDomain(CRef<CSeq_align> & align,CRef<CIgAnnotation> & annot)1134 void CIgBlast::x_FillJDomain(CRef<CSeq_align> & align, CRef <CIgAnnotation> & annot){
1135     string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1136     int j_cdr3end = m_AnnotationInfo.GetJDomain(sid);
1137     int subject_start = align->GetSeqStart(1);
1138     int subject_end = align->GetSeqStop(1);
1139     //don't try if j starts after cdr3 ends as we don't know for sure where the boundry is
1140     if (j_cdr3end > 0 && subject_start - j_cdr3end <= 1) {
1141         CAlnMap j_map(align->GetSegs().GetDenseg());
1142 
1143         //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1144         // of fwr4 can be handled.
1145         annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1,
1146                                                         max(subject_start, min(j_cdr3end + 1,
1147                                                                                subject_end)),
1148                                                         IAlnExplorer::eRight);
1149 
1150         if (align->GetSeqStrand(0) == eNa_strand_minus) {
1151             annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1152         }
1153 
1154         //deduct one back to in CDR3
1155         if (subject_end > j_cdr3end) {
1156             annot->m_JDomain[1] --;
1157         }
1158         //allow missed alignment to the first bp and deduce the cdr3 by gapless extension backwards
1159     } else if (j_cdr3end > 0 && subject_start - j_cdr3end <= 2) {
1160         CAlnMap j_map(align->GetSegs().GetDenseg());
1161 
1162         //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1163         // of fwr4 can be handled.
1164         annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1, subject_start, IAlnExplorer::eRight);
1165 
1166         if (align->GetSeqStrand(0) == eNa_strand_minus) {
1167             annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1168         }
1169 
1170         //deduct diff back to be in CDR3
1171         if (subject_end > j_cdr3end) {
1172             annot->m_JDomain[1] = annot->m_JDomain[1] - (subject_start - j_cdr3end);
1173         }
1174     }
1175 
1176     //fwr4
1177     if (annot->m_JDomain[1] > 0) {
1178         int j_fwr4end_offset = m_AnnotationInfo.GetFwr4EndOffset(sid);
1179         annot->m_JDomain[4] = j_fwr4end_offset;
1180         if (j_fwr4end_offset >= 0) {
1181             int j_fwr4end = m_Scope->GetBioseqHandle(align->GetSeq_id(1)).GetBioseqLength() - j_fwr4end_offset - 1;
1182             CAlnMap j_map(align->GetSegs().GetDenseg());
1183 
1184             annot->m_JDomain[3] = j_map.GetSeqPosFromSeqPos(0, 1, min(j_fwr4end, subject_end), IAlnExplorer::eRight);
1185 
1186 
1187             if (align->GetSeqStrand(0) == eNa_strand_minus)  {
1188                 annot->m_JDomain[3] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[3] - 1;
1189             }
1190             //cdr3 domain at the end of alignment
1191             if (annot->m_JDomain[1] == annot->m_JDomain[3]) {
1192                 annot->m_JDomain[3] = -1;
1193             }
1194         }
1195     }
1196 
1197 
1198 }
1199 
x_ProcessDGeneResult(CRef<CSearchResultSet> & results_V,CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1200 void CIgBlast::x_ProcessDGeneResult(CRef<CSearchResultSet>& results_V,
1201                                     CRef<CSearchResultSet>& results_D,
1202                                     CRef<CSearchResultSet>& results_J,
1203                                     vector<CRef <CIgAnnotation> > &annots) {
1204 
1205     int iq = 0;
1206     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1207         string q_ct = (*annot)->m_ChainType[0];
1208         bool q_ms = (*annot)->m_MinusStrand;
1209         ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1210         int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1211 
1212         CRef<CSeq_align_set> align_D (0);
1213         CSearchResults& res_D = (*results_D)[iq];
1214         if (res_D.HasAlignments()) {
1215             align_D = res_D.SetSeqAlign();
1216         }
1217 
1218         // preprocess D
1219         if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1220             CSeq_align_set::Tdata & align_list = align_D->Set();
1221             CSeq_align_set::Tdata::iterator it = align_list.begin();
1222 
1223             //test compatability between V and D
1224             it = align_list.begin();
1225             while (it != align_list.end()) {
1226                 bool keep = true;
1227                 // chain type test
1228                 if (q_ct!="N/A") {
1229                     char s_ct = q_ct[1];
1230                     string d_id;
1231                     (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
1232                     string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
1233                     if (d_chain_type != "N/A"){
1234                         if (d_chain_type[1] != q_ct[1]) keep = false;
1235                     } else { //assume D gene id style
1236                         string sid = (*it)->GetSeq_id(1).AsFastaString();
1237                         sid = NStr::ToUpper(sid);
1238                         if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
1239                         if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
1240                             && sid[3] == 'D') {
1241                             s_ct = sid[2];
1242                         }
1243                         if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
1244                         if (s_ct != q_ct[1]) keep = false;
1245                     }
1246                 }
1247 
1248                 //remove failed seq_align
1249                 if (!keep) it = align_list.erase(it);
1250                 else ++it;
1251             }
1252 
1253 
1254             //strand test
1255             bool strand_found = false;
1256             ITERATE(CSeq_align_set::Tdata, it, align_list) {
1257                 if ((*it)->GetSeqStrand(0) == q_st) {
1258                     strand_found = true;
1259                     break;
1260                 }
1261             }
1262             if (strand_found) {
1263                 it = align_list.begin();
1264                 while (it != align_list.end()) {
1265                     if ((*it)->GetSeqStrand(0) != q_st) {
1266                         it = align_list.erase(it);
1267                     } else ++it;
1268                 }
1269             }
1270             //v end test
1271             it = align_list.begin();
1272             while (it != align_list.end()) {
1273                 bool keep = false;
1274                 int q_ds = (*it)->GetSeqStart(0);
1275                 int q_de = (*it)->GetSeqStop(0);
1276                 if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
1277                 else      keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
1278                 if (!keep) it = align_list.erase(it);
1279                 else ++it;
1280             }
1281             // sort according to score
1282             align_list.sort(s_CompareSeqAlignByScoreAndName);
1283 
1284             /* process J */
1285             CRef<CSeq_align_set> align_J (0);
1286             CSearchResults& res_J = (*results_J)[iq];
1287             if (res_J.HasAlignments()) {
1288                 align_J = res_J.SetSeqAlign();
1289             }
1290             if (align_J && align_J.NotEmpty() && !align_J->IsEmpty() && !align_list.empty()) {
1291 
1292                 CSeq_align_set::Tdata & al_J = align_J->Set();
1293                 CSeq_align_set::Tdata::iterator it = al_J.begin();
1294                 while (it != al_J.end()) {
1295                     if (s_DJNotCompatible(*(align_list.front()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1296                         it = al_J.erase(it);
1297                     } else ++it;
1298                 }
1299             }
1300         }
1301 
1302         iq ++;
1303     }
1304 }
1305 
x_ProcessDJResult(CRef<CSearchResultSet> & results_V,CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1306 void CIgBlast::x_ProcessDJResult(CRef<CSearchResultSet>& results_V,
1307                                  CRef<CSearchResultSet>& results_D,
1308                                  CRef<CSearchResultSet>& results_J,
1309                                  vector<CRef <CIgAnnotation> > &annots) {
1310 
1311     int iq = 0;
1312     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1313         string q_ct = (*annot)->m_ChainType[0];
1314         bool q_ms = (*annot)->m_MinusStrand;
1315         ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1316         int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1317 
1318         CRef<CSeq_align_set> align_D, align_J;
1319 
1320         x_FindDJ( results_D, results_J, *annot, align_D, align_J, q_ct, q_ms, q_st, q_ve, iq);
1321         iq ++;
1322     }
1323 }
1324 
x_AnnotateD(CRef<CSearchResultSet> & results_D,vector<CRef<CIgAnnotation>> & annots)1325 void CIgBlast::x_AnnotateD(CRef<CSearchResultSet>        &results_D,
1326                            vector<CRef <CIgAnnotation> > &annots)
1327 {
1328 
1329     int iq = 0;
1330     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1331 
1332         string q_ct = (*annot)->m_ChainType[0];
1333         CSearchResults& res_d = (*results_D)[iq];
1334         CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1335         if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1336             const CSeq_align_set::Tdata& align_list = align_D->Get();
1337             CRef<CSeq_align> align = align_list.front();
1338             (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1339             (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1340             (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1341         }
1342 
1343 
1344         /* next set of results */
1345         ++iq;
1346     }
1347 };
1348 
x_AnnotateJ(CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1349 void CIgBlast::x_AnnotateJ(CRef<CSearchResultSet>        &results_J,
1350                            vector<CRef <CIgAnnotation> > &annots)
1351 {
1352     int iq = 0;
1353     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1354 
1355         string q_ct = (*annot)->m_ChainType[0];
1356         bool q_ms = (*annot)->m_MinusStrand;
1357 
1358         const CSearchResults& res_j = (*results_J)[iq];
1359         CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1360 
1361         /* annotate J */
1362         if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1363             const CSeq_align_set::Tdata & align_list = align_J->Get();
1364             CRef<CSeq_align> align = align_list.front();
1365             x_FillJDomain(align, *annot);
1366             (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1367             (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1368             string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1369             int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1370             if (frame_offset >= 0) {
1371                 int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1372                 (*annot)->m_FrameInfo[2] = (q_ms) ?
1373                                            align->GetSeqStop(0)  + frame_adj
1374                                          : align->GetSeqStart(0) - frame_adj;
1375             }
1376             (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1377         }
1378 
1379         /* next set of results */
1380         ++iq;
1381     }
1382 };
1383 
x_AnnotateDJ(CRef<CSearchResultSet> & results_D,CRef<CSearchResultSet> & results_J,vector<CRef<CIgAnnotation>> & annots)1384 void CIgBlast::x_AnnotateDJ(CRef<CSearchResultSet>        &results_D,
1385                             CRef<CSearchResultSet>        &results_J,
1386                             vector<CRef <CIgAnnotation> > &annots)
1387 {
1388     int iq = 0;
1389     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1390 
1391         string q_ct = (*annot)->m_ChainType[0];
1392         bool q_ms = (*annot)->m_MinusStrand;
1393 
1394         const CSearchResults& res_j = (*results_J)[iq];
1395         const CSearchResults& res_d = (*results_D)[iq];
1396         CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1397         CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1398 
1399         /* annotate D */
1400         if (align_D.NotEmpty() && !align_D->IsEmpty()) {
1401             const CSeq_align_set::Tdata & align_list = align_D->Get();
1402             CRef<CSeq_align> align = align_list.front();
1403             (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1404             (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1405             (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1406 
1407         }
1408 
1409         /* annotate J */
1410         if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1411             const CSeq_align_set::Tdata & align_list = align_J->Get();
1412             CRef<CSeq_align> align = align_list.front();
1413             x_FillJDomain(align, *annot);
1414             (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1415             (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1416             string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1417             int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1418             if (frame_offset >= 0) {
1419                 int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1420                 (*annot)->m_FrameInfo[2] = (q_ms) ?
1421                                            align->GetSeqStop(0)  + frame_adj
1422                                          : align->GetSeqStart(0) - frame_adj;
1423             }
1424             (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1425         }
1426 
1427         /* next set of results */
1428         ++iq;
1429     }
1430 };
1431 
1432 // query chain type and domain is annotated by germline alignment
x_AnnotateDomain(CRef<CSearchResultSet> & gl_results,CRef<CSearchResultSet> & dm_results,vector<CRef<CIgAnnotation>> & annots)1433 void CIgBlast::x_AnnotateDomain(CRef<CSearchResultSet>        &gl_results,
1434                                 CRef<CSearchResultSet>        &dm_results,
1435                                 vector<CRef <CIgAnnotation> > &annots)
1436 {
1437     CRef<CObjectManager> mgr = CObjectManager::GetInstance();
1438     CScope scope_q(*mgr), scope_s(*mgr);
1439     CRef<CSeqDB> db_V, db_domain;
1440     bool annotate_subject = false;
1441     if (m_IgOptions->m_Db[0]->IsBlastDb()) {
1442         string db_name_V = m_IgOptions->m_Db[0]->GetDatabaseName();
1443         string db_name_domain = m_IgOptions->m_Db[3]->GetDatabaseName();
1444         CSeqDB::ESeqType db_type = (m_IgOptions->m_IsProtein)?
1445                                    CSeqDB::eProtein : CSeqDB::eNucleotide;
1446         db_V.Reset(new CSeqDB(db_name_V, db_type));
1447         if (db_name_V == db_name_domain) {
1448             db_domain.Reset(&(*db_V));
1449         } else {
1450             db_domain.Reset(new CSeqDB(db_name_domain, db_type));
1451         }
1452         annotate_subject = true;
1453     }
1454 
1455     int iq = 0;
1456     ITERATE(CSearchResultSet, result, *dm_results) {
1457 
1458         CIgAnnotation *annot = &*(annots[iq]);
1459         annot->m_ChainType.push_back("N/A");  // Assuming non-ig sequence first
1460         annot->m_ChainTypeToShow = "N/A";
1461         if ((*result)->HasAlignments() && (*gl_results)[iq].HasAlignments()) {
1462 
1463 
1464             CConstRef<CSeq_align> master_align =
1465                             (*gl_results)[iq].GetSeqAlign()->Get().front();
1466             CAlnMap q_map(master_align->GetSegs().GetDenseg());
1467 
1468             if (master_align->GetSeqStrand(0) == eNa_strand_minus) {
1469                 annot->m_MinusStrand = true;
1470             }
1471 
1472             int q_ends[2], q_dir;
1473 
1474             if (annot->m_MinusStrand) {
1475                 q_ends[1] = master_align->GetSeqStart(0);
1476                 q_ends[0] = master_align->GetSeqStop(0);
1477                 q_dir = -1;
1478 
1479             } else {
1480                 q_ends[0] = master_align->GetSeqStart(0);
1481                 q_ends[1] = master_align->GetSeqStop(0);
1482                 q_dir = 1;
1483             }
1484 
1485             const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
1486 
1487             ITERATE(CSeq_align_set::Tdata, it, align_list) {
1488 
1489                 string sid = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
1490                 annot->m_ChainType[0] = m_AnnotationInfo.GetDomainChainType(sid);
1491                 annot->m_ChainTypeToShow = annot->m_ChainType[0];
1492                 int domain_info[10];
1493 
1494                 if (m_AnnotationInfo.GetDomainInfo(sid, domain_info)) {
1495 
1496 
1497                     CAlnMap s_map((*it)->GetSegs().GetDenseg());
1498                     int s_start = (*it)->GetSeqStart(1);
1499                     int s_stop = (*it)->GetSeqStop(1);
1500 
1501                     CRef<CAlnMap> d_map;
1502                     int d_start = -1;
1503                     int d_stop = -1;
1504 
1505                     int start, stop;
1506 
1507                     if (annotate_subject) {
1508                         CRef<CBioseq> seq_q = db_domain->SeqidToBioseq((*it)->GetSeq_id(1));
1509                         CBioseq_Handle hdl_q = scope_q.AddBioseq(*seq_q);
1510                         CRef<CBioseq> seq_s = db_V->SeqidToBioseq(master_align->GetSeq_id(1));
1511                         CBioseq_Handle hdl_s = scope_s.AddBioseq(*seq_s);
1512                         CSeq_loc query, subject;
1513                         query.SetWhole();
1514                         query.SetId((*it)->GetSeq_id(1));
1515                         subject.SetWhole();
1516                         subject.SetId(master_align->GetSeq_id(1));
1517                         SSeqLoc q_loc(&query, &scope_q);
1518                         SSeqLoc s_loc(&subject, &scope_s);
1519                         CBl2Seq bl2seq(q_loc, s_loc, (m_IgOptions->m_IsProtein)? eBlastp: eBlastn);
1520                         const CSearchResults& result = (*(bl2seq.RunEx()))[0];
1521                         if (result.HasAlignments()) {
1522                             CConstRef<CSeq_align> subject_align = result.GetSeqAlign()->Get().front();
1523                             d_map.Reset(new CAlnMap(subject_align->GetSegs().GetDenseg()));
1524                             d_start = subject_align->GetSeqStart(0);
1525                             d_stop = subject_align->GetSeqStop(0);
1526                         }
1527                         scope_q.RemoveBioseq(hdl_q);
1528                         scope_s.RemoveBioseq(hdl_s);
1529                     }
1530 
1531                     for (int i =0; i<10; i+=2) {
1532 
1533                         start = domain_info[i] - 1;
1534                         stop = domain_info[i+1] - 1;
1535 
1536                         if (start <= d_stop && stop >= d_start) {
1537                             int start_copy = start;
1538                             int stop_copy = stop;
1539                             if (start_copy < d_start) start_copy = d_start;
1540                             if (stop_copy > d_stop) stop_copy = d_stop;
1541                             if (start_copy <= stop_copy) {
1542                                 if (i>0 && annot->m_DomainInfo_S[i-1]>=0) {
1543                                     annot->m_DomainInfo_S[i] = annot->m_DomainInfo_S[i-1] + 1;
1544                                 } else {
1545                                     annot->m_DomainInfo_S[i] =
1546                                        d_map->GetSeqPosFromSeqPos(1, 0, start_copy, IAlnExplorer::eForward);
1547                                 }
1548                                 annot->m_DomainInfo_S[i+1] =
1549                                    d_map->GetSeqPosFromSeqPos(1, 0, stop_copy, IAlnExplorer::eBackwards);
1550                             }
1551                         }
1552 
1553                         if (start > s_stop || stop < s_start) continue;
1554 
1555                         if (start < s_start) start = s_start;
1556 
1557                         if (stop > s_stop) stop = s_stop;
1558 
1559                         if (start > stop) continue;
1560 
1561                         start = s_map.GetSeqPosFromSeqPos(0, 1, start, IAlnExplorer::eForward);
1562                         stop = s_map.GetSeqPosFromSeqPos(0, 1, stop, IAlnExplorer::eBackwards);
1563 
1564                         if ((start - q_ends[1])*q_dir > 0 || (stop - q_ends[0])*q_dir < 0) continue;
1565 
1566                         if ((start - q_ends[0])*q_dir < 0) start = q_ends[0];
1567 
1568                         if ((stop - q_ends[1])*q_dir > 0) stop = q_ends[1];
1569 
1570                         if ((start - stop)*q_dir > 0) continue;
1571 
1572                         start = q_map.GetSeqPosFromSeqPos(1, 0, start, IAlnExplorer::eForward);
1573                         stop = q_map.GetSeqPosFromSeqPos(1, 0, stop, IAlnExplorer::eBackwards);
1574 
1575                         start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1576                         stop = q_map.GetSeqPosFromSeqPos(0, 1, stop);
1577 
1578                         if ((start - stop)*q_dir > 0) continue;
1579 
1580                         annot->m_DomainInfo[i] = start;
1581                         annot->m_DomainInfo[i+1] = stop;
1582                     }
1583 
1584 
1585 
1586                     // extension of the first and last annotated domain (if any)
1587                     int i = 0;
1588                     int extension = 0;
1589                     while (i<10 && annot->m_DomainInfo[i] < 0) i+=2;
1590                     if (i < 10 && domain_info[i] > 0) {
1591                         extension = (domain_info[i] - 1 -
1592                                      s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1593                                                                IAlnExplorer::eBackwards))*q_dir;
1594                         annot->m_DomainInfo[i] += extension;
1595                         //this does not get reversed like m_DomainInfo
1596                         annot->m_DomainInfo_S[i] -= abs(extension);
1597 
1598                         if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1599                         if (annot->m_DomainInfo_S[i] < 0) annot->m_DomainInfo_S[i] = 0;
1600 
1601                         i+=2;
1602                         while (i<10 && annot->m_DomainInfo[i] >=0) {
1603                             annot->m_DomainInfo[i] = annot->m_DomainInfo[i-1] + q_dir;
1604                             i+=2;
1605                         }
1606                         i = 9;
1607                         while (i>0 && annot->m_DomainInfo[i] < 0) i-=2;
1608                         if (i >= 0) {
1609                             annot->m_DomainInfo[i] += (domain_info[i] - 1 -
1610                                                        s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1611                                                                                  IAlnExplorer::eForward))*q_dir;
1612                             if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1613                         }
1614                     }
1615 
1616                     // any extra alignments after FWR3 are attributed to CDR3
1617                     start = annot->m_DomainInfo[9];
1618 
1619                     if (start >= 0 && (start - q_ends[1])*q_dir < 0) {
1620                         start = q_map.GetSeqPosFromSeqPos(1, 0, start+q_dir, IAlnExplorer::eForward);
1621                         start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1622 
1623                         if ((start - q_ends[1])*q_dir <= 0) {
1624                             annot->m_DomainInfo[10] = start;
1625                             annot->m_DomainInfo[11] = q_ends[1];
1626                         }
1627                     }
1628                     // annotate the query frame offset
1629                     int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1630 
1631                     if (frame_offset >= 0) {
1632                         int q_start = (*it)->GetSeqStart(0);
1633                         int q_stop = (*it)->GetSeqStop(0);
1634                         int q_mid = q_start + q_stop;
1635                         int q_dif = q_stop - q_start;
1636                         int frame_adj = (3 - ((*it)->GetSeqStart(1) + 3 - frame_offset) % 3) %3;
1637                         annot->m_FrameInfo[0] = (q_mid - q_dir *q_dif)/2 + q_dir * frame_adj;
1638 
1639                         //counting frame from fwr3 end, not the V end since we need to ignore a few bases
1640                         //in the CDR3 to allow any insertion or deletion at V gene end
1641                         if (annot->m_DomainInfo[9] > 0) {
1642                             int fwr3_stop = annot->m_DomainInfo[9];
1643 
1644                             if (annot->m_MinusStrand) {
1645 
1646                                 q_start = max(q_start, fwr3_stop);
1647                                 q_mid = q_start + q_stop;
1648                                 q_dif = q_stop - q_start;
1649                                 frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_start, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1650                             } else {
1651                                 q_stop = min(q_stop, fwr3_stop);
1652                                 q_mid = q_start + q_stop;
1653                                 q_dif = q_stop - q_start;
1654                                 frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_stop, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1655                             }
1656                         } else {
1657                             frame_adj = ((*it)->GetSeqStop(1) + 3 - frame_offset) % 3;
1658                         }
1659 
1660                         annot->m_FrameInfo[1] = (q_mid + q_dir *q_dif)/2 - q_dir * frame_adj;
1661                     }
1662                     break;
1663 
1664                 }
1665             }
1666         }
1667         ++iq;
1668     }
1669 };
1670 
x_SetChainType(CRef<CSearchResultSet> & results,vector<CRef<CIgAnnotation>> & annots)1671 void CIgBlast::x_SetChainType(CRef<CSearchResultSet> &results,
1672                               vector<CRef <CIgAnnotation> > &annots)
1673 {
1674     int iq = 0;
1675     ITERATE(CSearchResultSet, result, *results) {
1676 
1677         CIgAnnotation *annot = &*(annots[iq++]);
1678 
1679         if ((*result)->HasAlignments()) {
1680             int num_aligns = (*result)->GetSeqAlign()->Size();
1681             CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
1682                                      (const_cast<CSearchResults *>(&**result));
1683             for (int i=0; i<ig_result->m_NumActualV; ++i, --num_aligns) {
1684                  annot->m_ChainType.push_back("V");
1685             }
1686             for (int i=0; i<ig_result->m_NumActualD; ++i, --num_aligns) {
1687                  annot->m_ChainType.push_back("D");
1688             }
1689             for (int i=0; i<ig_result->m_NumActualJ; ++i, --num_aligns) {
1690                  annot->m_ChainType.push_back("J");
1691             }
1692             for (int i=0; i<num_aligns; ++i) {
1693                  annot->m_ChainType.push_back("N/A");
1694             }
1695         }
1696     }
1697 };
1698 
s_SortResultsByEvalue(CRef<CSearchResultSet> & results)1699 void CIgBlast::s_SortResultsByEvalue(CRef<CSearchResultSet>& results)
1700 {
1701     ITERATE(CSearchResultSet, result, *results) {
1702         if ((*result)->HasAlignments()) {
1703             CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1704                                    (&*((*result)->GetSeqAlign())));
1705             CSeq_align_set::Tdata & align_list = align->Set();
1706             align_list.sort(s_CompareSeqAlignByEvalue);
1707         }
1708     }
1709 };
1710 
1711 // convert sequencecomparison to database mode
x_ConvertResultType(CRef<CSearchResultSet> & result)1712 void CIgBlast::x_ConvertResultType(CRef<CSearchResultSet> &result)
1713 {
1714     if (result->GetResultType() != eSequenceComparison) {
1715         return;
1716     }
1717 
1718     int num_queries = m_Query->Size();
1719     int num_results = result->GetNumResults();
1720     int ir = 0;
1721     CSearchResultSet *retv = new CSearchResultSet();
1722 
1723     for (int iq = 0; iq< num_queries && ir< num_results; ++iq) {
1724 
1725         CSearchResults &res = (*result)[ir++];
1726         CRef<CBlastAncillaryData> ancillary = res.GetAncillaryData();
1727         TQueryMessages errmsg = res.GetErrors();
1728         CConstRef<CSeq_id> rid = res.GetSeqId();
1729         CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1730                           (&*(res.GetSeqAlign())));
1731         CSeq_align_set::Tdata & align_list = align->Set();
1732 
1733         CConstRef<CSeq_id> qid = m_Query->GetBlastSearchQuery(iq)->GetQueryId();
1734         while(!qid->Match(*rid)) {
1735             CRef<CSeq_align_set> empty;
1736             CRef<CSearchResults> r(new CSearchResults(qid, empty, errmsg, ancillary));
1737             retv->push_back(r);
1738             qid = m_Query->GetBlastSearchQuery(++iq)->GetQueryId();
1739         }
1740 
1741         while(ir < num_results && (*result)[ir].GetSeqId()->Match(*qid)) {
1742             CSearchResults &add_res = (*result)[ir++];
1743             CRef<CSeq_align_set> add;
1744             add.Reset(const_cast<CSeq_align_set *>
1745                           (&*(add_res.GetSeqAlign())));
1746             CSeq_align_set::Tdata & add_list = add->Set();
1747             align_list.insert(align_list.end(), add_list.begin(), add_list.end());
1748         }
1749         CRef<CSearchResults> r(new CSearchResults(qid, align, errmsg, ancillary));
1750         retv->push_back(r);
1751     }
1752 
1753     result.Reset(retv);
1754 };
1755 
s_AppendResults(CRef<CSearchResultSet> & results,int num_aligns,int gene,CRef<CSearchResultSet> & final_results)1756 void CIgBlast::s_AppendResults(CRef<CSearchResultSet> &results,
1757                                int                     num_aligns,
1758                                int                     gene,
1759                                CRef<CSearchResultSet> &final_results)
1760 {
1761     bool  new_result = (final_results.Empty());
1762     if (new_result) {
1763         final_results.Reset(new CSearchResultSet());
1764     }
1765 
1766     int iq = 0;
1767     ITERATE(CSearchResultSet, result, *results) {
1768 
1769         CRef<CSeq_align_set> align;
1770         int actual_align = 0;
1771 
1772         if ((*result)->HasAlignments()) {
1773             align.Reset(const_cast<CSeq_align_set *>
1774                                    (&*((*result)->GetSeqAlign())));
1775 
1776             // keep only the first num_alignments
1777             if (num_aligns >= 0) {
1778                 CSeq_align_set::Tdata & align_list = align->Set();
1779                 if (align_list.size() > (CSeq_align_set::Tdata::size_type)num_aligns) {
1780                     CSeq_align_set::Tdata::iterator it = align_list.begin();
1781                     for (int i=0; i<num_aligns; ++i) ++it;
1782                     align_list.erase(it, align_list.end());
1783                     actual_align = num_aligns;
1784                 } else {
1785                     actual_align = align_list.size();
1786                 }
1787             }
1788         }
1789 
1790         TQueryMessages errmsg = (*result)->GetErrors();
1791         CConstRef<CSeq_id> query = (*result)->GetSeqId();
1792 
1793         CIgBlastResults *ig_result;
1794         if (new_result) {
1795             // TODO maybe we need the db ancillary instead?
1796             CRef<CBlastAncillaryData> ancillary = (*result)->GetAncillaryData();
1797             ig_result = new CIgBlastResults(query, align, errmsg, ancillary);
1798             CRef<CSearchResults> r(ig_result);
1799             final_results->push_back(r);
1800         } else {
1801             while( !(*final_results)[iq].GetSeqId()->Match(*query)) ++iq;
1802             ig_result = dynamic_cast<CIgBlastResults *> (&(*final_results)[iq]);
1803             if (!align.Empty()) {
1804                 CSeq_align_set::Tdata & ig_list = ig_result->SetSeqAlign()->Set();
1805                 CSeq_align_set::Tdata & align_list = align->Set();
1806 
1807                 if (gene < 0) {
1808                     // Remove duplicate seq_aligns
1809                     CSeq_align_set::Tdata::iterator it = align_list.begin();
1810                     while (it != align_list.end()) {
1811                         if (s_SeqAlignInSet(ig_list, *it)) it = align_list.erase(it);
1812                         else ++it;
1813                     }
1814                 }
1815 
1816                 if (!align_list.empty()) {
1817                     ig_list.insert(ig_list.end(), align_list.begin(), align_list.end());
1818                     ig_result->GetErrors().Combine(errmsg);
1819                 }
1820             }
1821         }
1822 
1823         switch(gene) {
1824         case 0: ig_result->m_NumActualV = actual_align; break;
1825         case 1: ig_result->m_NumActualD = actual_align; break;
1826         case 2: ig_result->m_NumActualJ = actual_align; break;
1827         default: break;
1828         }
1829     }
1830 };
1831 
x_SetAnnotation(vector<CRef<CIgAnnotation>> & annots,CRef<CSearchResultSet> & final_results)1832 void CIgBlast::x_SetAnnotation(vector<CRef <CIgAnnotation> > &annots,
1833                                CRef<CSearchResultSet> &final_results)
1834 {
1835     int iq = 0;
1836     NON_CONST_ITERATE(CSearchResultSet, result, *final_results) {
1837         CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
1838             (const_cast<CSearchResults *>(&**result));
1839         CIgAnnotation *annot = &*(annots[iq++]);
1840         ig_result->SetIgAnnotation().Reset(annot);
1841         if (annot->m_GeneInfo[4] < 0 && m_IgOptions->m_MinJLength > 0) { //no J
1842             if ((*result)->HasAlignments()){
1843                 (*result)->SetSeqAlign()->Set().clear();
1844             }
1845         }
1846     }
1847 };
1848 
1849 END_SCOPE(blast)
1850 END_NCBI_SCOPE
1851 
1852 /* @} */
1853