1 /*  $Id: igblast.hpp 625641 2021-02-17 18:15:12Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Ning Ma
27  *
28  */
29 
30 /// @file igblast.hpp
31 /// Declares CIgBlast, the C++ API for the IG-BLAST engine.
32 
33 #ifndef ALGO_BLAST_IGBLAST___IGBLAST__HPP
34 #define ALGO_BLAST_IGBLAST___IGBLAST__HPP
35 
36 #include <algo/blast/api/setup_factory.hpp>
37 #include <algo/blast/api/uniform_search.hpp>
38 #include <algo/blast/api/local_db_adapter.hpp>
39 #include <objmgr/scope.hpp>
40 
41 /** @addtogroup AlgoBlast
42  *
43  * @{
44  */
45 
46 BEGIN_NCBI_SCOPE
47 
48 BEGIN_SCOPE(blast)
49 
50 /// Keeps track of the version of IgBLAST in the NCBI C++ toolkit.
51 /// Used to perform run-time version checks
52 ///
53 /// For reference, please refer to http://apr.apache.org/versioning.html
54 class CIgBlastVersion : public CVersionInfo {
55 public:
CIgBlastVersion()56     CIgBlastVersion()
57         : CVersionInfo(1, 17, 1) {}
58 };
59 
60 class IQueryFactory;
61 
62 class CIgBlastOptions : public CObject
63 {
64 public:
65     // the germline database search must be carried out locally
66     bool m_IsProtein;                // search molecular type
67     string m_Origin;                 // the origin of species
68     string m_DomainSystem;           // domain system for annotation
69     string m_SequenceType;           //ig or tcr?
70     int m_Min_D_match;               //the word size for D gene search
71     int m_V_penalty;                 //the mismatch penalty for V gene search
72     int m_D_penalty;                 //the mismatch penalty for D gene search
73     int m_J_penalty;                 //the mismatch penalty for J gene search
74     string m_AuxFilename;            // auxulary file name
75     string m_IgDataPath;             // internal data path
76     CRef<CLocalDbAdapter> m_Db[4];   // user specified germline database
77                                      // 0-2: - user specified V, D, J
78                                      // 3:   - the default V gl db
79     int  m_NumAlign[3];              // number of VDJ alignments to show
80     bool m_FocusV;                   // should alignment restrict to V
81     bool m_Translate;                // should translation be displayed
82     bool m_ExtendAlign5end;
83     bool m_ExtendAlign3end;
84     int m_MinVLength;
85     int m_MinJLength;
86     bool m_DetectOverlap;
87     list<string> m_AirrField;
88 };
89 
90 class CIgAnnotation : public CObject
91 {
92 public:
93     bool m_MinusStrand;              // hit is on minus strand of the query
94     vector<string> m_TopGeneIds;     // Top match germline gene ID
95     vector<string> m_ChainType;      // chain types of the query ([0]) and subjects ([1:])
96     string m_ChainTypeToShow;        // chain type to show to user.  Normally this is
97                                      //the same as m_ChainType[0] but could be different
98                                      // in case o TCRA/D chains which can use both JA and JD
99     int m_GeneInfo[6];               // The (start) and (end offset + 1) for VDJ
100     int m_FrameInfo[3];              // Coding frame start offset for V start, V end,
101                                      // and V start.
102     int m_DomainInfo[12];            // The (start) and (end offset) for FWR1,
103                                      // CDR1, FWR2, CDR2, FWR3, CDR3 domains
104                                      // note: the first and last domains are be extended
105     int m_DomainInfo_S[10];          // The (start) and (end offset) for FWR1,
106                                      // CDR1, FWR2, CDR2, FWR3, CDR3 domains on topV sequence
107 
108     int m_JDomain[5];                // CDr3 start, stop, FWR4 start, stop, extra number of bases past last J codon (i.e., m_Fwr4EndOffset in CIgAnnotationInfo
109 
110     /// Constructor
CIgAnnotation()111     CIgAnnotation()
112         : m_MinusStrand (false)
113     {
114         for (int i=0; i<3; i++) m_TopGeneIds.push_back("N/A");
115         for (int i=0; i<6; i++) m_GeneInfo[i] = -1;
116         for (int i=0; i<3; i++) m_FrameInfo[i] = -1;
117         for (int i=0; i<12; i++) m_DomainInfo[i] = -1;
118         for (int i=0; i<10; i++) m_DomainInfo_S[i] = -1;
119         for (int i=0; i<5; i++) m_JDomain[i] = -1;
120     }
121 
122 };
123 
124 class CIgAnnotationInfo
125 {
126 public:
127     CIgAnnotationInfo(CConstRef<CIgBlastOptions> &ig_options);
128 
GetDomainInfo(const string sid,int * domain_info)129     bool GetDomainInfo(const string sid, int * domain_info) {
130         if (m_DomainIndex.find(sid) != m_DomainIndex.end()) {
131             int index = m_DomainIndex[sid];
132             for (int i=0; i<10; ++i) {
133                 domain_info[i] = m_DomainData[index + i];
134             }
135             return true;
136         }
137         return false;
138     }
139 
GetDomainChainType(const string sid)140     const string GetDomainChainType(const string sid) {
141         if (m_DomainChainType.find(sid) != m_DomainChainType.end()) {
142             return m_DomainChainType[sid];
143         }
144         return "N/A";
145     }
146 
GetFrameOffset(const string sid)147     int GetFrameOffset(const string sid) {
148         if (m_FrameOffset.find(sid) != m_FrameOffset.end()) {
149             return m_FrameOffset[sid];
150         }
151         return -1;
152     }
GetJDomain(const string & sid)153     int GetJDomain(const string& sid) {
154         if (m_JDomainInfo.find(sid) != m_JDomainInfo.end()) {
155             return m_JDomainInfo[sid];
156         }
157         return -1;
158 
159     }
160 
GetFwr4EndOffset(const string & sid)161     int GetFwr4EndOffset(const string& sid) {
162         if (m_Fwr4EndOffset.find(sid) != m_Fwr4EndOffset.end()) {
163             return m_Fwr4EndOffset[sid];
164         }
165         return -1;
166 
167     }
168 
GetDJChainType(const string sid)169     const string GetDJChainType(const string sid) {
170         if (m_DJChainType.find(sid) != m_DJChainType.end()) {
171             return m_DJChainType[sid];
172         }
173         return "N/A";
174     }
175 
176 private:
177     map<string, int> m_DomainIndex;
178     vector<int> m_DomainData;
179     map<string, string> m_DomainChainType;
180     map<string, int> m_FrameOffset;
181     map<string, string> m_DJChainType;
182     map<string, int>  m_JDomainInfo;
183     map<string, int>  m_Fwr4EndOffset;  //extra number of bases past J end
184 };
185 
186 class CIgBlastResults : public CSearchResults
187 {
188 public:
189 
190     int m_NumActualV;
191     int m_NumActualD;
192     int m_NumActualJ;
193 
GetIgAnnotation() const194     const CRef<CIgAnnotation> & GetIgAnnotation() const {
195         return m_Annotation;
196     }
197 
SetIgAnnotation()198     CRef<CIgAnnotation> & SetIgAnnotation() {
199         return m_Annotation;
200     }
201 
SetSeqAlign()202     CRef<CSeq_align_set> & SetSeqAlign() {
203         return m_Alignment;
204     }
205 
206     /// Constructor
207     /// @param query List of query identifiers [in]
208     /// @param align alignments for a single query sequence [in]
209     /// @param errs error messages for this query sequence [in]
210     /// @param ancillary_data Miscellaneous output from the blast engine [in]
211     /// @param query_masks Mask locations for this query [in]
212     /// @param rid RID (if applicable, else empty string) [in]
CIgBlastResults(CConstRef<objects::CSeq_id> query,CRef<objects::CSeq_align_set> align,const TQueryMessages & errs,CRef<CBlastAncillaryData> ancillary_data)213     CIgBlastResults(CConstRef<objects::CSeq_id>   query,
214                     CRef<objects::CSeq_align_set> align,
215                     const TQueryMessages         &errs,
216                     CRef<CBlastAncillaryData>     ancillary_data)
217            : CSearchResults(query, align, errs, ancillary_data),
218              m_NumActualV(0), m_NumActualD(0), m_NumActualJ(0) { }
219 
220 private:
221     CRef<CIgAnnotation> m_Annotation;
222 };
223 
224 class CIgBlast : public CObject
225 {
226 public:
227     /// Local Igblast search API
228     /// @param query_factory  Concatenated query sequences [in]
229     /// @param blastdb        Adapter to the BLAST database to search [in]
230     /// @param options        Blast search options [in]
231     /// @param ig_options     Additional Ig-BLAST specific options [in]
CIgBlast(CRef<CBlastQueryVector> query_factory,CRef<CLocalDbAdapter> blastdb,CRef<CBlastOptionsHandle> options,CConstRef<CIgBlastOptions> ig_options,CRef<CScope> scope)232     CIgBlast(CRef<CBlastQueryVector> query_factory,
233              CRef<CLocalDbAdapter> blastdb,
234              CRef<CBlastOptionsHandle> options,
235              CConstRef<CIgBlastOptions> ig_options,
236              CRef<CScope> scope)
237        : m_IsLocal(true),
238          m_NumThreads(1),
239          m_Query(query_factory),
240          m_LocalDb(blastdb),
241          m_Options(options),
242          m_IgOptions(ig_options),
243          m_AnnotationInfo(ig_options),
244         m_Scope(scope) {m_RID= NcbiEmptyString; }
245 
246     /// Remote Igblast search API
247     /// @param query_factory  Concatenated query sequences [in]
248     /// @param blastdb        Remote BLAST database to search [in]
249     /// @param subjects       Subject sequences to search [in]
250     /// @param options        Blast search options [in]
251     /// @param ig_options     Additional Ig-BLAST specific options [in]
CIgBlast(CRef<CBlastQueryVector> query_factory,CRef<CSearchDatabase> blastdb,CRef<IQueryFactory> subjects,CRef<CBlastOptionsHandle> options,CConstRef<CIgBlastOptions> ig_options,string entrez_query,CRef<CScope> scope)252     CIgBlast(CRef<CBlastQueryVector> query_factory,
253              CRef<CSearchDatabase> blastdb,
254              CRef<IQueryFactory>   subjects,
255              CRef<CBlastOptionsHandle> options,
256              CConstRef<CIgBlastOptions> ig_options,
257              string entrez_query,
258              CRef<CScope> scope)
259        : m_IsLocal(false),
260          m_NumThreads(1),
261          m_Query(query_factory),
262          m_Subject(subjects),
263          m_RemoteDb(blastdb),
264          m_Options(options),
265          m_IgOptions(ig_options),
266          m_AnnotationInfo(ig_options),
267          m_EntrezQuery(entrez_query),
268          m_Scope(scope){ m_RID= NcbiEmptyString; }
269 
270     /// Destructor
~CIgBlast()271     ~CIgBlast() {};
272 
273     /// Run the Ig-BLAST engine
274     CRef<CSearchResultSet> Run();
275 
276     /// Set MT mode
SetNumberOfThreads(size_t nthreads)277     void SetNumberOfThreads(size_t nthreads) {
278         m_NumThreads = nthreads;
279     }
280 
GetRid()281     string GetRid() {
282         return m_RID;
283     }
284 private:
285 
286     bool m_IsLocal;
287     size_t m_NumThreads;
288     CRef<CBlastQueryVector> m_Query;
289     CRef<IQueryFactory> m_Subject;
290     CRef<CLocalDbAdapter> m_LocalDb;
291     CRef<CSearchDatabase> m_RemoteDb;
292     CRef<CBlastOptionsHandle> m_Options;
293     CConstRef<CIgBlastOptions> m_IgOptions;
294     CIgAnnotationInfo m_AnnotationInfo;
295     string m_EntrezQuery;
296     CRef<CScope> m_Scope;
297     string m_RID; //remote rid
298 
299     /// Prohibit copy constructor
300     CIgBlast(const CIgBlast& rhs);
301 
302     /// Prohibit assignment operator
303     CIgBlast& operator=(const CIgBlast& rhs);
304 
305     /// Prepare blast option handle and query for V germline database search
306     void x_SetupVSearch(CRef<IQueryFactory>           &qf,
307                         CRef<CBlastOptionsHandle>     &opts_hndl);
308 
309     /// Prepare blast option handle and query for D, J germline database search
310     void x_SetupDJSearch(const vector<CRef <CIgAnnotation> > &annots,
311                          CRef<IQueryFactory>           &qf,
312                          CRef<CBlastOptionsHandle>     &opts_hndl,
313                          int db_type);
314     void x_SetupNoOverlapDSearch(const vector<CRef <CIgAnnotation> > &annots,
315                                  CRef<CSearchResultSet>        &results,
316                                  CRef<IQueryFactory>           &qf,
317                                  CRef<CBlastOptionsHandle>     &opts_hndl,
318                                  int db_type);
319 
320     /// Prepare blast option handle and query for specified database search
321     void x_SetupDbSearch(vector<CRef <CIgAnnotation> > &annot,
322                          CRef<IQueryFactory>           &qf);
323 
324     /// Annotate the V gene based on blast results
325     void x_AnnotateV(CRef<CSearchResultSet>        &results,
326                      vector<CRef <CIgAnnotation> > &annot);
327 
328     /// Annotate the D and J genes based on blast results
329     void x_AnnotateDJ(CRef<CSearchResultSet>        &results_D,
330                       CRef<CSearchResultSet>        &results_J,
331                       vector<CRef <CIgAnnotation> > &annot);
332 
333     void x_AnnotateD(CRef<CSearchResultSet>        &results_D,
334                      vector<CRef <CIgAnnotation> > &annot);
335     void x_AnnotateJ(CRef<CSearchResultSet>        &results_J,
336                      vector<CRef <CIgAnnotation> > &annot);
337 
338     /// Annotate the query chaintype and domains based on blast results
339     void x_AnnotateDomain(CRef<CSearchResultSet>        &gl_results,
340                           CRef<CSearchResultSet>        &dm_results,
341                           vector<CRef <CIgAnnotation> > &annot);
342 
343     /// Set the subject chain type and frame info
344     void x_SetChainType(CRef<CSearchResultSet>        &results,
345                         vector<CRef <CIgAnnotation> > &annot);
346 
347     /// Convert bl2seq result to database search mode
348     void x_ConvertResultType(CRef<CSearchResultSet>  &results);
349 
350     /// Sort blast results according to evalue
351     static void s_SortResultsByEvalue(CRef<CSearchResultSet> &results);
352 
353     /// Append blast results to the final results
354     static void s_AppendResults(CRef<CSearchResultSet> &results,
355                                 int                     num_aligns,
356                                 int                     gene,
357                                 CRef<CSearchResultSet> &final_results);
358 
359 
360     /// Append annotation info to the final results
361     void x_SetAnnotation(vector<CRef <CIgAnnotation> > &annot,
362                                 CRef<CSearchResultSet> &final_results);
363 
364     void x_FindDJ(CRef<CSearchResultSet>& results_D,
365                   CRef<CSearchResultSet>& results_J,
366                   CRef <CIgAnnotation> & annot,
367                   CRef<CSeq_align_set>& align_D,
368                   CRef<CSeq_align_set>& align_J,
369                   string q_ct,
370                   bool q_ms,
371                   ENa_strand q_st,
372                   int q_ve,
373                   int iq);
374 
375     void x_FindDJAln(CRef<CSeq_align_set>& align_D,
376                      CRef<CSeq_align_set>& align_J,
377                      string q_ct,
378                      bool q_ms,
379                      ENa_strand q_st,
380                      int q_ve,
381                      int iq,
382                      bool va_or_vd_as_heavy_chain);
383 
384     void x_ExtendAlign5end(CRef<CSearchResultSet> & results);
385     void x_ExtendAlign3end(CRef<CSearchResultSet> & results);
386     void x_ScreenByAlignLength(CRef<CSearchResultSet> & results, int length);
387     void x_FillJDomain(CRef<CSeq_align> & align, CRef <CIgAnnotation> & annot);
388     void x_ProcessDJResult(CRef<CSearchResultSet>& results_V,
389                            CRef<CSearchResultSet>& results_D,
390                            CRef<CSearchResultSet>& results_J,
391                            vector<CRef <CIgAnnotation> > &annots);
392     void x_ProcessDGeneResult(CRef<CSearchResultSet>& results_V,
393                               CRef<CSearchResultSet>& results_D,
394                               CRef<CSearchResultSet>& results_J,
395                               vector<CRef <CIgAnnotation> > &annots);
396 
397 };
398 
399 END_SCOPE(blast)
400 END_NCBI_SCOPE
401 
402 /* @} */
403 
404 #endif  /* ALGO_BLAST_IGBLAST___IGBLAST__HPP */
405