1 /* $Id: igblast.hpp 625641 2021-02-17 18:15:12Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Ning Ma 27 * 28 */ 29 30 /// @file igblast.hpp 31 /// Declares CIgBlast, the C++ API for the IG-BLAST engine. 32 33 #ifndef ALGO_BLAST_IGBLAST___IGBLAST__HPP 34 #define ALGO_BLAST_IGBLAST___IGBLAST__HPP 35 36 #include <algo/blast/api/setup_factory.hpp> 37 #include <algo/blast/api/uniform_search.hpp> 38 #include <algo/blast/api/local_db_adapter.hpp> 39 #include <objmgr/scope.hpp> 40 41 /** @addtogroup AlgoBlast 42 * 43 * @{ 44 */ 45 46 BEGIN_NCBI_SCOPE 47 48 BEGIN_SCOPE(blast) 49 50 /// Keeps track of the version of IgBLAST in the NCBI C++ toolkit. 51 /// Used to perform run-time version checks 52 /// 53 /// For reference, please refer to http://apr.apache.org/versioning.html 54 class CIgBlastVersion : public CVersionInfo { 55 public: CIgBlastVersion()56 CIgBlastVersion() 57 : CVersionInfo(1, 17, 1) {} 58 }; 59 60 class IQueryFactory; 61 62 class CIgBlastOptions : public CObject 63 { 64 public: 65 // the germline database search must be carried out locally 66 bool m_IsProtein; // search molecular type 67 string m_Origin; // the origin of species 68 string m_DomainSystem; // domain system for annotation 69 string m_SequenceType; //ig or tcr? 70 int m_Min_D_match; //the word size for D gene search 71 int m_V_penalty; //the mismatch penalty for V gene search 72 int m_D_penalty; //the mismatch penalty for D gene search 73 int m_J_penalty; //the mismatch penalty for J gene search 74 string m_AuxFilename; // auxulary file name 75 string m_IgDataPath; // internal data path 76 CRef<CLocalDbAdapter> m_Db[4]; // user specified germline database 77 // 0-2: - user specified V, D, J 78 // 3: - the default V gl db 79 int m_NumAlign[3]; // number of VDJ alignments to show 80 bool m_FocusV; // should alignment restrict to V 81 bool m_Translate; // should translation be displayed 82 bool m_ExtendAlign5end; 83 bool m_ExtendAlign3end; 84 int m_MinVLength; 85 int m_MinJLength; 86 bool m_DetectOverlap; 87 list<string> m_AirrField; 88 }; 89 90 class CIgAnnotation : public CObject 91 { 92 public: 93 bool m_MinusStrand; // hit is on minus strand of the query 94 vector<string> m_TopGeneIds; // Top match germline gene ID 95 vector<string> m_ChainType; // chain types of the query ([0]) and subjects ([1:]) 96 string m_ChainTypeToShow; // chain type to show to user. Normally this is 97 //the same as m_ChainType[0] but could be different 98 // in case o TCRA/D chains which can use both JA and JD 99 int m_GeneInfo[6]; // The (start) and (end offset + 1) for VDJ 100 int m_FrameInfo[3]; // Coding frame start offset for V start, V end, 101 // and V start. 102 int m_DomainInfo[12]; // The (start) and (end offset) for FWR1, 103 // CDR1, FWR2, CDR2, FWR3, CDR3 domains 104 // note: the first and last domains are be extended 105 int m_DomainInfo_S[10]; // The (start) and (end offset) for FWR1, 106 // CDR1, FWR2, CDR2, FWR3, CDR3 domains on topV sequence 107 108 int m_JDomain[5]; // CDr3 start, stop, FWR4 start, stop, extra number of bases past last J codon (i.e., m_Fwr4EndOffset in CIgAnnotationInfo 109 110 /// Constructor CIgAnnotation()111 CIgAnnotation() 112 : m_MinusStrand (false) 113 { 114 for (int i=0; i<3; i++) m_TopGeneIds.push_back("N/A"); 115 for (int i=0; i<6; i++) m_GeneInfo[i] = -1; 116 for (int i=0; i<3; i++) m_FrameInfo[i] = -1; 117 for (int i=0; i<12; i++) m_DomainInfo[i] = -1; 118 for (int i=0; i<10; i++) m_DomainInfo_S[i] = -1; 119 for (int i=0; i<5; i++) m_JDomain[i] = -1; 120 } 121 122 }; 123 124 class CIgAnnotationInfo 125 { 126 public: 127 CIgAnnotationInfo(CConstRef<CIgBlastOptions> &ig_options); 128 GetDomainInfo(const string sid,int * domain_info)129 bool GetDomainInfo(const string sid, int * domain_info) { 130 if (m_DomainIndex.find(sid) != m_DomainIndex.end()) { 131 int index = m_DomainIndex[sid]; 132 for (int i=0; i<10; ++i) { 133 domain_info[i] = m_DomainData[index + i]; 134 } 135 return true; 136 } 137 return false; 138 } 139 GetDomainChainType(const string sid)140 const string GetDomainChainType(const string sid) { 141 if (m_DomainChainType.find(sid) != m_DomainChainType.end()) { 142 return m_DomainChainType[sid]; 143 } 144 return "N/A"; 145 } 146 GetFrameOffset(const string sid)147 int GetFrameOffset(const string sid) { 148 if (m_FrameOffset.find(sid) != m_FrameOffset.end()) { 149 return m_FrameOffset[sid]; 150 } 151 return -1; 152 } GetJDomain(const string & sid)153 int GetJDomain(const string& sid) { 154 if (m_JDomainInfo.find(sid) != m_JDomainInfo.end()) { 155 return m_JDomainInfo[sid]; 156 } 157 return -1; 158 159 } 160 GetFwr4EndOffset(const string & sid)161 int GetFwr4EndOffset(const string& sid) { 162 if (m_Fwr4EndOffset.find(sid) != m_Fwr4EndOffset.end()) { 163 return m_Fwr4EndOffset[sid]; 164 } 165 return -1; 166 167 } 168 GetDJChainType(const string sid)169 const string GetDJChainType(const string sid) { 170 if (m_DJChainType.find(sid) != m_DJChainType.end()) { 171 return m_DJChainType[sid]; 172 } 173 return "N/A"; 174 } 175 176 private: 177 map<string, int> m_DomainIndex; 178 vector<int> m_DomainData; 179 map<string, string> m_DomainChainType; 180 map<string, int> m_FrameOffset; 181 map<string, string> m_DJChainType; 182 map<string, int> m_JDomainInfo; 183 map<string, int> m_Fwr4EndOffset; //extra number of bases past J end 184 }; 185 186 class CIgBlastResults : public CSearchResults 187 { 188 public: 189 190 int m_NumActualV; 191 int m_NumActualD; 192 int m_NumActualJ; 193 GetIgAnnotation() const194 const CRef<CIgAnnotation> & GetIgAnnotation() const { 195 return m_Annotation; 196 } 197 SetIgAnnotation()198 CRef<CIgAnnotation> & SetIgAnnotation() { 199 return m_Annotation; 200 } 201 SetSeqAlign()202 CRef<CSeq_align_set> & SetSeqAlign() { 203 return m_Alignment; 204 } 205 206 /// Constructor 207 /// @param query List of query identifiers [in] 208 /// @param align alignments for a single query sequence [in] 209 /// @param errs error messages for this query sequence [in] 210 /// @param ancillary_data Miscellaneous output from the blast engine [in] 211 /// @param query_masks Mask locations for this query [in] 212 /// @param rid RID (if applicable, else empty string) [in] CIgBlastResults(CConstRef<objects::CSeq_id> query,CRef<objects::CSeq_align_set> align,const TQueryMessages & errs,CRef<CBlastAncillaryData> ancillary_data)213 CIgBlastResults(CConstRef<objects::CSeq_id> query, 214 CRef<objects::CSeq_align_set> align, 215 const TQueryMessages &errs, 216 CRef<CBlastAncillaryData> ancillary_data) 217 : CSearchResults(query, align, errs, ancillary_data), 218 m_NumActualV(0), m_NumActualD(0), m_NumActualJ(0) { } 219 220 private: 221 CRef<CIgAnnotation> m_Annotation; 222 }; 223 224 class CIgBlast : public CObject 225 { 226 public: 227 /// Local Igblast search API 228 /// @param query_factory Concatenated query sequences [in] 229 /// @param blastdb Adapter to the BLAST database to search [in] 230 /// @param options Blast search options [in] 231 /// @param ig_options Additional Ig-BLAST specific options [in] CIgBlast(CRef<CBlastQueryVector> query_factory,CRef<CLocalDbAdapter> blastdb,CRef<CBlastOptionsHandle> options,CConstRef<CIgBlastOptions> ig_options,CRef<CScope> scope)232 CIgBlast(CRef<CBlastQueryVector> query_factory, 233 CRef<CLocalDbAdapter> blastdb, 234 CRef<CBlastOptionsHandle> options, 235 CConstRef<CIgBlastOptions> ig_options, 236 CRef<CScope> scope) 237 : m_IsLocal(true), 238 m_NumThreads(1), 239 m_Query(query_factory), 240 m_LocalDb(blastdb), 241 m_Options(options), 242 m_IgOptions(ig_options), 243 m_AnnotationInfo(ig_options), 244 m_Scope(scope) {m_RID= NcbiEmptyString; } 245 246 /// Remote Igblast search API 247 /// @param query_factory Concatenated query sequences [in] 248 /// @param blastdb Remote BLAST database to search [in] 249 /// @param subjects Subject sequences to search [in] 250 /// @param options Blast search options [in] 251 /// @param ig_options Additional Ig-BLAST specific options [in] CIgBlast(CRef<CBlastQueryVector> query_factory,CRef<CSearchDatabase> blastdb,CRef<IQueryFactory> subjects,CRef<CBlastOptionsHandle> options,CConstRef<CIgBlastOptions> ig_options,string entrez_query,CRef<CScope> scope)252 CIgBlast(CRef<CBlastQueryVector> query_factory, 253 CRef<CSearchDatabase> blastdb, 254 CRef<IQueryFactory> subjects, 255 CRef<CBlastOptionsHandle> options, 256 CConstRef<CIgBlastOptions> ig_options, 257 string entrez_query, 258 CRef<CScope> scope) 259 : m_IsLocal(false), 260 m_NumThreads(1), 261 m_Query(query_factory), 262 m_Subject(subjects), 263 m_RemoteDb(blastdb), 264 m_Options(options), 265 m_IgOptions(ig_options), 266 m_AnnotationInfo(ig_options), 267 m_EntrezQuery(entrez_query), 268 m_Scope(scope){ m_RID= NcbiEmptyString; } 269 270 /// Destructor ~CIgBlast()271 ~CIgBlast() {}; 272 273 /// Run the Ig-BLAST engine 274 CRef<CSearchResultSet> Run(); 275 276 /// Set MT mode SetNumberOfThreads(size_t nthreads)277 void SetNumberOfThreads(size_t nthreads) { 278 m_NumThreads = nthreads; 279 } 280 GetRid()281 string GetRid() { 282 return m_RID; 283 } 284 private: 285 286 bool m_IsLocal; 287 size_t m_NumThreads; 288 CRef<CBlastQueryVector> m_Query; 289 CRef<IQueryFactory> m_Subject; 290 CRef<CLocalDbAdapter> m_LocalDb; 291 CRef<CSearchDatabase> m_RemoteDb; 292 CRef<CBlastOptionsHandle> m_Options; 293 CConstRef<CIgBlastOptions> m_IgOptions; 294 CIgAnnotationInfo m_AnnotationInfo; 295 string m_EntrezQuery; 296 CRef<CScope> m_Scope; 297 string m_RID; //remote rid 298 299 /// Prohibit copy constructor 300 CIgBlast(const CIgBlast& rhs); 301 302 /// Prohibit assignment operator 303 CIgBlast& operator=(const CIgBlast& rhs); 304 305 /// Prepare blast option handle and query for V germline database search 306 void x_SetupVSearch(CRef<IQueryFactory> &qf, 307 CRef<CBlastOptionsHandle> &opts_hndl); 308 309 /// Prepare blast option handle and query for D, J germline database search 310 void x_SetupDJSearch(const vector<CRef <CIgAnnotation> > &annots, 311 CRef<IQueryFactory> &qf, 312 CRef<CBlastOptionsHandle> &opts_hndl, 313 int db_type); 314 void x_SetupNoOverlapDSearch(const vector<CRef <CIgAnnotation> > &annots, 315 CRef<CSearchResultSet> &results, 316 CRef<IQueryFactory> &qf, 317 CRef<CBlastOptionsHandle> &opts_hndl, 318 int db_type); 319 320 /// Prepare blast option handle and query for specified database search 321 void x_SetupDbSearch(vector<CRef <CIgAnnotation> > &annot, 322 CRef<IQueryFactory> &qf); 323 324 /// Annotate the V gene based on blast results 325 void x_AnnotateV(CRef<CSearchResultSet> &results, 326 vector<CRef <CIgAnnotation> > &annot); 327 328 /// Annotate the D and J genes based on blast results 329 void x_AnnotateDJ(CRef<CSearchResultSet> &results_D, 330 CRef<CSearchResultSet> &results_J, 331 vector<CRef <CIgAnnotation> > &annot); 332 333 void x_AnnotateD(CRef<CSearchResultSet> &results_D, 334 vector<CRef <CIgAnnotation> > &annot); 335 void x_AnnotateJ(CRef<CSearchResultSet> &results_J, 336 vector<CRef <CIgAnnotation> > &annot); 337 338 /// Annotate the query chaintype and domains based on blast results 339 void x_AnnotateDomain(CRef<CSearchResultSet> &gl_results, 340 CRef<CSearchResultSet> &dm_results, 341 vector<CRef <CIgAnnotation> > &annot); 342 343 /// Set the subject chain type and frame info 344 void x_SetChainType(CRef<CSearchResultSet> &results, 345 vector<CRef <CIgAnnotation> > &annot); 346 347 /// Convert bl2seq result to database search mode 348 void x_ConvertResultType(CRef<CSearchResultSet> &results); 349 350 /// Sort blast results according to evalue 351 static void s_SortResultsByEvalue(CRef<CSearchResultSet> &results); 352 353 /// Append blast results to the final results 354 static void s_AppendResults(CRef<CSearchResultSet> &results, 355 int num_aligns, 356 int gene, 357 CRef<CSearchResultSet> &final_results); 358 359 360 /// Append annotation info to the final results 361 void x_SetAnnotation(vector<CRef <CIgAnnotation> > &annot, 362 CRef<CSearchResultSet> &final_results); 363 364 void x_FindDJ(CRef<CSearchResultSet>& results_D, 365 CRef<CSearchResultSet>& results_J, 366 CRef <CIgAnnotation> & annot, 367 CRef<CSeq_align_set>& align_D, 368 CRef<CSeq_align_set>& align_J, 369 string q_ct, 370 bool q_ms, 371 ENa_strand q_st, 372 int q_ve, 373 int iq); 374 375 void x_FindDJAln(CRef<CSeq_align_set>& align_D, 376 CRef<CSeq_align_set>& align_J, 377 string q_ct, 378 bool q_ms, 379 ENa_strand q_st, 380 int q_ve, 381 int iq, 382 bool va_or_vd_as_heavy_chain); 383 384 void x_ExtendAlign5end(CRef<CSearchResultSet> & results); 385 void x_ExtendAlign3end(CRef<CSearchResultSet> & results); 386 void x_ScreenByAlignLength(CRef<CSearchResultSet> & results, int length); 387 void x_FillJDomain(CRef<CSeq_align> & align, CRef <CIgAnnotation> & annot); 388 void x_ProcessDJResult(CRef<CSearchResultSet>& results_V, 389 CRef<CSearchResultSet>& results_D, 390 CRef<CSearchResultSet>& results_J, 391 vector<CRef <CIgAnnotation> > &annots); 392 void x_ProcessDGeneResult(CRef<CSearchResultSet>& results_V, 393 CRef<CSearchResultSet>& results_D, 394 CRef<CSearchResultSet>& results_J, 395 vector<CRef <CIgAnnotation> > &annots); 396 397 }; 398 399 END_SCOPE(blast) 400 END_NCBI_SCOPE 401 402 /* @} */ 403 404 #endif /* ALGO_BLAST_IGBLAST___IGBLAST__HPP */ 405