1 /* $Id: blast_objmgr_priv.hpp 516747 2016-10-17 19:00:07Z boratyng $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Christiam Camacho / Kevin Bealer 27 * 28 */ 29 30 /** @file blast_objmgr_priv.hpp 31 * Definitions which are dependant on the NCBI C++ Object Manager 32 */ 33 34 #ifndef ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP 35 #define ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP 36 37 #include "blast_setup.hpp" 38 #include <algo/blast/api/sseqloc.hpp> 39 #include <algo/blast/api/blast_seqinfosrc.hpp> 40 #include <algo/blast/api/blast_types.hpp> 41 42 /** @addtogroup AlgoBlast 43 * 44 * @{ 45 */ 46 47 BEGIN_NCBI_SCOPE 48 49 BEGIN_SCOPE(objects) 50 class CSeq_loc; 51 class CScope; 52 class CBioseq; 53 class CSeq_align_set; 54 class CPssmWithParameters; 55 END_SCOPE(objects) 56 57 BEGIN_SCOPE(blast) 58 59 class CBlastOptions; 60 class CPSIBlastOptionsHandle; 61 62 /// Implements the object manager dependant version of the IBlastQuerySource 63 class NCBI_XBLAST_EXPORT CBlastQuerySourceOM : public IBlastQuerySource { 64 public: 65 /// Constructor which takes a TSeqLocVector 66 /// 67 /// This version assumes the masking information (if any) was 68 /// provided with the TSeqLocVector. 69 /// 70 /// @param v vector of SSeqLoc structures containing the queries [in] 71 /// @param prog program type of this search [in] 72 CBlastQuerySourceOM(TSeqLocVector & v, EBlastProgramType prog); 73 74 /// Constructor which takes a TSeqLocVector 75 /// 76 /// This version will compute masking information with dust. 77 /// 78 /// @param v vector of SSeqLoc structures containing the queries [in] 79 /// @param opts BLAST algorithm options [in] 80 /// @note that the v argument might be changed with the filtering locations 81 CBlastQuerySourceOM(TSeqLocVector & v, const CBlastOptions* opts); 82 83 /// Constructor which takes a CBlastQueryVector 84 /// 85 /// This version assumes the masking information (if any) was 86 /// provided with the CBlastQueryVector. 87 /// 88 /// @param v Object containing the queries, scopes and masking info [in] 89 /// @param prog type of program to run [in] 90 CBlastQuerySourceOM(CBlastQueryVector & v, EBlastProgramType prog); 91 92 /// Constructor which takes a CBlastQueryVector 93 /// 94 /// This version will compute masking information with dust. 95 /// 96 /// @param v Object containing the queries, scopes and masking info [in] 97 /// @param opts BLAST algorithm options [in] 98 CBlastQuerySourceOM(CBlastQueryVector & v, const CBlastOptions* opts); 99 100 /// dtor which determines if the internal pointer to its data should be 101 /// deleted or not. 102 virtual ~CBlastQuerySourceOM(); 103 104 /// Return strand for a sequence 105 /// @param i index of the sequence in the sequence container [in] 106 virtual objects::ENa_strand GetStrand(int i) const; 107 108 /// Return the filtered (masked) regions for a sequence 109 /// @param i index of the sequence in the sequence container [in] 110 virtual CConstRef<objects::CSeq_loc> GetMask(int i); 111 112 /// Return the filtered (masked) regions for a sequence 113 /// @param i index of the sequence in the sequence container [in] 114 virtual TMaskedQueryRegions GetMaskedRegions(int i); 115 116 /// Return the CSeq_loc associated with a sequence 117 /// @param i index of the sequence in the sequence container [in] 118 virtual CConstRef<objects::CSeq_loc> GetSeqLoc(int i) const; 119 120 /// Return the sequence identifier associated with a sequence 121 /// @param index index of the sequence in the sequence container [in] 122 virtual const objects::CSeq_id* GetSeqId(int index) const; 123 124 /// Retrieve the genetic code associated with a sequence 125 /// @param index index of the sequence in the sequence container [in] 126 virtual Uint4 GetGeneticCodeId(int index) const; 127 128 /// Return the sequence data for a sequence 129 /// @param i index of the sequence in the sequence container [in] 130 /// @param encoding desired encoding [in] 131 /// @param strand strand to fetch [in] 132 /// @param sentinel specifies to use or not to use sentinel bytes around 133 /// sequence data. Note that this is ignored for proteins, as in the 134 /// CORE of BLAST, proteins always have sentinel bytes [in] 135 /// @param warnings if not NULL, warnings will be returned in this string 136 /// [in|out] 137 /// @return SBlastSequence structure containing sequence data requested 138 virtual SBlastSequence GetBlastSequence(int i, 139 EBlastEncoding encoding, 140 objects::ENa_strand strand, 141 ESentinelType sentinel, 142 string* warnings = 0) const; 143 /// Return the length of a sequence 144 /// @param i index of the sequence in the sequence container [in] 145 virtual TSeqPos GetLength(int i) const; 146 /// Return the number of elements in the sequence container 147 virtual TSeqPos Size() const; 148 149 /// Return the title of a sequence 150 /// @param index index of the sequence in the sequence container [in] 151 virtual string GetTitle(int index) const; 152 153 /// Is this sequence followed by a mate (for mapping short reads) IsFirstOfAPair(int index) const154 NCBI_DEPRECATED virtual bool IsFirstOfAPair(int index) const 155 { NCBI_THROW(CException, eInvalid, "Function " 156 "CBlasyQuerySourceOM::IsFirstOfAPair was not implemented");} 157 158 /// Get segment information (for mapping paired short reads) GetSegmentInfo(int index) const159 virtual int GetSegmentInfo(int index) const 160 { NCBI_THROW(CException, eInvalid, "Function " 161 "CBlasyQuerySourceOM::GetSegmentInfo was not implemented");} 162 163 164 protected: 165 /// Reference to input CBlastQueryVector (or empty if not used) 166 CRef<CBlastQueryVector> m_QueryVector; 167 168 /// Reference to input TSeqLocVector (or NULL if not used) 169 TSeqLocVector* m_TSeqLocVector; 170 171 /// flag to determine if the member above should or not be deleted in the 172 /// destructor 173 bool m_OwnTSeqLocVector; 174 175 /// BLAST algorithm options 176 const CBlastOptions* m_Options; 177 178 private: 179 /// this flag allows for lazy initialization of the masking locations 180 bool m_CalculatedMasks; 181 182 /// BLAST program variable 183 EBlastProgramType m_Program; 184 185 /// Performs filtering on the query sequences to calculate the masked 186 /// locations 187 void x_CalculateMasks(); 188 189 /// Tries to extract the genetic code using the CScope, if it succeeds, 190 /// it supercedes what's specified in the 191 /// {SSeqLoc,CBlastSearchQuery}::genetic_code_id field 192 void x_AutoDetectGeneticCodes(void); 193 }; 194 195 /** Allocates the query information structure and fills the context 196 * offsets, in case of multiple queries, frames or strands. If query seqids 197 * cannot be resolved, they will be ignored as warnings will be issued in 198 * blast::SetupQueries. This version takes a TSeqLocVector. 199 * NB: effective length will be assigned inside the engine. 200 * @param queries Vector of query locations [in] 201 * @param prog program type from the CORE's point of view [in] 202 * @param strand_opt Unless the strand option is set to single strand, the 203 * actual CSeq_locs in the TSeqLocVector dictate which strand to use 204 * during the search [in] 205 * @param qinfo Allocated query info structure [out] 206 */ 207 NCBI_XBLAST_EXPORT 208 void 209 SetupQueryInfo(TSeqLocVector& queries, 210 EBlastProgramType prog, 211 objects::ENa_strand strand_opt, 212 BlastQueryInfo** qinfo); 213 214 /** Allocates the query information structure and fills the context 215 * offsets, in case of multiple queries, frames or strands. If query seqids 216 * cannot be resolved, they will be ignored as warnings will be issued in 217 * blast::SetupQueries. This version takes a CBlastQueryVector. 218 * NB: effective length will be assigned inside the engine. 219 * @param queries Vector of query locations [in] 220 * @param prog program type from the CORE's point of view [in] 221 * @param strand_opt Unless the strand option is set to single strand, the 222 * actual CSeq_locs in the CBlastQueryVector dictate which strand to use 223 * during the search [in] 224 * @param qinfo Allocated query info structure [out] 225 */ 226 NCBI_XBLAST_EXPORT 227 void 228 SetupQueryInfo(const CBlastQueryVector & queries, 229 EBlastProgramType prog, 230 objects::ENa_strand strand_opt, 231 BlastQueryInfo** qinfo); 232 233 /// Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST 234 /// 235 /// @param queries vector of blast::SSeqLoc structures [in] 236 /// @param qinfo BlastQueryInfo structure to obtain context information [in] 237 /// @param seqblk Structure to save sequence data, allocated in this 238 /// function [out] 239 /// @param messages object to save warnings/errors for all queries [out] 240 /// @param prog program type from the CORE's point of view [in] 241 /// @param strand_opt Unless the strand option is set to single strand, the 242 /// actual CSeq_locs in the TSeqLocVector dictate which strand to use 243 /// during the search [in] 244 245 NCBI_XBLAST_EXPORT 246 void 247 SetupQueries(TSeqLocVector& queries, 248 BlastQueryInfo* qinfo, 249 BLAST_SequenceBlk** seqblk, 250 EBlastProgramType prog, 251 objects::ENa_strand strand_opt, 252 TSearchMessages& messages); 253 254 /** Sets up internal subject data structure for the BLAST search. 255 * 256 * This uses the TSeqLocVector to create subject data structures. 257 * 258 * @param subjects Vector of subject locations [in] 259 * @param program BLAST program [in] 260 * @param seqblk_vec Vector of subject sequence data structures [out] 261 * @param max_subjlen Maximal length of the subject sequences [out] 262 */ 263 NCBI_XBLAST_EXPORT 264 void 265 SetupSubjects(TSeqLocVector& subjects, 266 EBlastProgramType program, 267 vector<BLAST_SequenceBlk*>* seqblk_vec, 268 unsigned int* max_subjlen); 269 270 /** Retrieves a sequence using the object manager. 271 * @param sl seqloc of the sequence to obtain [in] 272 * @param encoding encoding for the sequence retrieved. 273 * Supported encodings include: eBlastEncodingNcbi2na, 274 * eBlastEncodingNcbi4na, eBlastEncodingNucleotide, and 275 * eBlastEncodingProtein. [in] 276 * @param scope Scope from which the sequences are retrieved [in] 277 * @param strand strand to retrieve (applies to nucleotide only). 278 * N.B.: When requesting the eBlastEncodingNcbi2na, only the plus strand 279 * is retrieved, because BLAST only requires one strand on the subject 280 * sequences (as in BLAST databases). [in] 281 * @param sentinel Use eSentinels to guard nucleotide sequence with sentinel 282 * bytes (ignored for protein sequences, which always have sentinels) 283 * When using eBlastEncodingNcbi2na, this argument should be set to 284 * eNoSentinels as a sentinel byte cannot be represented in this 285 * encoding. [in] 286 * @param warnings Used to emit warnings when fetching sequence (e.g.: 287 * replacement of invalid characters). Parameter must be allocated by 288 * caller of this function and warnings will be appended. [out] 289 * @throws CBlastException, CSeqVectorException, CException 290 * @return pair containing the buffer and its length. 291 */ 292 NCBI_XBLAST_EXPORT 293 SBlastSequence 294 GetSequence(const objects::CSeq_loc& sl, EBlastEncoding encoding, 295 objects::CScope* scope, 296 objects::ENa_strand strand = objects::eNa_strand_plus, 297 ESentinelType sentinel = eSentinels, 298 std::string* warnings = NULL); 299 300 /** Converts a TSeqLocVector into a CPacked_seqint. Note that a consequence of 301 * this is that the CSeq_loc-s specified in the TSeqLocVector cannot be 302 * of type mix or packed int 303 * @param sequences data to convert from [in] 304 * @return CPacked_seqint containing data from previous sequences 305 */ 306 NCBI_XBLAST_EXPORT 307 CRef<objects::CPacked_seqint> 308 TSeqLocVector2Packed_seqint(const TSeqLocVector& sequences); 309 310 END_SCOPE(blast) 311 END_NCBI_SCOPE 312 313 /* @} */ 314 315 #endif /* ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP */ 316