1 /*  $Id: blast_objmgr_priv.hpp 516747 2016-10-17 19:00:07Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Christiam Camacho / Kevin Bealer
27  *
28  */
29 
30 /** @file blast_objmgr_priv.hpp
31  * Definitions which are dependant on the NCBI C++ Object Manager
32  */
33 
34 #ifndef ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP
35 #define ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP
36 
37 #include "blast_setup.hpp"
38 #include <algo/blast/api/sseqloc.hpp>
39 #include <algo/blast/api/blast_seqinfosrc.hpp>
40 #include <algo/blast/api/blast_types.hpp>
41 
42 /** @addtogroup AlgoBlast
43  *
44  * @{
45  */
46 
47 BEGIN_NCBI_SCOPE
48 
49 BEGIN_SCOPE(objects)
50     class CSeq_loc;
51     class CScope;
52     class CBioseq;
53     class CSeq_align_set;
54     class CPssmWithParameters;
55 END_SCOPE(objects)
56 
57 BEGIN_SCOPE(blast)
58 
59 class CBlastOptions;
60 class CPSIBlastOptionsHandle;
61 
62 /// Implements the object manager dependant version of the IBlastQuerySource
63 class NCBI_XBLAST_EXPORT CBlastQuerySourceOM : public IBlastQuerySource {
64 public:
65     /// Constructor which takes a TSeqLocVector
66     ///
67     /// This version assumes the masking information (if any) was
68     /// provided with the TSeqLocVector.
69     ///
70     /// @param v vector of SSeqLoc structures containing the queries [in]
71     /// @param prog program type of this search [in]
72     CBlastQuerySourceOM(TSeqLocVector & v, EBlastProgramType prog);
73 
74     /// Constructor which takes a TSeqLocVector
75     ///
76     /// This version will compute masking information with dust.
77     ///
78     /// @param v vector of SSeqLoc structures containing the queries [in]
79     /// @param opts BLAST algorithm options [in]
80     /// @note that the v argument might be changed with the filtering locations
81     CBlastQuerySourceOM(TSeqLocVector & v, const CBlastOptions* opts);
82 
83     /// Constructor which takes a CBlastQueryVector
84     ///
85     /// This version assumes the masking information (if any) was
86     /// provided with the CBlastQueryVector.
87     ///
88     /// @param v Object containing the queries, scopes and masking info [in]
89     /// @param prog type of program to run [in]
90     CBlastQuerySourceOM(CBlastQueryVector & v, EBlastProgramType prog);
91 
92     /// Constructor which takes a CBlastQueryVector
93     ///
94     /// This version will compute masking information with dust.
95     ///
96     /// @param v Object containing the queries, scopes and masking info [in]
97     /// @param opts BLAST algorithm options [in]
98     CBlastQuerySourceOM(CBlastQueryVector & v, const CBlastOptions* opts);
99 
100     /// dtor which determines if the internal pointer to its data should be
101     /// deleted or not.
102     virtual ~CBlastQuerySourceOM();
103 
104     /// Return strand for a sequence
105     /// @param i index of the sequence in the sequence container [in]
106     virtual objects::ENa_strand GetStrand(int i) const;
107 
108     /// Return the filtered (masked) regions for a sequence
109     /// @param i index of the sequence in the sequence container [in]
110     virtual CConstRef<objects::CSeq_loc> GetMask(int i);
111 
112     /// Return the filtered (masked) regions for a sequence
113     /// @param i index of the sequence in the sequence container [in]
114     virtual TMaskedQueryRegions GetMaskedRegions(int i);
115 
116     /// Return the CSeq_loc associated with a sequence
117     /// @param i index of the sequence in the sequence container [in]
118     virtual CConstRef<objects::CSeq_loc> GetSeqLoc(int i) const;
119 
120     /// Return the sequence identifier associated with a sequence
121     /// @param index index of the sequence in the sequence container [in]
122     virtual const objects::CSeq_id* GetSeqId(int index) const;
123 
124     /// Retrieve the genetic code associated with a sequence
125     /// @param index index of the sequence in the sequence container [in]
126     virtual Uint4 GetGeneticCodeId(int index) const;
127 
128     /// Return the sequence data for a sequence
129     /// @param i index of the sequence in the sequence container [in]
130     /// @param encoding desired encoding [in]
131     /// @param strand strand to fetch [in]
132     /// @param sentinel specifies to use or not to use sentinel bytes around
133     ///        sequence data. Note that this is ignored for proteins, as in the
134     ///        CORE of BLAST, proteins always have sentinel bytes [in]
135     /// @param warnings if not NULL, warnings will be returned in this string
136     ///        [in|out]
137     /// @return SBlastSequence structure containing sequence data requested
138     virtual SBlastSequence GetBlastSequence(int i,
139                                             EBlastEncoding encoding,
140                                             objects::ENa_strand strand,
141                                             ESentinelType sentinel,
142                                             string* warnings = 0) const;
143     /// Return the length of a sequence
144     /// @param i index of the sequence in the sequence container [in]
145     virtual TSeqPos GetLength(int i) const;
146     /// Return the number of elements in the sequence container
147     virtual TSeqPos Size() const;
148 
149     /// Return the title of a sequence
150     /// @param index index of the sequence in the sequence container [in]
151     virtual string GetTitle(int index) const;
152 
153     /// Is this sequence followed by a mate (for mapping short reads)
IsFirstOfAPair(int index) const154     NCBI_DEPRECATED virtual bool IsFirstOfAPair(int index) const
155     { NCBI_THROW(CException, eInvalid, "Function "
156                  "CBlasyQuerySourceOM::IsFirstOfAPair was not implemented");}
157 
158     /// Get segment information (for mapping paired short reads)
GetSegmentInfo(int index) const159     virtual int GetSegmentInfo(int index) const
160     { NCBI_THROW(CException, eInvalid, "Function "
161                  "CBlasyQuerySourceOM::GetSegmentInfo was not implemented");}
162 
163 
164 protected:
165     /// Reference to input CBlastQueryVector (or empty if not used)
166     CRef<CBlastQueryVector> m_QueryVector;
167 
168     /// Reference to input TSeqLocVector (or NULL if not used)
169     TSeqLocVector* m_TSeqLocVector;
170 
171     /// flag to determine if the member above should or not be deleted in the
172     /// destructor
173     bool m_OwnTSeqLocVector;
174 
175     /// BLAST algorithm options
176     const CBlastOptions* m_Options;
177 
178 private:
179     /// this flag allows for lazy initialization of the masking locations
180     bool m_CalculatedMasks;
181 
182     /// BLAST program variable
183     EBlastProgramType   m_Program;
184 
185     /// Performs filtering on the query sequences to calculate the masked
186     /// locations
187     void x_CalculateMasks();
188 
189     /// Tries to extract the genetic code using the CScope, if it succeeds,
190     /// it supercedes what's specified in the
191     /// {SSeqLoc,CBlastSearchQuery}::genetic_code_id field
192     void x_AutoDetectGeneticCodes(void);
193 };
194 
195 /** Allocates the query information structure and fills the context
196  * offsets, in case of multiple queries, frames or strands. If query seqids
197  * cannot be resolved, they will be ignored as warnings will be issued in
198  * blast::SetupQueries.  This version takes a TSeqLocVector.
199  * NB: effective length will be assigned inside the engine.
200  * @param queries Vector of query locations [in]
201  * @param prog program type from the CORE's point of view [in]
202  * @param strand_opt Unless the strand option is set to single strand, the
203  * actual CSeq_locs in the TSeqLocVector dictate which strand to use
204  * during the search [in]
205  * @param qinfo Allocated query info structure [out]
206  */
207 NCBI_XBLAST_EXPORT
208 void
209 SetupQueryInfo(TSeqLocVector& queries,
210                EBlastProgramType prog,
211                objects::ENa_strand strand_opt,
212                BlastQueryInfo** qinfo);
213 
214 /** Allocates the query information structure and fills the context
215  * offsets, in case of multiple queries, frames or strands. If query seqids
216  * cannot be resolved, they will be ignored as warnings will be issued in
217  * blast::SetupQueries.  This version takes a CBlastQueryVector.
218  * NB: effective length will be assigned inside the engine.
219  * @param queries Vector of query locations [in]
220  * @param prog program type from the CORE's point of view [in]
221  * @param strand_opt Unless the strand option is set to single strand, the
222  * actual CSeq_locs in the CBlastQueryVector dictate which strand to use
223  * during the search [in]
224  * @param qinfo Allocated query info structure [out]
225  */
226 NCBI_XBLAST_EXPORT
227 void
228 SetupQueryInfo(const CBlastQueryVector & queries,
229                EBlastProgramType prog,
230                objects::ENa_strand strand_opt,
231                BlastQueryInfo** qinfo);
232 
233 /// Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST
234 ///
235 /// @param queries vector of blast::SSeqLoc structures [in]
236 /// @param qinfo BlastQueryInfo structure to obtain context information [in]
237 /// @param seqblk Structure to save sequence data, allocated in this
238 /// function [out]
239 /// @param messages object to save warnings/errors for all queries [out]
240 /// @param prog program type from the CORE's point of view [in]
241 /// @param strand_opt Unless the strand option is set to single strand, the
242 /// actual CSeq_locs in the TSeqLocVector dictate which strand to use
243 /// during the search [in]
244 
245 NCBI_XBLAST_EXPORT
246 void
247 SetupQueries(TSeqLocVector& queries,
248              BlastQueryInfo* qinfo,
249              BLAST_SequenceBlk** seqblk,
250              EBlastProgramType prog,
251              objects::ENa_strand strand_opt,
252              TSearchMessages& messages);
253 
254 /** Sets up internal subject data structure for the BLAST search.
255  *
256  * This uses the TSeqLocVector to create subject data structures.
257  *
258  * @param subjects Vector of subject locations [in]
259  * @param program BLAST program [in]
260  * @param seqblk_vec Vector of subject sequence data structures [out]
261  * @param max_subjlen Maximal length of the subject sequences [out]
262  */
263 NCBI_XBLAST_EXPORT
264 void
265 SetupSubjects(TSeqLocVector& subjects,
266               EBlastProgramType program,
267               vector<BLAST_SequenceBlk*>* seqblk_vec,
268               unsigned int* max_subjlen);
269 
270 /** Retrieves a sequence using the object manager.
271  * @param sl seqloc of the sequence to obtain [in]
272  * @param encoding encoding for the sequence retrieved.
273  *        Supported encodings include: eBlastEncodingNcbi2na,
274  *        eBlastEncodingNcbi4na, eBlastEncodingNucleotide, and
275  *        eBlastEncodingProtein. [in]
276  * @param scope Scope from which the sequences are retrieved [in]
277  * @param strand strand to retrieve (applies to nucleotide only).
278  *        N.B.: When requesting the eBlastEncodingNcbi2na, only the plus strand
279  *        is retrieved, because BLAST only requires one strand on the subject
280  *        sequences (as in BLAST databases). [in]
281  * @param sentinel Use eSentinels to guard nucleotide sequence with sentinel
282  *        bytes (ignored for protein sequences, which always have sentinels)
283  *        When using eBlastEncodingNcbi2na, this argument should be set to
284  *        eNoSentinels as a sentinel byte cannot be represented in this
285  *        encoding. [in]
286  * @param warnings Used to emit warnings when fetching sequence (e.g.:
287  *        replacement of invalid characters). Parameter must be allocated by
288  *        caller of this function and warnings will be appended. [out]
289  * @throws CBlastException, CSeqVectorException, CException
290  * @return pair containing the buffer and its length.
291  */
292 NCBI_XBLAST_EXPORT
293 SBlastSequence
294 GetSequence(const objects::CSeq_loc& sl, EBlastEncoding encoding,
295             objects::CScope* scope,
296             objects::ENa_strand strand = objects::eNa_strand_plus,
297             ESentinelType sentinel = eSentinels,
298             std::string* warnings = NULL);
299 
300 /** Converts a TSeqLocVector into a CPacked_seqint. Note that a consequence of
301  * this is that the CSeq_loc-s specified in the TSeqLocVector cannot be
302  * of type mix or packed int
303  * @param sequences data to convert from [in]
304  * @return CPacked_seqint containing data from previous sequences
305  */
306 NCBI_XBLAST_EXPORT
307 CRef<objects::CPacked_seqint>
308 TSeqLocVector2Packed_seqint(const TSeqLocVector& sequences);
309 
310 END_SCOPE(blast)
311 END_NCBI_SCOPE
312 
313 /* @} */
314 
315 #endif  /* ALGO_BLAST_API___BLAST_OBJMGR_PRIV__HPP */
316