1 /*  $Id: blast_results.hpp 542379 2017-07-31 13:06:45Z dicuccio $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jason Papadopoulos
27  *
28  */
29 
30 /** @file blast_results.hpp
31  * Definition of classes which constitute the results of running a BLAST
32  * search
33  */
34 
35 #ifndef ALGO_BLAST_API___BLAST_RESULTS_HPP
36 #define ALGO_BLAST_API___BLAST_RESULTS_HPP
37 
38 #include <algo/blast/core/blast_stat.h>
39 #include <algo/blast/api/blast_aux.hpp>
40 
41 /** @addtogroup AlgoBlast
42  *
43  * @{
44  */
45 
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(blast)
48 
49 /// Class used to return ancillary data from a blast search,
50 /// i.e. information that is not the list of alignment found
51 class NCBI_XBLAST_EXPORT CBlastAncillaryData : public CObject
52 {
53 
54 public:
55 
56     /// constructor
57     /// @param program_type Type of blast search [in]
58     /// @param query_number The index of the query for which
59     ///                 information will be retrieved [in]
60     /// @param sbp Score block, containing Karlin parameters [in]
61     /// @param query_info Structure with per-context information [in]
62     CBlastAncillaryData(EBlastProgramType program_type,
63                         int query_number,
64                         const BlastScoreBlk *sbp,
65                         const BlastQueryInfo *query_info);
66 
67     /** Parametrized constructor taking pairs of values for ungapped and gapped
68      * Karlin-Altschul parameters as well as the effective search space
69      * @param lambda Pair of ungapped and gapped lambda (in that order) [in]
70      * @param k Pair of ungapped and gapped k (in that order) [in]
71      * @param h Pair of ungapped and gapped h (in that order) [in]
72      * @param effective_search_space effective search space [in]
73      * @param is_psiblast true if the statistical parameters are for PSI-BLAST
74      * [in]
75      */
76     CBlastAncillaryData(pair<double, double> lambda,
77                         pair<double, double> k,
78                         pair<double, double> h,
79                         Int8 effective_search_space,
80                         bool is_psiblast = false);
81 
82     /// Destructor
83     ~CBlastAncillaryData();
84 
85     /// Copy-constructor
86     CBlastAncillaryData(const CBlastAncillaryData& rhs);
87 
88     /// Assignment operator
89     CBlastAncillaryData& operator=(const CBlastAncillaryData& rhs);
90 
91     /// Retrieve gumbel parameters
GetGumbelBlk() const92     const Blast_GumbelBlk * GetGumbelBlk() const {
93         return m_GumbelBlk;
94     }
95 
96     /// Retrieve ungapped Karlin parameters
GetUngappedKarlinBlk() const97     const Blast_KarlinBlk * GetUngappedKarlinBlk() const {
98         return m_UngappedKarlinBlk;
99     }
100 
101     /// Retrieve gapped Karlin parameters
GetGappedKarlinBlk() const102     const Blast_KarlinBlk * GetGappedKarlinBlk() const {
103         return m_GappedKarlinBlk;
104     }
105 
106     /// Retrieve PSI-BLAST ungapped Karlin parameters
GetPsiUngappedKarlinBlk() const107     const Blast_KarlinBlk * GetPsiUngappedKarlinBlk() const {
108         return m_PsiUngappedKarlinBlk;
109     }
110 
111     /// Retrieve PSI-BLAST gapped Karlin parameters
GetPsiGappedKarlinBlk() const112     const Blast_KarlinBlk * GetPsiGappedKarlinBlk() const {
113         return m_PsiGappedKarlinBlk;
114     }
115     /// Retrieve the search space for this query sequence. If the
116     /// results correspond to a blastx search, the search space will
117     /// refer to protein letters
GetSearchSpace() const118     Int8 GetSearchSpace() const {
119         return m_SearchSpace;
120     }
121     /// Retrieve the length adjustment for boundary conditions
GetLengthAdjustment() const122     Int8 GetLengthAdjustment() const {
123         return m_LengthAdjustment;
124     }
125     /// Set the length adjustment for boundary conditions
SetLengthAdjustment(int len_adj)126     void SetLengthAdjustment(int len_adj){
127         m_LengthAdjustment = len_adj;
128     }
129 private:
130     /// Gumbel parameters for one query
131     Blast_GumbelBlk *m_GumbelBlk;
132 
133     /// Ungapped Karlin parameters for one query
134     Blast_KarlinBlk *m_UngappedKarlinBlk;
135 
136     /// Gapped Karlin parameters for one query
137     Blast_KarlinBlk *m_GappedKarlinBlk;
138 
139     /// PSI-BLAST ungapped Karlin parameters for one query (if applicable)
140     Blast_KarlinBlk *m_PsiUngappedKarlinBlk;
141 
142     /// PSI-BLAST gapped Karlin parameters for one query (if applicable)
143     Blast_KarlinBlk *m_PsiGappedKarlinBlk;
144 
145     /// Search space used when calculating e-values for one query
146 
147     Int8 m_SearchSpace;
148 
149     /// Length adjustment for boundary conditions
150     Int8 m_LengthAdjustment;
151 
152     /// Workhorse for copy constructor and assignment operator
153     /// @param other object to copy [in]
154     void do_copy(const CBlastAncillaryData& other);
155 };
156 
157 
158 /// Search Results for One Query.
159 ///
160 /// This class encapsulates all the search results and related data
161 /// corresponding to one of the input queries.
162 
163 class NCBI_XBLAST_EXPORT CSearchResults : public CObject {
164 public:
165 
166     /// Constructor
167     /// @param query List of query identifiers [in]
168     /// @param align alignments for a single query sequence [in]
169     /// @param errs error messages for this query sequence [in]
170     /// @param ancillary_data Miscellaneous output from the blast engine [in]
171     /// @param query_masks Mask locations for this query [in]
172     /// @param rid RID (if applicable, else empty string) [in]
173     CSearchResults(CConstRef<objects::CSeq_id>     query,
174                    CRef<objects::CSeq_align_set>   align,
175                    const TQueryMessages          & errs,
176                    CRef<CBlastAncillaryData>       ancillary_data,
177                    const TMaskedQueryRegions     * query_masks = NULL,
178                    const string                  & rid = kEmptyStr,
179                    const SPHIQueryInfo           * phi_query_info = NULL);
180 
181     /// Our destructor
182     ~CSearchResults();
183 
184     /// Sets the RID for these results
185     /// @param rid RID to set [in]
SetRID(const string & rid)186     void SetRID(const string& rid) { m_RID.assign(rid); }
187 
188     /// Returns the RID for these results (if applicable), otherwise returns an
189     /// empty string
GetRID() const190     string GetRID() const { return m_RID; }
191 
192     /// Accessor for the Seq-align results
GetSeqAlign() const193     CConstRef<objects::CSeq_align_set> GetSeqAlign() const
194     {
195         return m_Alignment;
196     }
197 
198     /// Only intended to be used if you need to edit the seqlign. Otherwise
199     ///use GetSeqAlign()
SetSeqAlign()200     CRef<objects::CSeq_align_set> SetSeqAlign()
201     {
202         return m_Alignment;
203     }
204 
205     /// Return true if there are any alignments for this query
206     bool HasAlignments() const;
207 
208     /// Accessor for the query's sequence identifier
209     CConstRef<objects::CSeq_id> GetSeqId() const;
210 
211     /// Accessor for the query's search ancillary
GetAncillaryData() const212     CRef<CBlastAncillaryData> GetAncillaryData() const
213     {
214         return m_AncillaryData;
215     }
216 
217     /// Accessor for the error/warning messsages for this query
218     /// @param min_severity minimum severity to report errors [in]
219     TQueryMessages GetErrors(int min_severity = eBlastSevError) const;
220 
221     /// Returns true if there are errors among the results for this object
222     bool HasErrors() const;
223     /// Returns true if there are warnings among the results for this object
224     bool HasWarnings() const;
225 
226     /// Retrieve a string with the query identifier followed by the errors
227     /// produced, returns a empty string if HasErrors() returns false.
228     string GetErrorStrings() const;
229     /// Retrieve a string with the query identifier followed by the warnings
230     /// produced, returns a empty string if HasWarnings() returns false.
231     string GetWarningStrings() const;
232 
233     /// Retrieve the query regions which were masked by BLAST
234     /// @param flt_query_regions the return value [in|out]
235     void GetMaskedQueryRegions(TMaskedQueryRegions& flt_query_regions) const;
236 
237     /// Mutator for the masked query regions, intended to be used by internal
238     /// BLAST APIs to populate this object
239     /// @param flt_query_regions the input value [in]
240     void SetMaskedQueryRegions(const TMaskedQueryRegions& flt_query_regions);
241 
242     /// Retrieve the masked locations for the subject sequences in the
243     /// contained alignment
244     /// @param subj_masks masked locations [out]
245     void GetSubjectMasks(TSeqLocInfoVector& subj_masks) const;
246 
247     /// Set the masked locations for the subject sequences in the
248     /// contained alignment
249     /// @param subj_masks masked locations [in]
250     void SetSubjectMasks(const TSeqLocInfoVector& subj_masks);
251 
252     /// Retrieves PHI-BLAST information about pattern on query.
GetPhiQueryInfo() const253     const SPHIQueryInfo * GetPhiQueryInfo() const {
254          return m_PhiQueryInfo;
255     }
256 
257     /// Trim align_set size
258     /// @parm size  max num of alignments to keep
259     ///				(0 will erase all)
260     void TrimSeqAlign(objects::CSeq_align_set::Tdata::size_type max_size);
261 
262 protected:
263     /// this query's id
264     CConstRef<objects::CSeq_id> m_QueryId;
265 
266     /// alignments for this query
267     CRef<objects::CSeq_align_set> m_Alignment;
268 
269     /// error/warning messages for this query
270     TQueryMessages m_Errors;
271 
272     /// this query's masked regions
273     TMaskedQueryRegions m_Masks;
274 
275     /// the matching subjects masks
276     TSeqLocInfoVector m_SubjectMasks;
277 
278     /// non-alignment ancillary data for this query
279     CRef<CBlastAncillaryData> m_AncillaryData;
280 
281     /// The RID, if applicable (otherwise it's empty)
282     string m_RID;
283 
284     /// PHI-BLAST information.
285     SPHIQueryInfo *m_PhiQueryInfo;
286 
287 private:
288     /// Prohibit copy constructor
289     CSearchResults(const CSearchResults& rhs);
290     /// Prohibit assignment operator
291     CSearchResults& operator=(const CSearchResults& rhs);
292 };
293 
294 
295 /// Search Results for All Queries.
296 ///
297 /// This class encapsulates all of the search results and related data
298 /// from a search, it supports BLAST database and Bl2Seq searches and provides
299 /// a convenient way of accessing the results from BLAST.
300 ///
301 /// @note When representing BLAST database results, there are
302 /// CSearchResultSet::NumQueries() objects of type CSearchResultSet::value_type
303 /// in this object. When representing Bl2Seq results, there are
304 /// (CSearchResultSet::NumQueries() * number of subjects) objects of type
305 /// CSearchResultSet::value_type in this object.
306 
307 class NCBI_XBLAST_EXPORT CSearchResultSet : public CObject {
308 public:
309     /// data type contained by this container
310     typedef CRef<CSearchResults> value_type;
311 
312     /// List of query ids.
313     typedef vector< CConstRef<objects::CSeq_id> > TQueryIdVector;
314 
315     /// size_type type definition
316     typedef vector<value_type>::size_type size_type;
317 
318     /// typedef for a vector of CRef<CBlastAncillaryData>
319     typedef vector< CRef<CBlastAncillaryData> > TAncillaryVector;
320 
321     /// const_iterator type definition
322     typedef vector<value_type>::const_iterator const_iterator;
323 
324     /// iterator type definition
325     typedef vector<value_type>::iterator iterator;
326 
327     /// Simplest constructor
328     CSearchResultSet(EResultType res_type = eDatabaseSearch);
329 
330     /// Parametrized constructor
331     /// @param aligns vector of all queries' alignments [in]
332     /// @param msg_vec vector of all queries' messages [in]
333     /// @param res_type result type stored in this object [in]
334     CSearchResultSet(TSeqAlignVector aligns,
335                      TSearchMessages msg_vec,
336                      EResultType res_type = eDatabaseSearch);
337 
338     /// Parametrized constructor
339     /// @param ids vector of all queries' ids [in]
340     /// @param aligns vector of all queries' alignments [in]
341     /// @param msg_vec vector of all queries' messages [in]
342     /// @param ancillary_data vector of per-query search ancillary data [in]
343     /// @param masks Mask locations for this query [in]
344     /// @param res_type result type stored in this object [in]
345     /// @note this constructor assumes that the ids, msg_vec, and
346     /// ancillary_data vectors are of the SAME size as the aligns vector. The
347     /// masks vector can be of the same size as aligns or have as many elements
348     /// as there were queries in the search and they will be adjusted as
349     /// necessary.
350     CSearchResultSet(TQueryIdVector  ids,
351                      TSeqAlignVector aligns,
352                      TSearchMessages msg_vec,
353                      TAncillaryVector  ancillary_data =
354                      TAncillaryVector(),
355                      const TSeqLocInfoVector* masks = NULL,
356                      EResultType res_type = eDatabaseSearch,
357                      const SPHIQueryInfo* phi_query_info = NULL);
358 
359     /// Allow array-like access with integer indices to CSearchResults
360     /// contained by this object
361     /// @param i query sequence index if result type is eDatabaseSearch,
362     /// otherwise it's the query-subject index [in]
operator [](size_type i)363     CSearchResults & operator[](size_type i) {
364         return *m_Results[i];
365     }
366 
367     /// Allow array-like access with integer indices to const CSearchResults
368     /// contained by this object
369     /// @param i query sequence index if result type is eDatabaseSearch,
370     /// otherwise it's the query-subject index [in]
operator [](size_type i) const371     const CSearchResults & operator[](size_type i) const {
372         return *m_Results[i];
373     }
374 
375     /// Retrieve results for a query-subject pair
376     /// contained by this object
377     /// @param qi query sequence index [in]
378     /// @param si subject sequence index [in]
379     /// @note it only works for results of type eSequenceComparison
380     CSearchResults & GetResults(size_type qi, size_type si);
381 
382     /// Retrieve results for a query-subject pair
383     /// @param qi query sequence index [in]
384     /// @param si subject sequence index [in]
385     /// @note it only works for results of type eSequenceComparison
386     const CSearchResults & GetResults(size_type qi, size_type si) const;
387 
388     /// Allow array-like access with CSeq_id indices to CSearchResults
389     /// contained by this object
390     /// @param ident query sequence identifier [in]
391     /// @note it only works for results of type eDatabaseSearch
392     CRef<CSearchResults> operator[](const objects::CSeq_id & ident);
393 
394     /// Allow array-like access with CSeq_id indices to const CSearchResults
395     /// contained by this object
396     /// @param ident query sequence identifier [in]
397     /// @note it only works for results of type eDatabaseSearch
398     CConstRef<CSearchResults> operator[](const objects::CSeq_id & ident) const;
399 
400     /// Return the number of results contained by this object
401     /// @note this returns the number of queries for results of type
402     /// eDatabaseSearch and (number of queries * number of subjects) for results
403     /// of type eSequenceComparison
GetNumResults() const404     size_type GetNumResults() const
405     {
406         return m_Results.size();
407     }
408 
409     /// Return the number of unique query ID's represented by this object
GetNumQueries() const410     size_type GetNumQueries() const
411     {
412         return m_NumQueries;
413     }
414 
415     /// Sets the filtered query regions. If results are of type
416     /// eSequenceComparison, the masks can be one for each query and they will
417     /// be duplicated as necessary to meet this class' pre-conditions.
418     void SetFilteredQueryRegions(const TSeqLocInfoVector& masks);
419     /// Retrieves the filtered query regions
420     TSeqLocInfoVector GetFilteredQueryRegions() const;
421 
422     /// Identical to GetNumResults, provided to facilitate STL-style iteration
423     /// @sa note in GetNumResults
size() const424     size_type size() const { return GetNumResults(); }
425 
426     /// Returns whether this container is empty or not.
empty() const427     bool empty() const { return size() == 0; }
428 
429     /// Returns const_iterator to beginning of container, provided to
430     /// facilitate STL-style iteration
begin() const431     const_iterator begin() const { return m_Results.begin(); }
432 
433     /// Returns const_iterator to end of container, provided to
434     /// facilitate STL-style iteration
end() const435     const_iterator end() const { return m_Results.end(); }
436 
437     /// Returns iterator to beginning of container, provided to
438     /// facilitate STL-style iteration
begin()439     iterator begin() { return m_Results.begin(); }
440 
441     /// Returns iterator to end of container, provided to
442     /// facilitate STL-style iteration
end()443     iterator end() { return m_Results.end(); }
444 
445     /// Clears the contents of this object
clear()446     void clear() {
447         m_NumQueries = 0;
448         m_Results.clear();
449     }
450 
451     /// Add a value to the back of this container
452     /// @param element element to add [in]
453     void push_back(value_type& element);
454 
455     /// Get the type of results contained in this object
GetResultType() const456     EResultType GetResultType() const { return m_ResultType; }
457 
458     /// Sets the RID for these results
459     /// @param rid RID to set [in]
460     void SetRID(const string& rid);
461 
462 private:
463     /// Initialize the result set.
464     void x_Init(TQueryIdVector& queries,
465                 TSeqAlignVector                       aligns,
466                 TSearchMessages                       msg_vec,
467                 TAncillaryVector                      ancillary_data,
468                 const TSeqLocInfoVector*              query_masks,
469                 const SPHIQueryInfo*                  phi_query_info = NULL);
470 
471     /// Type of results stored in this object
472     EResultType m_ResultType;
473 
474     /// Number of queries
475     size_type m_NumQueries;
476 
477     /// Vector of results.
478     vector< CRef<CSearchResults> > m_Results;
479 
480     /// True if this object contains PHI-BLAST results
481     bool m_IsPhiBlast;
482 
483     /// Stores the masked query regions, for convenience and usage in CBl2Seq
484     TSeqLocInfoVector m_QueryMasks;
485 };
486 
487 END_SCOPE(blast)
488 END_NCBI_SCOPE
489 
490 /* @} */
491 
492 #endif /* ALGO_BLAST_API___BLAST_RESULTS_HPP */
493