1 #ifndef ALGO_BLAST_API__PSI_PSSM_INPUT__HPP
2 #define ALGO_BLAST_API__PSI_PSSM_INPUT__HPP
3 
4 /*  $Id: psi_pssm_input.hpp 341202 2011-10-18 12:21:49Z fongah2 $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Christiam Camacho
30  *
31  */
32 
33 /** @file psi_pssm_input.hpp
34  * Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
35  */
36 
37 #include <corelib/ncbiobj.hpp>
38 #include <algo/blast/api/blast_aux.hpp>
39 #include <algo/blast/api/pssm_input.hpp>
40 #include <objmgr/scope.hpp>
41 
42 /// Forward declaration for unit test classes
43 class CPssmCreateTestFixture;
44 
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
49 
50 BEGIN_NCBI_SCOPE
51 
52 // Forward declarations in objects scope
53 BEGIN_SCOPE(objects)
54 #ifndef SKIP_DOXYGEN_PROCESSING
55     class CSeq_align_set;
56     class CDense_seg;
57 #endif /* SKIP_DOXYGEN_PROCESSING */
58 END_SCOPE(objects)
59 
60 BEGIN_SCOPE(blast)
61 
62 /// Implements the interface to retrieve data for the last 2 stages of the PSSM
63 /// creation. Note that none of the data is owned by this class, it simply
64 /// returns pointers to the data it is passed
65 class NCBI_XBLAST_EXPORT CPsiBlastInputFreqRatios : public IPssmInputFreqRatios
66 {
67 public:
68     /**
69      * @brief Constructor
70      *
71      * @param query pointer to query sequence data [in]
72      * @param query_length length of the query sequence data [in]
73      * @param freq_ratios matrix of frequency ratios [in]
74      * @param matrix_name name of underlying scoring matrix used [in]
75      * @param gap_existence cost to open a gap, if zero default from IPssmInputData used.
76      * @param gap_extension cost to open a gap, if zero default from IPssmInputData used.
77      */
CPsiBlastInputFreqRatios(const unsigned char * query,unsigned int query_length,const CNcbiMatrix<double> & freq_ratios,const char * matrix_name=NULL,int gap_existence=0,int gap_extension=0,double impala_scale_factor=0)78     CPsiBlastInputFreqRatios(const unsigned char* query,
79                              unsigned int query_length,
80                              const CNcbiMatrix<double>& freq_ratios,
81                              const char* matrix_name = NULL,
82                              int gap_existence = 0,
83                              int gap_extension = 0,
84                              double impala_scale_factor = 0)
85         : m_Query(const_cast<unsigned char*>(query)),
86           m_QueryLength(query_length),
87           m_MatrixName(matrix_name),
88           m_GapExistence(gap_existence),
89           m_GapExtension(gap_extension),
90           m_FreqRatios(freq_ratios),
91           m_ImpalaScaleFactor(impala_scale_factor)
92     {}
93 
94     /// No-op as we assume the data is passed in to the constructor
Process()95     void Process() {}
96 
97     /// @inheritDoc
GetQuery()98     unsigned char* GetQuery() { return m_Query; }
99 
100     /// Get the query's length
GetQueryLength()101     unsigned int GetQueryLength() { return m_QueryLength; }
102 
103     /// Obtain the name of the underlying matrix to use when building the PSSM
GetMatrixName()104     const char* GetMatrixName() {
105         return m_MatrixName ? m_MatrixName :
106             IPssmInputFreqRatios::GetMatrixName();
107     }
108 
109    /// Obtain the gap existence value to use when building the PSSM
GetGapExistence()110     int GetGapExistence() {
111          return m_GapExistence
112          ? m_GapExistence
113          : IPssmInputFreqRatios::GetGapExistence();
114     }
115 
116     /// Obtain the gap extension value to use when building the PSSM
GetGapExtension()117     int GetGapExtension() {
118          return m_GapExtension
119          ? m_GapExtension
120          : IPssmInputFreqRatios::GetGapExtension();
121     }
122 
123     /// Obtain the IMPALA Scale Factor value to use when building the PSSM
GetImpalaScaleFactor()124     double GetImpalaScaleFactor() {
125              return m_ImpalaScaleFactor
126              ? m_ImpalaScaleFactor
127              : IPssmInputFreqRatios::GetImpalaScaleFactor();
128         }
129 
130     /// Obtain a matrix of frequency ratios with this->GetQueryLength() columns
131     /// and BLASTAA_SIZE rows
GetData()132     const CNcbiMatrix<double>& GetData() {
133         return m_FreqRatios;
134     }
135 
136 private:
137     /// Query sequence data
138     unsigned char*      m_Query;
139     /// Length of query sequence data
140     unsigned int        m_QueryLength;
141     /// Name of underlying scoring matrix
142     const char*         m_MatrixName;
143     /// Gap existence penalty
144     int                 m_GapExistence;
145     /// Gap extension penalty
146     int                 m_GapExtension;
147     /// Frequency ratios
148     CNcbiMatrix<double> m_FreqRatios;
149 
150     // IMPALA Scale Factor
151     double	m_ImpalaScaleFactor;
152 };
153 
154 /// This class is a concrete strategy for IPssmInputData, and it
155 /// implements the traditional PSI-BLAST algorithm for building a multiple
156 /// sequence alignment from a list of pairwise alignments using the C++ object
157 /// manager.
158 class NCBI_XBLAST_EXPORT CPsiBlastInputData : public IPssmInputData
159 {
160 public:
161     /// Construct a concrete strategy, used to configure the CPssmEngine object
162     /// @param query query sequence for the alignment in ncbistdaa encoding.
163     /// @param query_length length of the sequence above.
164     /// @param sset pairwise alignment produced by BLAST where query was the
165     /// query sequence.
166     /// @param scope object manager scope from which to retrieve sequence data
167     /// [in]
168     /// @param opts options to be used in the PSSM engine
169     /// @param matrix_name name of the substitution matrix to use to build PSSM
170     /// If not provided, the default implementation of
171     /// IPssmInputData::GetMatrixName() will be returned
172     /// @param gap_existence cost to open a gap, if zero default from IPssmInputData used.
173     /// @param gap_extension cost to open a gap, if zero default from IPssmInputData used.
174     /// @param diags diagnostics data requests for the PSSM engine
175     CPsiBlastInputData(const unsigned char* query,
176                        unsigned int query_length,
177                        CConstRef<objects::CSeq_align_set> sset,
178                        CRef<objects::CScope> scope,
179                        const PSIBlastOptions& opts,
180                        const char* matrix_name = NULL,
181                        int gap_existence = 0,
182                        int gap_opening = 0,
183                        const PSIDiagnosticsRequest* diags = NULL,
184                        const string& query_title = "");
185 
186     /// virtual destructor
187     virtual ~CPsiBlastInputData();
188 
189     /// The work to process the alignment is done here
190     void Process();
191 
192     /// Get the query sequence used as master for the multiple sequence
193     /// alignment in ncbistdaa encoding.
194     unsigned char* GetQuery();
195 
196     /// Get the query's length
197     unsigned int GetQueryLength();
198 
199     /// Obtain the multiple sequence alignment structure
200     PSIMsa* GetData();
201 
202     /// Obtain the options for the PSSM engine
203     const PSIBlastOptions* GetOptions();
204 
205     /// Obtain the name of the underlying matrix to use when building the PSSM
206     const char* GetMatrixName();
207 
208    /// Obtain the gap existence value to use when building the PSSM
GetGapExistence()209     int GetGapExistence() {
210          return m_GapExistence
211          ? m_GapExistence
212          : IPssmInputData::GetGapExistence();
213     }
214 
215     /// Obtain the gap extension value to use when building the PSSM
GetGapExtension()216     int GetGapExtension() {
217          return m_GapExtension
218          ? m_GapExtension
219          : IPssmInputData::GetGapExtension();
220     }
221 
222     /// Obtain the diagnostics data that is requested from the PSSM engine
223     const PSIDiagnosticsRequest* GetDiagnosticsRequest();
224 
225     /// @inheritDoc
GetQueryForPssm()226     CRef<objects::CBioseq> GetQueryForPssm() {
227         return m_QueryBioseq;
228     }
229 
230 private:
231 
232     /// Pointer to query sequence
233     unsigned char*                  m_Query;
234     /// Title of query
235     string                          m_QueryTitle;
236     /// Scope where to retrieve the sequences in the aligment from
237     CRef<objects::CScope>           m_Scope;
238     /// Structure representing the multiple sequence alignment
239     PSIMsa*                         m_Msa;
240     /// Multiple sequence alignment dimensions
241     PSIMsaDimensions                m_MsaDimensions;
242     /// Pairwise alignment result of a BLAST search
243     CConstRef<objects::CSeq_align_set>   m_SeqAlignSet;
244     /// Algorithm options
245     PSIBlastOptions                 m_Opts;
246     /// Diagnostics request structure
247     PSIDiagnosticsRequest*          m_DiagnosticsRequest;
248     /// Underlying matrix to use
249     string                          m_MatrixName;
250     /// Gap existence paramter used.
251     int                             m_GapExistence;
252     /// Gap extension paramter used.
253     int                             m_GapExtension;
254     /// Query as CBioseq for PSSM
255     CRef<objects::CBioseq>          m_QueryBioseq;
256 
257     /////////////////////////// Auxiliary functions ///////////////////////////
258 
259     /// Tries to fetch the sequence data for the subject for the segments
260     /// specified in the Dense-seg. If the sequence cannot be retrieved from the
261     /// scope, a warning is printed and an empty string is returned in
262     /// sequence_data
263     /// @param ds dense seg for which the sequence data is needed [in]
264     /// @param scope scope from which to obtain the sequence data [in]
265     /// @param sequence_data string which will contain the sequence data.
266     static void
267     x_GetSubjectSequence(const objects::CDense_seg& ds, objects::CScope& scope,
268                          string& sequence_data);
269 
270     /// Examines the sequence alignment and keeps track of those hits which
271     /// have an HSP with an e-value below the inclusion threshold specified in
272     /// the PSIBlastOptions structure.
273     /// @return number of hits which qualify for constructing the multiple
274     /// sequence alignment structure
275     unsigned int
276     x_CountAndSelectQualifyingAlignments();
277 
278     /// Populates the multiple alignment data structure
279     void x_ExtractAlignmentData();
280     // First implementation of use_best_align option from old toolkit. Should
281     // be implemented as a subclass of this one?
282     //void x_ExtractAlignmentDataUseBestAlign();
283 
284     /// Copies query sequence data to multiple alignment data structure
285     void x_CopyQueryToMsa();
286 
287     /// Returns the number of sequences that make up the multiple sequence
288     /// alignment
289     /// @throws CBlastException if this number hasn't been calculated yet (need
290     /// to invoke Process() first!)
291     unsigned int GetNumAlignedSequences() const;
292 
293     /// Iterates over the Dense-seg passed in and extracts alignment
294     /// information to multiple alignment data structure.
295     /// @param denseg source alignment segment (HSP) [in]
296     /// @param msa_index index of the sequence aligned with the query in the
297     ///        desc_matrix field of the m_AlignmentData data member [in]
298     /// @param evalue evalue for this sequence aligned with the query (used for
299     /// debugging only) [in]
300     /// @param bit_score bit score for this sequence aligned with the query
301     /// (used for debugging only) [in]
302     void x_ProcessDenseg(const objects::CDense_seg& denseg,
303                          unsigned int msa_index,
304                          double evalue, double bit_score);
305 
306     /// Extracts the query bioseq from m_SeqAlignSet
307     void x_ExtractQueryForPssm();
308 
309     /// unit test class
310     friend class ::CPssmCreateTestFixture;
311 
312 private:
313     /// prohibit copy constructor
314     CPsiBlastInputData(const CPsiBlastInputData&);
315     /// prohibit assignment operator
316     CPsiBlastInputData& operator=(const CPsiBlastInputData&);
317 };
318 
319 END_SCOPE(blast)
320 END_NCBI_SCOPE
321 
322 /* @} */
323 
324 #endif  /* ALGO_BLAST_API__PSI_PSSM_INPUT_HPP */
325