1 #ifndef ALGO_BLAST_API__PSSM_ENGINE__HPP
2 #define ALGO_BLAST_API__PSSM_ENGINE__HPP
3 
4 /*  $Id: pssm_engine.hpp 575325 2018-11-27 18:22:00Z ucko $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Christiam Camacho
30  *
31  */
32 
33 /** @file pssm_engine.hpp
34  * C++ API for the PSI-BLAST PSSM engine.
35  */
36 
37 #include <corelib/ncbiobj.hpp>
38 #include <algo/blast/api/blast_aux.hpp>
39 #include <algo/blast/api/pssm_input.hpp>
40 #include <algo/blast/api/blast_exception.hpp>
41 #include <algo/blast/api/blast_results.hpp> // for CBlastAncillaryData
42 #include <algo/blast/api/cdd_pssm_input.hpp>
43 
44 // Forward declarations
45 class CPssmCreateTestFixture;      // unit test class
46 
47 /** @addtogroup AlgoBlast
48  *
49  * @{
50  */
51 
52 BEGIN_NCBI_SCOPE
53 
54 BEGIN_SCOPE(objects)
55     /// forward declaration of ASN.1 object containing PSSM (scoremat.asn)
56     class CPssmWithParameters;
57 END_SCOPE(objects)
58 
59 BEGIN_SCOPE(blast)
60 
61 /// Exception class for the CPssmEngine class
62 class CPssmEngineException : public CBlastException
63 {
64 public:
65     /// Types of exceptions generated by the CPssmEngine class
66     enum EErrCode {
67         eNullInputData,
68         eInvalidInputData
69     };
70 
71     /// Translate from the error code value to its string representation.
GetErrCodeString(void) const72     virtual const char* GetErrCodeString(void) const override {
73         switch (GetErrCode()) {
74         case eNullInputData:    return "eNullInputData";
75         case eInvalidInputData: return "eInvalidInputData";
76         default:                return CException::GetErrCodeString();
77         }
78     }
79 #ifndef SKIP_DOXYGEN_PROCESSING
80     NCBI_EXCEPTION_DEFAULT(CPssmEngineException, CBlastException);
81 #endif /* SKIP_DOXYGEN_PROCESSING */
82 };
83 
84 /// Computes a PSSM as specified in PSI-BLAST.
85 ///
86 /// This class must be configured with a concrete strategy for it to obtain
87 /// its input data.
88 /// The following example uses the CPsiBlastInputData concrete strategy:
89 ///
90 /// @code
91 /// ...
92 /// CPsiBlastInputData pssm_strategy(query, query_length, alignment,
93 ///                                  object_manager_scope, psi_blast_options);
94 /// CPssmEngine pssm_engine(&pssm_strategy);
95 /// CRef<CPssmWithParameters> scoremat = pssm_engine.Run();
96 /// ...
97 /// @endcode
98 
99 class NCBI_XBLAST_EXPORT CPssmEngine : public CObject
100 {
101 public:
102     /// Constructor to configure the PSSM engine with a PSSM input data
103     /// strategy object
104     /// Checks that no data returned by the IPssmInputData interface is NULL
105     /// @throws CPssmEngineException if validation fails. Does not test the
106     /// GetData() method as this is only populated after Process() is called.
107     CPssmEngine(IPssmInputData* input);
108 
109     /// Constructor to perform the last 2 stages of the PSSM creation algorithm
110     /// Checks that no data returned by the IPssmInputFreqRatios interface is
111     /// NULL
112     /// @throws CPssmEngineException if validation fails
113     CPssmEngine(IPssmInputFreqRatios* input);
114 
115     /// Constructor to configure the PSSM engine with a PSSM input data
116     /// strategy object for CDD-based PSSM computation
117     CPssmEngine(IPssmInputCdd* input);
118 
119     /// Destructor
120     ~CPssmEngine();
121 
122     /// Sets the Karlin & Altschul parameters in the BlastScoreBlk to be used
123     /// in PSSM generation. This should be used when performing PSI-BLAST
124     /// iterations, but it's not necessary for a single PSSM construction
125     /// @param ancillary_data BLAST ancillary data from a previous iteration
126     void SetUngappedStatisticalParams(CConstRef<CBlastAncillaryData>
127                                       ancillary_data);
128 
129     /// Runs the PSSM engine to compute the PSSM
130     CRef<objects::CPssmWithParameters> Run();
131 
132 private:
133     // Note: only one of the two pointers below should be non-NULL, as this
134     // determines which API from the C PSSM engine core to call
135 
136     /// Handle to strategy to process raw PSSM input data
137     IPssmInputData*         m_PssmInput;
138     /// Pointer to input data to create PSSM from frequency ratios
139     IPssmInputFreqRatios*   m_PssmInputFreqRatios;
140     /// Blast score block structure
141     CBlastScoreBlk          m_ScoreBlk;
142 
143     /// Pointer to strategy to process raw PSSM input data
144     /// Note: Only one m_PssmInput* should be non-NULL
145     IPssmInputCdd* m_PssmInputCdd;
146 
147     /// Copies query sequence and adds protein sentinel bytes at the beginning
148     /// and at the end of the sequence.
149     /// @param query sequence to copy [in]
150     /// @param query_length length of the sequence above [in]
151     /// @throws CBlastException if does not have enough memory
152     /// @return copy of query guarded by protein sentinel bytes
153     static unsigned char*
154     x_GuardProteinQuery(const unsigned char* query,
155                         unsigned int query_length);
156 
157     /// Initialiazes the core BlastQueryInfo structure for a single protein
158     /// sequence.
159     /// @todo this should be moved to the core of BLAST
160     /// @param query_length length of the sequence above [in]
161     BlastQueryInfo*
162     x_InitializeQueryInfo(unsigned int query_length);
163 
164     /// Initializes the BlastScoreBlk data member required to run the PSSM
165     /// engine.
166     /// @todo this should be moved to the core of BLAST
167     /// @param query sequence [in]
168     /// @param query_length length of the sequence above [in]
169     /// @param matrix_name name of the underlying scoring matrix to use [in]
170     /// @param gap_existence cost to open a gap
171     /// @param gap_extension cost to open a gap
172     /// @throws CBlastException if does not have enough memory or if there was
173     /// an error when setting up the return value
174     /// @todo add an overloaded version of this method which takes an already
175     /// constructed BlastScoreBlk*
176     void
177     x_InitializeScoreBlock(const unsigned char* query,
178                            unsigned int query_length,
179                            const char* matrix_name,
180                            int gap_existence,
181                            int gap_extension);
182 
183     /// Private interface to retrieve query sequence from its data source
184     /// interface
185     unsigned char* x_GetQuery() const;
186 
187     /// Private interface to retrieve query length from its data source
188     /// interface
189     unsigned int x_GetQueryLength() const;
190 
191     /// Private interface to retrieve matrix name from its data source
192     /// interface
193     const char* x_GetMatrixName() const;
194 
195     /// Private interface to retrieve gap existence cost from data source
196     int x_GetGapExistence() const;
197 
198     /// Private interface to retrieve gap extension cost from data source
199     int x_GetGapExtension() const;
200 
201     /// Using IPssmInputData as a delegate to provide input data in the form of
202     /// a multiple sequence alignment, creates a PSSM using the CORE C PSSM
203     /// engine API
204     CRef<objects::CPssmWithParameters>
205     x_CreatePssmFromMsa();
206 
207     /// Using IPssmInputFreqRatios as a delegate to provide the input PSSM's
208     /// frequency ratios, creates a PSSM using the CORE C PSSM engine API
209     CRef<objects::CPssmWithParameters>
210     x_CreatePssmFromFreqRatios();
211 
212     /// Using IPssmInputCdd as a delegate to provide data in the form of
213     /// multiple alignment of CDs, creates PSSM using the CORE C PSSM
214     /// engine API
215     CRef<objects::CPssmWithParameters>
216     x_CreatePssmFromCDD();
217 
218     /// Converts the PSIMatrix structure into a ASN.1 CPssmWithParameters object
219     /// @param pssm input PSIMatrix structure [in]
220     /// @param opts options to be used in the PSSM engine [in]
221     /// @param matrix_name name of the underlying scoring matrix used
222     /// @param diagnostics contains diagnostics data from PSSM creation process
223     /// to save into the return value [in]
224     /// @return CPssmWithParameters object with equivalent contents to
225     /// those in pssm argument
226     static CRef<objects::CPssmWithParameters>
227     x_PSIMatrix2Asn1(const PSIMatrix* pssm,
228                      const char* matrix_name,
229                      const PSIBlastOptions* opts = NULL,
230                      const PSIDiagnosticsResponse* diagnostics = NULL);
231 
232     /// Convert a PSSM return status into a string
233     /// @param error_code return value of a PSSM engine function as defined in
234     /// blast_psi_priv.h [in]
235     /// @return string containing a description of the error
236     static std::string
237     x_ErrorCodeToString(int error_code);
238 
239     /// Default constructor available for derived test classes
CPssmEngine()240     CPssmEngine() {}
241     /// Prohibit copy constructor
242     CPssmEngine(const CPssmEngine& rhs);
243     /// Prohibit assignment operator
244     CPssmEngine& operator=(const CPssmEngine& rhs);
245 
246     /// unit test class
247     friend class ::CPssmCreateTestFixture;
248 };
249 
250 /// Auxiliary class to convert data encoded in the PSSM to CNcbiMatrix
251 class NCBI_XBLAST_EXPORT CScorematPssmConverter
252 {
253 public:
254     /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of
255     /// what is stored in the BlastScoreBlk) containing scores
256     /// @param pssm PSSM to extract data from [in]
257     /// @throws std::runtime_error if scores are not available
258     static CNcbiMatrix<int>*
259     GetScores(const objects::CPssmWithParameters& pssm);
260 
261     /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of
262     /// what is stored in the BlastScoreBlk) containing frequency ratios
263     /// @param pssm PSSM to extract data from [in]
264     /// @throws std::runtime_error if frequency ratios are not available
265     static CNcbiMatrix<double>*
266     GetFreqRatios(const objects::CPssmWithParameters& pssm);
267 
268     /// Returns the information content per position of the PSSM
269     /// @param pssm PSSM to extract data from [in]
270     /// @param retval vector containing the information content or an empty
271     /// vector if this data is not available [in|out]
272     static void
273     GetInformationContent(const objects::CPssmWithParameters& pssm,
274                           vector<double>& retval);
275 
276     /// Returns the relative gapless PSSM column weights to pseudocounts
277     /// for the provided PSSM
278     /// @param pssm PSSM to extract data from [in]
279     /// @param retval vector containing the gapless column weights or an empty
280     /// vector if this data is not available [in|out]
281     static void
282     GetGaplessColumnWeights(const objects::CPssmWithParameters& pssm,
283                             vector<double>& retval);
284 
285     /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of
286     /// what is stored in the BlastScoreBlk) containing the residue frequencies
287     /// per position of the PSSM
288     /// @param pssm PSSM to extract data from [in]
289     /// @return NULL if residue frequencies are not available
290     static CNcbiMatrix<int>*
291     GetResidueFrequencies(const objects::CPssmWithParameters& pssm);
292 
293     /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of
294     /// what is stored in the BlastScoreBlk) containing the weighted residue
295     /// frequencies per position of the PSSM
296     /// @param pssm PSSM to extract data from [in]
297     /// @return NULL if weighted residue frequencies are not available
298     static CNcbiMatrix<double>*
299     GetWeightedResidueFrequencies(const objects::CPssmWithParameters& pssm);
300 
301     /// Data used in sequence weights computation
302     /// @param pssm PSSM to extract data from [in]
303     /// @param retval vector containing the sigma values or an empty
304     /// vector if this data is not available [in|out]
305     static void
306     GetSigma(const objects::CPssmWithParameters& pssm, vector<double>& retval);
307 
308     /// Length of the aligned regions per position of the query sequence
309     /// @param pssm PSSM to extract data from [in]
310     /// @param retval vector containing the interval sizes or an empty
311     /// vector if this data is not available [in|out]
312     static void
313     GetIntervalSizes(const objects::CPssmWithParameters& pssm,
314                      vector<int>& retval);
315 
316     /// Gets the number of matching sequences per position of the PSSM
317     /// @param pssm PSSM to extract data from [in]
318     /// @param retval vector containing the number of matching sequences or an
319     /// empty vector if this data is not available [in|out]
320     static void
321     GetNumMatchingSeqs(const objects::CPssmWithParameters& pssm,
322                        vector<int>& retval);
323 };
324 
325 END_SCOPE(blast)
326 END_NCBI_SCOPE
327 
328 /* @} */
329 
330 #endif  /* ALGO_BLAST_API__PSSM_ENGINE__HPP */
331