1 #ifndef ALGO_BLAST_API__PSSM_ENGINE__HPP 2 #define ALGO_BLAST_API__PSSM_ENGINE__HPP 3 4 /* $Id: pssm_engine.hpp 575325 2018-11-27 18:22:00Z ucko $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Christiam Camacho 30 * 31 */ 32 33 /** @file pssm_engine.hpp 34 * C++ API for the PSI-BLAST PSSM engine. 35 */ 36 37 #include <corelib/ncbiobj.hpp> 38 #include <algo/blast/api/blast_aux.hpp> 39 #include <algo/blast/api/pssm_input.hpp> 40 #include <algo/blast/api/blast_exception.hpp> 41 #include <algo/blast/api/blast_results.hpp> // for CBlastAncillaryData 42 #include <algo/blast/api/cdd_pssm_input.hpp> 43 44 // Forward declarations 45 class CPssmCreateTestFixture; // unit test class 46 47 /** @addtogroup AlgoBlast 48 * 49 * @{ 50 */ 51 52 BEGIN_NCBI_SCOPE 53 54 BEGIN_SCOPE(objects) 55 /// forward declaration of ASN.1 object containing PSSM (scoremat.asn) 56 class CPssmWithParameters; 57 END_SCOPE(objects) 58 59 BEGIN_SCOPE(blast) 60 61 /// Exception class for the CPssmEngine class 62 class CPssmEngineException : public CBlastException 63 { 64 public: 65 /// Types of exceptions generated by the CPssmEngine class 66 enum EErrCode { 67 eNullInputData, 68 eInvalidInputData 69 }; 70 71 /// Translate from the error code value to its string representation. GetErrCodeString(void) const72 virtual const char* GetErrCodeString(void) const override { 73 switch (GetErrCode()) { 74 case eNullInputData: return "eNullInputData"; 75 case eInvalidInputData: return "eInvalidInputData"; 76 default: return CException::GetErrCodeString(); 77 } 78 } 79 #ifndef SKIP_DOXYGEN_PROCESSING 80 NCBI_EXCEPTION_DEFAULT(CPssmEngineException, CBlastException); 81 #endif /* SKIP_DOXYGEN_PROCESSING */ 82 }; 83 84 /// Computes a PSSM as specified in PSI-BLAST. 85 /// 86 /// This class must be configured with a concrete strategy for it to obtain 87 /// its input data. 88 /// The following example uses the CPsiBlastInputData concrete strategy: 89 /// 90 /// @code 91 /// ... 92 /// CPsiBlastInputData pssm_strategy(query, query_length, alignment, 93 /// object_manager_scope, psi_blast_options); 94 /// CPssmEngine pssm_engine(&pssm_strategy); 95 /// CRef<CPssmWithParameters> scoremat = pssm_engine.Run(); 96 /// ... 97 /// @endcode 98 99 class NCBI_XBLAST_EXPORT CPssmEngine : public CObject 100 { 101 public: 102 /// Constructor to configure the PSSM engine with a PSSM input data 103 /// strategy object 104 /// Checks that no data returned by the IPssmInputData interface is NULL 105 /// @throws CPssmEngineException if validation fails. Does not test the 106 /// GetData() method as this is only populated after Process() is called. 107 CPssmEngine(IPssmInputData* input); 108 109 /// Constructor to perform the last 2 stages of the PSSM creation algorithm 110 /// Checks that no data returned by the IPssmInputFreqRatios interface is 111 /// NULL 112 /// @throws CPssmEngineException if validation fails 113 CPssmEngine(IPssmInputFreqRatios* input); 114 115 /// Constructor to configure the PSSM engine with a PSSM input data 116 /// strategy object for CDD-based PSSM computation 117 CPssmEngine(IPssmInputCdd* input); 118 119 /// Destructor 120 ~CPssmEngine(); 121 122 /// Sets the Karlin & Altschul parameters in the BlastScoreBlk to be used 123 /// in PSSM generation. This should be used when performing PSI-BLAST 124 /// iterations, but it's not necessary for a single PSSM construction 125 /// @param ancillary_data BLAST ancillary data from a previous iteration 126 void SetUngappedStatisticalParams(CConstRef<CBlastAncillaryData> 127 ancillary_data); 128 129 /// Runs the PSSM engine to compute the PSSM 130 CRef<objects::CPssmWithParameters> Run(); 131 132 private: 133 // Note: only one of the two pointers below should be non-NULL, as this 134 // determines which API from the C PSSM engine core to call 135 136 /// Handle to strategy to process raw PSSM input data 137 IPssmInputData* m_PssmInput; 138 /// Pointer to input data to create PSSM from frequency ratios 139 IPssmInputFreqRatios* m_PssmInputFreqRatios; 140 /// Blast score block structure 141 CBlastScoreBlk m_ScoreBlk; 142 143 /// Pointer to strategy to process raw PSSM input data 144 /// Note: Only one m_PssmInput* should be non-NULL 145 IPssmInputCdd* m_PssmInputCdd; 146 147 /// Copies query sequence and adds protein sentinel bytes at the beginning 148 /// and at the end of the sequence. 149 /// @param query sequence to copy [in] 150 /// @param query_length length of the sequence above [in] 151 /// @throws CBlastException if does not have enough memory 152 /// @return copy of query guarded by protein sentinel bytes 153 static unsigned char* 154 x_GuardProteinQuery(const unsigned char* query, 155 unsigned int query_length); 156 157 /// Initialiazes the core BlastQueryInfo structure for a single protein 158 /// sequence. 159 /// @todo this should be moved to the core of BLAST 160 /// @param query_length length of the sequence above [in] 161 BlastQueryInfo* 162 x_InitializeQueryInfo(unsigned int query_length); 163 164 /// Initializes the BlastScoreBlk data member required to run the PSSM 165 /// engine. 166 /// @todo this should be moved to the core of BLAST 167 /// @param query sequence [in] 168 /// @param query_length length of the sequence above [in] 169 /// @param matrix_name name of the underlying scoring matrix to use [in] 170 /// @param gap_existence cost to open a gap 171 /// @param gap_extension cost to open a gap 172 /// @throws CBlastException if does not have enough memory or if there was 173 /// an error when setting up the return value 174 /// @todo add an overloaded version of this method which takes an already 175 /// constructed BlastScoreBlk* 176 void 177 x_InitializeScoreBlock(const unsigned char* query, 178 unsigned int query_length, 179 const char* matrix_name, 180 int gap_existence, 181 int gap_extension); 182 183 /// Private interface to retrieve query sequence from its data source 184 /// interface 185 unsigned char* x_GetQuery() const; 186 187 /// Private interface to retrieve query length from its data source 188 /// interface 189 unsigned int x_GetQueryLength() const; 190 191 /// Private interface to retrieve matrix name from its data source 192 /// interface 193 const char* x_GetMatrixName() const; 194 195 /// Private interface to retrieve gap existence cost from data source 196 int x_GetGapExistence() const; 197 198 /// Private interface to retrieve gap extension cost from data source 199 int x_GetGapExtension() const; 200 201 /// Using IPssmInputData as a delegate to provide input data in the form of 202 /// a multiple sequence alignment, creates a PSSM using the CORE C PSSM 203 /// engine API 204 CRef<objects::CPssmWithParameters> 205 x_CreatePssmFromMsa(); 206 207 /// Using IPssmInputFreqRatios as a delegate to provide the input PSSM's 208 /// frequency ratios, creates a PSSM using the CORE C PSSM engine API 209 CRef<objects::CPssmWithParameters> 210 x_CreatePssmFromFreqRatios(); 211 212 /// Using IPssmInputCdd as a delegate to provide data in the form of 213 /// multiple alignment of CDs, creates PSSM using the CORE C PSSM 214 /// engine API 215 CRef<objects::CPssmWithParameters> 216 x_CreatePssmFromCDD(); 217 218 /// Converts the PSIMatrix structure into a ASN.1 CPssmWithParameters object 219 /// @param pssm input PSIMatrix structure [in] 220 /// @param opts options to be used in the PSSM engine [in] 221 /// @param matrix_name name of the underlying scoring matrix used 222 /// @param diagnostics contains diagnostics data from PSSM creation process 223 /// to save into the return value [in] 224 /// @return CPssmWithParameters object with equivalent contents to 225 /// those in pssm argument 226 static CRef<objects::CPssmWithParameters> 227 x_PSIMatrix2Asn1(const PSIMatrix* pssm, 228 const char* matrix_name, 229 const PSIBlastOptions* opts = NULL, 230 const PSIDiagnosticsResponse* diagnostics = NULL); 231 232 /// Convert a PSSM return status into a string 233 /// @param error_code return value of a PSSM engine function as defined in 234 /// blast_psi_priv.h [in] 235 /// @return string containing a description of the error 236 static std::string 237 x_ErrorCodeToString(int error_code); 238 239 /// Default constructor available for derived test classes CPssmEngine()240 CPssmEngine() {} 241 /// Prohibit copy constructor 242 CPssmEngine(const CPssmEngine& rhs); 243 /// Prohibit assignment operator 244 CPssmEngine& operator=(const CPssmEngine& rhs); 245 246 /// unit test class 247 friend class ::CPssmCreateTestFixture; 248 }; 249 250 /// Auxiliary class to convert data encoded in the PSSM to CNcbiMatrix 251 class NCBI_XBLAST_EXPORT CScorematPssmConverter 252 { 253 public: 254 /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of 255 /// what is stored in the BlastScoreBlk) containing scores 256 /// @param pssm PSSM to extract data from [in] 257 /// @throws std::runtime_error if scores are not available 258 static CNcbiMatrix<int>* 259 GetScores(const objects::CPssmWithParameters& pssm); 260 261 /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of 262 /// what is stored in the BlastScoreBlk) containing frequency ratios 263 /// @param pssm PSSM to extract data from [in] 264 /// @throws std::runtime_error if frequency ratios are not available 265 static CNcbiMatrix<double>* 266 GetFreqRatios(const objects::CPssmWithParameters& pssm); 267 268 /// Returns the information content per position of the PSSM 269 /// @param pssm PSSM to extract data from [in] 270 /// @param retval vector containing the information content or an empty 271 /// vector if this data is not available [in|out] 272 static void 273 GetInformationContent(const objects::CPssmWithParameters& pssm, 274 vector<double>& retval); 275 276 /// Returns the relative gapless PSSM column weights to pseudocounts 277 /// for the provided PSSM 278 /// @param pssm PSSM to extract data from [in] 279 /// @param retval vector containing the gapless column weights or an empty 280 /// vector if this data is not available [in|out] 281 static void 282 GetGaplessColumnWeights(const objects::CPssmWithParameters& pssm, 283 vector<double>& retval); 284 285 /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of 286 /// what is stored in the BlastScoreBlk) containing the residue frequencies 287 /// per position of the PSSM 288 /// @param pssm PSSM to extract data from [in] 289 /// @return NULL if residue frequencies are not available 290 static CNcbiMatrix<int>* 291 GetResidueFrequencies(const objects::CPssmWithParameters& pssm); 292 293 /// Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of 294 /// what is stored in the BlastScoreBlk) containing the weighted residue 295 /// frequencies per position of the PSSM 296 /// @param pssm PSSM to extract data from [in] 297 /// @return NULL if weighted residue frequencies are not available 298 static CNcbiMatrix<double>* 299 GetWeightedResidueFrequencies(const objects::CPssmWithParameters& pssm); 300 301 /// Data used in sequence weights computation 302 /// @param pssm PSSM to extract data from [in] 303 /// @param retval vector containing the sigma values or an empty 304 /// vector if this data is not available [in|out] 305 static void 306 GetSigma(const objects::CPssmWithParameters& pssm, vector<double>& retval); 307 308 /// Length of the aligned regions per position of the query sequence 309 /// @param pssm PSSM to extract data from [in] 310 /// @param retval vector containing the interval sizes or an empty 311 /// vector if this data is not available [in|out] 312 static void 313 GetIntervalSizes(const objects::CPssmWithParameters& pssm, 314 vector<int>& retval); 315 316 /// Gets the number of matching sequences per position of the PSSM 317 /// @param pssm PSSM to extract data from [in] 318 /// @param retval vector containing the number of matching sequences or an 319 /// empty vector if this data is not available [in|out] 320 static void 321 GetNumMatchingSeqs(const objects::CPssmWithParameters& pssm, 322 vector<int>& retval); 323 }; 324 325 END_SCOPE(blast) 326 END_NCBI_SCOPE 327 328 /* @} */ 329 330 #endif /* ALGO_BLAST_API__PSSM_ENGINE__HPP */ 331