1 #ifndef ALGO_BLAST_API__PSI_PSSM_INPUT__HPP 2 #define ALGO_BLAST_API__PSI_PSSM_INPUT__HPP 3 4 /* $Id: psi_pssm_input.hpp 341202 2011-10-18 12:21:49Z fongah2 $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Christiam Camacho 30 * 31 */ 32 33 /** @file psi_pssm_input.hpp 34 * Defines a concrete strategy to obtain PSSM input data for PSI-BLAST. 35 */ 36 37 #include <corelib/ncbiobj.hpp> 38 #include <algo/blast/api/blast_aux.hpp> 39 #include <algo/blast/api/pssm_input.hpp> 40 #include <objmgr/scope.hpp> 41 42 /// Forward declaration for unit test classes 43 class CPssmCreateTestFixture; 44 45 /** @addtogroup AlgoBlast 46 * 47 * @{ 48 */ 49 50 BEGIN_NCBI_SCOPE 51 52 // Forward declarations in objects scope 53 BEGIN_SCOPE(objects) 54 #ifndef SKIP_DOXYGEN_PROCESSING 55 class CSeq_align_set; 56 class CDense_seg; 57 #endif /* SKIP_DOXYGEN_PROCESSING */ 58 END_SCOPE(objects) 59 60 BEGIN_SCOPE(blast) 61 62 /// Implements the interface to retrieve data for the last 2 stages of the PSSM 63 /// creation. Note that none of the data is owned by this class, it simply 64 /// returns pointers to the data it is passed 65 class NCBI_XBLAST_EXPORT CPsiBlastInputFreqRatios : public IPssmInputFreqRatios 66 { 67 public: 68 /** 69 * @brief Constructor 70 * 71 * @param query pointer to query sequence data [in] 72 * @param query_length length of the query sequence data [in] 73 * @param freq_ratios matrix of frequency ratios [in] 74 * @param matrix_name name of underlying scoring matrix used [in] 75 * @param gap_existence cost to open a gap, if zero default from IPssmInputData used. 76 * @param gap_extension cost to open a gap, if zero default from IPssmInputData used. 77 */ CPsiBlastInputFreqRatios(const unsigned char * query,unsigned int query_length,const CNcbiMatrix<double> & freq_ratios,const char * matrix_name=NULL,int gap_existence=0,int gap_extension=0,double impala_scale_factor=0)78 CPsiBlastInputFreqRatios(const unsigned char* query, 79 unsigned int query_length, 80 const CNcbiMatrix<double>& freq_ratios, 81 const char* matrix_name = NULL, 82 int gap_existence = 0, 83 int gap_extension = 0, 84 double impala_scale_factor = 0) 85 : m_Query(const_cast<unsigned char*>(query)), 86 m_QueryLength(query_length), 87 m_MatrixName(matrix_name), 88 m_GapExistence(gap_existence), 89 m_GapExtension(gap_extension), 90 m_FreqRatios(freq_ratios), 91 m_ImpalaScaleFactor(impala_scale_factor) 92 {} 93 94 /// No-op as we assume the data is passed in to the constructor Process()95 void Process() {} 96 97 /// @inheritDoc GetQuery()98 unsigned char* GetQuery() { return m_Query; } 99 100 /// Get the query's length GetQueryLength()101 unsigned int GetQueryLength() { return m_QueryLength; } 102 103 /// Obtain the name of the underlying matrix to use when building the PSSM GetMatrixName()104 const char* GetMatrixName() { 105 return m_MatrixName ? m_MatrixName : 106 IPssmInputFreqRatios::GetMatrixName(); 107 } 108 109 /// Obtain the gap existence value to use when building the PSSM GetGapExistence()110 int GetGapExistence() { 111 return m_GapExistence 112 ? m_GapExistence 113 : IPssmInputFreqRatios::GetGapExistence(); 114 } 115 116 /// Obtain the gap extension value to use when building the PSSM GetGapExtension()117 int GetGapExtension() { 118 return m_GapExtension 119 ? m_GapExtension 120 : IPssmInputFreqRatios::GetGapExtension(); 121 } 122 123 /// Obtain the IMPALA Scale Factor value to use when building the PSSM GetImpalaScaleFactor()124 double GetImpalaScaleFactor() { 125 return m_ImpalaScaleFactor 126 ? m_ImpalaScaleFactor 127 : IPssmInputFreqRatios::GetImpalaScaleFactor(); 128 } 129 130 /// Obtain a matrix of frequency ratios with this->GetQueryLength() columns 131 /// and BLASTAA_SIZE rows GetData()132 const CNcbiMatrix<double>& GetData() { 133 return m_FreqRatios; 134 } 135 136 private: 137 /// Query sequence data 138 unsigned char* m_Query; 139 /// Length of query sequence data 140 unsigned int m_QueryLength; 141 /// Name of underlying scoring matrix 142 const char* m_MatrixName; 143 /// Gap existence penalty 144 int m_GapExistence; 145 /// Gap extension penalty 146 int m_GapExtension; 147 /// Frequency ratios 148 CNcbiMatrix<double> m_FreqRatios; 149 150 // IMPALA Scale Factor 151 double m_ImpalaScaleFactor; 152 }; 153 154 /// This class is a concrete strategy for IPssmInputData, and it 155 /// implements the traditional PSI-BLAST algorithm for building a multiple 156 /// sequence alignment from a list of pairwise alignments using the C++ object 157 /// manager. 158 class NCBI_XBLAST_EXPORT CPsiBlastInputData : public IPssmInputData 159 { 160 public: 161 /// Construct a concrete strategy, used to configure the CPssmEngine object 162 /// @param query query sequence for the alignment in ncbistdaa encoding. 163 /// @param query_length length of the sequence above. 164 /// @param sset pairwise alignment produced by BLAST where query was the 165 /// query sequence. 166 /// @param scope object manager scope from which to retrieve sequence data 167 /// [in] 168 /// @param opts options to be used in the PSSM engine 169 /// @param matrix_name name of the substitution matrix to use to build PSSM 170 /// If not provided, the default implementation of 171 /// IPssmInputData::GetMatrixName() will be returned 172 /// @param gap_existence cost to open a gap, if zero default from IPssmInputData used. 173 /// @param gap_extension cost to open a gap, if zero default from IPssmInputData used. 174 /// @param diags diagnostics data requests for the PSSM engine 175 CPsiBlastInputData(const unsigned char* query, 176 unsigned int query_length, 177 CConstRef<objects::CSeq_align_set> sset, 178 CRef<objects::CScope> scope, 179 const PSIBlastOptions& opts, 180 const char* matrix_name = NULL, 181 int gap_existence = 0, 182 int gap_opening = 0, 183 const PSIDiagnosticsRequest* diags = NULL, 184 const string& query_title = ""); 185 186 /// virtual destructor 187 virtual ~CPsiBlastInputData(); 188 189 /// The work to process the alignment is done here 190 void Process(); 191 192 /// Get the query sequence used as master for the multiple sequence 193 /// alignment in ncbistdaa encoding. 194 unsigned char* GetQuery(); 195 196 /// Get the query's length 197 unsigned int GetQueryLength(); 198 199 /// Obtain the multiple sequence alignment structure 200 PSIMsa* GetData(); 201 202 /// Obtain the options for the PSSM engine 203 const PSIBlastOptions* GetOptions(); 204 205 /// Obtain the name of the underlying matrix to use when building the PSSM 206 const char* GetMatrixName(); 207 208 /// Obtain the gap existence value to use when building the PSSM GetGapExistence()209 int GetGapExistence() { 210 return m_GapExistence 211 ? m_GapExistence 212 : IPssmInputData::GetGapExistence(); 213 } 214 215 /// Obtain the gap extension value to use when building the PSSM GetGapExtension()216 int GetGapExtension() { 217 return m_GapExtension 218 ? m_GapExtension 219 : IPssmInputData::GetGapExtension(); 220 } 221 222 /// Obtain the diagnostics data that is requested from the PSSM engine 223 const PSIDiagnosticsRequest* GetDiagnosticsRequest(); 224 225 /// @inheritDoc GetQueryForPssm()226 CRef<objects::CBioseq> GetQueryForPssm() { 227 return m_QueryBioseq; 228 } 229 230 private: 231 232 /// Pointer to query sequence 233 unsigned char* m_Query; 234 /// Title of query 235 string m_QueryTitle; 236 /// Scope where to retrieve the sequences in the aligment from 237 CRef<objects::CScope> m_Scope; 238 /// Structure representing the multiple sequence alignment 239 PSIMsa* m_Msa; 240 /// Multiple sequence alignment dimensions 241 PSIMsaDimensions m_MsaDimensions; 242 /// Pairwise alignment result of a BLAST search 243 CConstRef<objects::CSeq_align_set> m_SeqAlignSet; 244 /// Algorithm options 245 PSIBlastOptions m_Opts; 246 /// Diagnostics request structure 247 PSIDiagnosticsRequest* m_DiagnosticsRequest; 248 /// Underlying matrix to use 249 string m_MatrixName; 250 /// Gap existence paramter used. 251 int m_GapExistence; 252 /// Gap extension paramter used. 253 int m_GapExtension; 254 /// Query as CBioseq for PSSM 255 CRef<objects::CBioseq> m_QueryBioseq; 256 257 /////////////////////////// Auxiliary functions /////////////////////////// 258 259 /// Tries to fetch the sequence data for the subject for the segments 260 /// specified in the Dense-seg. If the sequence cannot be retrieved from the 261 /// scope, a warning is printed and an empty string is returned in 262 /// sequence_data 263 /// @param ds dense seg for which the sequence data is needed [in] 264 /// @param scope scope from which to obtain the sequence data [in] 265 /// @param sequence_data string which will contain the sequence data. 266 static void 267 x_GetSubjectSequence(const objects::CDense_seg& ds, objects::CScope& scope, 268 string& sequence_data); 269 270 /// Examines the sequence alignment and keeps track of those hits which 271 /// have an HSP with an e-value below the inclusion threshold specified in 272 /// the PSIBlastOptions structure. 273 /// @return number of hits which qualify for constructing the multiple 274 /// sequence alignment structure 275 unsigned int 276 x_CountAndSelectQualifyingAlignments(); 277 278 /// Populates the multiple alignment data structure 279 void x_ExtractAlignmentData(); 280 // First implementation of use_best_align option from old toolkit. Should 281 // be implemented as a subclass of this one? 282 //void x_ExtractAlignmentDataUseBestAlign(); 283 284 /// Copies query sequence data to multiple alignment data structure 285 void x_CopyQueryToMsa(); 286 287 /// Returns the number of sequences that make up the multiple sequence 288 /// alignment 289 /// @throws CBlastException if this number hasn't been calculated yet (need 290 /// to invoke Process() first!) 291 unsigned int GetNumAlignedSequences() const; 292 293 /// Iterates over the Dense-seg passed in and extracts alignment 294 /// information to multiple alignment data structure. 295 /// @param denseg source alignment segment (HSP) [in] 296 /// @param msa_index index of the sequence aligned with the query in the 297 /// desc_matrix field of the m_AlignmentData data member [in] 298 /// @param evalue evalue for this sequence aligned with the query (used for 299 /// debugging only) [in] 300 /// @param bit_score bit score for this sequence aligned with the query 301 /// (used for debugging only) [in] 302 void x_ProcessDenseg(const objects::CDense_seg& denseg, 303 unsigned int msa_index, 304 double evalue, double bit_score); 305 306 /// Extracts the query bioseq from m_SeqAlignSet 307 void x_ExtractQueryForPssm(); 308 309 /// unit test class 310 friend class ::CPssmCreateTestFixture; 311 312 private: 313 /// prohibit copy constructor 314 CPsiBlastInputData(const CPsiBlastInputData&); 315 /// prohibit assignment operator 316 CPsiBlastInputData& operator=(const CPsiBlastInputData&); 317 }; 318 319 END_SCOPE(blast) 320 END_NCBI_SCOPE 321 322 /* @} */ 323 324 #endif /* ALGO_BLAST_API__PSI_PSSM_INPUT_HPP */ 325