1 #ifndef ALGO_BLAST_API__MSA_PSSM_INPUT__HPP 2 #define ALGO_BLAST_API__MSA_PSSM_INPUT__HPP 3 4 /* $Id: msa_pssm_input.hpp 221725 2011-01-25 13:50:14Z camacho $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Christiam Camacho 30 * 31 */ 32 33 /** @file msa_pssm_input.hpp 34 * Defines a concrete strategy to obtain PSSM input data for PSI-BLAST from a 35 * multiple sequence alignment file. 36 */ 37 38 #include <corelib/ncbiobj.hpp> 39 #include <algo/blast/api/blast_aux.hpp> 40 #include <algo/blast/api/pssm_input.hpp> 41 #include <objects/seqset/Seq_entry.hpp> 42 43 /** @addtogroup AlgoBlast 44 * 45 * @{ 46 */ 47 48 BEGIN_NCBI_SCOPE 49 BEGIN_SCOPE(blast) 50 51 /// This class is a concrete strategy for IPssmInputData which converts the 52 /// CLUSTALW-style output containing a multiple sequence alignment into the data 53 /// structures needed by the PSSM engine. 54 class NCBI_XBLAST_EXPORT CPsiBlastInputClustalW: public IPssmInputData 55 { 56 public: 57 /// Construct a concrete strategy, used to configure the CPssmEngine object 58 /// @param input_file Input file containing the multiple sequence 59 /// alignment. [in] 60 /// @param opts options to be used in the PSSM engine 61 /// @param matrix_name name of the substitution matrix to use to build PSSM 62 /// If not provided, the default implementation of 63 /// IPssmInputData::GetMatrixName() will be returned 64 /// @param diags diagnostics data requests for the PSSM engine 65 /// @param query query sequence for the alignment in ncbistdaa encoding. 66 /// @param query_length length of the sequence above. 67 /// @param gap_existence cost to open a gap, if zero default from IPssmInputData used. 68 /// @param gap_extension cost to open a gap, if zero default from IPssmInputData used. 69 /// @param msa_master_idx 0-based index of the multiple sequence alignment 70 /// This is an alternative way to specify the query sequence to use (i.e.: 71 /// don't use query and query_length if this is provided) [in] 72 CPsiBlastInputClustalW(CNcbiIstream& input_file, 73 const PSIBlastOptions& opts, 74 const char* matrix_name = NULL, 75 const PSIDiagnosticsRequest* diags = NULL, 76 const unsigned char* query = NULL, 77 unsigned int query_length = 0, 78 int gap_existence = 0, 79 int gap_opening = 0, 80 unsigned int msa_master_idx = 0); 81 82 /// virtual destructor 83 virtual ~CPsiBlastInputClustalW(); 84 85 /// The work to process the alignment is done here 86 void Process(); 87 88 /// Get the query sequence used as master for the multiple sequence 89 /// alignment in ncbistdaa encoding. GetQuery()90 unsigned char* GetQuery() { return m_Query.get(); } 91 92 /// Get the query's length GetQueryLength()93 unsigned int GetQueryLength() { return m_MsaDimensions.query_length; } 94 95 /// Obtain the multiple sequence alignment structure GetData()96 PSIMsa* GetData() { return m_Msa; } 97 98 /// Obtain the options for the PSSM engine GetOptions()99 const PSIBlastOptions* GetOptions() { 100 return &m_Opts; 101 } 102 103 /// Obtain the name of the underlying matrix to use when building the PSSM GetMatrixName()104 const char* GetMatrixName() { 105 return m_MatrixName.empty() 106 ? IPssmInputData::GetMatrixName() 107 : m_MatrixName.c_str(); 108 } 109 110 /// Obtain the gap existence value to use when building the PSSM GetGapExistence()111 int GetGapExistence() { 112 return m_GapExistence 113 ? m_GapExistence 114 : IPssmInputData::GetGapExistence(); 115 } 116 117 /// Obtain the gap extension value to use when building the PSSM GetGapExtension()118 int GetGapExtension() { 119 return m_GapExtension 120 ? m_GapExtension 121 : IPssmInputData::GetGapExtension(); 122 } 123 124 /// Obtain the diagnostics data that is requested from the PSSM engine GetDiagnosticsRequest()125 const PSIDiagnosticsRequest* GetDiagnosticsRequest() { 126 return m_DiagnosticsRequest; 127 } 128 129 /// @inheritDoc GetQueryForPssm()130 CRef<objects::CBioseq> GetQueryForPssm() { 131 return m_QueryBioseq; 132 } 133 134 private: 135 136 /// Pointer to query sequence 137 TAutoUint1ArrayPtr m_Query; 138 /// The raw multiple sequence alignment in ASCII read from the input file 139 vector<string> m_AsciiMsa; 140 /// Structure representing the multiple sequence alignment 141 PSIMsa* m_Msa; 142 /// Multiple sequence alignment dimensions 143 PSIMsaDimensions m_MsaDimensions; 144 /// Algorithm options 145 PSIBlastOptions m_Opts; 146 /// Diagnostics request structure 147 PSIDiagnosticsRequest* m_DiagnosticsRequest; 148 /// Underlying matrix to use 149 string m_MatrixName; 150 /// Gap existence parameter used. 151 int m_GapExistence; 152 /// Gap extension parameter used. 153 int m_GapExtension; 154 /// CSeq_entry obtained from the multiple sequence alignment 155 CRef<objects::CSeq_entry> m_SeqEntry; 156 /// Query as CBioseq for PSSM 157 CRef<objects::CBioseq> m_QueryBioseq; 158 159 /////////////////////////// Auxiliary functions /////////////////////////// 160 161 /// Reads the multiple sequence alignment from the input file 162 /// @param input_file Input file containing the multiple sequence 163 /// alignment. [in] 164 /// @post m_AsciiMsa and m_SeqEntry are not empty 165 void x_ReadAsciiMsa(CNcbiIstream& input_file); 166 167 /// Extracts the query sequence from the multiple sequence alignment, 168 /// assuming it's the first one, into m_Query 169 /// @post m_Query is not NULL and m_MsaDimensions.query_length is assigned 170 void x_ExtractQueryFromMsa(unsigned int msa_master_idx = 0); 171 172 /// Searches the query sequence (m_Query) in the aligned sequences 173 /// (m_AsciiMsa) and moves the first instance it finds to the front of this 174 /// data structure. 175 /// @throw CBlastException if the query sequence is not found. 176 void x_ValidateQueryInMsa(); 177 178 /// Copies query sequence data to multiple alignment data structure 179 void x_CopyQueryToMsa(); 180 181 /// Populates the multiple alignment data structure 182 void x_ExtractAlignmentData(); 183 184 /// Extracts the query bioseq from m_SeqEntry 185 void x_ExtractQueryForPssm(); 186 private: 187 /// prohibit copy constructor 188 CPsiBlastInputClustalW(const CPsiBlastInputClustalW&); 189 /// prohibit assignment operator 190 CPsiBlastInputClustalW& operator=(const CPsiBlastInputClustalW&); 191 }; 192 193 END_SCOPE(blast) 194 END_NCBI_SCOPE 195 196 /* @} */ 197 198 #endif /* ALGO_BLAST_API__MSA_PSSM_INPUT_HPP */ 199