1 #ifndef ALGO_BLAST_API__MSA_PSSM_INPUT__HPP
2 #define ALGO_BLAST_API__MSA_PSSM_INPUT__HPP
3 
4 /*  $Id: msa_pssm_input.hpp 221725 2011-01-25 13:50:14Z camacho $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Christiam Camacho
30  *
31  */
32 
33 /** @file msa_pssm_input.hpp
34  * Defines a concrete strategy to obtain PSSM input data for PSI-BLAST from a
35  * multiple sequence alignment file.
36  */
37 
38 #include <corelib/ncbiobj.hpp>
39 #include <algo/blast/api/blast_aux.hpp>
40 #include <algo/blast/api/pssm_input.hpp>
41 #include <objects/seqset/Seq_entry.hpp>
42 
43 /** @addtogroup AlgoBlast
44  *
45  * @{
46  */
47 
48 BEGIN_NCBI_SCOPE
49 BEGIN_SCOPE(blast)
50 
51 /// This class is a concrete strategy for IPssmInputData which converts the
52 /// CLUSTALW-style output containing a multiple sequence alignment into the data
53 /// structures needed by the PSSM engine.
54 class NCBI_XBLAST_EXPORT CPsiBlastInputClustalW: public IPssmInputData
55 {
56 public:
57     /// Construct a concrete strategy, used to configure the CPssmEngine object
58     /// @param input_file Input file containing the multiple sequence
59     /// alignment. [in]
60     /// @param opts options to be used in the PSSM engine
61     /// @param matrix_name name of the substitution matrix to use to build PSSM
62     /// If not provided, the default implementation of
63     /// IPssmInputData::GetMatrixName() will be returned
64     /// @param diags diagnostics data requests for the PSSM engine
65     /// @param query query sequence for the alignment in ncbistdaa encoding.
66     /// @param query_length length of the sequence above.
67     /// @param gap_existence cost to open a gap, if zero default from IPssmInputData used.
68     /// @param gap_extension cost to open a gap, if zero default from IPssmInputData used.
69     /// @param msa_master_idx 0-based index of the multiple sequence alignment
70     /// This is an alternative way to specify the query sequence to use (i.e.:
71     /// don't use query and query_length if this is provided) [in]
72     CPsiBlastInputClustalW(CNcbiIstream& input_file,
73                            const PSIBlastOptions& opts,
74                            const char* matrix_name = NULL,
75                            const PSIDiagnosticsRequest* diags = NULL,
76                            const unsigned char* query = NULL,
77                            unsigned int query_length = 0,
78                            int gap_existence = 0,
79                            int gap_opening = 0,
80                            unsigned int msa_master_idx = 0);
81 
82     /// virtual destructor
83     virtual ~CPsiBlastInputClustalW();
84 
85     /// The work to process the alignment is done here
86     void Process();
87 
88     /// Get the query sequence used as master for the multiple sequence
89     /// alignment in ncbistdaa encoding.
GetQuery()90     unsigned char* GetQuery() { return m_Query.get(); }
91 
92     /// Get the query's length
GetQueryLength()93     unsigned int GetQueryLength() { return m_MsaDimensions.query_length; }
94 
95     /// Obtain the multiple sequence alignment structure
GetData()96     PSIMsa* GetData() { return m_Msa; }
97 
98     /// Obtain the options for the PSSM engine
GetOptions()99     const PSIBlastOptions* GetOptions() {
100         return &m_Opts;
101     }
102 
103     /// Obtain the name of the underlying matrix to use when building the PSSM
GetMatrixName()104     const char* GetMatrixName() {
105         return m_MatrixName.empty()
106             ? IPssmInputData::GetMatrixName()
107             : m_MatrixName.c_str();
108     }
109 
110    /// Obtain the gap existence value to use when building the PSSM
GetGapExistence()111     int GetGapExistence() {
112          return m_GapExistence
113          ? m_GapExistence
114          : IPssmInputData::GetGapExistence();
115     }
116 
117     /// Obtain the gap extension value to use when building the PSSM
GetGapExtension()118     int GetGapExtension() {
119          return m_GapExtension
120          ? m_GapExtension
121          : IPssmInputData::GetGapExtension();
122     }
123 
124     /// Obtain the diagnostics data that is requested from the PSSM engine
GetDiagnosticsRequest()125     const PSIDiagnosticsRequest* GetDiagnosticsRequest() {
126         return m_DiagnosticsRequest;
127     }
128 
129     /// @inheritDoc
GetQueryForPssm()130     CRef<objects::CBioseq> GetQueryForPssm() {
131         return m_QueryBioseq;
132     }
133 
134 private:
135 
136     /// Pointer to query sequence
137     TAutoUint1ArrayPtr              m_Query;
138     /// The raw multiple sequence alignment in ASCII read from the input file
139     vector<string>                  m_AsciiMsa;
140     /// Structure representing the multiple sequence alignment
141     PSIMsa*                         m_Msa;
142     /// Multiple sequence alignment dimensions
143     PSIMsaDimensions                m_MsaDimensions;
144     /// Algorithm options
145     PSIBlastOptions                 m_Opts;
146     /// Diagnostics request structure
147     PSIDiagnosticsRequest*          m_DiagnosticsRequest;
148     /// Underlying matrix to use
149     string                          m_MatrixName;
150     /// Gap existence parameter used.
151     int                             m_GapExistence;
152     /// Gap extension parameter used.
153     int                             m_GapExtension;
154     /// CSeq_entry obtained from the multiple sequence alignment
155     CRef<objects::CSeq_entry>       m_SeqEntry;
156     /// Query as CBioseq for PSSM
157     CRef<objects::CBioseq>          m_QueryBioseq;
158 
159     /////////////////////////// Auxiliary functions ///////////////////////////
160 
161     /// Reads the multiple sequence alignment from the input file
162     /// @param input_file Input file containing the multiple sequence
163     /// alignment. [in]
164     /// @post m_AsciiMsa and m_SeqEntry are not empty
165     void x_ReadAsciiMsa(CNcbiIstream& input_file);
166 
167     /// Extracts the query sequence from the multiple sequence alignment,
168     /// assuming it's the first one, into m_Query
169     /// @post m_Query is not NULL and m_MsaDimensions.query_length is assigned
170     void x_ExtractQueryFromMsa(unsigned int msa_master_idx = 0);
171 
172     /// Searches the query sequence (m_Query) in the aligned sequences
173     /// (m_AsciiMsa) and moves the first instance it finds to the front of this
174     /// data structure.
175     /// @throw CBlastException if the query sequence is not found.
176     void x_ValidateQueryInMsa();
177 
178     /// Copies query sequence data to multiple alignment data structure
179     void x_CopyQueryToMsa();
180 
181     /// Populates the multiple alignment data structure
182     void x_ExtractAlignmentData();
183 
184     /// Extracts the query bioseq from m_SeqEntry
185     void x_ExtractQueryForPssm();
186 private:
187     /// prohibit copy constructor
188     CPsiBlastInputClustalW(const CPsiBlastInputClustalW&);
189     /// prohibit assignment operator
190     CPsiBlastInputClustalW& operator=(const CPsiBlastInputClustalW&);
191 };
192 
193 END_SCOPE(blast)
194 END_NCBI_SCOPE
195 
196 /* @} */
197 
198 #endif  /* ALGO_BLAST_API__MSA_PSSM_INPUT_HPP */
199