1 #ifndef ALGO_COBALT___SEQ__HPP
2 #define ALGO_COBALT___SEQ__HPP
3 
4 /* $Id: seq.hpp 330412 2011-08-11 18:56:23Z boratyng $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's offical duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: seq.hpp
32 
33 Author: Jason Papadopoulos
34 
35 Contents: Interface for CSequence class
36 
37 ******************************************************************************/
38 
39 /// @file seq.hpp
40 /// Interface for CSequence class
41 
42 #include <util/math/matrix.hpp>
43 #include <objects/seqalign/Seq_align.hpp>
44 #include <algo/blast/api/sseqloc.hpp>
45 #include <algo/align/nw/nw_aligner.hpp>
46 #include <algo/cobalt/base.hpp>
47 #include <algo/cobalt/exception.hpp>
48 
49 BEGIN_NCBI_SCOPE
50 BEGIN_SCOPE(cobalt)
51 
52 /// Class for representing protein sequences
53 class NCBI_COBALT_EXPORT CSequence
54 {
55 
56 public:
57     /// The ncbistdaa code for a gap
58     static const unsigned char kGapChar = 0;
59 
60     /// Represents the sequence as position-specific
61     /// amino acid frequencies. Matrix is of dimension
62     /// (sequence length) x kAlphabetSize
63     typedef CNcbiMatrix<double> TFreqMatrix;
64 
65     /// Default constructor: build an empty sequence
66     ///
CSequence()67     CSequence() {}
68 
69     /// Build a sequence
70     /// @param seq The input sequence.
71     ///
72     CSequence(const objects::CSeq_loc& seq, objects::CScope& scope);
73 
74     /// Replace the sequence represented by a CSequence object
75     /// @param seq The new sequence [in]
76     ///
77     void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
78 
79     /// Replace the sequence with sequence of gaps of given length
80     /// @param length Number of gaps [in]
81     ///
82     void Reset(int length);
83 
84 
85     /// Access the list of position frequencies associated
86     /// with a sequence
87     /// @return The frequency matrix
88     ///
GetFreqs()89     TFreqMatrix& GetFreqs() { return m_Freqs; }
90 
GetFreqs() const91     const TFreqMatrix& GetFreqs() const { return m_Freqs; }
92 
93     /// Access the raw sequence data, in ncbistdaa format
94     /// @return Pointer to array of sequence data
95     ///
GetSequence()96     unsigned char* GetSequence() { return &m_Sequence[0]; }
97 
98     /// Get the raw sequence data in ncbistdaa format
99     /// @return Pointer to array of sequence data
100     ///
GetSequence() const101     const unsigned char* GetSequence() const { return &m_Sequence[0]; }
102 
103     /// Access the sequence letter at a specified position
104     /// @param pos Position to access [in]
105     /// @return The sequence letter
106     ///
GetLetter(int pos) const107     unsigned char GetLetter(int pos) const { return m_Sequence[pos]; }
108 
109     /// Change letter in a given position to a given one
110     /// @param pos Position in the sequence [in]
111     /// @param letter Letter [in]
112     ///
SetLetter(int pos,unsigned char letter)113     void SetLetter(int pos, unsigned char letter) { m_Sequence[pos] = letter; }
114 
115     /// Access the sequence letter at a specified position,
116     /// and return an ASCII representation of that letter
117     /// @param pos Position to access [in]
118     /// @return The ASCII sequence letter
119     ///
120     unsigned char GetPrintableLetter(int pos) const;
121 
122     /// Get the length of the current sequence
123     /// @return Sequence length
124     ///
GetLength() const125     int GetLength() const { return m_Sequence.size(); }
126 
127     /// Given an edit script, insert gaps into a sequence. The
128     /// profile of the sequence is adjusted automatically, and
129     /// new gaps are assigned a column of all zero profile frequencies
130     /// @param transcript The edit script to apply [in]
131     /// @param gap_choice Which edit script character, eTS_Delete or
132     ///                   eTS_Insert, will cause a gap to be inserted [in]
133     ///
134     void PropagateGaps(const CNWAligner::TTranscript& transcript,
135                        CNWAligner::ETranscriptSymbol gap_choice);
136 
137     /// Insert gaps into a sequence. Gap is inserted before each given
138     /// location. The profile of the sequence is adjusted automatically,
139     /// and new gaps are assigned a column of all zero profile frequencies.
140     /// @param gap_locations Locations of single gaps
141     /// @param consider_gaps If false location n denotes before n-th letter,
142     /// otherwise simple position in a string, each location considers gaps
143     /// with smaller locations already added
144     ///
145     void InsertGaps(const vector<Uint4>& gap_locations,
146                     bool consider_gaps = false);
147 
148 
149     /// Given a collection of sequences, remove all sequence positions
150     /// where a subset of the sequences all contain a gap
151     /// @param seq The list of sequences [in]
152     /// @param index_list List of elements of 'seq' that will be
153     ///                   compressed. The other elements of 'seq' will
154     ///                   not be changed [in]
155     ///
156     static void CompressSequences(vector<CSequence>& seq,
157                                   vector<int> index_list);
158 
159     /// Create a vector of CSequence objects that represents the alignment in
160     /// given Seq_align
161     /// @param seq_align Alignment in ASN.1 format [in]
162     /// @param scope Scope [in]
163     /// @param msa Alignment as strings of residues and gaps [out]
164     ///
165     static void CreateMsa(const objects::CSeq_align& seq_align,
166                           objects::CScope& scope,
167                           vector<CSequence>& msa);
168 
169 private:
170     vector<unsigned char> m_Sequence;  ///< The sequence (ncbistdaa format)
171     TFreqMatrix m_Freqs;               ///< Position-specific frequency
172                                        ///  profile corresponding to sequence
173 };
174 
175 END_SCOPE(cobalt)
176 END_NCBI_SCOPE
177 
178 #endif // ALGO_COBALT___SEQ__HPP
179