1 #ifndef ALGO_COBALT___SEQ__HPP 2 #define ALGO_COBALT___SEQ__HPP 3 4 /* $Id: seq.hpp 330412 2011-08-11 18:56:23Z boratyng $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's offical duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * ===========================================================================*/ 28 29 /***************************************************************************** 30 31 File name: seq.hpp 32 33 Author: Jason Papadopoulos 34 35 Contents: Interface for CSequence class 36 37 ******************************************************************************/ 38 39 /// @file seq.hpp 40 /// Interface for CSequence class 41 42 #include <util/math/matrix.hpp> 43 #include <objects/seqalign/Seq_align.hpp> 44 #include <algo/blast/api/sseqloc.hpp> 45 #include <algo/align/nw/nw_aligner.hpp> 46 #include <algo/cobalt/base.hpp> 47 #include <algo/cobalt/exception.hpp> 48 49 BEGIN_NCBI_SCOPE 50 BEGIN_SCOPE(cobalt) 51 52 /// Class for representing protein sequences 53 class NCBI_COBALT_EXPORT CSequence 54 { 55 56 public: 57 /// The ncbistdaa code for a gap 58 static const unsigned char kGapChar = 0; 59 60 /// Represents the sequence as position-specific 61 /// amino acid frequencies. Matrix is of dimension 62 /// (sequence length) x kAlphabetSize 63 typedef CNcbiMatrix<double> TFreqMatrix; 64 65 /// Default constructor: build an empty sequence 66 /// CSequence()67 CSequence() {} 68 69 /// Build a sequence 70 /// @param seq The input sequence. 71 /// 72 CSequence(const objects::CSeq_loc& seq, objects::CScope& scope); 73 74 /// Replace the sequence represented by a CSequence object 75 /// @param seq The new sequence [in] 76 /// 77 void Reset(const objects::CSeq_loc& seq, objects::CScope& scope); 78 79 /// Replace the sequence with sequence of gaps of given length 80 /// @param length Number of gaps [in] 81 /// 82 void Reset(int length); 83 84 85 /// Access the list of position frequencies associated 86 /// with a sequence 87 /// @return The frequency matrix 88 /// GetFreqs()89 TFreqMatrix& GetFreqs() { return m_Freqs; } 90 GetFreqs() const91 const TFreqMatrix& GetFreqs() const { return m_Freqs; } 92 93 /// Access the raw sequence data, in ncbistdaa format 94 /// @return Pointer to array of sequence data 95 /// GetSequence()96 unsigned char* GetSequence() { return &m_Sequence[0]; } 97 98 /// Get the raw sequence data in ncbistdaa format 99 /// @return Pointer to array of sequence data 100 /// GetSequence() const101 const unsigned char* GetSequence() const { return &m_Sequence[0]; } 102 103 /// Access the sequence letter at a specified position 104 /// @param pos Position to access [in] 105 /// @return The sequence letter 106 /// GetLetter(int pos) const107 unsigned char GetLetter(int pos) const { return m_Sequence[pos]; } 108 109 /// Change letter in a given position to a given one 110 /// @param pos Position in the sequence [in] 111 /// @param letter Letter [in] 112 /// SetLetter(int pos,unsigned char letter)113 void SetLetter(int pos, unsigned char letter) { m_Sequence[pos] = letter; } 114 115 /// Access the sequence letter at a specified position, 116 /// and return an ASCII representation of that letter 117 /// @param pos Position to access [in] 118 /// @return The ASCII sequence letter 119 /// 120 unsigned char GetPrintableLetter(int pos) const; 121 122 /// Get the length of the current sequence 123 /// @return Sequence length 124 /// GetLength() const125 int GetLength() const { return m_Sequence.size(); } 126 127 /// Given an edit script, insert gaps into a sequence. The 128 /// profile of the sequence is adjusted automatically, and 129 /// new gaps are assigned a column of all zero profile frequencies 130 /// @param transcript The edit script to apply [in] 131 /// @param gap_choice Which edit script character, eTS_Delete or 132 /// eTS_Insert, will cause a gap to be inserted [in] 133 /// 134 void PropagateGaps(const CNWAligner::TTranscript& transcript, 135 CNWAligner::ETranscriptSymbol gap_choice); 136 137 /// Insert gaps into a sequence. Gap is inserted before each given 138 /// location. The profile of the sequence is adjusted automatically, 139 /// and new gaps are assigned a column of all zero profile frequencies. 140 /// @param gap_locations Locations of single gaps 141 /// @param consider_gaps If false location n denotes before n-th letter, 142 /// otherwise simple position in a string, each location considers gaps 143 /// with smaller locations already added 144 /// 145 void InsertGaps(const vector<Uint4>& gap_locations, 146 bool consider_gaps = false); 147 148 149 /// Given a collection of sequences, remove all sequence positions 150 /// where a subset of the sequences all contain a gap 151 /// @param seq The list of sequences [in] 152 /// @param index_list List of elements of 'seq' that will be 153 /// compressed. The other elements of 'seq' will 154 /// not be changed [in] 155 /// 156 static void CompressSequences(vector<CSequence>& seq, 157 vector<int> index_list); 158 159 /// Create a vector of CSequence objects that represents the alignment in 160 /// given Seq_align 161 /// @param seq_align Alignment in ASN.1 format [in] 162 /// @param scope Scope [in] 163 /// @param msa Alignment as strings of residues and gaps [out] 164 /// 165 static void CreateMsa(const objects::CSeq_align& seq_align, 166 objects::CScope& scope, 167 vector<CSequence>& msa); 168 169 private: 170 vector<unsigned char> m_Sequence; ///< The sequence (ncbistdaa format) 171 TFreqMatrix m_Freqs; ///< Position-specific frequency 172 /// profile corresponding to sequence 173 }; 174 175 END_SCOPE(cobalt) 176 END_NCBI_SCOPE 177 178 #endif // ALGO_COBALT___SEQ__HPP 179