1 /*
2   Copyright 2009 Andreas Biegert
3 
4   This file is part of the CS-BLAST package.
5 
6   The CS-BLAST package is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation, either version 3 of the License, or
9   (at your option) any later version.
10 
11   The CS-BLAST package is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15 
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #ifndef CS_ALIGNMENT_H_
21 #define CS_ALIGNMENT_H_
22 
23 #include <valarray>
24 #include <vector>
25 
26 #include "sequence.h"
27 #include "globals.h"
28 #include "matrix.h"
29 #include "vector.h"
30 
31 namespace cs {
32 
33 // Forward declarations
34 template<class Abc>
35 class Alignment;
36 
37 // Convince the compiler that operator<< is a template friend.
38 template<class Abc>
39 std::ostream& operator<< (std::ostream& out, const Alignment<Abc>& ali);
40 
41 // Supported alignment formats for in- and output.
42 enum AlignmentFormat {
43     FASTA_ALIGNMENT    = 0,
44     A2M_ALIGNMENT      = 1,
45     A3M_ALIGNMENT      = 2,
46     CLUSTAL_ALIGNMENT  = 3,
47     PSI_ALIGNMENT      = 4
48 };
49 
50 // A container class for multiple sequence alignments.
51 template<class Abc>
52 class Alignment {
53 	private:
54 	  // Row major matrix with sequences in integer representation.
55 	  Matrix<uint8_t> seqs_;
56 	  // Array with indices of all columns [0,1,2,...,num_cols-1].
57 	  std::valarray<size_t> col_idx_;
58 	  // Array with indices of match columns.
59 	  std::valarray<size_t> match_idx_;
60 	  // Array mask indicating match and insert columns.
61 	  std::valarray<bool> is_match_;
62 	  // Headers of sequences in the alignment.
63 	  std::vector<std::string> headers_;
64 	  // Name of the alignment as given by comment line in FASTA file
65 	  std::string name_;
66 
67 	  // Buffer size for reading
68 	  static const size_t kBufferSize = MB;
69 
70 	  // Initializes alignment with given headers and sequences.
71 	  void Init(const std::vector<std::string>& headers, const std::vector<std::string>& seqs);
72 
73 	  // Resize the sequence matrix and header vector to given dimensions.
74 	  void Resize(size_t num_seqs, size_t num_cols);
75 
76 	  // Fills match_idx__ with the indices of all match columns.
77 	  void SetMatchIndices();
78 
79     // Removes sequences with headers indicating non-protein sequences (secondary structure predictions)
80     void FilterSequencesByHeaders(std::vector<std::string>& headers, std::vector<std::string>& seqs);
81 
82 	  // Reads an alignment in FASTA format.
83 	  void ReadFasta(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
84 
85 	  // Reads an alignment in A2M format from given stream.
86 	  void ReadA2M(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
87 
88 	  // Reads an alignment in A3M format from given stream.
89 	  void ReadA3M(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
90 
91 	  // Helper method that reads a FASTA, A2M, or A3M formatted alignment.
92 	  void ReadFastaFlavors(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
93 
94 	  // Reads an alignment in PSI format.
95 	  void ReadPsi(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
96 
97 	  // Writes the alignment in FASTA, A2M, or A3M format to output stream.
98 	  void WriteFastaFlavors(FILE* fout, AlignmentFormat format, size_t width = 100) const;
99 
100 	  // Writes the alignment in CLUSTAL or PSI format to output stream.
101 	  void WriteClustalFlavors(FILE* fout, AlignmentFormat format, size_t width = 100) const;
102 
103   public:
104     // Constructs alignment from multi FASTA formatted alignment read from input
105     // stream.
106     Alignment(FILE* fin, AlignmentFormat format);
107 
108     // Constructs an all-gaps alignment with 'ncols' columns and 'nseqs' sequences
109     Alignment(size_t ncols, size_t nseqs);
110 
111     // Constructs an alignment from a single sequence.
112     Alignment(const Sequence<Abc>& seq);
113 
114     // All memeber are automatically destructed
~Alignment()115     ~Alignment() {}
116 
117     // Accessors for integer representation of character in MATCH column i of
118     // sequence k.
119     uint8_t* operator[](size_t i) { return seqs_[match_idx_[i]]; }
120     const uint8_t* operator[](size_t i) const { return seqs_[match_idx_[i]]; }
match(size_t i,size_t k)121     uint8_t& match(size_t i, size_t k) { return seqs_[match_idx_[i]][k]; }
match(size_t i,size_t k)122     const uint8_t& match(size_t i, size_t k) const { return seqs_[match_idx_[i]][k]; }
123 
124     // Accessors for integer representation of the character at position i
125     // (NOT match column i) of sequence k.
operator()126     uint8_t& operator() (size_t k, size_t i) { return seqs_[i][k]; }
operator()127     const uint8_t& operator() (size_t k, size_t i) const { return seqs_[i][k]; }
seq(size_t k,size_t i)128     uint8_t& seq(size_t k, size_t i) { return seqs_[i][k]; }
seq(size_t k,size_t i)129     const uint8_t& seq(size_t k, size_t i) const { return seqs_[i][k]; }
130 
131     // Returns the character at position i (NOT match column i) of sequence k.
chr(size_t k,size_t i)132     char chr(size_t k, size_t i) const { return Abc::kIntToChar[seqs_[i][k]]; }
133 
134     // Returns index of all-alignment column corresponding to match column i.
col_idx(size_t i)135     size_t col_idx(size_t i) const { return match_idx_[i]; }
136 
137     // Returns the number of sequences in the alignment.
nseqs()138     size_t nseqs() const { return seqs_.ncols(); }
139 
140     // Returns the total number of alignment columns.
ncols()141     size_t ncols() const { return seqs_.nrows(); }
142 
143     // Returns the number of match columns.
nmatch()144     size_t nmatch() const { return match_idx_.size(); }
145 
146     // Returns the number of insert columns.
ninsert()147     int ninsert() const { return ncols() - nmatch(); }
148 
149     // Returns the header of sequence k.
header(size_t k)150     std::string header(size_t k) const { return headers_[k]; }
151 
152     // Sets the header of sequence k.
set_header(size_t k,const std::string & header)153     void set_header(size_t k, const std::string& header) { headers_[k] = header; }
154 
155     // Returns the name of the alignment
name()156     std::string name() const { return name_; }
157 
158     // Sets the name of the alignment
159 //    void set_name(const std::string& header) { name_ = name; }
160 
161     // Makes all columns with a residue in sequence k match columns.
162     void AssignMatchColumnsBySequence(size_t k = 0);
163 
164     // Makes all columns with less than X% gaps match columns.
165     void AssignMatchColumnsByGapRule(double gap_threshold = 50);
166 
167     // Initializes object with an alignment in FASTA format read from given
168     // stream.
169     void Read(FILE* fin, AlignmentFormat format);
170 
171     // Writes the alignment in given format to ouput stream.
172     void Write(FILE* fout, AlignmentFormat format, size_t width = 100) const;
173 
174     // Returns true if column i is a match column.
is_match(size_t i)175     bool is_match(size_t i) const { return is_match_[i]; }
176 
177     // Removes all insert columns from the alignment.
178     void RemoveInsertColumns();
179 
180     // Merges the provided alignment with this alignment considering only sequences
181     // that are not already included in this alignment. Warning: Inserts in current
182     // alignment are lost!
183     void Merge(const Alignment<Abc>& ali);
184 
185     // Returns alignment sequence k as Sequence object without gaps.
186     Sequence<Abc> GetSequence(size_t k) const;
187 
188     // Rearranges the aligned sequences such that they appear in the same order as
189     // their unaligned counterparts in the given sequence vector.
190     void Rearrange(const std::vector<Sequence<Abc> >& seqs);
191 
192     // Prints the Alignment in A2M format for debugging.
193     friend std::ostream& operator<< <> (std::ostream& out, const Alignment<Abc>& ali);
194 
195 };  // Alignment
196 
197 
198 // Returns the alignment format corresponding to provided filename extension
199 inline AlignmentFormat AlignmentFormatFromString(const std::string& s);
200 
201 // Reads all available alignments from the input stream and puts them into vector.
202 template<class Abc>
203 static void ReadAll(FILE* fin, AlignmentFormat format, std::vector< Alignment<Abc> >& v);
204 
205 // Calculates global sequence weights by maximum entropy weighting
206 // (Henikoff&Henikoff '94).
207 template<class Abc>
208 double GlobalWeightsAndDiversity(const Alignment<Abc>& ali, Vector<double>& wg, bool neff_sum_pairs = false);
209 
210 // Calculates position-dependent sequence weights and number of effective
211 // sequences on subalignments.
212 template<class Abc>
213 Vector<double> PositionSpecificWeightsAndDiversity(const Alignment<Abc>& ali, Matrix<double>& w);
214 
215 // Converts a character to uppercase and '.' to '-'.
to_match_chr(char c)216 inline char to_match_chr(char c) {
217     return isalpha(c) ? toupper(c) : (c == '.' ? '-' : c);
218 }
219 
220 // Converts a character to lowercase and '-' to '.'.
to_insert_chr(char c)221 inline char to_insert_chr(char c) {
222     return isalpha(c) ? tolower(c) : (c == '-' ? '.' : c);
223 }
224 
225 // Predicate indicating if character belongs to match column.
match_chr(char c)226 inline bool match_chr(char c) {
227     return (isalpha(c) && isupper(c)) || c == '-';
228 }
229 
230 // Predicate indicating if character belongs to insert column.
insert_chr(char c)231 inline char insert_chr(char c) {
232     return (isalpha(c) && islower(c)) || c == '.';
233 }
234 
235 }  // namespace cs
236 
237 #endif  // CS_ALIGNMENT_H_
238