1 /*
2 Copyright 2009 Andreas Biegert
3
4 This file is part of the CS-BLAST package.
5
6 The CS-BLAST package is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 The CS-BLAST package is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef CS_ALIGNMENT_H_
21 #define CS_ALIGNMENT_H_
22
23 #include <valarray>
24 #include <vector>
25
26 #include "sequence.h"
27 #include "globals.h"
28 #include "matrix.h"
29 #include "vector.h"
30
31 namespace cs {
32
33 // Forward declarations
34 template<class Abc>
35 class Alignment;
36
37 // Convince the compiler that operator<< is a template friend.
38 template<class Abc>
39 std::ostream& operator<< (std::ostream& out, const Alignment<Abc>& ali);
40
41 // Supported alignment formats for in- and output.
42 enum AlignmentFormat {
43 FASTA_ALIGNMENT = 0,
44 A2M_ALIGNMENT = 1,
45 A3M_ALIGNMENT = 2,
46 CLUSTAL_ALIGNMENT = 3,
47 PSI_ALIGNMENT = 4
48 };
49
50 // A container class for multiple sequence alignments.
51 template<class Abc>
52 class Alignment {
53 private:
54 // Row major matrix with sequences in integer representation.
55 Matrix<uint8_t> seqs_;
56 // Array with indices of all columns [0,1,2,...,num_cols-1].
57 std::valarray<size_t> col_idx_;
58 // Array with indices of match columns.
59 std::valarray<size_t> match_idx_;
60 // Array mask indicating match and insert columns.
61 std::valarray<bool> is_match_;
62 // Headers of sequences in the alignment.
63 std::vector<std::string> headers_;
64 // Name of the alignment as given by comment line in FASTA file
65 std::string name_;
66
67 // Buffer size for reading
68 static const size_t kBufferSize = MB;
69
70 // Initializes alignment with given headers and sequences.
71 void Init(const std::vector<std::string>& headers, const std::vector<std::string>& seqs);
72
73 // Resize the sequence matrix and header vector to given dimensions.
74 void Resize(size_t num_seqs, size_t num_cols);
75
76 // Fills match_idx__ with the indices of all match columns.
77 void SetMatchIndices();
78
79 // Removes sequences with headers indicating non-protein sequences (secondary structure predictions)
80 void FilterSequencesByHeaders(std::vector<std::string>& headers, std::vector<std::string>& seqs);
81
82 // Reads an alignment in FASTA format.
83 void ReadFasta(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
84
85 // Reads an alignment in A2M format from given stream.
86 void ReadA2M(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
87
88 // Reads an alignment in A3M format from given stream.
89 void ReadA3M(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
90
91 // Helper method that reads a FASTA, A2M, or A3M formatted alignment.
92 void ReadFastaFlavors(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
93
94 // Reads an alignment in PSI format.
95 void ReadPsi(FILE* fin, std::vector<std::string>& headers, std::vector<std::string>& seqs);
96
97 // Writes the alignment in FASTA, A2M, or A3M format to output stream.
98 void WriteFastaFlavors(FILE* fout, AlignmentFormat format, size_t width = 100) const;
99
100 // Writes the alignment in CLUSTAL or PSI format to output stream.
101 void WriteClustalFlavors(FILE* fout, AlignmentFormat format, size_t width = 100) const;
102
103 public:
104 // Constructs alignment from multi FASTA formatted alignment read from input
105 // stream.
106 Alignment(FILE* fin, AlignmentFormat format);
107
108 // Constructs an all-gaps alignment with 'ncols' columns and 'nseqs' sequences
109 Alignment(size_t ncols, size_t nseqs);
110
111 // Constructs an alignment from a single sequence.
112 Alignment(const Sequence<Abc>& seq);
113
114 // All memeber are automatically destructed
~Alignment()115 ~Alignment() {}
116
117 // Accessors for integer representation of character in MATCH column i of
118 // sequence k.
119 uint8_t* operator[](size_t i) { return seqs_[match_idx_[i]]; }
120 const uint8_t* operator[](size_t i) const { return seqs_[match_idx_[i]]; }
match(size_t i,size_t k)121 uint8_t& match(size_t i, size_t k) { return seqs_[match_idx_[i]][k]; }
match(size_t i,size_t k)122 const uint8_t& match(size_t i, size_t k) const { return seqs_[match_idx_[i]][k]; }
123
124 // Accessors for integer representation of the character at position i
125 // (NOT match column i) of sequence k.
operator()126 uint8_t& operator() (size_t k, size_t i) { return seqs_[i][k]; }
operator()127 const uint8_t& operator() (size_t k, size_t i) const { return seqs_[i][k]; }
seq(size_t k,size_t i)128 uint8_t& seq(size_t k, size_t i) { return seqs_[i][k]; }
seq(size_t k,size_t i)129 const uint8_t& seq(size_t k, size_t i) const { return seqs_[i][k]; }
130
131 // Returns the character at position i (NOT match column i) of sequence k.
chr(size_t k,size_t i)132 char chr(size_t k, size_t i) const { return Abc::kIntToChar[seqs_[i][k]]; }
133
134 // Returns index of all-alignment column corresponding to match column i.
col_idx(size_t i)135 size_t col_idx(size_t i) const { return match_idx_[i]; }
136
137 // Returns the number of sequences in the alignment.
nseqs()138 size_t nseqs() const { return seqs_.ncols(); }
139
140 // Returns the total number of alignment columns.
ncols()141 size_t ncols() const { return seqs_.nrows(); }
142
143 // Returns the number of match columns.
nmatch()144 size_t nmatch() const { return match_idx_.size(); }
145
146 // Returns the number of insert columns.
ninsert()147 int ninsert() const { return ncols() - nmatch(); }
148
149 // Returns the header of sequence k.
header(size_t k)150 std::string header(size_t k) const { return headers_[k]; }
151
152 // Sets the header of sequence k.
set_header(size_t k,const std::string & header)153 void set_header(size_t k, const std::string& header) { headers_[k] = header; }
154
155 // Returns the name of the alignment
name()156 std::string name() const { return name_; }
157
158 // Sets the name of the alignment
159 // void set_name(const std::string& header) { name_ = name; }
160
161 // Makes all columns with a residue in sequence k match columns.
162 void AssignMatchColumnsBySequence(size_t k = 0);
163
164 // Makes all columns with less than X% gaps match columns.
165 void AssignMatchColumnsByGapRule(double gap_threshold = 50);
166
167 // Initializes object with an alignment in FASTA format read from given
168 // stream.
169 void Read(FILE* fin, AlignmentFormat format);
170
171 // Writes the alignment in given format to ouput stream.
172 void Write(FILE* fout, AlignmentFormat format, size_t width = 100) const;
173
174 // Returns true if column i is a match column.
is_match(size_t i)175 bool is_match(size_t i) const { return is_match_[i]; }
176
177 // Removes all insert columns from the alignment.
178 void RemoveInsertColumns();
179
180 // Merges the provided alignment with this alignment considering only sequences
181 // that are not already included in this alignment. Warning: Inserts in current
182 // alignment are lost!
183 void Merge(const Alignment<Abc>& ali);
184
185 // Returns alignment sequence k as Sequence object without gaps.
186 Sequence<Abc> GetSequence(size_t k) const;
187
188 // Rearranges the aligned sequences such that they appear in the same order as
189 // their unaligned counterparts in the given sequence vector.
190 void Rearrange(const std::vector<Sequence<Abc> >& seqs);
191
192 // Prints the Alignment in A2M format for debugging.
193 friend std::ostream& operator<< <> (std::ostream& out, const Alignment<Abc>& ali);
194
195 }; // Alignment
196
197
198 // Returns the alignment format corresponding to provided filename extension
199 inline AlignmentFormat AlignmentFormatFromString(const std::string& s);
200
201 // Reads all available alignments from the input stream and puts them into vector.
202 template<class Abc>
203 static void ReadAll(FILE* fin, AlignmentFormat format, std::vector< Alignment<Abc> >& v);
204
205 // Calculates global sequence weights by maximum entropy weighting
206 // (Henikoff&Henikoff '94).
207 template<class Abc>
208 double GlobalWeightsAndDiversity(const Alignment<Abc>& ali, Vector<double>& wg, bool neff_sum_pairs = false);
209
210 // Calculates position-dependent sequence weights and number of effective
211 // sequences on subalignments.
212 template<class Abc>
213 Vector<double> PositionSpecificWeightsAndDiversity(const Alignment<Abc>& ali, Matrix<double>& w);
214
215 // Converts a character to uppercase and '.' to '-'.
to_match_chr(char c)216 inline char to_match_chr(char c) {
217 return isalpha(c) ? toupper(c) : (c == '.' ? '-' : c);
218 }
219
220 // Converts a character to lowercase and '-' to '.'.
to_insert_chr(char c)221 inline char to_insert_chr(char c) {
222 return isalpha(c) ? tolower(c) : (c == '-' ? '.' : c);
223 }
224
225 // Predicate indicating if character belongs to match column.
match_chr(char c)226 inline bool match_chr(char c) {
227 return (isalpha(c) && isupper(c)) || c == '-';
228 }
229
230 // Predicate indicating if character belongs to insert column.
insert_chr(char c)231 inline char insert_chr(char c) {
232 return (isalpha(c) && islower(c)) || c == '.';
233 }
234
235 } // namespace cs
236
237 #endif // CS_ALIGNMENT_H_
238