1 /*
2 Copyright 2009-2012 Andreas Biegert, Christof Angermueller
3
4 This file is part of the CS-BLAST package.
5
6 The CS-BLAST package is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 The CS-BLAST package is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef CS_COUNT_PROFILE_INL_H_
21 #define CS_COUNT_PROFILE_INL_H_
22
23 #include "count_profile.h"
24
25 #include "alignment-inl.h"
26 #include "profile-inl.h"
27 #include "sequence-inl.h"
28
29 namespace cs {
30
31 template<class Abc>
CountProfile(const Alignment<Abc> & ali,bool pos_weights,bool neff_sum_pairs)32 CountProfile<Abc>::CountProfile(const Alignment<Abc>& ali, bool pos_weights, bool neff_sum_pairs)
33 : counts(ali.nmatch(), 0.0f),
34 neff(ali.nmatch()) {
35 // Add counts and neff from alignment to count profile
36 if (pos_weights) { // use position-specific sequence weights
37 Matrix<double> w;
38 Vector<double> neff_tmp(PositionSpecificWeightsAndDiversity(ali, w));
39 assert(neff.size());
40
41 for (size_t i = 0; i < counts.length(); ++i) {
42 neff[i] = neff_tmp[i];
43 for (size_t k = 0; k < ali.nseqs(); ++k)
44 if (ali[i][k] < Abc::kAny)
45 counts[i][ali[i][k]] += w[i][k];
46 }
47 } else { // use faster global sequence weights
48 Vector<double> wg;
49 double neff_glob = GlobalWeightsAndDiversity(ali, wg, neff_sum_pairs);
50 for (size_t i = 0; i < counts.length(); ++i) {
51 neff[i] = neff_glob;
52 for (size_t k = 0; k < ali.nseqs(); ++k)
53 if (ali[i][k] < Abc::kAny)
54 counts[i][ali[i][k]] += wg[k];
55 }
56 }
57 // Normalize counts to effective number of sequences
58 Normalize(counts, neff);
59 }
60
61 template<class Abc>
Read(FILE * fin)62 void CountProfile<Abc>::Read(FILE* fin) {
63 // Parse and check header information
64 if (!StreamStartsWith(fin, "CountProfile"))
65 throw Exception("Stream does not start with class id 'CountProfile'!");
66
67 char buffer[KB];
68 cs::fgetline(buffer, KB, fin);
69 if (strstr(buffer, "NAME")) {
70 name = ReadString(buffer, "NAME", "Unable to parse count profile 'NAME'!");
71 cs::fgetline(buffer, KB, fin);
72 }
73 size_t len = ReadInt(buffer, "LENG", "Unable to parse count profile 'LENG'!");
74 cs::fgetline(buffer, KB, fin);
75 size_t nalph = ReadInt(buffer, "ALPH", "Unable to parse count profile 'ALPH'!");
76 if (nalph != Abc::kSize)
77 throw Exception("Alphabet size of serialized count profile should be %d "
78 "but is actually %d!", Abc::kSize, nalph);
79
80 // If everything went fine we can resize our data memmbers
81 counts.Resize(len);
82 neff.Resize(len);
83
84 // Read counts and effective number of sequences for each column
85 size_t i = 0;
86 const char* ptr = buffer;
87 cs::fgetline(buffer, KB, fin); // skip alphabet description line
88 while (cs::fgetline(buffer, KB, fin) && buffer[0] != '/' && buffer[1] != '/') {
89 ptr = buffer;
90 i = strtoi(ptr) - 1;
91 assert(i < len);
92 // TODO: include ANY char in seialization
93 for (size_t a = 0; a < Abc::kSize; ++a)
94 // TODO: save counts as log base e instead of log base 2
95 counts[i][a] = pow(2, static_cast<double>(-strastoi(ptr)) / kScale);
96 counts[i][Abc::kAny] = 0.0f;
97 neff[i] = static_cast<double>(strtoi(ptr)) / kScale;
98 }
99 Normalize(counts, neff); // normalize probs to counts
100 if (i != len - 1)
101 throw Exception("Count profile should have %i columns but actually has %i!",
102 len, i+1);
103 }
104
105 template<class Abc>
Write(FILE * fout)106 void CountProfile<Abc>::Write(FILE* fout) const {
107 // Print header section
108 fputs("CountProfile\n", fout);
109 if (!name.empty()) fprintf(fout, "NAME\t%s\n", name.c_str());
110 fprintf(fout, "LENG\t%zu\n", counts.length());
111 fprintf(fout, "ALPH\t%zu\n", Abc::kSize);
112
113 // Print alphabet description line
114 fputs("COUNTS", fout);
115 for (size_t a = 0; a < Abc::kSize; ++a)
116 fprintf(fout, "\t%c", Abc::kIntToChar[a]);
117 fputs("\tNEFF\n", fout);
118
119 // Print counts matrix and neff vector as negative logs scaled by 'kScale'
120 for (size_t i = 0; i < counts.length(); ++i) {
121 fprintf(fout, "%zu", i+1);
122 // TODO: include ANY char in seialization
123 for (size_t a = 0; a < Abc::kSize; ++a) {
124 // TODO: save counts as log base e instead of log base 2
125 if (counts[i][a] == 0.0) fputs("\t*", fout);
126 else fprintf(fout, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale));
127 }
128 fprintf(fout, "\t%d\n", iround(neff[i] * kScale));
129 }
130 fputs("//\n", fout);
131 }
132
133
134 template<class Abc>
Write(std::stringstream & ss)135 void CountProfile<Abc>::Write(std::stringstream& ss) const {
136 // Print header section
137 char buffer [4000];
138 sprintf(buffer, "CountProfile\n");
139 ss << buffer;
140 if (!name.empty()) {
141 sprintf(buffer, "NAME\t%s\n", name.c_str());
142 ss << buffer;
143 }
144
145 sprintf(buffer, "LENG\t%zu\n", counts.length());
146 ss << buffer;
147 sprintf(buffer, "ALPH\t%zu\n", Abc::kSize);
148 ss << buffer;
149
150 // Print alphabet description line
151 sprintf(buffer, "COUNTS");
152 ss << buffer;
153 for (size_t a = 0; a < Abc::kSize; ++a) {
154 sprintf(buffer, "\t%c", Abc::kIntToChar[a]);
155 ss << buffer;
156 }
157 sprintf(buffer, "\tNEFF\n");
158 ss << buffer;
159
160 // Print counts matrix and neff vector as negative logs scaled by 'kScale'
161 for (size_t i = 0; i < counts.length(); ++i) {
162 sprintf(buffer, "%zu", i+1);
163 ss << buffer;
164 // TODO: include ANY char in seialization
165 for (size_t a = 0; a < Abc::kSize; ++a) {
166 // TODO: save counts as log base e instead of log base 2
167 if (counts[i][a] == 0.0) {
168 sprintf(buffer, "\t*");
169 ss << buffer;
170 }
171 else {
172 sprintf(buffer, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale));
173 ss << buffer;
174 }
175 }
176 sprintf(buffer, "\t%d\n", iround(neff[i] * kScale));
177 ss << buffer;
178 }
179 sprintf(buffer, "//\n");
180 ss << buffer;
181 }
182
183
184
185 // Returns the average Neff in given count profile.
186 template<class Abc>
Neff(const CountProfile<Abc> & cp)187 inline double Neff(const CountProfile<Abc>& cp) {
188 double neff_sum = 0.0;
189 for (size_t i = 0; i < cp.neff.size(); ++i)
190 neff_sum += cp.neff[i];
191 return neff_sum / cp.neff.size();
192 }
193
194 // Builds and returns a consensus string of the given count profile by
195 // calculating at each position the alphabet character that deviates most strongly
196 // from its background probability.
197 template<class Abc>
ConsensusSequence(const CountProfile<Abc> & cp,const SubstitutionMatrix<Abc> & sm)198 std::string ConsensusSequence(const CountProfile<Abc>& cp,
199 const SubstitutionMatrix<Abc>& sm) {
200 Profile<Abc> prof(cp.counts);
201 Normalize(prof, 1.0);
202 std::string cons(prof.length(), ' ');
203
204 for (size_t i = 0; i < prof.length(); ++i) {
205 double maxw = 0.0;
206 size_t maxa = 0;
207 for (size_t a = 0; a < Abc::kSize; ++a) {
208 if (prof[i][a] - sm.p(a) > maxw) {
209 maxw = prof[i][a] - sm.p(a);
210 maxa = a;
211 }
212 }
213 cons[i] = Abc::kIntToChar[maxa];
214 }
215 return cons;
216 }
217
218 // Builds and returns a conservation string for given count profile that
219 // indicates conservation of residues by uppercase, lowercase, and '~'
220 template<class Abc>
ConservationSequence(const CountProfile<Abc> & cp,const SubstitutionMatrix<Abc> & sm)221 std::string ConservationSequence(const CountProfile<Abc>& cp,
222 const SubstitutionMatrix<Abc>& sm) {
223 static const char kMixedColChar = '~';
224
225 // Precompute similarity matrix for amino acid pairs
226 Matrix<double> sim(Abc::kSize, Abc::kSize);
227 for (size_t a = 0; a < Abc::kSize; ++a)
228 for (size_t b = 0; b < Abc::kSize; ++b)
229 sim[a][b] = sm.q(a,b) * sm.q(a,b) / sm.q(a,a) / sm.q(b,b);
230
231 // Normalize count profile
232 Profile<Abc> prof(cp.counts);
233 Normalize(prof, 1.0);
234 std::string cons(prof.length(), ' ');
235
236 // Now compute conservation string
237 for (size_t i = 0; i < prof.length(); ++i) {
238 double maxw = 0.0;
239 size_t maxa = 0;
240 for (size_t a = 0; a < Abc::kSize; ++a) {
241 if (prof[i][a] - sm.p(a) > maxw) {
242 maxw = prof[i][a] - sm.p(a);
243 maxa = a;
244 }
245 }
246
247 maxw = 0.0;
248 for (size_t b = 0; b < Abc::kSize; ++b)
249 maxw += prof[i][b] * sim[maxa][b] * sim[maxa][b];
250
251 if (maxw > 0.6)
252 cons[i] = toupper(Abc::kIntToChar[maxa]);
253 else if (maxw > 0.4)
254 cons[i] = tolower(Abc::kIntToChar[maxa]);
255 else
256 cons[i] = kMixedColChar;
257 }
258
259 return cons;
260 }
261
262 // Prints counts and neff in human-readable format for debugging.
263 template<class Abc>
264 std::ostream& operator<< (std::ostream& out, const CountProfile<Abc>& cp) {
265 out << "CountProfile" << std::endl;
266 out << "name:\t" << cp.name << std::endl;
267 for (size_t a = 0; a < Abc::kSizeAny; ++a)
268 out << "\t" << Abc::kIntToChar[a];
269 out << "\tNeff" << std::endl;
270 for (size_t i = 0; i < cp.counts.length(); ++i) {
271 out << i+1;
272 for (size_t a = 0; a < Abc::kSizeAny; ++a)
273 out << strprintf("\t%6.4f", cp.counts[i][a]);
274 out << strprintf("\t%-5.2f\n", cp.neff[i]);
275 }
276 return out;
277 }
278
279 } // namespace cs
280
281 #endif // CS_COUNT_PROFILE_INL_H_
282