1 /*
2   Copyright 2009-2012 Andreas Biegert, Christof Angermueller
3 
4   This file is part of the CS-BLAST package.
5 
6   The CS-BLAST package is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation, either version 3 of the License, or
9   (at your option) any later version.
10 
11   The CS-BLAST package is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15 
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #ifndef CS_COUNT_PROFILE_INL_H_
21 #define CS_COUNT_PROFILE_INL_H_
22 
23 #include "count_profile.h"
24 
25 #include "alignment-inl.h"
26 #include "profile-inl.h"
27 #include "sequence-inl.h"
28 
29 namespace cs {
30 
31 template<class Abc>
CountProfile(const Alignment<Abc> & ali,bool pos_weights,bool neff_sum_pairs)32 CountProfile<Abc>::CountProfile(const Alignment<Abc>& ali, bool pos_weights, bool neff_sum_pairs)
33   : counts(ali.nmatch(), 0.0f),
34     neff(ali.nmatch()) {
35   // Add counts and neff from alignment to count profile
36   if (pos_weights) {  // use position-specific sequence weights
37     Matrix<double> w;
38     Vector<double> neff_tmp(PositionSpecificWeightsAndDiversity(ali, w));
39     assert(neff.size());
40 
41     for (size_t i = 0; i < counts.length(); ++i) {
42       neff[i] = neff_tmp[i];
43       for (size_t k = 0; k < ali.nseqs(); ++k)
44         if (ali[i][k] < Abc::kAny)
45           counts[i][ali[i][k]] += w[i][k];
46     }
47   } else {  // use faster global sequence weights
48     Vector<double> wg;
49     double neff_glob = GlobalWeightsAndDiversity(ali, wg, neff_sum_pairs);
50     for (size_t i = 0; i < counts.length(); ++i) {
51       neff[i] = neff_glob;
52       for (size_t k = 0; k < ali.nseqs(); ++k)
53         if (ali[i][k] < Abc::kAny)
54           counts[i][ali[i][k]] += wg[k];
55     }
56   }
57   // Normalize counts to effective number of sequences
58   Normalize(counts, neff);
59 }
60 
61 template<class Abc>
Read(FILE * fin)62 void CountProfile<Abc>::Read(FILE* fin) {
63   // Parse and check header information
64   if (!StreamStartsWith(fin, "CountProfile"))
65       throw Exception("Stream does not start with class id 'CountProfile'!");
66 
67   char buffer[KB];
68   cs::fgetline(buffer, KB, fin);
69   if (strstr(buffer, "NAME")) {
70     name = ReadString(buffer, "NAME", "Unable to parse count profile 'NAME'!");
71     cs::fgetline(buffer, KB, fin);
72   }
73   size_t len = ReadInt(buffer, "LENG", "Unable to parse count profile 'LENG'!");
74   cs::fgetline(buffer, KB, fin);
75   size_t nalph = ReadInt(buffer, "ALPH", "Unable to parse count profile 'ALPH'!");
76   if (nalph != Abc::kSize)
77     throw Exception("Alphabet size of serialized count profile should be %d "
78                     "but is actually %d!", Abc::kSize, nalph);
79 
80   // If everything went fine we can resize our data memmbers
81   counts.Resize(len);
82   neff.Resize(len);
83 
84   // Read counts and effective number of sequences for each column
85   size_t i = 0;
86   const char* ptr = buffer;
87   cs::fgetline(buffer, KB, fin);  // skip alphabet description line
88   while (cs::fgetline(buffer, KB, fin) && buffer[0] != '/' && buffer[1] != '/') {
89     ptr = buffer;
90     i = strtoi(ptr) - 1;
91     assert(i < len);
92     // TODO: include ANY char in seialization
93     for (size_t a = 0; a < Abc::kSize; ++a)
94       // TODO: save counts as log base e instead of log base 2
95       counts[i][a] = pow(2, static_cast<double>(-strastoi(ptr)) / kScale);
96     counts[i][Abc::kAny] = 0.0f;
97     neff[i] = static_cast<double>(strtoi(ptr)) / kScale;
98   }
99   Normalize(counts, neff);  // normalize probs to counts
100   if (i != len - 1)
101     throw Exception("Count profile should have %i columns but actually has %i!",
102                     len, i+1);
103 }
104 
105 template<class Abc>
Write(FILE * fout)106 void CountProfile<Abc>::Write(FILE* fout) const {
107   // Print header section
108   fputs("CountProfile\n", fout);
109   if (!name.empty()) fprintf(fout, "NAME\t%s\n", name.c_str());
110   fprintf(fout, "LENG\t%zu\n", counts.length());
111   fprintf(fout, "ALPH\t%zu\n", Abc::kSize);
112 
113   // Print alphabet description line
114   fputs("COUNTS", fout);
115   for (size_t a = 0; a < Abc::kSize; ++a)
116     fprintf(fout, "\t%c", Abc::kIntToChar[a]);
117   fputs("\tNEFF\n", fout);
118 
119   // Print counts matrix and neff vector as negative logs scaled by 'kScale'
120   for (size_t i = 0; i < counts.length(); ++i) {
121     fprintf(fout, "%zu", i+1);
122     // TODO: include ANY char in seialization
123     for (size_t a = 0; a < Abc::kSize; ++a) {
124       // TODO: save counts as log base e instead of log base 2
125       if (counts[i][a] == 0.0) fputs("\t*", fout);
126       else fprintf(fout, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale));
127     }
128     fprintf(fout, "\t%d\n", iround(neff[i] * kScale));
129   }
130   fputs("//\n", fout);
131 }
132 
133 
134 template<class Abc>
Write(std::stringstream & ss)135 void CountProfile<Abc>::Write(std::stringstream& ss) const {
136   // Print header section
137   char buffer [4000];
138   sprintf(buffer, "CountProfile\n");
139   ss << buffer;
140   if (!name.empty()) {
141     sprintf(buffer, "NAME\t%s\n", name.c_str());
142     ss << buffer;
143   }
144 
145   sprintf(buffer, "LENG\t%zu\n", counts.length());
146   ss << buffer;
147   sprintf(buffer, "ALPH\t%zu\n", Abc::kSize);
148   ss << buffer;
149 
150   // Print alphabet description line
151   sprintf(buffer, "COUNTS");
152   ss << buffer;
153   for (size_t a = 0; a < Abc::kSize; ++a) {
154     sprintf(buffer, "\t%c", Abc::kIntToChar[a]);
155     ss << buffer;
156   }
157   sprintf(buffer, "\tNEFF\n");
158   ss << buffer;
159 
160   // Print counts matrix and neff vector as negative logs scaled by 'kScale'
161   for (size_t i = 0; i < counts.length(); ++i) {
162     sprintf(buffer, "%zu", i+1);
163     ss << buffer;
164     // TODO: include ANY char in seialization
165     for (size_t a = 0; a < Abc::kSize; ++a) {
166       // TODO: save counts as log base e instead of log base 2
167       if (counts[i][a] == 0.0) {
168         sprintf(buffer, "\t*");
169         ss << buffer;
170       }
171       else {
172         sprintf(buffer, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale));
173         ss << buffer;
174       }
175     }
176     sprintf(buffer, "\t%d\n", iround(neff[i] * kScale));
177     ss << buffer;
178   }
179   sprintf(buffer, "//\n");
180   ss << buffer;
181 }
182 
183 
184 
185 // Returns the average Neff in given count profile.
186 template<class Abc>
Neff(const CountProfile<Abc> & cp)187 inline double Neff(const CountProfile<Abc>& cp) {
188   double neff_sum = 0.0;
189   for (size_t i = 0; i < cp.neff.size(); ++i)
190     neff_sum += cp.neff[i];
191   return neff_sum / cp.neff.size();
192 }
193 
194 // Builds and returns a consensus string of the given count profile by
195 // calculating at each position the alphabet character that deviates most strongly
196 // from its background probability.
197 template<class Abc>
ConsensusSequence(const CountProfile<Abc> & cp,const SubstitutionMatrix<Abc> & sm)198 std::string ConsensusSequence(const CountProfile<Abc>& cp,
199                               const SubstitutionMatrix<Abc>& sm) {
200   Profile<Abc> prof(cp.counts);
201   Normalize(prof, 1.0);
202   std::string cons(prof.length(), ' ');
203 
204   for (size_t i = 0; i < prof.length(); ++i) {
205     double maxw = 0.0;
206     size_t maxa = 0;
207     for (size_t a = 0; a < Abc::kSize; ++a) {
208       if (prof[i][a] - sm.p(a) > maxw) {
209         maxw = prof[i][a] - sm.p(a);
210         maxa = a;
211       }
212     }
213     cons[i] = Abc::kIntToChar[maxa];
214   }
215   return cons;
216 }
217 
218 // Builds and returns a conservation string for given count profile that
219 // indicates conservation of residues by uppercase, lowercase, and '~'
220 template<class Abc>
ConservationSequence(const CountProfile<Abc> & cp,const SubstitutionMatrix<Abc> & sm)221 std::string ConservationSequence(const CountProfile<Abc>& cp,
222                                  const SubstitutionMatrix<Abc>& sm) {
223   static const char kMixedColChar = '~';
224 
225   // Precompute similarity matrix for amino acid pairs
226   Matrix<double> sim(Abc::kSize, Abc::kSize);
227   for (size_t a = 0; a < Abc::kSize; ++a)
228     for (size_t b = 0; b < Abc::kSize; ++b)
229       sim[a][b] = sm.q(a,b) * sm.q(a,b) / sm.q(a,a) / sm.q(b,b);
230 
231   // Normalize count profile
232   Profile<Abc> prof(cp.counts);
233   Normalize(prof, 1.0);
234   std::string cons(prof.length(), ' ');
235 
236   // Now compute conservation string
237   for (size_t i = 0; i < prof.length(); ++i) {
238     double maxw = 0.0;
239     size_t maxa = 0;
240     for (size_t a = 0; a < Abc::kSize; ++a) {
241       if (prof[i][a] - sm.p(a) > maxw) {
242         maxw = prof[i][a] - sm.p(a);
243         maxa = a;
244       }
245     }
246 
247     maxw = 0.0;
248     for (size_t b = 0; b < Abc::kSize; ++b)
249       maxw += prof[i][b] * sim[maxa][b] * sim[maxa][b];
250 
251     if (maxw > 0.6)
252       cons[i] = toupper(Abc::kIntToChar[maxa]);
253     else if (maxw > 0.4)
254       cons[i] = tolower(Abc::kIntToChar[maxa]);
255     else
256       cons[i] = kMixedColChar;
257   }
258 
259   return cons;
260 }
261 
262 // Prints counts and neff in human-readable format for debugging.
263 template<class Abc>
264 std::ostream& operator<< (std::ostream& out, const CountProfile<Abc>& cp) {
265   out << "CountProfile" << std::endl;
266   out << "name:\t" << cp.name << std::endl;
267   for (size_t a = 0; a < Abc::kSizeAny; ++a)
268     out << "\t" << Abc::kIntToChar[a];
269   out << "\tNeff" << std::endl;
270   for (size_t i = 0; i < cp.counts.length(); ++i) {
271     out << i+1;
272     for (size_t a = 0; a < Abc::kSizeAny; ++a)
273       out << strprintf("\t%6.4f", cp.counts[i][a]);
274     out << strprintf("\t%-5.2f\n", cp.neff[i]);
275   }
276   return out;
277 }
278 
279 }  // namespace cs
280 
281 #endif  // CS_COUNT_PROFILE_INL_H_
282