1 /***************************************************************************
2  *   Copyright (C) 2009 by BUI Quang Minh   *
3  *   minh.bui@univie.ac.at   *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 #include "maalignment.h"
21 
readLogLL(char * fileName)22 void MaAlignment::readLogLL(char *fileName)
23 {
24 	//First read the values from inFile to a DoubleVector
25 	DoubleVector _logllVec;
26 	int siteNum = -1;
27 	string currentString;
28 	cout << "\nReading file containing site's loglikelihood: " << fileName << "...." << endl;
29     ifstream inFile;
30 	try{
31 		inFile.exceptions (ios::failbit | ios::badbit);
32 		inFile.open(fileName);
33 		/**really start reading*/
34 		//read number of sites
35 		inFile >> currentString;
36 		siteNum = convert_int(currentString.c_str());
37 		//ignore "Site_Lh"
38 		inFile >> currentString;
39 		while (!inFile.eof())
40 		{
41 			//reading each line of the file
42 			//remove the badbit
43 			inFile.exceptions (ios::badbit);
44 			if ( !(inFile >> currentString) ) break;
45 			//set the failbit again
46 			inFile.exceptions (ios::failbit | ios::badbit);
47 			_logllVec.push_back(convert_double(currentString.c_str()));
48 		}/**finish reading*/
49 		inFile.clear();
50 		inFile.exceptions (ios::failbit | ios::badbit);
51 		inFile.close();
52 	} catch(bad_alloc){
53 			outError(ERR_NO_MEMORY);
54 	} catch (const char *str){
55 			outError(str);
56 	} catch (string str){
57 			outError(str);
58 	} catch (ios::failure){
59 			outError(ERR_READ_INPUT);
60 	} catch (...){
61 			outError(ERR_READ_ANY);
62 	}
63 	if (siteNum != _logllVec.size())
64 		outError("Actual number of site's likelihoods is not consistent with the announced number in the first line.");
65 	cout << "Finish reading, now assign the logLL to the pattern:" << endl;
66 
67 	logLL.resize(getNPattern(),0.0);
68 	for (int i = 0; i < siteNum; i++)
69 	{
70 		int patIndex = getPatternID(i);
71 		if ( logLL[patIndex] == 0 )
72 			logLL[patIndex] = _logllVec[i];
73 		else
74 			if ( logLL[patIndex] != _logllVec[i] )
75 //				outError("Conflicting between the likelihoods reported for pattern", (*this)[i]);
76                 outError("Conflicting between the likelihoods reported for pattern");
77 	}
78 //	int npat = getNPattern();
79 //	cout << "Number of patterns: " << npat << endl;
80 //	for ( int j = 0; j < npat; j++ )
81 //		cout << j << "\t" << at(j) << "\t" << logLL[j] << endl;
82 	cout << "Finish assigning logLL to the patterns!" << endl;
83 }
84 
computeExpectedNorFre()85 IntVector MaAlignment::computeExpectedNorFre()
86 {
87 	IntVector expectedNorFre;
88 	if ( logLL.empty())
89 		outError("Error: log likelihood of patterns are not given!");
90 
91 	int patNum = getNPattern();
92 	int alignLen = getNSite();
93 	//resize the expectedNorFre vector
94 	expectedNorFre.resize(patNum,-1);
95 
96 	//Vector containing the likelihood of the pattern p_i
97 	DoubleVector LL(patNum,-1.0);
98 	double sumLL = 0; //sum of the likelihood of the patterns in the alignment
99 
100 	//Compute the likelihood from the logLL
101 	for ( int i = 0; i < patNum; i++ )
102 	{
103 		LL[i] = exp(logLL[i]);
104 		sumLL += LL[i];
105 	}
106 
107 	//Vector containing l_i = p_i*ell/sum_i(p_i)
108 	DoubleVector ell(patNum, -1.0);
109 	//Compute l_i
110 	for ( int i = 0; i < patNum; i++ )
111 	{
112 		ell[i] = (double)alignLen * LL[i] / sumLL;
113 	}
114 
115 
116 	//Vector containing r_i where r_0 = ell_0; r_{i+1} = ell_{i+1} + r_i - ordinaryRounding(r_i)
117 	DoubleVector r(patNum, -1.0);
118 	//Compute r_i and the expected normalized frequencies
119 	r[0] = ell[0];
120 	expectedNorFre[0] = (int)floor(ell[0]+0.5); //note that floor(_number+0.5) returns the ordinary rounding of _number
121 	int sum = expectedNorFre[0];
122 	for (int j = 1; j < patNum; j++ )
123 	{
124 		r[j] = ell[j] + r[j-1] - floor(r[j-1]+0.5);
125 		expectedNorFre[j] = (int)floor(r[j]+0.5);
126 		sum += expectedNorFre[j];
127 	}
128 
129 	//cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
130 	return expectedNorFre;
131 }
132 
printPatObsExpFre(const char * fileName)133 void MaAlignment::printPatObsExpFre(const char *fileName)
134 {
135 	IntVector expectedNorFre = computeExpectedNorFre();
136 	printPatObsExpFre(fileName, expectedNorFre);
137 }
138 
printPatObsExpFre(const char * fileName,const IntVector expectedNorFre)139 void MaAlignment::printPatObsExpFre(const char *fileName, const IntVector expectedNorFre)
140 {
141 	try {
142 		ofstream out;
143 		out.exceptions(ios::failbit | ios::badbit);
144 		out.open(fileName);
145 		out << "Pattern\tLogLL\tObservedFre\tExpectedFre" << endl;
146 
147 		int patNum = getNPattern();
148 		int seqNum = getNSeq();
149 		int seqID;
150 
151 		for ( int i = 0; i < patNum; i++ )
152 		{
153 			for ( seqID = 0; seqID < seqNum; seqID++ ){
154 				out << convertStateBackStr(at(i)[seqID]);
155 			}
156 			out << "\t" << logLL[i] << "\t" << (*this)[i].frequency << "\t" << expectedNorFre[i] << endl;
157 		}
158 		out.close();
159 	} catch (ios::failure) {
160 		outError(ERR_WRITE_OUTPUT, fileName);
161 	}
162 }
163 
generateExpectedAlignment(MaAlignment * aln,double & prob)164 void MaAlignment::generateExpectedAlignment(MaAlignment *aln, double &prob)
165 {
166 	//cout << "In function: generating expected alignment!" << endl;
167 	IntVector expectedNorFre = aln->computeExpectedNorFre();
168 
169 	int nsite = aln->getNSite();
170 	seq_names.insert(seq_names.begin(), aln->seq_names.begin(), aln->seq_names.end());
171 	num_states = aln->num_states;
172 	site_pattern.resize(nsite, -1);
173 	clear();
174 	pattern_index.clear();
175 	VerboseMode save_mode = verbose_mode;
176 	verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
177 
178 	int patID;
179 	int site = 0;
180 	int npat = aln->getNPattern();
181 
182 	double sumFac = 0;
183 	double sumProb = 0;
184 	double fac = logFac(nsite);
185 
186 	double sumFacMax = 0;
187 	double sumProbMax = 0;
188 
189 	for (patID = 0; patID < npat; patID++) {
190 		int patFre = expectedNorFre[patID];
191 		for ( int patSite = 0; patSite < patFre; patSite++)
192 		{
193 			Pattern pat = aln->at(patID);
194 			addPattern(pat,site);
195 			site++;
196 		}
197 
198 		//to compute the probability of the new alignment given the multinomial distribution
199 		sumFac += logFac(patFre);
200 		sumProb += (double)patFre*log((double)aln->at(patID).frequency/(double)nsite);
201 
202 		//for the unconstraint maximum log likelihood
203 		sumFacMax += logFac(aln->at(patID).frequency);
204 		sumProbMax += (double)aln->at(patID).frequency*log((double)aln->at(patID).frequency/(double)nsite);
205 	}
206 	prob = fac - sumFac + sumProb;
207 
208 	double probMax = fac - sumFacMax + sumProbMax;
209 //	cout << "total number of sites: " << site << endl;
210 	verbose_mode = save_mode;
211 	countConstSite();
212 	//cout << "Finish generating expected alignment!" << endl;
213 	cout << "Logarithm of the probability of the new alignment given the multinomial distribution of the input alignment is: " << prob << endl;
214 	cout << "Maximum unconstraint (log) likelihood of the input alignment: " << probMax << endl;
215 // 	cout << "Maximum unconstraint likelihood: " << exp(probMax) << endl;
216 }
217 
218 /*void MaAlignment::multinomialProb(Alignment objectAlign, double &prob)
219 {
220 	cout << "Computing the multinomial probability of an object alignment given a reference alignment ..." << endl;
221 	//should we check for compatibility of sequence's names and sequence's order in THIS alignment and in the objectAlign??
222 	//check alignment length
223 	int nsite = getNSite();
224 	assert(nsite == objectAlign.getNSite());
225 	double sumFac = 0;
226 	double sumProb = 0;
227 	double fac = logFac(nsite);
228 	int index;
229 	for ( Alignment::iterator objectIt = objectAlign.begin(); objectIt != objectAlign.end() ; objectIt++)
230 	{
231 		PatternIntMap::iterator pat_it = pattern_index.find((*objectIt));
232 		if ( pat_it == pattern_index.end() ) //not found ==> error
233 			outError("Pattern in the object alignment is not found in the reference alignment!");
234 		sumFac += logFac((*objectIt).frequency);
235 		index = pat_it->second;
236 		sumProb += (double)(*objectIt).frequency*log((double)at(index).frequency/(double)nsite);
237 	}
238 	prob = fac - sumFac + sumProb;
239 }*/
240 
241 /*void MaAlignment::multinomialProb(AlignmentVector objectAligns, DoubleVector &probs)
242 {
243 	int num = objectAligns.size();
244 	double curProb;
245 	if (num > 0)
246 	{
247 		probs.resize(num,0);
248 		for ( int i = 0; i < num; i++ )
249 		{
250 			(*this).multinomialProb(objectAligns[i], curProb);
251 			probs[i] = curProb;
252 		}
253 	}
254 }*/
255