1 /**
2 * Author: Mark Larkin
3 *
4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5 *
6 * Changes:
7 * Mark 30-5-2007: Changed iterationOnTreeNode function as it was adding in extra gaps
8 * at the end of an alignment.
9 */
10 #ifdef HAVE_CONFIG_H
11 #include "config.h"
12 #endif
13 #include "Iteration.h"
14 #include "../alignment/ObjectiveScore.h"
15 #include "../general/utils.h"
16 #include "../general/userparams.h"
17 #include "../tree/TreeInterface.h"
18 #include "../clustalw_version.h"
19 #include "MSA.h"
20
21 namespace clustalw
22 {
23
iterationOnTreeNode(int numSeqsProf1,int numSeqsProf2,int & prfLength1,int & prfLength2,SeqArray * seqArray)24 bool Iteration::iterationOnTreeNode(int numSeqsProf1, int numSeqsProf2, int& prfLength1,
25 int& prfLength2, SeqArray* seqArray)
26 {
27 Alignment alignmentToIterate;
28 int numSeqsInProfiles = numSeqsProf1 + numSeqsProf2;
29
30 if(numSeqsInProfiles <= 2)
31 {
32 return false;
33 }
34
35 SeqArray profileSeqs;
36 profileSeqs.resize(numSeqsInProfiles + 1);
37
38 // Copy the SeqArray!
39 for (int j = 0; ((j < numSeqsProf1 + numSeqsProf2) &&
40 (j < (int)seqArray->size())); j++)
41 {
42 profileSeqs[j + 1].clear();
43 profileSeqs[j + 1].resize(prfLength1 + 1);
44 for (int i = 0; i < prfLength1 && i < (int)(*seqArray)[j].size(); i++)
45 {
46 profileSeqs[j + 1][i + 1] = (*seqArray)[j][i];
47 }
48 }
49
50 alignmentToIterate.addSequences(&profileSeqs);
51 //userParameters->setNumIterations(numSeqsInProfiles * 2);
52
53 bool changed = false;
54 changed = removeFirstIterate(&alignmentToIterate);
55
56 if(changed)
57 {
58 SeqArray* iteratedSeqs = alignmentToIterate.getSeqArrayForRealloc();
59 string aaCodes = userParameters->getAminoAcidCodes();
60
61 int newPrf1Length = 0, newPrf2Length = 0;
62
63 for (int j = 0; j < numSeqsProf1 + numSeqsProf2; j++)
64 {
65 if(j < numSeqsProf1)
66 {
67 if(alignmentToIterate.getSeqLength(j + 1) > newPrf1Length)
68 {
69 newPrf1Length = alignmentToIterate.getSeqLength(j + 1);
70 }
71 }
72 else if(j < numSeqsProf1 + numSeqsProf2)
73 {
74 if(alignmentToIterate.getSeqLength(j + 1) > newPrf2Length)
75 {
76 newPrf2Length = alignmentToIterate.getSeqLength(j + 1);
77 }
78 }
79 }
80
81 prfLength1 = newPrf1Length; // mark 30-5-2007
82 prfLength2 = newPrf2Length; // mark 30-5-2007
83
84 for (int j = 0; j < numSeqsProf1 + numSeqsProf2; j++)
85 {
86 // I need to recalculate the prfLength1 and prfLength2
87 (*seqArray)[j].clear();
88 (*seqArray)[j].assign((*iteratedSeqs)[j + 1].begin() + 1,
89 (*iteratedSeqs)[j + 1].end());
90 (*seqArray)[j].resize(prfLength1 + extraEndElemNum, 31);
91 (*seqArray)[j][prfLength1] = ENDALN;
92 }
93 }
94
95 return true;
96 }
97
printSeqArray(SeqArray * arrayToPrint)98 void Iteration::printSeqArray(SeqArray* arrayToPrint)
99 {
100 cout << "HERE IS THE SEQARRAY\n";
101 // I need to use iterators for everything here.
102 SeqArray::iterator mainBeginIt = arrayToPrint->begin();
103 SeqArray::iterator mainEndIt = arrayToPrint->end();
104 vector<int>::iterator begin, end;
105 string aaCodes = userParameters->getAminoAcidCodes();
106
107 for(; mainBeginIt != mainEndIt; mainBeginIt++)
108 {
109 if(mainBeginIt->size() > 0)
110 {
111 begin = mainBeginIt->begin() + 1;
112 end = mainBeginIt->end();
113 for(; begin != end; begin++)
114 {
115 if(*begin < (int)aaCodes.size())
116 {
117 cout << aaCodes[*begin];
118 }
119 else
120 {
121 cout << "-";
122 }
123 }
124 cout << "\n";
125 }
126 }
127 cout << "\n\n";
128 }
129
130 /**
131 * The function removeFirstIterate is used to perform the remove first iteration
132 * strategy on the finished alignment. It optimises the score give in alignScore.
133 * For iter = 1 to numIterations
134 * For seq i = 1 to numSeqs
135 * remove seq i
136 * if either of the profiles has all gaps, remove this column.
137 * realign using profileAlign
138 * if its better, keep it. If its not better, dont keep it.
139 * @param alnPtr The alignment object.
140 * @return true if it has been successful, false if it has not been successful.
141 */
removeFirstIterate(Alignment * alnPtr)142 bool Iteration::removeFirstIterate(Alignment* alnPtr)
143 {
144 if(!alnPtr)
145 {
146 return false;
147 }
148
149 string p1TreeName;
150 p1TreeName = "";
151 string p2TreeName;
152 int nSeqs = alnPtr->getNumSeqs();
153
154 if(nSeqs <= 2)
155 {
156 return false;
157 }
158 DistMatrix distMat;
159 distMat.ResizeRect(nSeqs + 1);
160
161 ObjectiveScore scoreObj;
162 int iterate = userParameters->getDoRemoveFirstIteration();
163 userParameters->setDoRemoveFirstIteration(NONE);
164
165 double firstScore = scoreObj.getScore(alnPtr);
166 //cout << "firstScore = " << firstScore << "\n";
167 double score = 0;
168 double bestScore = firstScore;
169 double dscore;
170 int count;
171 bool scoreImproved = false;
172 bool scoreImprovedAnyIteration = false;
173 int prof1NumSeqs = 1;
174
175 // This will be used for removing gaps!!!
176 vector<int> profile1;
177 vector<int> profile2;
178 profile1.resize(nSeqs + 1, 0);
179 profile1[1] = 1;
180 profile2.resize(nSeqs + 1, 1);
181 profile2[0] = 0;
182 profile2[1] = 0;
183 vector<int> prof1Weight, prof2Weight;
184 int iterations = userParameters->getNumIterations();
185 //cout << "Max num iterations = " << iterations << "\n";
186 Alignment bestAlignSoFar;
187 TreeInterface tree;
188 // One iteration consists of removing each of the sequences, reseting all the gap
189 // only columns. If the score is better, the new alignment is kept.
190 for(int n = 1; n <= iterations; n++)
191 {
192 scoreImproved = false;
193 cout << "ITERATION " << n << " OF " << iterations << "\n";
194 for(int i = 1; i <= nSeqs; i++)
195 {
196 vector<Sequence> seqVector;
197 Alignment iterateAlign = *alnPtr;
198
199 iterateAlign.setProfile1NumSeqs(1);
200
201 // We remove the sequence i from the profile, and paste into the first position
202 // This is to make it easy to do the profile alignment.
203 vector<int> selected;
204 selected.resize(nSeqs + 1, 0);
205 selected[i] = 1;
206 seqVector = iterateAlign.cutSelectedSequencesFromAlignment(&selected);
207 iterateAlign.pasteSequencesIntoPosition(&seqVector, 0);
208
209 // Remove any gap only columns
210 iterateAlign.removeGapOnlyColsFromSelectedSeqs(&profile1);
211 iterateAlign.removeGapOnlyColsFromSelectedSeqs(&profile2);
212
213 // Calculate a simple distance matrix.
214 if(nSeqs - 1 >= 2)
215 {
216 for (int i = 1; i <= nSeqs; i++)
217 {
218 for (int j = i + 1; j <= nSeqs; j++)
219 {
220 dscore = iterateAlign.countid(i, j);
221 distMat(i, j) = (100.0 - dscore)/100.0;
222 }
223 }
224
225 /* temporary tree file
226 *
227 * can't use the safer mkstemp function here, because
228 * we just pass down the filename :(
229 */
230 char buffer[L_tmpnam];
231 tmpnam (buffer);
232 p2TreeName = buffer + string(".dnd");
233 // should test here if file is writable
234 }
235 bool success = false;
236 prof1Weight.clear();
237 prof1Weight.resize(prof1NumSeqs);
238 prof2Weight.clear();
239 prof2Weight.resize(nSeqs);
240
241 tree.getWeightsForProfileAlign(&iterateAlign, &distMat, &p1TreeName, &prof1Weight,
242 &p2TreeName, &prof2Weight, nSeqs, prof1NumSeqs,
243 false, false, &success);
244 remove(p2TreeName.c_str());
245 if(!success)
246 {
247 /* returning false only means alignment hasn't
248 * changed, but here getWeightsForProfileAlign failed,
249 * most likely because p2TreeName couldn't be read. an
250 * error will be printed to console. clustalw should
251 * then exit, FIXME: clustalx users have to sit
252 * through all error messages until someone
253 * implements a way to return an exit code and react
254 * appropriately
255 */
256 // does anyone know how to use
257 // (userParameters->getMenuFlag() ||
258 // !userParameters->getInteractive() instead?
259 char buf[1024];
260 utilityObject->myname(buf);
261 if (strcasecmp(buf, "clustalw")==0) {
262 exit(EXIT_FAILURE);
263 } else {
264 // the next two lines were here before the exit
265 // was added. keeping it for clustalx although it
266 // doesnt seem to make any sens
267 userParameters->setDoRemoveFirstIteration(iterate);
268 return false;
269 }
270 }
271
272 MSA* msaObj = new MSA();
273
274 iterateAlign.resetProfile1();
275 iterateAlign.resetProfile2();
276 // Do the profile alignment.
277 count = msaObj->doProfileAlign(&iterateAlign, &distMat,
278 &prof1Weight, &prof2Weight);
279 delete msaObj;
280 // Check if its better
281 score = scoreObj.getScore(&iterateAlign);
282 iterateAlign.setProfile1NumSeqs(0);
283
284 if(score < bestScore) // Might be a problem with this.
285 {
286 //cout << "**********************************************\n";
287 //cout << "***** Better score found using iteration *****\n";
288 //cout << "**********************************************\n";
289 bestScore = score;
290 bestAlignSoFar = iterateAlign;
291 scoreImproved = true;
292 scoreImprovedAnyIteration = true;
293 }
294 distMat.clearArray();
295 distMat.ResizeRect(nSeqs + 1);
296 }
297 if(scoreImproved == false)
298 {
299 cout << "Score was not improved in last iteration. Exiting...\n";
300 break;
301 }
302 }
303
304 //
305 // NOTE if we have improved it, then we need to update the sequences in alnPtr
306 // 1) get the unique id of seq i
307 // 2) get the sequence from new object using id
308 // 3) update the sequence in alnPtr
309 //
310 if(scoreImprovedAnyIteration) // If we need to update the alnPtr object.
311 {
312 cout << "Iteration improved Align score: " << bestScore << "\n";
313 int seqId;
314 const vector<int>* improvedSeq;
315 for(int i = 1; i <= nSeqs; i++) // For each seq in alnPtr
316 {
317 seqId = alnPtr->getUniqueId(i);
318 improvedSeq = bestAlignSoFar.getSequenceFromUniqueId(seqId);
319 alnPtr->updateSequence(i, improvedSeq);
320 }
321 }
322 cout << "FINAL score: " << bestScore << "\n";
323 userParameters->setDoRemoveFirstIteration(iterate);
324 return true; // It was successful.
325 }
326
327 }
328
329