1 #include "muscle.h"
2 #include "tree.h"
3 #include "textfile.h" // for test only
4 #include "msa.h"
5 #include "seqvect.h"
6 #include "profile.h"
7 #include "muscle_context.h"
8 #ifndef _MSC_VER
9 #include <unistd.h> // for unlink
10 #endif
11
12 #define TRACE 0
13
14 /***
15 Find subfamilies from tree by following criteria:
16
17 (a) number of leaves <= max,
18 (b) is monophyletic, i.e. most recent common ancestor is parent
19 of no more than one subfamily.
20 ***/
21
SubFamRecurse(const Tree & tree,unsigned uNodeIndex,unsigned uMaxLeafCount,unsigned SubFams[],unsigned & uSubFamCount)22 static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,
23 unsigned SubFams[], unsigned &uSubFamCount)
24 {
25 if (tree.IsLeaf(uNodeIndex))
26 return 1;
27
28 unsigned uLeft = tree.GetLeft(uNodeIndex);
29 unsigned uRight = tree.GetRight(uNodeIndex);
30 unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);
31 unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);
32
33 unsigned uLeafCount = uLeftCount + uRightCount;
34 if (uLeftCount + uRightCount > uMaxLeafCount)
35 {
36 if (uLeftCount <= uMaxLeafCount)
37 SubFams[uSubFamCount++] = uLeft;
38 if (uRightCount <= uMaxLeafCount)
39 SubFams[uSubFamCount++] = uRight;
40 }
41 else if (tree.IsRoot(uNodeIndex))
42 {
43 if (uSubFamCount != 0)
44 Quit("Error in SubFamRecurse");
45 SubFams[uSubFamCount++] = uNodeIndex;
46 }
47
48 return uLeafCount;
49 }
50
SubFam(const Tree & tree,unsigned uMaxLeafCount,unsigned SubFams[],unsigned * ptruSubFamCount)51 void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)
52 {
53 *ptruSubFamCount = 0;
54 SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);
55
56 #if TRACE
57 {
58 Log("\n");
59 Log("Tree:\n");
60 tree.LogMe();
61 //void DrawTree(const Tree &tree);
62 //DrawTree(tree);
63 Log("\n");
64 Log("%d subfams:\n", *ptruSubFamCount);
65 for (unsigned i = 0; i < *ptruSubFamCount; ++i)
66 Log(" %d=%d", i, SubFams[i]);
67 Log("\n");
68 }
69 #endif
70 }
71
72 //unsigned SubFams[9999];
73 //unsigned uSubFamCount;
74 //
75 //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
76 // {
77 // const unsigned uRoot = tree.GetRootNodeIndex();
78 // unsigned uDist = 0;
79 // while (uNodeIndex != uRoot)
80 // {
81 // ++uDist;
82 // uNodeIndex = tree.GetParent(uNodeIndex);
83 // }
84 // return uDist;
85 // }
86 //
87 //static void DrawNode(const Tree &tree, unsigned uNodeIndex)
88 // {
89 // if (!tree.IsLeaf(uNodeIndex))
90 // DrawNode(tree, tree.GetLeft(uNodeIndex));
91 //
92 // unsigned uDist = DistFromRoot(tree, uNodeIndex);
93 // for (unsigned i = 0; i < 5*uDist; ++i)
94 // Log(" ");
95 // Log("%d", uNodeIndex);
96 // for (unsigned i = 0; i < uSubFamCount; ++i)
97 // if (uNodeIndex == SubFams[i])
98 // {
99 // Log("*");
100 // break;
101 // }
102 // Log("\n");
103 //
104 // if (!tree.IsLeaf(uNodeIndex))
105 // DrawNode(tree, tree.GetRight(uNodeIndex));
106 // }
107 //
108 //static void DrawTree(const Tree &tree)
109 // {
110 // unsigned uRoot = tree.GetRootNodeIndex();
111 // DrawNode(tree, uRoot);
112 // }
113 //
114 //void TestSubFams(const char *FileName)
115 // {
116 // Tree tree;
117 // TextFile f(FileName);
118 // tree.FromFile(f);
119 // SubFam(tree, 5, SubFams, &uSubFamCount);
120 // DrawTree(tree);
121 // }
122
SetInFam(const Tree & tree,unsigned uNodeIndex,bool NodeInSubFam[])123 static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])
124 {
125 if (tree.IsLeaf(uNodeIndex))
126 return;
127 unsigned uLeft = tree.GetLeft(uNodeIndex);
128 unsigned uRight = tree.GetRight(uNodeIndex);
129 NodeInSubFam[uLeft] = true;
130 NodeInSubFam[uRight] = true;
131
132 SetInFam(tree, uLeft, NodeInSubFam);
133 SetInFam(tree, uRight, NodeInSubFam);
134 }
135
AlignSubFam(SeqVect & vAll,const Tree & GuideTree,unsigned uNodeIndex,MSA & msaOut)136 void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,
137 MSA &msaOut)
138 {
139 const unsigned uSeqCount = vAll.GetSeqCount();
140
141 const char *InTmp = "asf_in.tmp";
142 const char *OutTmp = "asf_out.tmp";
143
144 unsigned *Leaves = new unsigned[uSeqCount];
145 unsigned uLeafCount;
146 GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);
147
148 SeqVect v;
149 for (unsigned i = 0; i < uLeafCount; ++i)
150 {
151 unsigned uLeafNodeIndex = Leaves[i];
152 unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);
153 Seq &s = vAll.GetSeqById(uId);
154 v.AppendSeq(s);
155 }
156
157 #if TRACE
158 {
159 Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);
160 for (unsigned i = 0; i < uLeafCount; ++i)
161 Log(" %s", v.GetSeqName(i));
162 Log("\n");
163 }
164 #endif
165
166 TextFile fIn(InTmp, true);
167
168 v.ToFASTAFile(fIn);
169 fIn.Close();
170
171 char CmdLine[4096];
172 sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);
173 // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);
174 system(CmdLine);
175
176 TextFile fOut(OutTmp);
177 msaOut.FromFile(fOut);
178
179 for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)
180 {
181 const char *Name = msaOut.GetSeqName(uSeqIndex);
182 unsigned uId = vAll.GetSeqIdFromName(Name);
183 msaOut.SetSeqId(uSeqIndex, uId);
184 }
185
186 unlink(InTmp);
187 unlink(OutTmp);
188
189 delete[] Leaves;
190 }
191
ProgAlignSubFams()192 void ProgAlignSubFams()
193 {
194 MuscleContext *ctx = getMuscleContext();
195
196 MSA msaOut;
197
198 SetOutputFileName(ctx->params.g_pstrOutFileName);
199 SetInputFileName(ctx->params.g_pstrInFileName);
200
201 SetMaxIters(ctx->params.g_uMaxIters);
202 SetSeqWeightMethod(ctx->params.g_SeqWeight1);
203
204 TextFile fileIn(ctx->params.g_pstrInFileName);
205 SeqVect v;
206 v.FromFASTAFile(fileIn);
207 const unsigned uSeqCount = v.Length();
208
209 if (0 == uSeqCount)
210 Quit("No sequences in input file");
211
212 ALPHA Alpha = ALPHA_Undefined;
213 switch (ctx->params.g_SeqType)
214 {
215 case SEQTYPE_Auto:
216 Alpha = v.GuessAlpha();
217 break;
218
219 case SEQTYPE_Protein:
220 Alpha = ALPHA_Amino;
221 break;
222
223 case SEQTYPE_DNA:
224 Alpha = ALPHA_DNA;
225 break;
226
227 case SEQTYPE_RNA:
228 Alpha = ALPHA_RNA;
229 break;
230
231 default:
232 Quit("Invalid seq type");
233 }
234 SetAlpha(Alpha);
235 v.FixAlpha();
236
237 PTR_SCOREMATRIX UserMatrix = 0;
238 if (0 != ctx->params.g_pstrMatrixFileName)
239 {
240 const char *FileName = ctx->params.g_pstrMatrixFileName;
241 const char *Path = getenv("MUSCLE_MXPATH");
242 if (Path != 0)
243 {
244 size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
245 char *NewFileName = new char[n];
246 sprintf(NewFileName, "%s/%s", Path, FileName);
247 FileName = NewFileName;
248 }
249 TextFile File(FileName);
250 UserMatrix = ReadMx(File);
251 ctx->alpha.g_Alpha = ALPHA_Amino;
252 ctx->params.g_PPScore = PPSCORE_SP;
253 }
254
255 SetPPScore();
256
257 if (0 != UserMatrix)
258 ctx->params.g_ptrScoreMatrix = UserMatrix;
259
260 if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
261 {
262 SetPPScore(PPSCORE_SPN);
263 ctx->params.g_Distance1 = DISTANCE_Kmer4_6;
264 }
265
266 unsigned uMinL = 0;
267 unsigned uMaxL = 0;
268 unsigned uTotL = 0;
269 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
270 {
271 unsigned L = v.GetSeq(uSeqIndex).Length();
272 uTotL += L;
273 if (uMinL == 0 || L < uMinL)
274 uMinL = L;
275 if (L > uMaxL)
276 uMaxL = L;
277 }
278
279 SetIter(1);
280 ctx->params.g_bDiags = ctx->params.g_bDiags1;
281 SetSeqStats(uSeqCount, uMinL, uMaxL, uTotL/uSeqCount);
282
283 SetMuscleSeqVect(v);
284
285 MSA::SetIdCount(uSeqCount);
286
287 // Initialize sequence ids.
288 // From this point on, ids must somehow propogate from here.
289 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
290 v.SetSeqId(uSeqIndex, uSeqIndex);
291
292 if (uSeqCount > 1)
293 MHackStart(v);
294
295 if (0 == uSeqCount)
296 {
297 msaOut.Clear();
298 return;
299 }
300
301 if (1 == uSeqCount && ALPHA_Amino == Alpha)
302 {
303 const Seq &s = v.GetSeq(0);
304 msaOut.FromSeq(s);
305 return;
306 }
307
308 Tree GuideTree;
309 TreeFromSeqVect(v, GuideTree, ctx->params.g_Cluster1, ctx->params.g_Distance1, ctx->params.g_Root1);
310 SetMuscleTree(GuideTree);
311
312 MSA msa;
313 if (ctx->params.g_bLow)
314 {
315 ProgNode *ProgNodes = 0;
316 ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
317 delete[] ProgNodes;
318 }
319 else
320 ProgressiveAlign(v, GuideTree, msa);
321 SetCurrentAlignment(msa);
322 TreeFromMSA(msa, GuideTree, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2);
323 SetMuscleTree(GuideTree);
324
325 unsigned *SubFams = new unsigned[uSeqCount];
326 unsigned uSubFamCount;
327 SubFam(GuideTree, ctx->params.g_uMaxSubFamCount, SubFams, &uSubFamCount);
328
329 SetProgressDesc("Align node");
330 const unsigned uNodeCount = 2*uSeqCount - 1;
331
332 ProgNode *ProgNodes = new ProgNode[uNodeCount];
333 bool *NodeIsSubFam = new bool[uNodeCount];
334 bool *NodeInSubFam = new bool[uNodeCount];
335
336 for (unsigned i = 0; i < uNodeCount; ++i)
337 {
338 NodeIsSubFam[i] = false;
339 NodeInSubFam[i] = false;
340 }
341
342 for (unsigned i = 0; i < uSubFamCount; ++i)
343 {
344 unsigned uNodeIndex = SubFams[i];
345 assert(uNodeIndex < uNodeCount);
346 NodeIsSubFam[uNodeIndex] = true;
347 SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
348 }
349
350 unsigned uJoin = 0;
351 unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
352 do
353 {
354 if (NodeIsSubFam[uTreeNodeIndex])
355 {
356 #if TRACE
357 Log("Node %d: align subfam\n", uTreeNodeIndex);
358 #endif
359 ProgNode &Node = ProgNodes[uTreeNodeIndex];
360 AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
361 Node.m_uLength = Node.m_MSA.GetColCount();
362 }
363 else if (!NodeInSubFam[uTreeNodeIndex])
364 {
365 #if TRACE
366 Log("Node %d: align two subfams\n", uTreeNodeIndex);
367 #endif
368 Progress(uJoin, uSubFamCount - 1);
369 ++uJoin;
370
371 const unsigned uMergeNodeIndex = uTreeNodeIndex;
372 ProgNode &Parent = ProgNodes[uMergeNodeIndex];
373
374 const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
375 const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
376
377 ProgNode &Node1 = ProgNodes[uLeft];
378 ProgNode &Node2 = ProgNodes[uRight];
379
380 PWPath Path;
381 AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
382 Parent.m_uLength = Parent.m_MSA.GetColCount();
383
384 Node1.m_MSA.Clear();
385 Node2.m_MSA.Clear();
386 }
387 else
388 {
389 #if TRACE
390 Log("Node %d: in subfam\n", uTreeNodeIndex);
391 #endif
392 ;
393 }
394 uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
395 }
396 while (NULL_NEIGHBOR != uTreeNodeIndex);
397 ProgressStepsDone();
398
399 unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
400 ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
401
402 TextFile fOut(ctx->params.g_pstrOutFileName, true);
403 MHackEnd(RootProgNode.m_MSA);
404 RootProgNode.m_MSA.ToFile(fOut);
405
406 delete[] NodeInSubFam;
407 delete[] NodeIsSubFam;
408 delete[] ProgNodes;
409 delete[] SubFams;
410
411 ProgNodes = 0;
412 NodeInSubFam = 0;
413 NodeIsSubFam = 0;
414 SubFams = 0;
415 }
416