1 #include "muscle.h"
2 #include "tree.h"
3 #include "textfile.h"	// for test only
4 #include "msa.h"
5 #include "seqvect.h"
6 #include "profile.h"
7 #include "muscle_context.h"
8 #ifndef _MSC_VER
9 #include <unistd.h>	//	for unlink
10 #endif
11 
12 #define TRACE	0
13 
14 /***
15 Find subfamilies from tree by following criteria:
16 
17 (a) number of leaves <= max,
18 (b) is monophyletic, i.e. most recent common ancestor is parent
19 of no more than one subfamily.
20 ***/
21 
SubFamRecurse(const Tree & tree,unsigned uNodeIndex,unsigned uMaxLeafCount,unsigned SubFams[],unsigned & uSubFamCount)22 static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,
23   unsigned SubFams[], unsigned &uSubFamCount)
24 	{
25 	if (tree.IsLeaf(uNodeIndex))
26 		return 1;
27 
28 	unsigned uLeft = tree.GetLeft(uNodeIndex);
29 	unsigned uRight = tree.GetRight(uNodeIndex);
30 	unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);
31 	unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);
32 
33 	unsigned uLeafCount = uLeftCount + uRightCount;
34 	if (uLeftCount + uRightCount > uMaxLeafCount)
35 		{
36 		if (uLeftCount <= uMaxLeafCount)
37 			SubFams[uSubFamCount++] = uLeft;
38 		if (uRightCount <= uMaxLeafCount)
39 			SubFams[uSubFamCount++] = uRight;
40 		}
41 	else if (tree.IsRoot(uNodeIndex))
42 		{
43 		if (uSubFamCount != 0)
44 			Quit("Error in SubFamRecurse");
45 		SubFams[uSubFamCount++] = uNodeIndex;
46 		}
47 
48 	return uLeafCount;
49 	}
50 
SubFam(const Tree & tree,unsigned uMaxLeafCount,unsigned SubFams[],unsigned * ptruSubFamCount)51 void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)
52 	{
53 	*ptruSubFamCount = 0;
54 	SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);
55 
56 #if	TRACE
57 	{
58 	Log("\n");
59 	Log("Tree:\n");
60 	tree.LogMe();
61 	//void DrawTree(const Tree &tree);
62 	//DrawTree(tree);
63 	Log("\n");
64 	Log("%d subfams:\n", *ptruSubFamCount);
65 	for (unsigned i = 0; i < *ptruSubFamCount; ++i)
66 		Log("  %d=%d", i, SubFams[i]);
67 	Log("\n");
68 	}
69 #endif
70 	}
71 
72 //unsigned SubFams[9999];
73 //unsigned uSubFamCount;
74 //
75 //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
76 //	{
77 //	const unsigned uRoot = tree.GetRootNodeIndex();
78 //	unsigned uDist = 0;
79 //	while (uNodeIndex != uRoot)
80 //		{
81 //		++uDist;
82 //		uNodeIndex = tree.GetParent(uNodeIndex);
83 //		}
84 //	return uDist;
85 //	}
86 //
87 //static void DrawNode(const Tree &tree, unsigned uNodeIndex)
88 //	{
89 //	if (!tree.IsLeaf(uNodeIndex))
90 //		DrawNode(tree, tree.GetLeft(uNodeIndex));
91 //
92 //	unsigned uDist = DistFromRoot(tree, uNodeIndex);
93 //	for (unsigned i = 0; i < 5*uDist; ++i)
94 //		Log(" ");
95 //	Log("%d", uNodeIndex);
96 //	for (unsigned i = 0; i < uSubFamCount; ++i)
97 //		if (uNodeIndex == SubFams[i])
98 //			{
99 //			Log("*");
100 //			break;
101 //			}
102 //	Log("\n");
103 //
104 //	if (!tree.IsLeaf(uNodeIndex))
105 //		DrawNode(tree, tree.GetRight(uNodeIndex));
106 //	}
107 //
108 //static void DrawTree(const Tree &tree)
109 //	{
110 //	unsigned uRoot = tree.GetRootNodeIndex();
111 //	DrawNode(tree, uRoot);
112 //	}
113 //
114 //void TestSubFams(const char *FileName)
115 //	{
116 //	Tree tree;
117 //	TextFile f(FileName);
118 //	tree.FromFile(f);
119 //	SubFam(tree, 5, SubFams, &uSubFamCount);
120 //	DrawTree(tree);
121 //	}
122 
SetInFam(const Tree & tree,unsigned uNodeIndex,bool NodeInSubFam[])123 static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])
124 	{
125 	if (tree.IsLeaf(uNodeIndex))
126 		return;
127 	unsigned uLeft = tree.GetLeft(uNodeIndex);
128 	unsigned uRight = tree.GetRight(uNodeIndex);
129 	NodeInSubFam[uLeft] = true;
130 	NodeInSubFam[uRight] = true;
131 
132 	SetInFam(tree, uLeft, NodeInSubFam);
133 	SetInFam(tree, uRight, NodeInSubFam);
134 	}
135 
AlignSubFam(SeqVect & vAll,const Tree & GuideTree,unsigned uNodeIndex,MSA & msaOut)136 void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,
137   MSA &msaOut)
138 	{
139 	const unsigned uSeqCount = vAll.GetSeqCount();
140 
141 	const char *InTmp = "asf_in.tmp";
142 	const char *OutTmp = "asf_out.tmp";
143 
144 	unsigned *Leaves = new unsigned[uSeqCount];
145 	unsigned uLeafCount;
146 	GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);
147 
148 	SeqVect v;
149 	for (unsigned i = 0; i < uLeafCount; ++i)
150 		{
151 		unsigned uLeafNodeIndex = Leaves[i];
152 		unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);
153 		Seq &s = vAll.GetSeqById(uId);
154 		v.AppendSeq(s);
155 		}
156 
157 #if	TRACE
158 	{
159 	Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);
160 	for (unsigned i = 0; i < uLeafCount; ++i)
161 		Log(" %s", v.GetSeqName(i));
162 	Log("\n");
163 	}
164 #endif
165 
166 	TextFile fIn(InTmp, true);
167 
168 	v.ToFASTAFile(fIn);
169 	fIn.Close();
170 
171 	char CmdLine[4096];
172 	sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);
173 //	sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);
174 	system(CmdLine);
175 
176 	TextFile fOut(OutTmp);
177 	msaOut.FromFile(fOut);
178 
179 	for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)
180 		{
181 		const char *Name = msaOut.GetSeqName(uSeqIndex);
182 		unsigned uId = vAll.GetSeqIdFromName(Name);
183 		msaOut.SetSeqId(uSeqIndex, uId);
184 		}
185 
186 	unlink(InTmp);
187 	unlink(OutTmp);
188 
189 	delete[] Leaves;
190 	}
191 
ProgAlignSubFams()192 void ProgAlignSubFams()
193 	{
194     MuscleContext *ctx = getMuscleContext();
195 
196 	MSA msaOut;
197 
198 	SetOutputFileName(ctx->params.g_pstrOutFileName);
199 	SetInputFileName(ctx->params.g_pstrInFileName);
200 
201 	SetMaxIters(ctx->params.g_uMaxIters);
202 	SetSeqWeightMethod(ctx->params.g_SeqWeight1);
203 
204 	TextFile fileIn(ctx->params.g_pstrInFileName);
205 	SeqVect v;
206 	v.FromFASTAFile(fileIn);
207 	const unsigned uSeqCount = v.Length();
208 
209 	if (0 == uSeqCount)
210 		Quit("No sequences in input file");
211 
212 	ALPHA Alpha = ALPHA_Undefined;
213 	switch (ctx->params.g_SeqType)
214 		{
215 	case SEQTYPE_Auto:
216 		Alpha = v.GuessAlpha();
217 		break;
218 
219 	case SEQTYPE_Protein:
220 		Alpha = ALPHA_Amino;
221 		break;
222 
223 	case SEQTYPE_DNA:
224 		Alpha = ALPHA_DNA;
225 		break;
226 
227 	case SEQTYPE_RNA:
228 		Alpha = ALPHA_RNA;
229 		break;
230 
231 	default:
232 		Quit("Invalid seq type");
233 		}
234 	SetAlpha(Alpha);
235 	v.FixAlpha();
236 
237 	PTR_SCOREMATRIX UserMatrix = 0;
238 	if (0 != ctx->params.g_pstrMatrixFileName)
239 		{
240 		const char *FileName = ctx->params.g_pstrMatrixFileName;
241 		const char *Path = getenv("MUSCLE_MXPATH");
242 		if (Path != 0)
243 			{
244 			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
245 			char *NewFileName = new char[n];
246 			sprintf(NewFileName, "%s/%s", Path, FileName);
247 			FileName = NewFileName;
248 			}
249 		TextFile File(FileName);
250 		UserMatrix = ReadMx(File);
251 		ctx->alpha.g_Alpha = ALPHA_Amino;
252 		ctx->params.g_PPScore = PPSCORE_SP;
253 		}
254 
255 	SetPPScore();
256 
257 	if (0 != UserMatrix)
258 		ctx->params.g_ptrScoreMatrix = UserMatrix;
259 
260 	if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
261 		{
262 		SetPPScore(PPSCORE_SPN);
263 		ctx->params.g_Distance1 = DISTANCE_Kmer4_6;
264 		}
265 
266     unsigned uMinL = 0;
267 	unsigned uMaxL = 0;
268 	unsigned uTotL = 0;
269 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
270 		{
271 		unsigned L = v.GetSeq(uSeqIndex).Length();
272 		uTotL += L;
273         if (uMinL == 0 || L < uMinL)
274             uMinL = L;
275 		if (L > uMaxL)
276 			uMaxL = L;
277 		}
278 
279 	SetIter(1);
280 	ctx->params.g_bDiags = ctx->params.g_bDiags1;
281     SetSeqStats(uSeqCount, uMinL, uMaxL, uTotL/uSeqCount);
282 
283 	SetMuscleSeqVect(v);
284 
285 	MSA::SetIdCount(uSeqCount);
286 
287 // Initialize sequence ids.
288 // From this point on, ids must somehow propogate from here.
289 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
290 		v.SetSeqId(uSeqIndex, uSeqIndex);
291 
292 	if (uSeqCount > 1)
293 		MHackStart(v);
294 
295 	if (0 == uSeqCount)
296 		{
297 		msaOut.Clear();
298 		return;
299 		}
300 
301 	if (1 == uSeqCount && ALPHA_Amino == Alpha)
302 		{
303 		const Seq &s = v.GetSeq(0);
304 		msaOut.FromSeq(s);
305 		return;
306 		}
307 
308 	Tree GuideTree;
309 	TreeFromSeqVect(v, GuideTree, ctx->params.g_Cluster1, ctx->params.g_Distance1, ctx->params.g_Root1);
310 	SetMuscleTree(GuideTree);
311 
312 	MSA msa;
313 	if (ctx->params.g_bLow)
314 		{
315 		ProgNode *ProgNodes = 0;
316 		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
317 		delete[] ProgNodes;
318 		}
319 	else
320 		ProgressiveAlign(v, GuideTree, msa);
321 	SetCurrentAlignment(msa);
322 	TreeFromMSA(msa, GuideTree, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2);
323 	SetMuscleTree(GuideTree);
324 
325 	unsigned *SubFams = new unsigned[uSeqCount];
326 	unsigned uSubFamCount;
327 	SubFam(GuideTree, ctx->params.g_uMaxSubFamCount, SubFams, &uSubFamCount);
328 
329 	SetProgressDesc("Align node");
330 	const unsigned uNodeCount = 2*uSeqCount - 1;
331 
332 	ProgNode *ProgNodes = new ProgNode[uNodeCount];
333 	bool *NodeIsSubFam = new bool[uNodeCount];
334 	bool *NodeInSubFam = new bool[uNodeCount];
335 
336 	for (unsigned i = 0; i < uNodeCount; ++i)
337 		{
338 		NodeIsSubFam[i] = false;
339 		NodeInSubFam[i] = false;
340 		}
341 
342 	for (unsigned i = 0; i < uSubFamCount; ++i)
343 		{
344 		unsigned uNodeIndex = SubFams[i];
345 		assert(uNodeIndex < uNodeCount);
346 		NodeIsSubFam[uNodeIndex] = true;
347 		SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
348 		}
349 
350 	unsigned uJoin = 0;
351 	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
352 	do
353 		{
354 		if (NodeIsSubFam[uTreeNodeIndex])
355 			{
356 #if	TRACE
357 			Log("Node %d: align subfam\n", uTreeNodeIndex);
358 #endif
359 			ProgNode &Node = ProgNodes[uTreeNodeIndex];
360 			AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
361 			Node.m_uLength = Node.m_MSA.GetColCount();
362 			}
363 		else if (!NodeInSubFam[uTreeNodeIndex])
364 			{
365 #if	TRACE
366 			Log("Node %d: align two subfams\n", uTreeNodeIndex);
367 #endif
368 			Progress(uJoin, uSubFamCount - 1);
369 			++uJoin;
370 
371 			const unsigned uMergeNodeIndex = uTreeNodeIndex;
372 			ProgNode &Parent = ProgNodes[uMergeNodeIndex];
373 
374 			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
375 			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
376 
377 			ProgNode &Node1 = ProgNodes[uLeft];
378 			ProgNode &Node2 = ProgNodes[uRight];
379 
380 			PWPath Path;
381 			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
382 			Parent.m_uLength = Parent.m_MSA.GetColCount();
383 
384 			Node1.m_MSA.Clear();
385 			Node2.m_MSA.Clear();
386 			}
387 		else
388 			{
389 #if	TRACE
390 			Log("Node %d: in subfam\n", uTreeNodeIndex);
391 #endif
392 			;
393 			}
394 		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
395 		}
396 	while (NULL_NEIGHBOR != uTreeNodeIndex);
397 	ProgressStepsDone();
398 
399 	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
400 	ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
401 
402 	TextFile fOut(ctx->params.g_pstrOutFileName, true);
403 	MHackEnd(RootProgNode.m_MSA);
404 	RootProgNode.m_MSA.ToFile(fOut);
405 
406 	delete[] NodeInSubFam;
407 	delete[] NodeIsSubFam;
408 	delete[] ProgNodes;
409 	delete[] SubFams;
410 
411 	ProgNodes = 0;
412 	NodeInSubFam = 0;
413 	NodeIsSubFam = 0;
414 	SubFams = 0;
415 	}
416