1 #ifndef	MSA_h
2 #define MSA_h
3 
4 const int MAX_SEQ_NAME = 63;
5 struct PathEdge;
6 class TextFile;
7 class Seq;
8 class ClusterNode;
9 class NodeCounts;
10 class DataBuffer;
11 
12 class MSA
13 	{
14 public:
15 	MSA();
16 	virtual ~MSA();
17 
18 public:
19 // Ways to create an MSA
20 	void FromFile(TextFile &File);
21 	void FromFASTAFile(TextFile &File);
22 	void FromSeq(const Seq &s);
23 
24 	void ToFile(TextFile &File) const;
25 	void ToFASTAFile(TextFile &File) const;
26 	void ToMSFFile(TextFile &File, const char *ptrComment = 0) const;
27 	void ToAlnFile(TextFile &File) const;
28 	void ToHTMLFile(TextFile &File) const;
29 	void ToPhySequentialFile(TextFile &File) const;
30 	void ToPhyInterleavedFile(TextFile &File) const;
31 
32 	void SetSize(unsigned uSeqCount, unsigned uColCount);
33 	void SetSeqCount(unsigned uSeqCount);
34 	char GetChar(unsigned uSeqIndex, unsigned uIndex) const;
35 	unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const;
36 	unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const;
37 	const char *GetSeqName(unsigned uSeqIndex) const;
38 	unsigned GetSeqId(unsigned uSeqIndex) const;
39 	unsigned GetSeqIndex(unsigned uId) const;
40 	bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const;
41 	double GetOcc(unsigned uColIndex) const;
42 	void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
43 	  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
44 	  FCOUNT *fcGapExtend, FCOUNT *ptrfOcc,
45 	  FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const;
46 	bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const;
47 	bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const;
48 	bool IsGapColumn(unsigned uColIndex) const;
49 	bool ColumnHasGap(unsigned uColIndex) const;
50 	bool IsGapSeq(unsigned uSeqIndex) const;
51 
52 	void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c);
53 	void SetSeqName(unsigned uSeqIndex, const char szName[]);
54 	void SetSeqId(unsigned uSeqIndex, unsigned uId);
55 	bool HasGap() const;
56 	bool IsLegalLetter(unsigned uLetter) const;
57 	void GetSeq(unsigned uSeqIndex, Seq &seq) const;
58 	void Copy(const MSA &msa);
59 	double GetCons(unsigned uColIndex) const;
60 	double GetAvgCons() const;
61 	double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;
62 	bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const;
63 	void DeleteCol(unsigned uColIndex);
64 	void DeleteColumns(unsigned uColIndex, unsigned uColCount);
65 	void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex);
66 	void DeleteSeq(unsigned uSeqIndex);
67 //	void DeleteEmptyCols(bool bProgress = false);
68 	bool IsEmptyCol(unsigned uColIndex) const;
69 
70 	WEIGHT GetSeqWeight(unsigned uSeqIndex) const;
71 	WEIGHT GetTotalSeqWeight() const;
72 	void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const;
73 	void NormalizeWeights(WEIGHT wTotal) const;
74 	bool WeightsSet() const;
75 
76 	unsigned GetGCGCheckSum(unsigned uSeqIndex) const;
77 
78 	ALPHA GuessAlpha() const;
79 	void FixAlpha();
80 
81 	unsigned UniqueResidueTypes(unsigned uColIndex) const;
82 
83 	void UnWeight();
84 
85 	void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const;
86 	void ValidateBreakMatrices() const;
87 	unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const;
88 	const char *GetSeqBuffer(unsigned uSeqIndex) const;
89 	unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const;
90 	unsigned GetSeqLength(unsigned uSeqIndex) const;
91 	void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID,
92 	  unsigned *ptruPosCount) const;
93 
94 	void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[],
95 	  int iMap2[]) const;
96 
97 	void LogMe() const;
98 	void ListWeights() const;
99 
100 	void GapInfoToDataBuffer(DataBuffer &Buffer) const;
101 	void GapInfoFromDataBuffer(const DataBuffer &Buffer);
102 	double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;
103 
Clear()104 	void Clear()
105 		{
106 		Free();
107 		}
GetSeqCount()108 	unsigned GetSeqCount() const
109 		{
110 		return m_uSeqCount;
111 		}
GetColCount()112 	unsigned GetColCount() const
113 		{
114 		return m_uColCount;
115 		}
116 
117 	static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,
118 	  unsigned uSeqIndex2);
119 
120 	static void SetIdCount(unsigned uIdCount);
121 
122 private:
123 	friend void SetMSAWeightsMuscle(MSA &msa);
124 	friend void SetThreeWayWeightsMuscle(MSA &msa);
125 	void SetHenikoffWeightsPB() const;
126 	void SetHenikoffWeights() const;
127 	void SetGSCWeights() const;
128 	void SetUniformWeights() const;
129 	void SetClustalWWeights(const Tree &tree);
130 
131 	void Free();
132 	void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel);
133 	void ExpandCache(unsigned uSeqCount, unsigned uColCount);
134 	void CalcWeights() const;
135 	void GetNameFromFASTAAnnotationLine(const char szLine[],
136 	  char szName[], unsigned uBytes);
137 	void CopyCol(unsigned uFromCol, unsigned uToCol);
138 	unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const;
139 	void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const;
140 	unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const;
141 	void SetSubtreeWeight2(const ClusterNode *ptrNode) const;
142 	void SetSubtreeGSCWeight(ClusterNode *ptrNode) const;
143 
144 	void CalcHenikoffWeightsColPB(unsigned uColIndex) const;
145 	void CalcHenikoffWeightsCol(unsigned uColIndex) const;
146 
147 private:
148 	unsigned m_uSeqCount;
149 	unsigned m_uColCount;
150 	unsigned m_uCacheSeqLength;
151 	unsigned m_uCacheSeqCount;
152 	char **m_szSeqs;
153 	char **m_szNames;
154 
155 	static unsigned m_uIdCount;
156 
157 	unsigned *m_IdToSeqIndex;
158 	unsigned *m_SeqIndexToId;
159 
160 	WEIGHT *m_Weights;
161 	};
162 
163 void SeqVectFromMSA(const MSA &msa, SeqVect &v);
164 void DeleteGappedCols(MSA &msa);
165 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
166   MSA &msaOut);
167 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat);
168 void MSAAppend(MSA &msa1, const MSA &msa2);
169 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
170   MSA &msaOut);
171 void AssertMSAEq(const MSA &msa1, const MSA &msa2);
172 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2);
173 void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,
174   MSA &msaOut);
175 void SetMSAWeightsMuscle(MSA &msa);
176 void SetClustalWWeightsMuscle(MSA &msa);
177 void SetThreeWayWeightsMuscle(MSA &msa);
178 
179 #endif	// MSA_h
180