1 #ifndef MSA_h 2 #define MSA_h 3 4 const int MAX_SEQ_NAME = 63; 5 struct PathEdge; 6 class TextFile; 7 class Seq; 8 class ClusterNode; 9 class NodeCounts; 10 class DataBuffer; 11 12 class MSA 13 { 14 public: 15 MSA(); 16 virtual ~MSA(); 17 18 public: 19 // Ways to create an MSA 20 void FromFile(TextFile &File); 21 void FromFASTAFile(TextFile &File); 22 void FromSeq(const Seq &s); 23 24 void ToFile(TextFile &File) const; 25 void ToFASTAFile(TextFile &File) const; 26 void ToMSFFile(TextFile &File, const char *ptrComment = 0) const; 27 void ToAlnFile(TextFile &File) const; 28 void ToHTMLFile(TextFile &File) const; 29 void ToPhySequentialFile(TextFile &File) const; 30 void ToPhyInterleavedFile(TextFile &File) const; 31 32 void SetSize(unsigned uSeqCount, unsigned uColCount); 33 void SetSeqCount(unsigned uSeqCount); 34 char GetChar(unsigned uSeqIndex, unsigned uIndex) const; 35 unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const; 36 unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const; 37 const char *GetSeqName(unsigned uSeqIndex) const; 38 unsigned GetSeqId(unsigned uSeqIndex) const; 39 unsigned GetSeqIndex(unsigned uId) const; 40 bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const; 41 double GetOcc(unsigned uColIndex) const; 42 void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, 43 FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, 44 FCOUNT *fcGapExtend, FCOUNT *ptrfOcc, 45 FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const; 46 bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const; 47 bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const; 48 bool IsGapColumn(unsigned uColIndex) const; 49 bool ColumnHasGap(unsigned uColIndex) const; 50 bool IsGapSeq(unsigned uSeqIndex) const; 51 52 void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c); 53 void SetSeqName(unsigned uSeqIndex, const char szName[]); 54 void SetSeqId(unsigned uSeqIndex, unsigned uId); 55 bool HasGap() const; 56 bool IsLegalLetter(unsigned uLetter) const; 57 void GetSeq(unsigned uSeqIndex, Seq &seq) const; 58 void Copy(const MSA &msa); 59 double GetCons(unsigned uColIndex) const; 60 double GetAvgCons() const; 61 double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; 62 bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const; 63 void DeleteCol(unsigned uColIndex); 64 void DeleteColumns(unsigned uColIndex, unsigned uColCount); 65 void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex); 66 void DeleteSeq(unsigned uSeqIndex); 67 // void DeleteEmptyCols(bool bProgress = false); 68 bool IsEmptyCol(unsigned uColIndex) const; 69 70 WEIGHT GetSeqWeight(unsigned uSeqIndex) const; 71 WEIGHT GetTotalSeqWeight() const; 72 void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const; 73 void NormalizeWeights(WEIGHT wTotal) const; 74 bool WeightsSet() const; 75 76 unsigned GetGCGCheckSum(unsigned uSeqIndex) const; 77 78 ALPHA GuessAlpha() const; 79 void FixAlpha(); 80 81 unsigned UniqueResidueTypes(unsigned uColIndex) const; 82 83 void UnWeight(); 84 85 void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const; 86 void ValidateBreakMatrices() const; 87 unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const; 88 const char *GetSeqBuffer(unsigned uSeqIndex) const; 89 unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const; 90 unsigned GetSeqLength(unsigned uSeqIndex) const; 91 void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID, 92 unsigned *ptruPosCount) const; 93 94 void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[], 95 int iMap2[]) const; 96 97 void LogMe() const; 98 void ListWeights() const; 99 100 void GapInfoToDataBuffer(DataBuffer &Buffer) const; 101 void GapInfoFromDataBuffer(const DataBuffer &Buffer); 102 double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; 103 Clear()104 void Clear() 105 { 106 Free(); 107 } GetSeqCount()108 unsigned GetSeqCount() const 109 { 110 return m_uSeqCount; 111 } GetColCount()112 unsigned GetColCount() const 113 { 114 return m_uColCount; 115 } 116 117 static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, 118 unsigned uSeqIndex2); 119 120 static void SetIdCount(unsigned uIdCount); 121 122 private: 123 friend void SetMSAWeightsMuscle(MSA &msa); 124 friend void SetThreeWayWeightsMuscle(MSA &msa); 125 void SetHenikoffWeightsPB() const; 126 void SetHenikoffWeights() const; 127 void SetGSCWeights() const; 128 void SetUniformWeights() const; 129 void SetClustalWWeights(const Tree &tree); 130 131 void Free(); 132 void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel); 133 void ExpandCache(unsigned uSeqCount, unsigned uColCount); 134 void CalcWeights() const; 135 void GetNameFromFASTAAnnotationLine(const char szLine[], 136 char szName[], unsigned uBytes); 137 void CopyCol(unsigned uFromCol, unsigned uToCol); 138 unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const; 139 void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const; 140 unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const; 141 void SetSubtreeWeight2(const ClusterNode *ptrNode) const; 142 void SetSubtreeGSCWeight(ClusterNode *ptrNode) const; 143 144 void CalcHenikoffWeightsColPB(unsigned uColIndex) const; 145 void CalcHenikoffWeightsCol(unsigned uColIndex) const; 146 147 private: 148 unsigned m_uSeqCount; 149 unsigned m_uColCount; 150 unsigned m_uCacheSeqLength; 151 unsigned m_uCacheSeqCount; 152 char **m_szSeqs; 153 char **m_szNames; 154 155 static unsigned m_uIdCount; 156 157 unsigned *m_IdToSeqIndex; 158 unsigned *m_SeqIndexToId; 159 160 WEIGHT *m_Weights; 161 }; 162 163 void SeqVectFromMSA(const MSA &msa, SeqVect &v); 164 void DeleteGappedCols(MSA &msa); 165 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, 166 MSA &msaOut); 167 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat); 168 void MSAAppend(MSA &msa1, const MSA &msa2); 169 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, 170 MSA &msaOut); 171 void AssertMSAEq(const MSA &msa1, const MSA &msa2); 172 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2); 173 void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, 174 MSA &msaOut); 175 void SetMSAWeightsMuscle(MSA &msa); 176 void SetClustalWWeightsMuscle(MSA &msa); 177 void SetThreeWayWeightsMuscle(MSA &msa); 178 179 #endif // MSA_h 180