1 #ifndef MMSEQS_PSSM_H 2 #define MMSEQS_PSSM_H 3 4 #include <cstddef> 5 #include <string> 6 7 class BaseMatrix; 8 class Sequence; 9 10 class PSSMCalculator { 11 public: 12 struct Profile { 13 const char * pssm; 14 float * prob; 15 const float * neffM; 16 std::string consensus; ProfileProfile17 Profile(char * pssm, float * prob, float * neffM, std::string consensus) 18 : pssm(pssm), prob(prob), neffM(neffM), consensus(consensus) {}; 19 void toBuffer(const unsigned char* centerSequence, size_t centerSeqLen, BaseMatrix& subMat, std::string& result); 20 void toBuffer(Sequence& centerSequence, BaseMatrix& subMat, std::string& result); 21 }; 22 23 PSSMCalculator(BaseMatrix *subMat, size_t maxSeqLength, size_t maxSetSize, float pca, float pcb); 24 25 ~PSSMCalculator(); 26 27 Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, 28 bool wg); 29 30 void printProfile(size_t queryLength); 31 void printPSSM(size_t queryLength); 32 33 // prepare pseudocounts 34 static void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize, size_t queryLength, const float **R); 35 36 // compute pseudocounts from Neff_M -p log(p) per column 37 static void computePseudoCounts(float *profile, float *frequency, float *frequency_with_pseudocounts, size_t entrySize, float *Neff_M, size_t length,float pca, float pcb); 38 39 // Compute weight for sequence based on "Position-based Sequence Weights' (1994) 40 static void computeSequenceWeights(float *seqWeight, size_t queryLength, size_t setSize, const char **msaSeqs); 41 42 private: 43 BaseMatrix* subMat; 44 45 // contains sequence weights (global) 46 float * seqWeight; 47 48 // contains MSA AA matchWeight 49 float * matchWeight; 50 51 // contains MSA AA pseudocount weight 52 float * pseudocountsWeight; 53 54 // Entropy of MSA 55 float * Neff_M; 56 57 // Profile of MSA 58 float * profile; 59 60 // PSSM contains log odds PSSM values 61 char * pssm; 62 63 // number of sequences in subalignment i (only for DEBUGGING) 64 int *nseqs; 65 66 // weight contribution value for each sequence 67 float **w_contrib; 68 // backing aligned memory 69 unsigned char *w_contrib_backing; 70 71 // weight of sequence k in column i, calculated from subalignment i 72 float *wi; 73 74 // number of different amino acids 75 int *naa; 76 77 float **f; 78 79 int **n; 80 // backing aligned memory 81 unsigned char *n_backing; 82 83 size_t maxSeqLength; 84 size_t maxSetSize; 85 86 // compute position-specific scoring matrix PSSM score 87 // 1.) convert PFM to PPM (position probability matrix) 88 // Both PPMs assume statistical independence between positions in the pattern 89 // 2.) PSSM Log odds score 90 // M_{aa,pos}={log(M_{aa,pos} / b_{aa}). 91 void computeLogPSSM(char *pssm, const float *profile, float bitFactor, size_t queryLength, float scoreBias); 92 93 // compute the Neff_M per column -p log(p) 94 void computeNeff_M(float *frequency, float *seqWeight, float *Neff_M, size_t queryLength, size_t setSize, char const **msaSeqs); 95 96 void computeMatchWeights(float * matchWeight, float * seqWeight, size_t setSize, size_t queryLength, const char **msaSeqs); 97 98 void computeContextSpecificWeights(float * matchWeight, float *seqWeight, float * Neff_M, size_t queryLength, size_t setSize, const char **msaSeqs); 99 100 float pca; 101 float pcb; 102 103 std::string computeConsensusSequence(float *pDouble, size_t queryLength, double *back, char *num2aa); 104 105 void increaseSetSize(size_t newSetSize); 106 }; 107 108 109 #endif //MMSEQS_PSSM_H 110