1 #ifndef MMSEQS_PSSM_H
2 #define MMSEQS_PSSM_H
3 
4 #include <cstddef>
5 #include <string>
6 
7 class BaseMatrix;
8 class Sequence;
9 
10 class PSSMCalculator {
11 public:
12     struct Profile {
13         const char * pssm;
14         float * prob;
15         const float * neffM;
16         std::string consensus;
ProfileProfile17         Profile(char * pssm, float * prob, float * neffM, std::string consensus)
18                 : pssm(pssm), prob(prob), neffM(neffM), consensus(consensus) {};
19         void toBuffer(const unsigned char* centerSequence, size_t centerSeqLen, BaseMatrix& subMat, std::string& result);
20         void toBuffer(Sequence& centerSequence, BaseMatrix& subMat, std::string& result);
21     };
22 
23     PSSMCalculator(BaseMatrix *subMat, size_t maxSeqLength, size_t maxSetSize, float pca, float pcb);
24 
25     ~PSSMCalculator();
26 
27     Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs,
28                                     bool wg);
29 
30     void printProfile(size_t queryLength);
31     void printPSSM(size_t queryLength);
32 
33     // prepare pseudocounts
34     static void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize, size_t queryLength, const float **R);
35 
36     // compute pseudocounts from Neff_M -p log(p) per column
37     static void computePseudoCounts(float *profile, float *frequency, float *frequency_with_pseudocounts, size_t entrySize, float *Neff_M, size_t length,float pca, float pcb);
38 
39     // Compute weight for sequence based on "Position-based Sequence Weights' (1994)
40     static void computeSequenceWeights(float *seqWeight, size_t queryLength, size_t setSize, const char **msaSeqs);
41 
42 private:
43     BaseMatrix* subMat;
44 
45     // contains sequence weights (global)
46     float * seqWeight;
47 
48     // contains MSA AA matchWeight
49     float * matchWeight;
50 
51     // contains MSA AA pseudocount weight
52     float * pseudocountsWeight;
53 
54     // Entropy of MSA
55     float * Neff_M;
56 
57     // Profile of MSA
58     float * profile;
59 
60     // PSSM contains log odds PSSM values
61     char * pssm;
62 
63     // number of sequences in subalignment i (only for DEBUGGING)
64     int *nseqs;
65 
66     // weight contribution value for each sequence
67     float **w_contrib;
68     // backing aligned memory
69     unsigned char *w_contrib_backing;
70 
71     // weight of sequence k in column i, calculated from subalignment i
72     float *wi;
73 
74     // number of different amino acids
75     int *naa;
76 
77     float **f;
78 
79     int **n;
80     // backing aligned memory
81     unsigned char *n_backing;
82 
83     size_t maxSeqLength;
84     size_t maxSetSize;
85 
86     // compute position-specific scoring matrix PSSM score
87     // 1.) convert PFM to PPM (position probability matrix)
88     //     Both PPMs assume statistical independence between positions in the pattern
89     // 2.) PSSM Log odds score
90     //     M_{aa,pos}={log(M_{aa,pos} / b_{aa}).
91     void computeLogPSSM(char *pssm, const float *profile, float bitFactor, size_t queryLength, float scoreBias);
92 
93     // compute the Neff_M per column -p log(p)
94     void computeNeff_M(float *frequency, float *seqWeight, float *Neff_M, size_t queryLength, size_t setSize, char const **msaSeqs);
95 
96     void computeMatchWeights(float * matchWeight, float * seqWeight, size_t setSize, size_t queryLength, const char **msaSeqs);
97 
98     void computeContextSpecificWeights(float * matchWeight, float *seqWeight, float * Neff_M, size_t queryLength, size_t setSize, const char **msaSeqs);
99 
100     float pca;
101     float pcb;
102 
103     std::string computeConsensusSequence(float *pDouble, size_t queryLength, double *back, char *num2aa);
104 
105     void increaseSetSize(size_t newSetSize);
106 };
107 
108 
109 #endif //MMSEQS_PSSM_H
110