1 #include "Sequence.h"
2 #include "Debug.h"
3 #include "Util.h"
4 #include "simd.h"
5 #include "SubstitutionMatrix.h"
6 #include "Parameters.h"
7 #include "MathUtil.h"
8 #include "SubstitutionMatrixProfileStates.h"
9 #include "PSSMCalculator.h"
10 
11 #include <climits> // short_max
12 #include <cstddef>
13 
Sequence(size_t maxLen,int seqType,const BaseMatrix * subMat,const unsigned int kmerSize,const bool spaced,const bool aaBiasCorrection,bool shouldAddPC,const std::string & userSpacedKmerPattern)14 Sequence::Sequence(size_t maxLen, int seqType, const BaseMatrix *subMat, const unsigned int kmerSize, const bool spaced, const bool aaBiasCorrection, bool shouldAddPC, const std::string& userSpacedKmerPattern) {
15     this->maxLen = maxLen;
16     this->numSequence = (unsigned char*)malloc(maxLen + 1);
17     this->numConsensusSequence = (unsigned char*)malloc(maxLen + 1);
18     this->aaBiasCorrection = aaBiasCorrection;
19     this->subMat = (BaseMatrix*)subMat;
20     this->spaced = spaced;
21     this->seqType = seqType;
22     std::pair<const char *, unsigned int> spacedKmerInformation;
23     if (spaced == true && userSpacedKmerPattern.empty() == false) {
24         spacedKmerInformation = parseSpacedPattern(kmerSize, spaced, userSpacedKmerPattern);
25     } else {
26         spacedKmerInformation = getSpacedPattern(spaced, kmerSize);
27     }
28     this->spacedPattern = spacedKmerInformation.first;
29     this->spacedPatternSize = spacedKmerInformation.second;
30     this->kmerSize = kmerSize;
31     this->kmerWindow = NULL;
32     this->aaPosInSpacedPattern = NULL;
33     this->shouldAddPC = shouldAddPC;
34     this->userSpacedKmerPattern = userSpacedKmerPattern;
35     if(spacedPatternSize){
36         simdKmerRegisterCnt = (kmerSize / (VECSIZE_INT*4)) + 1;
37         unsigned int simdKmerLen =  simdKmerRegisterCnt *  (VECSIZE_INT*4); // for SIMD memory alignment
38         this->kmerWindow = (unsigned char*) mem_align(ALIGN_INT, simdKmerLen * sizeof(unsigned char));
39         memset(this->kmerWindow, 0, simdKmerLen * sizeof(unsigned char));
40         this->aaPosInSpacedPattern = new unsigned char[kmerSize];
41         if(spacedPattern == NULL) {
42             Debug(Debug::ERROR) << "Sequence does not have a kmerSize (kmerSize= " << spacedPatternSize << ") to use nextKmer.\n";
43             Debug(Debug::ERROR) << "Please report this bug to the developer\n";
44             EXIT(EXIT_FAILURE);
45         }
46         size_t pos = 0;
47         for(int i = 0; i < this->spacedPatternSize; i++) {
48             if(spacedPattern[i]){
49                 aaPosInSpacedPattern[pos] = i;
50                 pos++;
51             }
52         }
53     }
54 
55     // init memory for profile search
56     if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
57 
58         profile_matrix = new ScoreMatrix*[PROFILE_AA_SIZE]; // init 20 matrix pointer (its more than enough for all kmer parameter)
59         for (size_t i = 0; i < kmerSize; i++) {
60             profile_matrix[i] = new ScoreMatrix(NULL, NULL, PROFILE_AA_SIZE, PROFILE_ROW_SIZE);
61         }
62         this->pNullBuffer           = new float[maxLen + 1];
63         this->neffM                 = new float[maxLen + 1];
64         this->profile_score         = (short *)        mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(short));
65         this->profile_index         = (unsigned int *) mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(int));
66         this->profile               = (float *)        mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_AA_SIZE * sizeof(float));
67         this->pseudocountsWeight    = (float *)        mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(float));
68         this->profile_for_alignment = (int8_t *)       mem_align(ALIGN_INT, (maxLen + 1) * subMat->alphabetSize * sizeof(int8_t));
69         // init profile
70         memset(this->profile_for_alignment, 0, (maxLen + 1) * subMat->alphabetSize * sizeof(int8_t));
71         memset(this->profile, 0, (maxLen + 1) * PROFILE_AA_SIZE * sizeof(float));
72         for (size_t i = 0; i < (maxLen + 1) * PROFILE_ROW_SIZE; ++i){
73             profile_score[i] = -SHRT_MAX;
74             profile_index[i] = UINT_MAX;
75         }
76     } else {
77         profile_matrix = NULL;
78     }
79     currItPos = -1;
80 }
81 
~Sequence()82 Sequence::~Sequence() {
83     delete[] spacedPattern;
84     free(numSequence);
85     free(numConsensusSequence);
86     if (kmerWindow) {
87         free(kmerWindow);
88     }
89     if (aaPosInSpacedPattern){
90         delete[] aaPosInSpacedPattern;
91     }
92     if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE)|| Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
93         for (size_t i = 0; i < kmerSize; ++i) {
94             delete profile_matrix[i];
95         }
96         delete[] profile_matrix;
97         delete[] neffM;
98         delete[] pNullBuffer;
99         free(profile);
100         free(pseudocountsWeight);
101         free(profile_score);
102         free(profile_index);
103         free(profile_for_alignment);
104     }
105 }
106 
getSpacedPattern(bool spaced,unsigned int kmerSize)107 std::pair<const char *, unsigned int> Sequence::getSpacedPattern(bool spaced, unsigned int kmerSize){
108 #define CASE(x) {case x: \
109                       if(spaced){ \
110                         pair =  std::make_pair<const char *, unsigned int>((const char *) &spaced_seed_##x, ARRAY_SIZE(spaced_seed_##x)); \
111                       }else{ \
112                         pair =  std::make_pair<const char *, unsigned int>((const char *) &seed_##x, ARRAY_SIZE(seed_##x)); \
113                       } \
114                       break; \
115                  }
116     std::pair<const char *, unsigned int> pair;
117     switch (kmerSize) {
118         case 0: // if no kmer iterator support
119             pair = std::make_pair<const char *, unsigned int>(NULL, 0);
120             break;
121         CASE(4)
122         CASE(5)
123         CASE(6)
124         CASE(7)
125         CASE(8)
126         CASE(9)
127         CASE(10)
128         CASE(11)
129         CASE(12)
130         CASE(13)
131         CASE(14)
132         CASE(15)
133         CASE(16)
134         CASE(17)
135         CASE(18)
136         CASE(19)
137         CASE(20)
138         CASE(21)
139         CASE(22)
140         CASE(23)
141         CASE(24)
142         CASE(25)
143         CASE(26)
144         CASE(27)
145         CASE(28)
146         CASE(29)
147         CASE(30)
148         default:
149 
150             char * pattern = new char[kmerSize];
151             for(size_t i = 0; i < kmerSize; i++){
152                 pattern[i]=1;
153             }
154             return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(kmerSize));
155 
156 //            Debug(Debug::ERROR) << "Did not find spaced pattern for kmerSize: " << kmerSize << ". \n";
157 //            Debug(Debug::ERROR) << "Please report this bug to the developer\n";
158 //            EXIT(EXIT_FAILURE);
159             break;
160     }
161     char * pattern = new char[pair.second];
162     if (pair.second > 0) {
163         memcpy(pattern, pair.first, pair.second * sizeof(char));
164     }
165     return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(pair.second));
166 #undef CASE
167 }
168 
parseSpacedPattern(unsigned int kmerSize,bool spaced,const std::string & spacedKmerPattern)169 std::pair<const char *, unsigned int> Sequence::parseSpacedPattern(unsigned int kmerSize, bool spaced, const std::string& spacedKmerPattern) {
170     bool spacedKmerPatternSpaced = false;
171     unsigned int spacedKmerPatternKmerSize = 0;
172     char* pattern = new char[spacedKmerPattern.size()];
173     for (size_t i = 0; i < spacedKmerPattern.size(); ++i) {
174         switch (spacedKmerPattern[i]) {
175             case '0':
176                 spacedKmerPatternSpaced = true;
177                 pattern[i] = 0;
178                 break;
179             case '1':
180                 spacedKmerPatternKmerSize += 1;
181                 pattern[i] = 1;
182                 break;
183             default:
184                 Debug(Debug::ERROR) << "Invalid character in user-specified k-mer pattern\n";
185                 EXIT(EXIT_FAILURE);
186                 break;
187         }
188     }
189     if (spacedKmerPatternKmerSize != kmerSize){
190         Debug(Debug::ERROR) << "User-specified k-mer pattern is not consistent with stated k-mer size\n";
191         EXIT(EXIT_FAILURE);
192     } else if (spacedKmerPatternSpaced != spaced) {
193         Debug(Debug::ERROR) << "User-specified k-mer pattern is not consistent with spaced k-mer true/false\n";
194         EXIT(EXIT_FAILURE);
195     }
196     return std::make_pair<const char *, unsigned int>((const char *) pattern, spacedKmerPattern.size());
197 }
198 
mapSequence(size_t id,unsigned int dbKey,const char * sequence,unsigned int seqLen,bool mapProfileScores)199 void Sequence::mapSequence(size_t id, unsigned int dbKey, const char *sequence, unsigned int seqLen, bool mapProfileScores) {
200     this->id = id;
201     this->dbKey = dbKey;
202     this->seqData = sequence;
203     if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_AMINO_ACIDS) || Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_NUCLEOTIDES)) {
204         mapSequence(sequence, seqLen);
205     } else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_HMM_PROFILE)) {
206         mapProfile(sequence, mapProfileScores, seqLen);
207     } else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)) {
208         mapProfileStateSequence(sequence, seqLen);
209     }else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
210         switch(subMat->alphabetSize) {
211             case 8:
212                 mapProfileState<8>(sequence, seqLen);
213                 break;
214             case 32:
215                 mapProfileState<32>(sequence, seqLen);
216                 break;
217             case 219:
218                 mapProfileState<219>(sequence, seqLen);
219                 break;
220             case 255:
221                 mapProfileState<255>(sequence, seqLen);
222                 break;
223             default:
224                 Debug(Debug::ERROR) << "Invalid alphabet size type!\n";
225                 EXIT(EXIT_FAILURE);
226                 break;
227         }
228     } else {
229         Debug(Debug::ERROR) << "Invalid sequence type!\n";
230         EXIT(EXIT_FAILURE);
231     }
232     currItPos = -1;
233 
234 }
235 
mapSequence(size_t id,unsigned int dbKey,std::pair<const unsigned char *,const unsigned int> data)236 void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsigned char *,const unsigned int> data){
237     this->id = id;
238     this->dbKey = dbKey;
239     if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_AMINO_ACIDS)
240         || Parameters::isEqualDbtype( this->seqType,Parameters::DBTYPE_NUCLEOTIDES)
241         || Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)){
242         this->L = data.second;
243         if(this->L >= static_cast<int>(maxLen)){
244             numSequence = static_cast<unsigned char *>(realloc(numSequence, this->L+1));
245             maxLen = this->L;
246         }
247         memcpy(this->numSequence, data.first, this->L);
248     } else {
249         Debug(Debug::ERROR) << "Invalid sequence type!\n";
250         EXIT(EXIT_FAILURE);
251     }
252     currItPos = -1;
253 }
254 
mapProfileStateSequence(const char * profileStateSeq,unsigned int seqLen)255 void Sequence::mapProfileStateSequence(const char * profileStateSeq, unsigned int seqLen){
256     size_t l = 0;
257     size_t pos = 0;
258     unsigned char curr = profileStateSeq[pos];
259     while (curr != '\0' && l < seqLen){
260 
261         this->numSequence[l]  = curr - 1;
262 
263         l++;
264         if (l > maxLen){
265             Debug(Debug::ERROR) << "Sequence too long! Max length allowed would be " << maxLen << "\n";
266             EXIT(EXIT_FAILURE);
267         }
268         pos++;
269         curr  = profileStateSeq[pos];
270     }
271     this->L = l;
272 }
273 
274 
275 
mapProfile(const char * profileData,bool mapScores,unsigned int seqLen)276 void Sequence::mapProfile(const char * profileData, bool mapScores, unsigned int seqLen){
277     char * data = (char *) profileData;
278     size_t currPos = 0;
279     float scoreBias = 0.0;
280     // if no data exists
281     {
282         size_t l = 0;
283         while (data[currPos] != '\0' && l < maxLen  && l < seqLen){
284             for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
285                 // shift bytes back (avoids NULL byte)
286 //            short value = static_cast<short>( ^ mask);
287                 profile[l * PROFILE_AA_SIZE + aa_idx] = scoreUnmask(data[currPos + aa_idx]);
288                 //value * 4;
289             }
290 
291             float sumProb = 0.0;
292             for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
293                 sumProb += profile[l * PROFILE_AA_SIZE + aa];
294             }
295             if(sumProb > 0.9){
296                 MathUtil::NormalizeTo1(&profile[l * PROFILE_AA_SIZE], PROFILE_AA_SIZE);
297             }
298 
299             unsigned char queryLetter = data[currPos + PROFILE_AA_SIZE];
300             // read query sequence
301             numSequence[l] = queryLetter; // index 0 is the highst scoring one
302             unsigned char consensusLetter = data[currPos + PROFILE_AA_SIZE+1];
303             numConsensusSequence[l] = consensusLetter;
304             unsigned short neff = data[currPos + PROFILE_AA_SIZE+2];
305             neffM[l] = MathUtil::convertNeffToFloat(neff);
306             l++;
307 
308 
309             // go to begin of next entry 0, 20, 40, 60, ...
310             currPos += PROFILE_READIN_SIZE;
311         }
312         this->L = l;
313         if(l > maxLen ){
314             Debug(Debug::INFO) << "Entry " << dbKey << " is longer than max seq. len " << maxLen << "\n";
315         }
316 
317     }
318 
319 
320     // TODO: Make dependency explicit
321     float pca = Parameters::getInstance().pca;
322     if(shouldAddPC && pca  > 0.0){
323         PSSMCalculator::preparePseudoCounts(profile, pseudocountsWeight, PROFILE_AA_SIZE, L,
324                                             (const float **) subMat->subMatrixPseudoCounts);
325         float pcb = Parameters::getInstance().pcb;
326         PSSMCalculator::computePseudoCounts(profile, profile, pseudocountsWeight, PROFILE_AA_SIZE, neffM, L, pca, pcb);
327     }
328 //    printProfile();
329 
330     if (mapScores) {
331         for(int l = 0; l < this->L; l++) {
332     //        MathUtil::NormalizeTo1(&profile[l * profile_row_size], PROFILE_AA_SIZE);
333             for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
334                 float bitScore = probaToBitScore(profile[l * PROFILE_AA_SIZE + aa_idx], subMat->pBack[aa_idx]);
335                 if(bitScore<=-128){ //X state
336                     bitScore = -1;
337                 }
338                 const float bitScore8 =  bitScore * 2.0 + scoreBias;
339                 profile_score[l * PROFILE_ROW_SIZE + aa_idx] = static_cast<short>( ((bitScore8 < 0.0) ? bitScore8 - 0.5 : bitScore8 + 0.5) );
340                 profile_score[l * PROFILE_ROW_SIZE + aa_idx] = profile_score[l * PROFILE_ROW_SIZE + aa_idx] * 4;
341             }
342         }
343         //printPSSM();
344 
345         if (aaBiasCorrection == true){
346             SubstitutionMatrix::calcGlobalAaBiasCorrection(subMat, profile_score, pNullBuffer, PROFILE_ROW_SIZE, this->L);
347         }
348 
349         // sort profile scores and index for KmerGenerator (prefilter step)
350         for (int i = 0; i < this->L; i++){
351             unsigned int indexArray[PROFILE_AA_SIZE] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 };
352             Util::rankedDescSort20(&profile_score[i * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
353             memcpy(&profile_index[i * PROFILE_ROW_SIZE], &indexArray, PROFILE_AA_SIZE * sizeof(int));
354         }
355 
356         // write alignment profile
357         for (int i = 0; i < this->L; i++){
358             for (size_t aa_num = 0; aa_num < PROFILE_AA_SIZE; aa_num++) {
359                 unsigned int aa_idx = profile_index[i * PROFILE_ROW_SIZE + aa_num];
360                 profile_for_alignment[aa_idx * this-> L + i] = profile_score[i * PROFILE_ROW_SIZE + aa_num] / 4;
361             }
362         }
363         //TODO what is with the X
364     }
365     //printPSSM();
366 
367 //    printProfile();
368 }
369 
370 
371 template <int T>
mapProfileState(const char * profileState,unsigned int seqLen)372 void Sequence::mapProfileState(const char * profileState, unsigned int seqLen){
373     mapProfile(profileState, false, seqLen);
374 
375     SubstitutionMatrixProfileStates * profileStateMat = (SubstitutionMatrixProfileStates *) subMat;
376     // compute avg. amino acid probability
377     float pav[20];
378     // initialize vector of average aa freqs with pseudocounts
379     for (int a = 0; a < 20; a++){
380         pav[a] = subMat->pBack[a] * 10.0;
381     }
382     // calculate averages
383     for (int i = 0; i < L; ++i){
384         for (int a = 0; a < 20; a++){
385             pav[a] += profile[i * Sequence::PROFILE_AA_SIZE + a];
386         }
387     }
388     // Normalize vector of average aa frequencies pav[a]
389     MathUtil::NormalizeTo1(pav, Sequence::PROFILE_AA_SIZE);
390 
391     // log (S(i,k)) = log ( SUM_a p(i,a) * p(k,a) / f(a) )   k: column state, i: pos in ali, a: amino acid
392     if(profileStateMat->alphabetSize != 255 && profileStateMat->alphabetSize != 219){
393         for (int i = 0; i < L; i++){
394             for (int k = 0; k < profileStateMat->alphabetSize; k++) {
395                 // compute log score for all 32 profile states
396                 float sum = profileStateMat->scoreState(&profile[i * Sequence::PROFILE_AA_SIZE], pav, k);
397                 float pssmVal = (sum) * 10.0 * profileStateMat->getScoreNormalization();
398                 profile_score[i * PROFILE_ROW_SIZE + k] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
399             }
400         }
401 //        printProfileStatePSSM();
402 
403         if(aaBiasCorrection==true){
404             //TODO use new formular
405             SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(profile_score, PROFILE_ROW_SIZE, this->L, profileStateMat->alphabetSize);
406         }
407     //    printProfileStatePSSM();
408 
409         // sort profile scores and index for KmerGenerator (prefilter step)
410         for(int l = 0; l < this->L; l++){
411             unsigned int indexArray[T] = { 0, 1, 2, 3, 4, 5, 6, 7 };
412             switch (T) {
413                 case 8:
414                     Util::rankedDescSort8(&profile_score[l * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
415                     break;
416                 case 32:
417                     Util::rankedDescSort32(&profile_score[l * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
418                     break;
419                 default:
420                     Debug(Debug::ERROR) << "Sort for T of " << T << " is not defined \n";
421                     EXIT(EXIT_FAILURE);
422                     break;
423             }
424 
425             memcpy(&profile_index[l * PROFILE_ROW_SIZE], &indexArray, T * sizeof(int) );
426             // create consensus sequence
427     //        sequence[l] = indexArray[0]; // index 0 is the highst scoring one
428         }
429 
430         // write alignment profile
431         for(int l = 0; l < this->L; l++){
432             for(size_t aa_num = 0; aa_num < T; aa_num++) {
433                 unsigned int aa_idx = profile_index[l * PROFILE_ROW_SIZE + aa_num];
434                 float scale = 5.0*profileStateMat->getScoreNormalization();
435                 float score = static_cast<float>(profile_score[l * PROFILE_ROW_SIZE + aa_num]);
436                 float pssmVal = score/scale;
437                 profile_for_alignment[aa_idx * this->L + l] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
438             }
439         }
440     } else {
441         // write alignment profile
442         for (int l = 0; l < this->L; ++l) {
443             for (size_t aa_num = 0; aa_num < static_cast<size_t>(subMat->alphabetSize); ++aa_num) {
444                 float sum = profileStateMat->scoreState(&profile[l * Sequence::PROFILE_AA_SIZE], pav, aa_num);
445                 float pssmVal = 2.0 * sum * profileStateMat->getScoreNormalization();
446                 profile_for_alignment[aa_num * this->L + l] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
447             }
448         }
449         if(aaBiasCorrection==true){
450             SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(profile_for_alignment,this->L,profileStateMat->alphabetSize,subMat);
451         }
452 	/*
453  	//TEST with a neg bias to avoid over extension
454         for (int l = 0; l < this->L; ++l) {
455             for (size_t aa_num = 0; aa_num < static_cast<size_t>(subMat->alphabetSize); ++aa_num) {
456                 profile_for_alignment[aa_num * this->L + l] -= 1;
457             }
458         }*/
459 
460     }
461 }
462 
nextProfileKmer()463 void Sequence::nextProfileKmer() {
464     int pos = 0;
465     for (int i = 0; i < spacedPatternSize; i++) {
466         if (spacedPattern[i]) {
467             unsigned int *index = profile_index + ((currItPos + i) * PROFILE_ROW_SIZE);
468             short *score = profile_score + ((currItPos + i) * PROFILE_ROW_SIZE);
469             profile_matrix[pos]->index = index;
470             profile_matrix[pos]->score = score;
471             pos++;
472         }
473     }
474 }
475 
mapSequence(const char * sequence,unsigned int dataLen)476 void Sequence::mapSequence(const char * sequence, unsigned int dataLen){
477     size_t l = 0;
478     char curr = sequence[l];
479     if(dataLen >= maxLen){
480         numSequence = static_cast<unsigned char*>(realloc(numSequence, dataLen+1));
481         maxLen = dataLen;
482     }
483     while (curr != '\0' && curr != '\n' && l < dataLen &&  l < maxLen){
484         this->numSequence[l] = subMat->aa2num[static_cast<int>(curr)];
485         l++;
486         curr  = sequence[l];
487     }
488     this->L = l;
489 }
490 
printPSSM()491 void Sequence::printPSSM(){
492     printf("Query profile of sequence %d\n", dbKey);
493     printf("Pos ");
494     for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
495         printf("%3c ", subMat->num2aa[aa]);
496     }
497     printf("Neff \n");
498     for(int i = 0; i < this->L; i++){
499         printf("%3d ", i);
500         for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
501             printf("%3d ", profile_for_alignment[aa * L + i] );
502 //            printf("%3d ", profile_score[i * profile_row_size + aa] );
503         }
504         printf("%.1f\n",neffM[i]);
505     }
506 }
507 
printProfileStatePSSM()508 void Sequence::printProfileStatePSSM(){
509     printf("Query profile of sequence %d\n", dbKey);
510     printf("Pos ");
511     for(int aa = 0; aa < subMat->alphabetSize; aa++) {
512         printf("%3c ", subMat->num2aa[aa]);
513     }
514     printf("\n");
515     for(int i = 0; i < this->L; i++){
516         printf("%3d ", i);
517         for(int aa = 0; aa < subMat->alphabetSize; aa++){
518 //            printf("%3d ", profile_for_alignment[aa * L + i] );
519             printf("%3d ", profile_score[i * PROFILE_ROW_SIZE + profile_index[i * PROFILE_ROW_SIZE + aa]] );
520         }
521         printf("\n");
522     }
523 }
524 
printProfile()525 void Sequence::printProfile(){
526     printf("Query profile of sequence %d\n", dbKey);
527     printf("Pos ");
528     for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
529         printf("%3c ", subMat->num2aa[aa]);
530     }
531     printf("\n");
532     for(int i = 0; i < this->L; i++){
533         printf("%3d ", i);
534         for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
535             printf("%03.4f ", profile[i * PROFILE_AA_SIZE + aa] );
536         }
537         printf("\n");
538     }
539 }
540 
reverse()541 void Sequence::reverse() {
542     if(Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
543         short        tmpScore[PROFILE_AA_SIZE*4];
544         unsigned int tmpIndex[PROFILE_AA_SIZE*4];
545 
546         int i_curr = 0;
547         int j_curr = (this->L - 1) * PROFILE_ROW_SIZE;
548 
549         for (int i = 0; i < this->L/2; i++) {
550             memcpy(&tmpScore[0], profile_score + i_curr, PROFILE_ROW_SIZE * sizeof(short));
551             memcpy(&tmpIndex[0], profile_index + i_curr, PROFILE_ROW_SIZE * sizeof(unsigned int));
552             memcpy(profile_score + i_curr, profile_score + j_curr, PROFILE_ROW_SIZE * sizeof(short));
553             memcpy(profile_index + i_curr, profile_index + j_curr, PROFILE_ROW_SIZE * sizeof(unsigned int));
554             memcpy(profile_score + j_curr, &tmpScore[0], PROFILE_ROW_SIZE * sizeof(short));
555             memcpy(profile_index + j_curr, &tmpIndex[0], PROFILE_ROW_SIZE * sizeof(unsigned int));
556             i_curr += PROFILE_ROW_SIZE;
557             j_curr -= PROFILE_ROW_SIZE;
558         }
559     }
560     std::reverse(numSequence, numSequence + this->L); // reverse sequence
561 }
562 
print()563 void Sequence::print() {
564     std::cout << "Sequence ID " << this->id << "\n";
565     for(int i = 0; i < this->L; i++){
566         printf("%c",subMat->num2aa[this->numSequence[i]]);
567     }
568     std::cout << std::endl;
569 }
570 
extractProfileData(const char * data,const BaseMatrix & subMat,const int offset,std::string & result)571 void extractProfileData(const char* data, const BaseMatrix &subMat, const int offset, std::string &result) {
572     size_t i = 0;
573     while (data[i] != '\0'){
574         result.append(1, subMat.num2aa[(int)data[i + Sequence::PROFILE_AA_SIZE + offset]]);
575         i += Sequence::PROFILE_READIN_SIZE;
576     }
577 }
578 
extractProfileSequence(const char * data,const BaseMatrix & submat,std::string & result)579 void Sequence::extractProfileSequence(const char* data, const BaseMatrix &submat, std::string &result) {
580     extractProfileData(data, submat, 0, result);
581 }
582 
extractProfileConsensus(const char * data,const BaseMatrix & submat,std::string & result)583 void Sequence::extractProfileConsensus(const char* data, const BaseMatrix &submat, std::string &result) {
584     extractProfileData(data, submat, 1, result);
585 }
586