1 #include "Sequence.h"
2 #include "Debug.h"
3 #include "Util.h"
4 #include "simd.h"
5 #include "SubstitutionMatrix.h"
6 #include "Parameters.h"
7 #include "MathUtil.h"
8 #include "SubstitutionMatrixProfileStates.h"
9 #include "PSSMCalculator.h"
10
11 #include <climits> // short_max
12 #include <cstddef>
13
Sequence(size_t maxLen,int seqType,const BaseMatrix * subMat,const unsigned int kmerSize,const bool spaced,const bool aaBiasCorrection,bool shouldAddPC,const std::string & userSpacedKmerPattern)14 Sequence::Sequence(size_t maxLen, int seqType, const BaseMatrix *subMat, const unsigned int kmerSize, const bool spaced, const bool aaBiasCorrection, bool shouldAddPC, const std::string& userSpacedKmerPattern) {
15 this->maxLen = maxLen;
16 this->numSequence = (unsigned char*)malloc(maxLen + 1);
17 this->numConsensusSequence = (unsigned char*)malloc(maxLen + 1);
18 this->aaBiasCorrection = aaBiasCorrection;
19 this->subMat = (BaseMatrix*)subMat;
20 this->spaced = spaced;
21 this->seqType = seqType;
22 std::pair<const char *, unsigned int> spacedKmerInformation;
23 if (spaced == true && userSpacedKmerPattern.empty() == false) {
24 spacedKmerInformation = parseSpacedPattern(kmerSize, spaced, userSpacedKmerPattern);
25 } else {
26 spacedKmerInformation = getSpacedPattern(spaced, kmerSize);
27 }
28 this->spacedPattern = spacedKmerInformation.first;
29 this->spacedPatternSize = spacedKmerInformation.second;
30 this->kmerSize = kmerSize;
31 this->kmerWindow = NULL;
32 this->aaPosInSpacedPattern = NULL;
33 this->shouldAddPC = shouldAddPC;
34 this->userSpacedKmerPattern = userSpacedKmerPattern;
35 if(spacedPatternSize){
36 simdKmerRegisterCnt = (kmerSize / (VECSIZE_INT*4)) + 1;
37 unsigned int simdKmerLen = simdKmerRegisterCnt * (VECSIZE_INT*4); // for SIMD memory alignment
38 this->kmerWindow = (unsigned char*) mem_align(ALIGN_INT, simdKmerLen * sizeof(unsigned char));
39 memset(this->kmerWindow, 0, simdKmerLen * sizeof(unsigned char));
40 this->aaPosInSpacedPattern = new unsigned char[kmerSize];
41 if(spacedPattern == NULL) {
42 Debug(Debug::ERROR) << "Sequence does not have a kmerSize (kmerSize= " << spacedPatternSize << ") to use nextKmer.\n";
43 Debug(Debug::ERROR) << "Please report this bug to the developer\n";
44 EXIT(EXIT_FAILURE);
45 }
46 size_t pos = 0;
47 for(int i = 0; i < this->spacedPatternSize; i++) {
48 if(spacedPattern[i]){
49 aaPosInSpacedPattern[pos] = i;
50 pos++;
51 }
52 }
53 }
54
55 // init memory for profile search
56 if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
57
58 profile_matrix = new ScoreMatrix*[PROFILE_AA_SIZE]; // init 20 matrix pointer (its more than enough for all kmer parameter)
59 for (size_t i = 0; i < kmerSize; i++) {
60 profile_matrix[i] = new ScoreMatrix(NULL, NULL, PROFILE_AA_SIZE, PROFILE_ROW_SIZE);
61 }
62 this->pNullBuffer = new float[maxLen + 1];
63 this->neffM = new float[maxLen + 1];
64 this->profile_score = (short *) mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(short));
65 this->profile_index = (unsigned int *) mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(int));
66 this->profile = (float *) mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_AA_SIZE * sizeof(float));
67 this->pseudocountsWeight = (float *) mem_align(ALIGN_INT, (maxLen + 1) * PROFILE_ROW_SIZE * sizeof(float));
68 this->profile_for_alignment = (int8_t *) mem_align(ALIGN_INT, (maxLen + 1) * subMat->alphabetSize * sizeof(int8_t));
69 // init profile
70 memset(this->profile_for_alignment, 0, (maxLen + 1) * subMat->alphabetSize * sizeof(int8_t));
71 memset(this->profile, 0, (maxLen + 1) * PROFILE_AA_SIZE * sizeof(float));
72 for (size_t i = 0; i < (maxLen + 1) * PROFILE_ROW_SIZE; ++i){
73 profile_score[i] = -SHRT_MAX;
74 profile_index[i] = UINT_MAX;
75 }
76 } else {
77 profile_matrix = NULL;
78 }
79 currItPos = -1;
80 }
81
~Sequence()82 Sequence::~Sequence() {
83 delete[] spacedPattern;
84 free(numSequence);
85 free(numConsensusSequence);
86 if (kmerWindow) {
87 free(kmerWindow);
88 }
89 if (aaPosInSpacedPattern){
90 delete[] aaPosInSpacedPattern;
91 }
92 if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE)|| Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
93 for (size_t i = 0; i < kmerSize; ++i) {
94 delete profile_matrix[i];
95 }
96 delete[] profile_matrix;
97 delete[] neffM;
98 delete[] pNullBuffer;
99 free(profile);
100 free(pseudocountsWeight);
101 free(profile_score);
102 free(profile_index);
103 free(profile_for_alignment);
104 }
105 }
106
getSpacedPattern(bool spaced,unsigned int kmerSize)107 std::pair<const char *, unsigned int> Sequence::getSpacedPattern(bool spaced, unsigned int kmerSize){
108 #define CASE(x) {case x: \
109 if(spaced){ \
110 pair = std::make_pair<const char *, unsigned int>((const char *) &spaced_seed_##x, ARRAY_SIZE(spaced_seed_##x)); \
111 }else{ \
112 pair = std::make_pair<const char *, unsigned int>((const char *) &seed_##x, ARRAY_SIZE(seed_##x)); \
113 } \
114 break; \
115 }
116 std::pair<const char *, unsigned int> pair;
117 switch (kmerSize) {
118 case 0: // if no kmer iterator support
119 pair = std::make_pair<const char *, unsigned int>(NULL, 0);
120 break;
121 CASE(4)
122 CASE(5)
123 CASE(6)
124 CASE(7)
125 CASE(8)
126 CASE(9)
127 CASE(10)
128 CASE(11)
129 CASE(12)
130 CASE(13)
131 CASE(14)
132 CASE(15)
133 CASE(16)
134 CASE(17)
135 CASE(18)
136 CASE(19)
137 CASE(20)
138 CASE(21)
139 CASE(22)
140 CASE(23)
141 CASE(24)
142 CASE(25)
143 CASE(26)
144 CASE(27)
145 CASE(28)
146 CASE(29)
147 CASE(30)
148 default:
149
150 char * pattern = new char[kmerSize];
151 for(size_t i = 0; i < kmerSize; i++){
152 pattern[i]=1;
153 }
154 return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(kmerSize));
155
156 // Debug(Debug::ERROR) << "Did not find spaced pattern for kmerSize: " << kmerSize << ". \n";
157 // Debug(Debug::ERROR) << "Please report this bug to the developer\n";
158 // EXIT(EXIT_FAILURE);
159 break;
160 }
161 char * pattern = new char[pair.second];
162 if (pair.second > 0) {
163 memcpy(pattern, pair.first, pair.second * sizeof(char));
164 }
165 return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(pair.second));
166 #undef CASE
167 }
168
parseSpacedPattern(unsigned int kmerSize,bool spaced,const std::string & spacedKmerPattern)169 std::pair<const char *, unsigned int> Sequence::parseSpacedPattern(unsigned int kmerSize, bool spaced, const std::string& spacedKmerPattern) {
170 bool spacedKmerPatternSpaced = false;
171 unsigned int spacedKmerPatternKmerSize = 0;
172 char* pattern = new char[spacedKmerPattern.size()];
173 for (size_t i = 0; i < spacedKmerPattern.size(); ++i) {
174 switch (spacedKmerPattern[i]) {
175 case '0':
176 spacedKmerPatternSpaced = true;
177 pattern[i] = 0;
178 break;
179 case '1':
180 spacedKmerPatternKmerSize += 1;
181 pattern[i] = 1;
182 break;
183 default:
184 Debug(Debug::ERROR) << "Invalid character in user-specified k-mer pattern\n";
185 EXIT(EXIT_FAILURE);
186 break;
187 }
188 }
189 if (spacedKmerPatternKmerSize != kmerSize){
190 Debug(Debug::ERROR) << "User-specified k-mer pattern is not consistent with stated k-mer size\n";
191 EXIT(EXIT_FAILURE);
192 } else if (spacedKmerPatternSpaced != spaced) {
193 Debug(Debug::ERROR) << "User-specified k-mer pattern is not consistent with spaced k-mer true/false\n";
194 EXIT(EXIT_FAILURE);
195 }
196 return std::make_pair<const char *, unsigned int>((const char *) pattern, spacedKmerPattern.size());
197 }
198
mapSequence(size_t id,unsigned int dbKey,const char * sequence,unsigned int seqLen,bool mapProfileScores)199 void Sequence::mapSequence(size_t id, unsigned int dbKey, const char *sequence, unsigned int seqLen, bool mapProfileScores) {
200 this->id = id;
201 this->dbKey = dbKey;
202 this->seqData = sequence;
203 if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_AMINO_ACIDS) || Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_NUCLEOTIDES)) {
204 mapSequence(sequence, seqLen);
205 } else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_HMM_PROFILE)) {
206 mapProfile(sequence, mapProfileScores, seqLen);
207 } else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)) {
208 mapProfileStateSequence(sequence, seqLen);
209 }else if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
210 switch(subMat->alphabetSize) {
211 case 8:
212 mapProfileState<8>(sequence, seqLen);
213 break;
214 case 32:
215 mapProfileState<32>(sequence, seqLen);
216 break;
217 case 219:
218 mapProfileState<219>(sequence, seqLen);
219 break;
220 case 255:
221 mapProfileState<255>(sequence, seqLen);
222 break;
223 default:
224 Debug(Debug::ERROR) << "Invalid alphabet size type!\n";
225 EXIT(EXIT_FAILURE);
226 break;
227 }
228 } else {
229 Debug(Debug::ERROR) << "Invalid sequence type!\n";
230 EXIT(EXIT_FAILURE);
231 }
232 currItPos = -1;
233
234 }
235
mapSequence(size_t id,unsigned int dbKey,std::pair<const unsigned char *,const unsigned int> data)236 void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsigned char *,const unsigned int> data){
237 this->id = id;
238 this->dbKey = dbKey;
239 if (Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_AMINO_ACIDS)
240 || Parameters::isEqualDbtype( this->seqType,Parameters::DBTYPE_NUCLEOTIDES)
241 || Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)){
242 this->L = data.second;
243 if(this->L >= static_cast<int>(maxLen)){
244 numSequence = static_cast<unsigned char *>(realloc(numSequence, this->L+1));
245 maxLen = this->L;
246 }
247 memcpy(this->numSequence, data.first, this->L);
248 } else {
249 Debug(Debug::ERROR) << "Invalid sequence type!\n";
250 EXIT(EXIT_FAILURE);
251 }
252 currItPos = -1;
253 }
254
mapProfileStateSequence(const char * profileStateSeq,unsigned int seqLen)255 void Sequence::mapProfileStateSequence(const char * profileStateSeq, unsigned int seqLen){
256 size_t l = 0;
257 size_t pos = 0;
258 unsigned char curr = profileStateSeq[pos];
259 while (curr != '\0' && l < seqLen){
260
261 this->numSequence[l] = curr - 1;
262
263 l++;
264 if (l > maxLen){
265 Debug(Debug::ERROR) << "Sequence too long! Max length allowed would be " << maxLen << "\n";
266 EXIT(EXIT_FAILURE);
267 }
268 pos++;
269 curr = profileStateSeq[pos];
270 }
271 this->L = l;
272 }
273
274
275
mapProfile(const char * profileData,bool mapScores,unsigned int seqLen)276 void Sequence::mapProfile(const char * profileData, bool mapScores, unsigned int seqLen){
277 char * data = (char *) profileData;
278 size_t currPos = 0;
279 float scoreBias = 0.0;
280 // if no data exists
281 {
282 size_t l = 0;
283 while (data[currPos] != '\0' && l < maxLen && l < seqLen){
284 for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
285 // shift bytes back (avoids NULL byte)
286 // short value = static_cast<short>( ^ mask);
287 profile[l * PROFILE_AA_SIZE + aa_idx] = scoreUnmask(data[currPos + aa_idx]);
288 //value * 4;
289 }
290
291 float sumProb = 0.0;
292 for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
293 sumProb += profile[l * PROFILE_AA_SIZE + aa];
294 }
295 if(sumProb > 0.9){
296 MathUtil::NormalizeTo1(&profile[l * PROFILE_AA_SIZE], PROFILE_AA_SIZE);
297 }
298
299 unsigned char queryLetter = data[currPos + PROFILE_AA_SIZE];
300 // read query sequence
301 numSequence[l] = queryLetter; // index 0 is the highst scoring one
302 unsigned char consensusLetter = data[currPos + PROFILE_AA_SIZE+1];
303 numConsensusSequence[l] = consensusLetter;
304 unsigned short neff = data[currPos + PROFILE_AA_SIZE+2];
305 neffM[l] = MathUtil::convertNeffToFloat(neff);
306 l++;
307
308
309 // go to begin of next entry 0, 20, 40, 60, ...
310 currPos += PROFILE_READIN_SIZE;
311 }
312 this->L = l;
313 if(l > maxLen ){
314 Debug(Debug::INFO) << "Entry " << dbKey << " is longer than max seq. len " << maxLen << "\n";
315 }
316
317 }
318
319
320 // TODO: Make dependency explicit
321 float pca = Parameters::getInstance().pca;
322 if(shouldAddPC && pca > 0.0){
323 PSSMCalculator::preparePseudoCounts(profile, pseudocountsWeight, PROFILE_AA_SIZE, L,
324 (const float **) subMat->subMatrixPseudoCounts);
325 float pcb = Parameters::getInstance().pcb;
326 PSSMCalculator::computePseudoCounts(profile, profile, pseudocountsWeight, PROFILE_AA_SIZE, neffM, L, pca, pcb);
327 }
328 // printProfile();
329
330 if (mapScores) {
331 for(int l = 0; l < this->L; l++) {
332 // MathUtil::NormalizeTo1(&profile[l * profile_row_size], PROFILE_AA_SIZE);
333 for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
334 float bitScore = probaToBitScore(profile[l * PROFILE_AA_SIZE + aa_idx], subMat->pBack[aa_idx]);
335 if(bitScore<=-128){ //X state
336 bitScore = -1;
337 }
338 const float bitScore8 = bitScore * 2.0 + scoreBias;
339 profile_score[l * PROFILE_ROW_SIZE + aa_idx] = static_cast<short>( ((bitScore8 < 0.0) ? bitScore8 - 0.5 : bitScore8 + 0.5) );
340 profile_score[l * PROFILE_ROW_SIZE + aa_idx] = profile_score[l * PROFILE_ROW_SIZE + aa_idx] * 4;
341 }
342 }
343 //printPSSM();
344
345 if (aaBiasCorrection == true){
346 SubstitutionMatrix::calcGlobalAaBiasCorrection(subMat, profile_score, pNullBuffer, PROFILE_ROW_SIZE, this->L);
347 }
348
349 // sort profile scores and index for KmerGenerator (prefilter step)
350 for (int i = 0; i < this->L; i++){
351 unsigned int indexArray[PROFILE_AA_SIZE] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 };
352 Util::rankedDescSort20(&profile_score[i * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
353 memcpy(&profile_index[i * PROFILE_ROW_SIZE], &indexArray, PROFILE_AA_SIZE * sizeof(int));
354 }
355
356 // write alignment profile
357 for (int i = 0; i < this->L; i++){
358 for (size_t aa_num = 0; aa_num < PROFILE_AA_SIZE; aa_num++) {
359 unsigned int aa_idx = profile_index[i * PROFILE_ROW_SIZE + aa_num];
360 profile_for_alignment[aa_idx * this-> L + i] = profile_score[i * PROFILE_ROW_SIZE + aa_num] / 4;
361 }
362 }
363 //TODO what is with the X
364 }
365 //printPSSM();
366
367 // printProfile();
368 }
369
370
371 template <int T>
mapProfileState(const char * profileState,unsigned int seqLen)372 void Sequence::mapProfileState(const char * profileState, unsigned int seqLen){
373 mapProfile(profileState, false, seqLen);
374
375 SubstitutionMatrixProfileStates * profileStateMat = (SubstitutionMatrixProfileStates *) subMat;
376 // compute avg. amino acid probability
377 float pav[20];
378 // initialize vector of average aa freqs with pseudocounts
379 for (int a = 0; a < 20; a++){
380 pav[a] = subMat->pBack[a] * 10.0;
381 }
382 // calculate averages
383 for (int i = 0; i < L; ++i){
384 for (int a = 0; a < 20; a++){
385 pav[a] += profile[i * Sequence::PROFILE_AA_SIZE + a];
386 }
387 }
388 // Normalize vector of average aa frequencies pav[a]
389 MathUtil::NormalizeTo1(pav, Sequence::PROFILE_AA_SIZE);
390
391 // log (S(i,k)) = log ( SUM_a p(i,a) * p(k,a) / f(a) ) k: column state, i: pos in ali, a: amino acid
392 if(profileStateMat->alphabetSize != 255 && profileStateMat->alphabetSize != 219){
393 for (int i = 0; i < L; i++){
394 for (int k = 0; k < profileStateMat->alphabetSize; k++) {
395 // compute log score for all 32 profile states
396 float sum = profileStateMat->scoreState(&profile[i * Sequence::PROFILE_AA_SIZE], pav, k);
397 float pssmVal = (sum) * 10.0 * profileStateMat->getScoreNormalization();
398 profile_score[i * PROFILE_ROW_SIZE + k] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
399 }
400 }
401 // printProfileStatePSSM();
402
403 if(aaBiasCorrection==true){
404 //TODO use new formular
405 SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(profile_score, PROFILE_ROW_SIZE, this->L, profileStateMat->alphabetSize);
406 }
407 // printProfileStatePSSM();
408
409 // sort profile scores and index for KmerGenerator (prefilter step)
410 for(int l = 0; l < this->L; l++){
411 unsigned int indexArray[T] = { 0, 1, 2, 3, 4, 5, 6, 7 };
412 switch (T) {
413 case 8:
414 Util::rankedDescSort8(&profile_score[l * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
415 break;
416 case 32:
417 Util::rankedDescSort32(&profile_score[l * PROFILE_ROW_SIZE], (unsigned int *) &indexArray);
418 break;
419 default:
420 Debug(Debug::ERROR) << "Sort for T of " << T << " is not defined \n";
421 EXIT(EXIT_FAILURE);
422 break;
423 }
424
425 memcpy(&profile_index[l * PROFILE_ROW_SIZE], &indexArray, T * sizeof(int) );
426 // create consensus sequence
427 // sequence[l] = indexArray[0]; // index 0 is the highst scoring one
428 }
429
430 // write alignment profile
431 for(int l = 0; l < this->L; l++){
432 for(size_t aa_num = 0; aa_num < T; aa_num++) {
433 unsigned int aa_idx = profile_index[l * PROFILE_ROW_SIZE + aa_num];
434 float scale = 5.0*profileStateMat->getScoreNormalization();
435 float score = static_cast<float>(profile_score[l * PROFILE_ROW_SIZE + aa_num]);
436 float pssmVal = score/scale;
437 profile_for_alignment[aa_idx * this->L + l] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
438 }
439 }
440 } else {
441 // write alignment profile
442 for (int l = 0; l < this->L; ++l) {
443 for (size_t aa_num = 0; aa_num < static_cast<size_t>(subMat->alphabetSize); ++aa_num) {
444 float sum = profileStateMat->scoreState(&profile[l * Sequence::PROFILE_AA_SIZE], pav, aa_num);
445 float pssmVal = 2.0 * sum * profileStateMat->getScoreNormalization();
446 profile_for_alignment[aa_num * this->L + l] = static_cast<short>((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5);
447 }
448 }
449 if(aaBiasCorrection==true){
450 SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(profile_for_alignment,this->L,profileStateMat->alphabetSize,subMat);
451 }
452 /*
453 //TEST with a neg bias to avoid over extension
454 for (int l = 0; l < this->L; ++l) {
455 for (size_t aa_num = 0; aa_num < static_cast<size_t>(subMat->alphabetSize); ++aa_num) {
456 profile_for_alignment[aa_num * this->L + l] -= 1;
457 }
458 }*/
459
460 }
461 }
462
nextProfileKmer()463 void Sequence::nextProfileKmer() {
464 int pos = 0;
465 for (int i = 0; i < spacedPatternSize; i++) {
466 if (spacedPattern[i]) {
467 unsigned int *index = profile_index + ((currItPos + i) * PROFILE_ROW_SIZE);
468 short *score = profile_score + ((currItPos + i) * PROFILE_ROW_SIZE);
469 profile_matrix[pos]->index = index;
470 profile_matrix[pos]->score = score;
471 pos++;
472 }
473 }
474 }
475
mapSequence(const char * sequence,unsigned int dataLen)476 void Sequence::mapSequence(const char * sequence, unsigned int dataLen){
477 size_t l = 0;
478 char curr = sequence[l];
479 if(dataLen >= maxLen){
480 numSequence = static_cast<unsigned char*>(realloc(numSequence, dataLen+1));
481 maxLen = dataLen;
482 }
483 while (curr != '\0' && curr != '\n' && l < dataLen && l < maxLen){
484 this->numSequence[l] = subMat->aa2num[static_cast<int>(curr)];
485 l++;
486 curr = sequence[l];
487 }
488 this->L = l;
489 }
490
printPSSM()491 void Sequence::printPSSM(){
492 printf("Query profile of sequence %d\n", dbKey);
493 printf("Pos ");
494 for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
495 printf("%3c ", subMat->num2aa[aa]);
496 }
497 printf("Neff \n");
498 for(int i = 0; i < this->L; i++){
499 printf("%3d ", i);
500 for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
501 printf("%3d ", profile_for_alignment[aa * L + i] );
502 // printf("%3d ", profile_score[i * profile_row_size + aa] );
503 }
504 printf("%.1f\n",neffM[i]);
505 }
506 }
507
printProfileStatePSSM()508 void Sequence::printProfileStatePSSM(){
509 printf("Query profile of sequence %d\n", dbKey);
510 printf("Pos ");
511 for(int aa = 0; aa < subMat->alphabetSize; aa++) {
512 printf("%3c ", subMat->num2aa[aa]);
513 }
514 printf("\n");
515 for(int i = 0; i < this->L; i++){
516 printf("%3d ", i);
517 for(int aa = 0; aa < subMat->alphabetSize; aa++){
518 // printf("%3d ", profile_for_alignment[aa * L + i] );
519 printf("%3d ", profile_score[i * PROFILE_ROW_SIZE + profile_index[i * PROFILE_ROW_SIZE + aa]] );
520 }
521 printf("\n");
522 }
523 }
524
printProfile()525 void Sequence::printProfile(){
526 printf("Query profile of sequence %d\n", dbKey);
527 printf("Pos ");
528 for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
529 printf("%3c ", subMat->num2aa[aa]);
530 }
531 printf("\n");
532 for(int i = 0; i < this->L; i++){
533 printf("%3d ", i);
534 for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++){
535 printf("%03.4f ", profile[i * PROFILE_AA_SIZE + aa] );
536 }
537 printf("\n");
538 }
539 }
540
reverse()541 void Sequence::reverse() {
542 if(Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
543 short tmpScore[PROFILE_AA_SIZE*4];
544 unsigned int tmpIndex[PROFILE_AA_SIZE*4];
545
546 int i_curr = 0;
547 int j_curr = (this->L - 1) * PROFILE_ROW_SIZE;
548
549 for (int i = 0; i < this->L/2; i++) {
550 memcpy(&tmpScore[0], profile_score + i_curr, PROFILE_ROW_SIZE * sizeof(short));
551 memcpy(&tmpIndex[0], profile_index + i_curr, PROFILE_ROW_SIZE * sizeof(unsigned int));
552 memcpy(profile_score + i_curr, profile_score + j_curr, PROFILE_ROW_SIZE * sizeof(short));
553 memcpy(profile_index + i_curr, profile_index + j_curr, PROFILE_ROW_SIZE * sizeof(unsigned int));
554 memcpy(profile_score + j_curr, &tmpScore[0], PROFILE_ROW_SIZE * sizeof(short));
555 memcpy(profile_index + j_curr, &tmpIndex[0], PROFILE_ROW_SIZE * sizeof(unsigned int));
556 i_curr += PROFILE_ROW_SIZE;
557 j_curr -= PROFILE_ROW_SIZE;
558 }
559 }
560 std::reverse(numSequence, numSequence + this->L); // reverse sequence
561 }
562
print()563 void Sequence::print() {
564 std::cout << "Sequence ID " << this->id << "\n";
565 for(int i = 0; i < this->L; i++){
566 printf("%c",subMat->num2aa[this->numSequence[i]]);
567 }
568 std::cout << std::endl;
569 }
570
extractProfileData(const char * data,const BaseMatrix & subMat,const int offset,std::string & result)571 void extractProfileData(const char* data, const BaseMatrix &subMat, const int offset, std::string &result) {
572 size_t i = 0;
573 while (data[i] != '\0'){
574 result.append(1, subMat.num2aa[(int)data[i + Sequence::PROFILE_AA_SIZE + offset]]);
575 i += Sequence::PROFILE_READIN_SIZE;
576 }
577 }
578
extractProfileSequence(const char * data,const BaseMatrix & submat,std::string & result)579 void Sequence::extractProfileSequence(const char* data, const BaseMatrix &submat, std::string &result) {
580 extractProfileData(data, submat, 0, result);
581 }
582
extractProfileConsensus(const char * data,const BaseMatrix & submat,std::string & result)583 void Sequence::extractProfileConsensus(const char* data, const BaseMatrix &submat, std::string &result) {
584 extractProfileData(data, submat, 1, result);
585 }
586