1 #include <iostream>
2 #include <list>
3 #include <algorithm>
4 #include <cmath>
5
6 #include "SequenceLookup.h"
7 #include "SubstitutionMatrix.h"
8 #include "UngappedAlignment.h"
9 #include "ExtendedSubstitutionMatrix.h"
10 #include "FileUtil.h"
11
12 #include "kseq.h"
13 #include <unistd.h> // read
14
15 KSEQ_INIT(int, read)
16
17 #include "Clustering.h"
18 #include "DBReader.h"
19 #include "DBWriter.h"
20 #include "Parameters.h"
21
22 const char* binary_name = "test_diagonalscoringperformance";
23
main(int,const char **)24 int main (int, const char**) {
25 size_t kmer_size = 6;
26 Parameters& par = Parameters::getInstance();
27 SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, 0.0);
28 SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
29
30 std::string S1 = "AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL";
31 const char* S1char = S1.c_str();
32 // std::cout << S1char << "\n\n";
33 Sequence s1(10000, 0, &subMat, kmer_size, true, false);
34 s1.mapSequence(0,0,S1char, S1.size());
35
36 std::string S2 = "MLKIRYSSAFKKDLKPFQHDKSAISVINTVLKLLATGKPLPREYKEHSLKGDYIGYLECHGKPDLLLIYKRTEQEVFLYRVGSHAKLF";
37 const char* S2char = S2.c_str();
38 // std::cout << S2char << "\n\n";
39 Sequence s2(10000, 0, &subMat, kmer_size, true, false);
40 s2.mapSequence(0,0,S2char, S2.size());
41
42 FILE *fasta_file = FileUtil::openFileOrDie("/Users/mad/Documents/databases/mmseqs_benchmark/benchmarks/clustering_benchmark/db/db_full.fas", "r", true);
43 kseq_t *seq = kseq_init(fileno(fasta_file));
44 size_t dbEntrySize = 0;
45 size_t dbCnt = 0;
46 while (kseq_read(seq) >= 0) {
47 dbEntrySize += seq->seq.l;
48 dbCnt += 1;
49 }
50 SequenceLookup lookup(dbCnt*10, dbEntrySize*10);
51 //kseq_destroy(seq);
52 Sequence dbSeq(40000, 0, &subMat, kmer_size, true, false);
53 size_t id = 0;
54 size_t maxLen = 0;
55 for(size_t i = 0; i < 10; i++){
56 fclose(fasta_file);
57 fasta_file = FileUtil::openFileOrDie("/Users/mad/Documents/databases/mmseqs_benchmark/benchmarks/clustering_benchmark/db/db_full.fas", "r", true);
58 kseq_rewind(seq);
59 while (kseq_read(seq) >= 0) {
60 dbSeq.mapSequence(id,id,seq->seq.s, seq->seq.l);
61 maxLen = std::max(seq->seq.l, maxLen);
62 // if(id == 202423){
63 // std::cout << seq->seq.s << std::endl;
64 // }
65 lookup.addSequence(&dbSeq);
66
67 id += 1;
68 }
69 }
70 kseq_destroy(seq);
71 std::cout << maxLen << std::endl;
72 UngappedAlignment matcher(maxLen, &subMat, &lookup);
73 CounterResult hits[16000];
74 hits[0].id =142424;
75 hits[0].diagonal = 50;
76 hits[1].id = 191382;
77 hits[1].diagonal = 4;
78 hits[2].id = 135950;
79 hits[2].diagonal = 4;
80 hits[3].id = 63969;
81 hits[3].diagonal = 4;
82 hits[4].id = 244188;
83 hits[4].diagonal = 4;
84
85 for(size_t i = 5; i < 16; i++) {
86 hits[i].id = 159147;
87 hits[i].diagonal = 31;
88 }
89
90 float * compositionBias = new float[s1.L];
91 SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
92
93
94
95 matcher.processQuery(&s1,compositionBias, hits, 16);
96 std::cout << (int)hits[0].count << " ";
97 std::cout << (int)hits[1].count << " ";
98 std::cout << (int)hits[2].count << " ";
99 std::cout << (int)hits[3].count << std::endl;
100
101 matcher.processQuery(&s1, compositionBias, hits, 1);
102 matcher.processQuery(&s1, compositionBias, hits + 1, 1);
103 matcher.processQuery(&s1, compositionBias, hits + 2, 1);
104 matcher.processQuery(&s1, compositionBias, hits + 3, 1);
105
106 std::cout << (int)hits[0].count<< " ";
107 std::cout << (int)hits[1].count<< " ";
108 std::cout << (int)hits[2].count<< " ";
109 std::cout << (int)hits[3].count<< std::endl;
110 for(size_t i = 0; i < 10000; i++){
111 for(int j = 1; j < 16000; j++){
112 hits[j].id = rand()%dbCnt;
113 hits[j].diagonal = rand()%s1.L;
114 }
115 // std::reverse(hits, hits+1000);
116 matcher.processQuery(&s1, compositionBias, hits, 16000);
117 }
118 // std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore << std::endl;
119 // std::cout << (int)hits[0].diagonalScore << std::endl;
120 for(int i = 0; i < 1000; i++){
121 std::cout << hits[i].id << "\t" << (int) hits[i].diagonal << "\t" << (int)hits[i].count << std::endl;
122 }
123 delete [] compositionBias;
124 }
125