1 #include <iostream>
2 #include <list>
3 #include <algorithm>
4 #include <cmath>
5 
6 #include "SequenceLookup.h"
7 #include "SubstitutionMatrix.h"
8 #include "UngappedAlignment.h"
9 #include "ExtendedSubstitutionMatrix.h"
10 #include "FileUtil.h"
11 
12 #include "kseq.h"
13 #include <unistd.h> // read
14 
15 KSEQ_INIT(int, read)
16 
17 #include "Clustering.h"
18 #include "DBReader.h"
19 #include "DBWriter.h"
20 #include "Parameters.h"
21 
22 const char* binary_name = "test_diagonalscoringperformance";
23 
main(int,const char **)24 int main (int, const char**) {
25     size_t kmer_size = 6;
26     Parameters& par = Parameters::getInstance();
27     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, 0.0);
28     SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
29 
30     std::string S1 = "AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL";
31     const char* S1char = S1.c_str();
32 //    std::cout << S1char << "\n\n";
33     Sequence s1(10000,  0, &subMat, kmer_size, true, false);
34     s1.mapSequence(0,0,S1char, S1.size());
35 
36     std::string S2 = "MLKIRYSSAFKKDLKPFQHDKSAISVINTVLKLLATGKPLPREYKEHSLKGDYIGYLECHGKPDLLLIYKRTEQEVFLYRVGSHAKLF";
37     const char* S2char = S2.c_str();
38 //    std::cout << S2char << "\n\n";
39     Sequence s2(10000,  0, &subMat, kmer_size, true, false);
40     s2.mapSequence(0,0,S2char, S2.size());
41 
42     FILE *fasta_file = FileUtil::openFileOrDie("/Users/mad/Documents/databases/mmseqs_benchmark/benchmarks/clustering_benchmark/db/db_full.fas", "r", true);
43     kseq_t *seq = kseq_init(fileno(fasta_file));
44     size_t dbEntrySize = 0;
45     size_t dbCnt = 0;
46     while (kseq_read(seq) >= 0) {
47         dbEntrySize += seq->seq.l;
48         dbCnt += 1;
49     }
50     SequenceLookup lookup(dbCnt*10, dbEntrySize*10);
51     //kseq_destroy(seq);
52     Sequence dbSeq(40000,  0, &subMat, kmer_size, true, false);
53     size_t id = 0;
54     size_t maxLen = 0;
55     for(size_t i = 0; i < 10; i++){
56         fclose(fasta_file);
57         fasta_file = FileUtil::openFileOrDie("/Users/mad/Documents/databases/mmseqs_benchmark/benchmarks/clustering_benchmark/db/db_full.fas", "r", true);
58         kseq_rewind(seq);
59         while (kseq_read(seq) >= 0) {
60             dbSeq.mapSequence(id,id,seq->seq.s, seq->seq.l);
61             maxLen = std::max(seq->seq.l, maxLen);
62 //        if(id == 202423){
63 //            std::cout << seq->seq.s << std::endl;
64 //        }
65             lookup.addSequence(&dbSeq);
66 
67             id += 1;
68         }
69     }
70     kseq_destroy(seq);
71     std::cout << maxLen << std::endl;
72     UngappedAlignment matcher(maxLen, &subMat, &lookup);
73     CounterResult hits[16000];
74     hits[0].id =142424;
75     hits[0].diagonal = 50;
76     hits[1].id = 191382;
77     hits[1].diagonal = 4;
78     hits[2].id = 135950;
79     hits[2].diagonal = 4;
80     hits[3].id = 63969;
81     hits[3].diagonal = 4;
82     hits[4].id = 244188;
83     hits[4].diagonal = 4;
84 
85     for(size_t i = 5; i < 16; i++) {
86         hits[i].id = 159147;
87         hits[i].diagonal = 31;
88     }
89 
90     float * compositionBias = new float[s1.L];
91     SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
92 
93 
94 
95     matcher.processQuery(&s1,compositionBias, hits, 16);
96     std::cout << (int)hits[0].count << " ";
97     std::cout << (int)hits[1].count << " ";
98     std::cout << (int)hits[2].count << " ";
99     std::cout << (int)hits[3].count << std::endl;
100 
101     matcher.processQuery(&s1, compositionBias, hits, 1);
102     matcher.processQuery(&s1, compositionBias, hits + 1, 1);
103     matcher.processQuery(&s1, compositionBias, hits + 2, 1);
104     matcher.processQuery(&s1, compositionBias, hits + 3, 1);
105 
106     std::cout << (int)hits[0].count<< " ";
107     std::cout << (int)hits[1].count<< " ";
108     std::cout << (int)hits[2].count<< " ";
109     std::cout << (int)hits[3].count<< std::endl;
110     for(size_t i = 0; i < 10000; i++){
111         for(int j = 1; j < 16000; j++){
112             hits[j].id = rand()%dbCnt;
113             hits[j].diagonal =  rand()%s1.L;
114         }
115         //   std::reverse(hits, hits+1000);
116         matcher.processQuery(&s1, compositionBias, hits, 16000);
117     }
118 //    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore <<  std::endl;
119 //    std::cout << (int)hits[0].diagonalScore <<  std::endl;
120     for(int i = 0; i < 1000; i++){
121         std::cout << hits[i].id << "\t" << (int) hits[i].diagonal  << "\t" << (int)hits[i].count <<  std::endl;
122     }
123     delete [] compositionBias;
124 }
125