1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 ****/
18
19 #ifndef DAA_FILE_H_
20 #define DAA_FILE_H_
21
22 #include <string>
23 #include <exception>
24 #include "../util/ptr_vector.h"
25 #include "../basic/config.h"
26 #include "../basic/const.h"
27 #include "../util/binary_buffer.h"
28 #include "../util/io/input_file.h"
29 #include "../basic/value.h"
30 #include "../data/reference.h"
31
32 using std::string;
33
34 struct DAA_header1
35 {
36 enum { VERSION = 1, COMPATIBILITY_VERSION = 0 };
DAA_header1DAA_header137 DAA_header1():
38 magic_number (0x3c0e53476d3ee36bllu),
39 version (VERSION)
40 { }
41 uint64_t magic_number, version;
42 };
43
mode_str(int mode)44 inline const char* mode_str(int mode)
45 {
46 static const char* mode_str[] = { 0, 0, "blastp", "blastx", "blastn" };
47 return mode_str[mode];
48 }
49
50 struct DAA_header2
51 {
DAA_header2DAA_header252 DAA_header2():
53 diamond_build(Const::build_version)
54 {
55 memset(block_type, 0, sizeof(block_type));
56 memset(block_size, 0, sizeof(block_size));
57 }
DAA_header2DAA_header258 DAA_header2(uint64_t db_seqs,
59 uint64_t db_letters,
60 int32_t gap_open,
61 int32_t gap_extend,
62 int32_t reward,
63 int32_t penalty,
64 double k,
65 double lambda,
66 double evalue,
67 const string &score_matrix,
68 unsigned mode):
69 diamond_build (Const::build_version),
70 db_seqs (db_seqs),
71 db_seqs_used (0),
72 db_letters (db_letters),
73 flags (0),
74 query_records (0),
75 mode ((int32_t)mode),
76 gap_open (gap_open),
77 gap_extend (gap_extend),
78 reward (reward),
79 penalty (penalty),
80 reserved1 (0),
81 reserved2 (0),
82 reserved3 (0),
83 k (k),
84 lambda (lambda),
85 evalue (evalue),
86 reserved5 (0)
87 {
88 memset(block_type, 0, sizeof(block_type));
89 memset(block_size, 0, sizeof(block_size));
90 memset(this->score_matrix, 0, sizeof(this->score_matrix));
91 strcpy(this->score_matrix, score_matrix.c_str());
92 }
93 typedef enum { empty = 0, alignments = 1, ref_names = 2, ref_lengths = 3 } Block_type;
94 uint64_t diamond_build, db_seqs, db_seqs_used, db_letters, flags, query_records;
95 int32_t mode, gap_open, gap_extend, reward, penalty, reserved1, reserved2, reserved3;
96 double k, lambda, evalue, reserved5;
97 char score_matrix[16];
98 uint64_t block_size[256];
99 char block_type[256];
100 };
101
102 struct DAA_file
103 {
104
DAA_fileDAA_file105 DAA_file(const string& file_name):
106 f_ (file_name),
107 query_count_ (0)
108 {
109 f_.read(&h1_, 1);
110 if(h1_.magic_number != DAA_header1().magic_number)
111 throw std::runtime_error("Input file is not a DAA file.");
112 if(h1_.version > DAA_header1::VERSION)
113 throw std::runtime_error("DAA version requires later version of DIAMOND.");
114 f_.read(&h2_, 1);
115
116 if(h2_.block_size[0] == 0)
117 throw std::runtime_error("Invalid DAA file. DIAMOND run has probably not completed successfully.");
118
119 align_mode = Align_mode(h2_.mode);
120 //ref_header.sequences = h2_.db_seqs;
121
122 f_.seek(sizeof(DAA_header1) + sizeof(DAA_header2) + (size_t)h2_.block_size[0]);
123 string s;
124 ref_name_.reserve((size_t)h2_.db_seqs_used);
125 for(uint64_t i=0;i<h2_.db_seqs_used;++i) {
126 f_ >> s;
127 ref_name_.push_back(new string(s));
128 }
129 ref_len_.resize((size_t)h2_.db_seqs_used);
130 f_.read(ref_len_.data(), (size_t)h2_.db_seqs_used);
131
132 f_.seek(sizeof(DAA_header1) + sizeof(DAA_header2));
133 }
134
~DAA_fileDAA_file135 ~DAA_file()
136 {
137 f_.close();
138 }
139
diamond_buildDAA_file140 uint64_t diamond_build() const
141 { return h2_.diamond_build; }
142
db_seqsDAA_file143 uint64_t db_seqs() const
144 { return h2_.db_seqs; }
145
db_seqs_usedDAA_file146 uint64_t db_seqs_used() const
147 { return h2_.db_seqs_used; }
148
db_lettersDAA_file149 uint64_t db_letters() const
150 { return h2_.db_letters; }
151
score_matrixDAA_file152 const char* score_matrix() const
153 { return h2_.score_matrix; }
154
gap_open_penaltyDAA_file155 int32_t gap_open_penalty() const
156 { return h2_.gap_open; }
157
gap_extension_penaltyDAA_file158 int32_t gap_extension_penalty() const
159 { return h2_.gap_extend; }
160
match_rewardDAA_file161 int32_t match_reward() const
162 { return h2_.reward; }
163
mismatch_penaltyDAA_file164 int32_t mismatch_penalty() const
165 { return h2_.penalty; }
166
query_recordsDAA_file167 uint64_t query_records() const
168 { return h2_.query_records; }
169
modeDAA_file170 unsigned mode() const
171 { return (unsigned)h2_.mode; }
172
ref_nameDAA_file173 const string& ref_name(size_t i) const
174 { return ref_name_[i]; }
175
ref_lenDAA_file176 uint32_t ref_len(size_t i) const
177 { return ref_len_[i]; }
178
lambdaDAA_file179 double lambda() const
180 {
181 return h2_.lambda;
182 }
183
kappaDAA_file184 double kappa() const
185 {
186 return h2_.k;
187 }
188
evalueDAA_file189 double evalue() const
190 {
191 return h2_.evalue;
192 }
193
block_sizeDAA_file194 size_t block_size(size_t i) const {
195 return h2_.block_size[i];
196 }
197
ref_lenDAA_file198 const vector<uint32_t>& ref_len() const {
199 return ref_len_;
200 }
201
read_query_bufferDAA_file202 bool read_query_buffer(BinaryBuffer &buf, size_t &query_num)
203 {
204 uint32_t size;
205 f_.read(&size, 1);
206 if(size == 0)
207 return false;
208 buf.clear();
209 buf.resize(size);
210 f_.read(buf.data(), size);
211 query_num = query_count_++;
212 return true;
213 }
214
215 private:
216
217 InputFile f_;
218 size_t query_count_;
219 DAA_header1 h1_;
220 DAA_header2 h2_;
221 PtrVector<string> ref_name_;
222 vector<uint32_t> ref_len_;
223
224 };
225
226 #endif /* DAA_FILE_H_ */
227