1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
4 
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 ****/
18 
19 #ifndef DAA_FILE_H_
20 #define DAA_FILE_H_
21 
22 #include <string>
23 #include <exception>
24 #include "../util/ptr_vector.h"
25 #include "../basic/config.h"
26 #include "../basic/const.h"
27 #include "../util/binary_buffer.h"
28 #include "../util/io/input_file.h"
29 #include "../basic/value.h"
30 #include "../data/reference.h"
31 
32 using std::string;
33 
34 struct DAA_header1
35 {
36 	enum { VERSION = 1, COMPATIBILITY_VERSION = 0 };
DAA_header1DAA_header137 	DAA_header1():
38 		magic_number (0x3c0e53476d3ee36bllu),
39 		version (VERSION)
40 	{ }
41 	uint64_t magic_number, version;
42 };
43 
mode_str(int mode)44 inline const char* mode_str(int mode)
45 {
46 	static const char* mode_str[] = { 0, 0, "blastp", "blastx", "blastn" };
47 	return mode_str[mode];
48 }
49 
50 struct DAA_header2
51 {
DAA_header2DAA_header252 	DAA_header2():
53 		diamond_build(Const::build_version)
54 	{
55 		memset(block_type, 0, sizeof(block_type));
56 		memset(block_size, 0, sizeof(block_size));
57 	}
DAA_header2DAA_header258 	DAA_header2(uint64_t db_seqs,
59 			uint64_t db_letters,
60 			int32_t gap_open,
61 			int32_t gap_extend,
62 			int32_t reward,
63 			int32_t penalty,
64 			double k,
65 			double lambda,
66 			double evalue,
67 			const string &score_matrix,
68 			unsigned mode):
69 		diamond_build (Const::build_version),
70 		db_seqs (db_seqs),
71 		db_seqs_used (0),
72 		db_letters (db_letters),
73 		flags (0),
74 		query_records (0),
75 		mode ((int32_t)mode),
76 		gap_open (gap_open),
77 		gap_extend (gap_extend),
78 		reward (reward),
79 		penalty (penalty),
80 		reserved1 (0),
81 		reserved2 (0),
82 		reserved3 (0),
83 		k (k),
84 		lambda (lambda),
85 		evalue (evalue),
86 		reserved5 (0)
87 	{
88 		memset(block_type, 0, sizeof(block_type));
89 		memset(block_size, 0, sizeof(block_size));
90 		memset(this->score_matrix, 0, sizeof(this->score_matrix));
91 		strcpy(this->score_matrix, score_matrix.c_str());
92 	}
93 	typedef enum { empty = 0, alignments = 1, ref_names = 2, ref_lengths = 3 } Block_type;
94 	uint64_t diamond_build, db_seqs, db_seqs_used, db_letters, flags, query_records;
95 	int32_t mode, gap_open, gap_extend, reward, penalty, reserved1, reserved2, reserved3;
96 	double k, lambda, evalue, reserved5;
97 	char score_matrix[16];
98 	uint64_t block_size[256];
99 	char block_type[256];
100 };
101 
102 struct DAA_file
103 {
104 
DAA_fileDAA_file105 	DAA_file(const string& file_name):
106 		f_ (file_name),
107 		query_count_ (0)
108 	{
109 		f_.read(&h1_, 1);
110 		if(h1_.magic_number != DAA_header1().magic_number)
111 			throw std::runtime_error("Input file is not a DAA file.");
112 		if(h1_.version > DAA_header1::VERSION)
113 			throw std::runtime_error("DAA version requires later version of DIAMOND.");
114 		f_.read(&h2_, 1);
115 
116 		if(h2_.block_size[0] == 0)
117 			throw std::runtime_error("Invalid DAA file. DIAMOND run has probably not completed successfully.");
118 
119 		align_mode = Align_mode(h2_.mode);
120 		//ref_header.sequences = h2_.db_seqs;
121 
122 		f_.seek(sizeof(DAA_header1) + sizeof(DAA_header2) + (size_t)h2_.block_size[0]);
123 		string s;
124 		ref_name_.reserve((size_t)h2_.db_seqs_used);
125 		for(uint64_t i=0;i<h2_.db_seqs_used;++i) {
126 			f_ >> s;
127 			ref_name_.push_back(new string(s));
128 		}
129 		ref_len_.resize((size_t)h2_.db_seqs_used);
130 		f_.read(ref_len_.data(), (size_t)h2_.db_seqs_used);
131 
132 		f_.seek(sizeof(DAA_header1) + sizeof(DAA_header2));
133 	}
134 
~DAA_fileDAA_file135 	~DAA_file()
136 	{
137 		f_.close();
138 	}
139 
diamond_buildDAA_file140 	uint64_t diamond_build() const
141 	{ return h2_.diamond_build; }
142 
db_seqsDAA_file143 	uint64_t db_seqs() const
144 	{ return h2_.db_seqs; }
145 
db_seqs_usedDAA_file146 	uint64_t db_seqs_used() const
147 	{ return h2_.db_seqs_used; }
148 
db_lettersDAA_file149 	uint64_t db_letters() const
150 	{ return h2_.db_letters; }
151 
score_matrixDAA_file152 	const char* score_matrix() const
153 	{ return h2_.score_matrix; }
154 
gap_open_penaltyDAA_file155 	int32_t gap_open_penalty() const
156 	{ return h2_.gap_open; }
157 
gap_extension_penaltyDAA_file158 	int32_t gap_extension_penalty() const
159 	{ return h2_.gap_extend; }
160 
match_rewardDAA_file161 	int32_t match_reward() const
162 	{ return h2_.reward; }
163 
mismatch_penaltyDAA_file164 	int32_t mismatch_penalty() const
165 	{ return h2_.penalty; }
166 
query_recordsDAA_file167 	uint64_t query_records() const
168 	{ return h2_.query_records; }
169 
modeDAA_file170 	unsigned mode() const
171 	{ return (unsigned)h2_.mode; }
172 
ref_nameDAA_file173 	const string& ref_name(size_t i) const
174 	{ return ref_name_[i]; }
175 
ref_lenDAA_file176 	uint32_t ref_len(size_t i) const
177 	{ return ref_len_[i]; }
178 
lambdaDAA_file179 	double lambda() const
180 	{
181 		return h2_.lambda;
182 	}
183 
kappaDAA_file184 	double kappa() const
185 	{
186 		return h2_.k;
187 	}
188 
evalueDAA_file189 	double evalue() const
190 	{
191 		return h2_.evalue;
192 	}
193 
block_sizeDAA_file194 	size_t block_size(size_t i) const {
195 		return h2_.block_size[i];
196 	}
197 
ref_lenDAA_file198 	const vector<uint32_t>& ref_len() const {
199 		return ref_len_;
200 	}
201 
read_query_bufferDAA_file202 	bool read_query_buffer(BinaryBuffer &buf, size_t &query_num)
203 	{
204 		uint32_t size;
205 		f_.read(&size, 1);
206 		if(size == 0)
207 			return false;
208 		buf.clear();
209 		buf.resize(size);
210 		f_.read(buf.data(), size);
211 		query_num = query_count_++;
212 		return true;
213 	}
214 
215 private:
216 
217 	InputFile f_;
218 	size_t query_count_;
219 	DAA_header1 h1_;
220 	DAA_header2 h2_;
221 	PtrVector<string> ref_name_;
222 	vector<uint32_t> ref_len_;
223 
224 };
225 
226 #endif /* DAA_FILE_H_ */
227