1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
4 
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 ****/
18 
19 #ifndef PACKED_TRANSCRIPT_H_
20 #define PACKED_TRANSCRIPT_H_
21 
22 #include "../util/binary_buffer.h"
23 #include "../basic/value.h"
24 #include "diagonal_segment.h"
25 #include "sequence.h"
26 
27 typedef enum { op_match = 0, op_insertion = 1, op_deletion = 2, op_substitution = 3, op_frameshift_forward = 4, op_frameshift_reverse = 5 } Edit_operation;
28 
29 struct Reversed {};
30 
31 struct Packed_operation
32 {
33 	enum { OP_BITS = 2, COUNT_BITS = 8 - OP_BITS, MAX_COUNT = (1 << COUNT_BITS) - 1 };
Packed_operationPacked_operation34 	Packed_operation(uint8_t code):
35 		code (code)
36 	{ }
Packed_operationPacked_operation37 	Packed_operation(Edit_operation op, unsigned count):
38 		code ((op<<COUNT_BITS) | count)
39 	{ }
Packed_operationPacked_operation40 	Packed_operation(Edit_operation op, Letter v) :
41 		code((op << COUNT_BITS) | (int)v)
42 	{ }
uint8_tPacked_operation43 	operator uint8_t() const
44 	{ return code; }
opPacked_operation45 	Edit_operation op() const
46 	{
47 		uint8_t o = code >> COUNT_BITS;
48 		switch (o) {
49 		case op_substitution:
50 			switch (letter()) {
51 			case (Letter)AMINO_ACID_COUNT:
52 				return op_frameshift_reverse;
53 			case (Letter)(AMINO_ACID_COUNT + 1) :
54 				return op_frameshift_forward;
55 			default:
56 				return op_substitution;
57 			}
58 		default:
59 			return (Edit_operation)o;
60 		}
61 	}
countPacked_operation62 	unsigned count() const
63 	{
64 		switch (op()) {
65 		case op_match:
66 		case op_insertion:
67 			return code & MAX_COUNT;
68 		default:
69 			return 1;
70 		}
71 	}
letterPacked_operation72 	Letter letter() const
73 	{
74 		return code&MAX_COUNT;
75 	}
terminatorPacked_operation76 	static Packed_operation terminator()
77 	{ return Packed_operation(op_match, 0u); }
frameshift_forwardPacked_operation78 	static Packed_operation frameshift_forward()
79 	{
80 		return Packed_operation(op_substitution, (unsigned)AMINO_ACID_COUNT + 1);
81 	}
frameshift_reversePacked_operation82 	static Packed_operation frameshift_reverse()
83 	{
84 		return Packed_operation(op_substitution, (unsigned)AMINO_ACID_COUNT);
85 	}
86 	uint8_t code;
87 };
88 
89 struct Combined_operation
90 {
91 	Edit_operation op;
92 	unsigned count;
93 	Letter letter;
94 };
95 
96 struct Packed_transcript
97 {
98 
99 	struct Const_iterator
100 	{
Const_iteratorPacked_transcript::Const_iterator101 		Const_iterator(const Packed_operation *op):
102 			ptr_ (op)
103 		{ gather(); }
goodPacked_transcript::Const_iterator104 		bool good() const
105 		{ return *ptr_ != Packed_operation::terminator(); }
106 		Const_iterator& operator++()
107 		{ ++ptr_; gather(); return *this; }
108 		const Combined_operation& operator*() const
109 		{ return op_; }
110 		const Combined_operation* operator->() const
111 		{ return &op_; }
112 	private:
gatherPacked_transcript::Const_iterator113 		void gather()
114 		{
115 			if(!good())
116 				return;
117 			op_.op = ptr_->op();
118 			if(op_.op == op_deletion || op_.op == op_substitution || op_.op == op_frameshift_forward || op_.op==op_frameshift_reverse) {
119 				op_.letter = ptr_->letter();
120 				op_.count = 1;
121 			} else {
122 				op_.count = 0;
123 				do {
124 					op_.count += ptr_->count();
125 					++ptr_;
126 				} while(good() && ptr_->op() == op_.op);
127 				--ptr_;
128 			}
129 		}
130 		const Packed_operation *ptr_;
131 		Combined_operation op_;
132 	};
133 
readPacked_transcript134 	void read(BinaryBuffer::Iterator &it)
135 	{
136 		data_.clear();
137 		uint8_t code;
138 		do {
139 			it >> code;
140 			data_.push_back(code);
141 		} while (code != Packed_operation::terminator());
142 	}
143 
beginPacked_transcript144 	Const_iterator begin() const
145 	{ return Const_iterator (data_.data()); }
146 
dataPacked_transcript147 	const vector<Packed_operation>& data() const
148 	{ return data_; }
149 
ptrPacked_transcript150 	const Packed_operation* ptr() const
151 	{
152 		return &data_[0];
153 	}
154 
push_backPacked_transcript155 	void push_back(Edit_operation op)
156 	{
157 		if (op == op_frameshift_forward)
158 			data_.push_back(Packed_operation::frameshift_forward());
159 		else if (op == op_frameshift_reverse)
160 			data_.push_back(Packed_operation::frameshift_reverse());
161 		else if (data_.empty() || data_.back().op() != op || (data_.back().op() == op && data_.back().count() == Packed_operation::MAX_COUNT))
162 			data_.push_back(Packed_operation(op, 1u));
163 		else
164 			++data_.back().code;
165 	}
166 
push_backPacked_transcript167 	void push_back(Edit_operation op, Letter l)
168 	{
169 		data_.push_back(Packed_operation(op, l));
170 	}
171 
push_backPacked_transcript172 	void push_back(Edit_operation op, unsigned count)
173 	{
174 		while (count > 0) {
175 			const unsigned n = std::min(count, (unsigned)Packed_operation::MAX_COUNT);
176 			data_.push_back(Packed_operation(op, n));
177 			count -= n;
178 		}
179 	}
180 
181 	void reverse(size_t begin = 0)
182 	{
183 		std::reverse(data_.begin() + begin, data_.end());
184 	}
185 
raw_lengthPacked_transcript186 	size_t raw_length() const
187 	{
188 		return data_.size();
189 	}
190 
push_terminatorPacked_transcript191 	void push_terminator()
192 	{
193 		data_.push_back(Packed_operation::terminator());
194 	}
195 
clearPacked_transcript196 	void clear()
197 	{
198 		data_.clear();
199 	}
200 
push_backPacked_transcript201 	void push_back(const Sequence &s, const Reversed&)
202 	{
203 		const int l = (int)s.length();
204 		for (int i = l - 1; i >= 0; --i)
205 			push_back(op_deletion, s[i]);
206 	}
207 
push_backPacked_transcript208 	void push_back(const Sequence &s)
209 	{
210 		const int l = (int)s.length();
211 		for (int i = 0; i < l; ++i)
212 			push_back(op_deletion, s[i]);
213 	}
214 
reservePacked_transcript215 	void reserve(size_t n)
216 	{
217 		data_.reserve(n);
218 	}
219 
220 private:
221 
222 	vector<Packed_operation> data_;
223 
224 	friend struct Hsp;
225 
226 };
227 
228 #endif /* PACKED_TRANSCRIPT_H_ */
229